├── .gitignore ├── config ├── crd │ ├── metagpu.yaml │ └── bases │ │ └── mlops.cnvrg.io_metagpus.yaml └── config.yaml ├── chart ├── values.yaml ├── Chart.yaml └── templates │ ├── svc.yml │ ├── svcmon.yml │ ├── scc.yml │ ├── cm.yml │ ├── rbac.yml │ └── ds.yml ├── buf.work.yaml ├── cmd ├── mgctl │ ├── create.go │ ├── ping.go │ ├── config.go │ ├── utils.go │ ├── kill.go │ ├── main.go │ ├── enforce.go │ └── get.go ├── mgex │ ├── main.go │ ├── readme.md │ └── exporter.go └── mgdp │ └── main.go ├── hack ├── scripts │ ├── test2.py │ ├── test3.py │ └── test.py ├── dp.yaml ├── dep-gpu.yaml └── remote-dev.yaml ├── pkg ├── mgsrv │ ├── deviceapi │ │ ├── buf.yaml │ │ ├── buf.lock │ │ └── device │ │ │ └── v1 │ │ │ ├── utils.go │ │ │ ├── device.proto │ │ │ └── device.go │ ├── interceptor.go │ └── server.go ├── allocator │ ├── types.go │ ├── allocator.go │ └── allocator_test.go ├── podexec │ ├── types.go │ ├── copycache.go │ └── podexec.go ├── gpumgr │ ├── enforcer.go │ ├── device.go │ ├── enforcer_test.go │ ├── process.go │ ├── container.go │ └── mgr.go ├── plugin │ ├── types.go │ ├── nvidia.go │ └── server.go ├── ctlutils │ └── utils.go ├── nvmlutils │ └── utils.go └── sharecfg │ └── sharecfg.go ├── buf.gen.yaml ├── Dockerfile.dev ├── LICENSE ├── Dockerfile ├── Makefile ├── pkged.go ├── README.md ├── go.mod ├── deploy └── static.yaml ├── .github └── workflows │ └── docker-image.yml └── gen └── proto └── go └── device └── v1 └── device_grpc.pb.go /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | bin -------------------------------------------------------------------------------- /config/crd/metagpu.yaml: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /chart/values.yaml: -------------------------------------------------------------------------------- 1 | tag: latest 2 | ocp: false -------------------------------------------------------------------------------- /buf.work.yaml: -------------------------------------------------------------------------------- 1 | version: v1 2 | directories: 3 | - pkg/mgsrv/deviceapi -------------------------------------------------------------------------------- /cmd/mgctl/create.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | // TODO: create logic for container and device visibility level tokens generations 4 | -------------------------------------------------------------------------------- /hack/scripts/test2.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | tf.get_logger().setLevel('INFO') 3 | gpus = tf.config.list_physical_devices('GPU') 4 | print(gpus) -------------------------------------------------------------------------------- /pkg/mgsrv/deviceapi/buf.yaml: -------------------------------------------------------------------------------- 1 | version: v1 2 | deps: 3 | - buf.build/googleapis/googleapis 4 | lint: 5 | use: 6 | - DEFAULT 7 | breaking: 8 | use: 9 | - FILE -------------------------------------------------------------------------------- /chart/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v2 2 | name: metagpu-device-plugin 3 | description: Metagpu device plugin 4 | type: application 5 | version: 1.0.0 6 | appVersion: 1.0.0 7 | -------------------------------------------------------------------------------- /hack/scripts/test3.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | tf.get_logger().setLevel('INFO') 3 | print("===============================") 4 | print(tf.config.list_physical_devices('GPU')) 5 | print("===============================") -------------------------------------------------------------------------------- /pkg/mgsrv/deviceapi/buf.lock: -------------------------------------------------------------------------------- 1 | # Generated by buf. DO NOT EDIT. 2 | version: v1 3 | deps: 4 | - remote: buf.build 5 | owner: googleapis 6 | repository: googleapis 7 | commit: f78a83d0b4bd469fa605445ada4c6249 8 | -------------------------------------------------------------------------------- /pkg/allocator/types.go: -------------------------------------------------------------------------------- 1 | package allocator 2 | 3 | type DeviceLoad struct { 4 | Metagpus []string 5 | } 6 | 7 | type DeviceAllocation struct { 8 | LoadMap []*DeviceLoad 9 | AvailableDevIds []string 10 | AllocationSize int 11 | TotalSharesPerGpu int 12 | MetagpusAllocations []string 13 | } 14 | -------------------------------------------------------------------------------- /pkg/podexec/types.go: -------------------------------------------------------------------------------- 1 | package podexec 2 | 3 | import ( 4 | "bytes" 5 | "io" 6 | "sync" 7 | ) 8 | 9 | type mgctlCopyCache struct { 10 | mu sync.Mutex 11 | cache map[string]bool 12 | } 13 | 14 | type podExec struct { 15 | podName string 16 | podNs string 17 | containerName string 18 | cmd []string 19 | stdin io.Reader 20 | stdout *bytes.Buffer 21 | } 22 | -------------------------------------------------------------------------------- /chart/templates/svc.yml: -------------------------------------------------------------------------------- 1 | --- 2 | kind: Service 3 | apiVersion: v1 4 | metadata: 5 | name: metagpu-device-plugin 6 | namespace: {{ .Release.Namespace }} 7 | labels: 8 | app: "metagpu-exporter" 9 | spec: 10 | selector: 11 | name: metagpu-device-plugin 12 | ports: 13 | - protocol: TCP 14 | port: 50052 15 | name: grcp 16 | - protocol: TCP 17 | port: 2112 18 | name: metrics -------------------------------------------------------------------------------- /buf.gen.yaml: -------------------------------------------------------------------------------- 1 | version: v1 2 | managed: 3 | enabled: true 4 | optimize_for: CODE_SIZE 5 | go_package_prefix: 6 | default: github.com/AccessibleAI/metagpu-device-plugin/gen/proto/go 7 | except: 8 | - buf.build/googleapis/googleapis 9 | plugins: 10 | - name: go 11 | opt: paths=source_relative 12 | out: gen/proto/go 13 | - name: go-grpc 14 | opt: paths=source_relative 15 | out: gen/proto/go -------------------------------------------------------------------------------- /pkg/podexec/copycache.go: -------------------------------------------------------------------------------- 1 | package podexec 2 | 3 | func NewMgctlCopyCache() *mgctlCopyCache { 4 | return &mgctlCopyCache{cache: make(map[string]bool)} 5 | } 6 | 7 | func (c *mgctlCopyCache) setCache(podId string) { 8 | c.mu.Lock() 9 | defer c.mu.Unlock() 10 | c.cache[podId] = true 11 | } 12 | 13 | func (c *mgctlCopyCache) isCached(podId string) bool { 14 | c.mu.Lock() 15 | defer c.mu.Unlock() 16 | _, cached := c.cache[podId] 17 | return cached 18 | } 19 | -------------------------------------------------------------------------------- /chart/templates/svcmon.yml: -------------------------------------------------------------------------------- 1 | apiVersion: monitoring.coreos.com/v1 2 | kind: ServiceMonitor 3 | metadata: 4 | name: metagpu-exporter 5 | namespace: {{ .Release.Namespace }} 6 | labels: 7 | app: "metagpu-exporter" 8 | cnvrg-infra-prometheus: cnvrg-infra-cnvrg 9 | spec: 10 | selector: 11 | matchLabels: 12 | app: "metagpu-exporter" 13 | namespaceSelector: 14 | matchNames: 15 | - {{ .Release.Namespace }} 16 | endpoints: 17 | - port: "metrics" 18 | path: "/metrics" 19 | interval: "15s" -------------------------------------------------------------------------------- /cmd/mgctl/ping.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "github.com/AccessibleAI/cnvrg-fractional-accelerator-device-plugin/pkg/ctlutils" 5 | log "github.com/sirupsen/logrus" 6 | "github.com/spf13/cobra" 7 | "github.com/spf13/viper" 8 | ) 9 | 10 | var pingCmd = &cobra.Command{ 11 | Use: "ping", 12 | Short: "ping server to check connectivity", 13 | Run: func(cmd *cobra.Command, args []string) { 14 | conn := ctlutils.GetGrpcMetaGpuSrvClientConn(viper.GetString("addr")) 15 | if conn == nil { 16 | log.Fatalf("can't initiate connection to metagpu server") 17 | } 18 | defer conn.Close() 19 | }, 20 | } 21 | -------------------------------------------------------------------------------- /config/config.yaml: -------------------------------------------------------------------------------- 1 | accelerator: nvidia 2 | processesDiscoveryPeriod: 5 3 | deviceCacheTTL: 3600 4 | jwtSecret: topSecret 5 | mgctlTar: /tmp/mgctl 6 | mgctlAutoInject: true 7 | serverAddr: 0.0.0.0:50052 8 | memoryEnforcer: true 9 | deviceToken: eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJlbWFpbCI6Im1ldGFncHVAaW5zdGFuY2UiLCJ2aXNpYmlsaXR5TGV2ZWwiOiJsMCJ9.2rHykHFcHoIr-OCoPA5Am4ubf31-RJcayZnOTK6db94 10 | containerToken: eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJlbWFpbCI6Im1ldGFncHVAaW5zdGFuY2UiLCJ2aXNpYmlsaXR5TGV2ZWwiOiJsMSJ9.o5v6Zdi1FKXQevRjuSbABBX1vIRYgN3Daz9iXabuFFA 11 | nodename: "" 12 | deviceSharing: 13 | - resourceName: cnvrg.io/metagpu 14 | autoReshare: true 15 | metaGpus: 2 16 | uuid: [ "*" ] 17 | -------------------------------------------------------------------------------- /Dockerfile.dev: -------------------------------------------------------------------------------- 1 | FROM nvidia/cuda:11.6.0-base-ubuntu20.04 2 | 3 | ENV NVIDIA_DISABLE_REQUIRE="true" 4 | ENV NVIDIA_VISIBLE_DEVICES=all 5 | ENV NVIDIA_DRIVER_CAPABILITIES=utility 6 | 7 | ENV PATH=${PATH}:/usr/local/go/bin:/opt/workdir/.go/bin 8 | ENV GOPATH=/opt/workdir/.go 9 | ENV GOCACHE=/opt/workdir/.go/.cache 10 | 11 | WORKDIR /opt/workdir 12 | RUN apt update -y && apt install curl wget vim git gcc make -y 13 | RUN wget https://go.dev/dl/go1.17.11.linux-amd64.tar.gz 14 | RUN rm -rf /usr/local/go \ 15 | && tar -C /usr/local -xzf go1.17.11.linux-amd64.tar.gz \ 16 | && mkdir -p /opt/workdir/.go/github.com 17 | RUN go install github.com/go-delve/delve/cmd/dlv@latest 18 | CMD ["/bin/bash", "-c", "sleep inf"] 19 | -------------------------------------------------------------------------------- /hack/scripts/test.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | tf.get_logger().setLevel('INFO') 3 | gpus = tf.config.list_physical_devices('GPU') 4 | if gpus: 5 | # Restrict TensorFlow to only allocate 1GB of memory on the first GPU 6 | try: 7 | tf.config.set_logical_device_configuration(gpus[0],[tf.config.LogicalDeviceConfiguration(memory_limit=1024)]) 8 | logical_gpus = tf.config.list_logical_devices('GPU') 9 | print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs") 10 | except RuntimeError as e: 11 | # Virtual devices must be set before GPUs have been initialized 12 | print(e) 13 | print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU'))) 14 | while True: 15 | print(tf.reduce_sum(tf.random.normal([1000, 1000]))) -------------------------------------------------------------------------------- /chart/templates/scc.yml: -------------------------------------------------------------------------------- 1 | {{- if eq .Values.ocp true }} 2 | kind: SecurityContextConstraints 3 | apiVersion: security.openshift.io/v1 4 | metadata: 5 | annotations: 6 | mlops.cnvrg.io/default-loader: "false" 7 | mlops.cnvrg.io/own: "false" 8 | mlops.cnvrg.io/updatable: "false" 9 | name: metagpu-device-plugin 10 | allowHostDirVolumePlugin: true 11 | allowHostIPC: true 12 | allowHostNetwork: true 13 | allowHostPID: true 14 | allowHostPorts: true 15 | allowPrivilegeEscalation: true 16 | allowPrivilegedContainer: true 17 | readOnlyRootFilesystem: false 18 | requiredDropCapabilities: null 19 | allowedCapabilities: 20 | - '*' 21 | allowedUnsafeSysctls: 22 | - '*' 23 | fsGroup: 24 | type: RunAsAny 25 | runAsUser: 26 | type: RunAsAny 27 | seLinuxContext: 28 | type: RunAsAny 29 | seccompProfiles: 30 | - '*' 31 | supplementalGroups: 32 | type: RunAsAny 33 | users: 34 | - system:serviceaccount:{{ .Release.Namespace }}:metagpu-device-plugin 35 | volumes: 36 | - '*' 37 | {{- end }} -------------------------------------------------------------------------------- /chart/templates/cm.yml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ConfigMap 3 | metadata: 4 | name: metagpu-device-plugin-config 5 | namespace: {{ .Release.Namespace }} 6 | data: 7 | config.yaml: | 8 | accelerator: nvidia 9 | processesDiscoveryPeriod: 5 10 | deviceCacheTTL: 3600 11 | jwtSecret: topSecret 12 | mgctlTar: /tmp/mgctl 13 | mgctlAutoInject: true 14 | serverAddr: 0.0.0.0:50052 15 | memoryEnforcer: true 16 | deviceToken: eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJlbWFpbCI6Im1ldGFncHVAaW5zdGFuY2UiLCJ2aXNpYmlsaXR5TGV2ZWwiOiJsMCJ9.2rHykHFcHoIr-OCoPA5Am4ubf31-RJcayZnOTK6db94 17 | containerToken: eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJlbWFpbCI6Im1ldGFncHVAaW5zdGFuY2UiLCJ2aXNpYmlsaXR5TGV2ZWwiOiJsMSJ9.o5v6Zdi1FKXQevRjuSbABBX1vIRYgN3Daz9iXabuFFA 18 | deviceSharing: 19 | - resourceName: cnvrg.io/metagpu 20 | autoReshare: true 21 | metaGpus: 2 22 | uuid: [ "*" ] 23 | --- 24 | apiVersion: v1 25 | kind: ConfigMap 26 | metadata: 27 | name: metagpu-presence 28 | namespace: {{ .Release.Namespace }} 29 | data: 30 | enabled: "true" 31 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 cnvrg.io 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /pkg/gpumgr/enforcer.go: -------------------------------------------------------------------------------- 1 | package gpumgr 2 | 3 | import ( 4 | log "github.com/sirupsen/logrus" 5 | "time" 6 | ) 7 | 8 | func (m *GpuMgr) StartMemoryEnforcer() { 9 | log.Info("starting gpu memory enforcer") 10 | go func() { 11 | for { 12 | for _, p := range m.enforce() { 13 | p.Kill() 14 | } 15 | time.Sleep(5 * time.Second) 16 | } 17 | }() 18 | } 19 | 20 | func (m *GpuMgr) enforce() (gpuProcForKill []*GpuProcess) { 21 | for _, c := range m.gpuContainers { 22 | for _, p := range c.Processes { 23 | if d := m.getGpuDeviceByUuid(p.DeviceUuid); d != nil { 24 | maxAllowedMem := d.Memory.ShareSize * uint64(c.PodMetagpuRequest) 25 | if p.GpuMemory > maxAllowedMem && p.Pid != 0 && maxAllowedMem > 0 { 26 | log.Infof("out of memory: %dMB/%dMB, pod: %s going to be terminated", p.GpuMemory, maxAllowedMem, c.PodId) 27 | gpuProcForKill = append(gpuProcForKill, p) 28 | } 29 | } 30 | } 31 | } 32 | return 33 | } 34 | 35 | func (m *GpuMgr) getGpuDeviceByUuid(uuid string) *GpuDevice { 36 | for _, d := range m.GpuDevices { 37 | if d.UUID == uuid { 38 | return d 39 | } 40 | } 41 | return nil 42 | } 43 | -------------------------------------------------------------------------------- /pkg/plugin/types.go: -------------------------------------------------------------------------------- 1 | package plugin 2 | 3 | import ( 4 | "github.com/AccessibleAI/cnvrg-fractional-accelerator-device-plugin/pkg/sharecfg" 5 | "google.golang.org/grpc" 6 | pluginapi "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1" 7 | "time" 8 | ) 9 | 10 | type DeviceManager interface { 11 | GetPluginDevices() []*pluginapi.Device 12 | GetDeviceSharingConfig() *sharecfg.DeviceSharingConfig 13 | GetUnixSocket() string 14 | ParseRealDeviceId(metaDevicesIds []string) (realDeviceId []string) 15 | MetagpuAllocation(allocationSize int, availableDevIds []string) ([]string, error) 16 | } 17 | 18 | type DeviceUuid string 19 | 20 | type MetaGpuDevicePlugin struct { 21 | DeviceManager 22 | server *grpc.Server 23 | socket string 24 | stop chan interface{} 25 | MetaGpuRecalculation chan bool 26 | } 27 | 28 | type NvidiaDeviceManager struct { 29 | Devices []*MetaDevice 30 | cacheTTL time.Duration 31 | processesDiscoveryPeriod time.Duration 32 | shareCfg *sharecfg.DeviceSharingConfig 33 | } 34 | 35 | type MetaDevice struct { 36 | UUID string 37 | Index int 38 | } 39 | -------------------------------------------------------------------------------- /chart/templates/rbac.yml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: ClusterRole 3 | metadata: 4 | name: metagpu-device-plugin 5 | namespace: {{ .Release.Namespace }} 6 | rules: 7 | - apiGroups: 8 | - "" 9 | resources: 10 | - pods 11 | verbs: 12 | - list 13 | - get 14 | - create 15 | - apiGroups: 16 | - "" 17 | resources: 18 | - pods/exec 19 | verbs: 20 | - create 21 | - apiGroups: 22 | - "" 23 | resources: 24 | - configmaps 25 | resourceNames: 26 | - metagpu-device-plugin-config 27 | verbs: 28 | - get 29 | - update 30 | --- 31 | apiVersion: v1 32 | kind: ServiceAccount 33 | metadata: 34 | name: metagpu-device-plugin 35 | namespace: {{ .Release.Namespace }} 36 | --- 37 | apiVersion: rbac.authorization.k8s.io/v1 38 | kind: ClusterRoleBinding 39 | metadata: 40 | name: metagpu-device-plugin 41 | namespace: {{ .Release.Namespace }} 42 | roleRef: 43 | apiGroup: rbac.authorization.k8s.io 44 | kind: ClusterRole 45 | name: metagpu-device-plugin 46 | subjects: 47 | - kind: ServiceAccount 48 | name: metagpu-device-plugin 49 | namespace: {{ .Release.Namespace }} -------------------------------------------------------------------------------- /hack/dp.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: DaemonSet 3 | metadata: 4 | name: nvidia-device-plugin-daemonset 5 | spec: 6 | selector: 7 | matchLabels: 8 | name: nvidia-device-plugin-ds 9 | template: 10 | metadata: 11 | labels: 12 | name: nvidia-device-plugin-ds 13 | spec: 14 | nodeSelector: 15 | accelerator: nvidia 16 | tolerations: 17 | - operator: Exists 18 | priorityClassName: "system-node-critical" 19 | containers: 20 | - image: nvcr.io/nvidia/k8s-device-plugin:v0.9.0 21 | name: nvidia-device-plugin-ctr 22 | args: ["--fail-on-init-error=true"] 23 | resources: 24 | requests: 25 | cpu: 100m 26 | memory: 100Mi 27 | limits: 28 | cpu: 500m 29 | memory: 500Mi 30 | securityContext: 31 | allowPrivilegeEscalation: false 32 | capabilities: 33 | drop: ["ALL"] 34 | volumeMounts: 35 | - name: device-plugin 36 | mountPath: /var/lib/kubelet/device-plugins 37 | volumes: 38 | - name: device-plugin 39 | hostPath: 40 | path: /var/lib/kubelet/device-plugins -------------------------------------------------------------------------------- /pkg/ctlutils/utils.go: -------------------------------------------------------------------------------- 1 | package ctlutils 2 | 3 | import ( 4 | "context" 5 | "net" 6 | "time" 7 | 8 | log "github.com/sirupsen/logrus" 9 | "google.golang.org/grpc" 10 | "google.golang.org/grpc/metadata" 11 | "os" 12 | ) 13 | 14 | func GetGrpcMetaGpuSrvClientConn(address string) *grpc.ClientConn { 15 | log.Debugf("initiating gRPC connection to %s", address) 16 | 17 | c, err := dial(address, 3*time.Second) 18 | if err != nil { 19 | log.Errorf("failed to connect to server 🙀, err: %s", err) 20 | os.Exit(1) 21 | } 22 | log.Debugf("connected to %s", address) 23 | return c 24 | } 25 | 26 | func AuthenticatedContext(token string) context.Context { 27 | ctx := context.Background() 28 | md := metadata.Pairs("Authorization", token) 29 | return metadata.NewOutgoingContext(ctx, md) 30 | } 31 | 32 | func dial(socket string, timeout time.Duration) (*grpc.ClientConn, error) { 33 | opts := []grpc.DialOption{ 34 | grpc.WithInsecure(), 35 | grpc.WithBlock(), 36 | grpc.WithContextDialer(func(ctx context.Context, s string) (net.Conn, error) { 37 | c, e := net.DialTimeout("tcp", socket, timeout) 38 | if e != nil { 39 | log.Fatalf("error connecting to the server, e: %s", e) 40 | } 41 | return c, e 42 | }), 43 | } 44 | c, err := grpc.Dial(socket, opts...) 45 | if err != nil { 46 | return nil, err 47 | } 48 | 49 | return c, nil 50 | } 51 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM golang:1.17.3 as builder 2 | ARG BUILD_SHA 3 | ARG BUILD_VERSION 4 | WORKDIR /root/.go/src/metagpu 5 | COPY go.mod go.mod 6 | COPY go.sum go.sum 7 | RUN go mod download 8 | COPY cmd cmd 9 | COPY pkg pkg 10 | COPY gen gen 11 | RUN go mod tidy 12 | RUN go build \ 13 | -ldflags="-extldflags=-Wl,-z,lazy -s -w -X 'main.Build=${BUILD_SHA}' -X 'main.Version=${BUILD_VERSION}'" \ 14 | -o mgdp cmd/mgdp/main.go 15 | RUN go build \ 16 | -ldflags="-X 'main.Build=${BUILD_SHA}' -X 'main.Version=${BUILD_VERSION}'" \ 17 | -o mgctl cmd/mgctl/*.go 18 | RUN go build \ 19 | -ldflags="-X 'main.Build=${BUILD_SHA}' -X 'main.Version=${BUILD_VERSION}'" \ 20 | -o mgex cmd/mgex/*.go 21 | 22 | FROM nvidia/cuda:11.6.0-base-ubuntu20.04 23 | 24 | ENV NVIDIA_DISABLE_REQUIRE="true" 25 | ENV NVIDIA_VISIBLE_DEVICES=all 26 | ENV NVIDIA_DRIVER_CAPABILITIES=utility 27 | 28 | LABEL io.k8s.display-name="cnvrg.io Meta GPU Device Plugin" 29 | LABEL name="cnvrg.io MetaGPU Device Plugin" 30 | LABEL vendor="cnvrg.io" 31 | LABEL version="N/A" 32 | LABEL release="N/A" 33 | LABEL summary="cnvrg.io MetaGPU device plugin for Kubernetes" 34 | LABEL description="See summary" 35 | COPY --from=builder /root/.go/src/metagpu/mgdp /usr/bin/mgdp 36 | COPY --from=builder /root/.go/src/metagpu/mgctl /usr/bin/mgctl 37 | COPY --from=builder /root/.go/src/metagpu/mgex /usr/bin/mgex 38 | RUN cp /usr/bin/mgctl /tmp -------------------------------------------------------------------------------- /cmd/mgctl/config.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | pbdevice "github.com/AccessibleAI/cnvrg-fractional-accelerator-device-plugin/gen/proto/go/device/v1" 5 | "github.com/AccessibleAI/cnvrg-fractional-accelerator-device-plugin/pkg/ctlutils" 6 | log "github.com/sirupsen/logrus" 7 | "github.com/spf13/cobra" 8 | "github.com/spf13/viper" 9 | ) 10 | 11 | var ( 12 | configCmdParams = []param{ 13 | {name: "metagpu", shorthand: "m", value: 0, usage: "set metagpus quantity (gpu shares)"}, 14 | {name: "auto", shorthand: "a", value: false, usage: "automatically configure GPU shares"}, 15 | } 16 | ) 17 | 18 | var configCmd = &cobra.Command{ 19 | Use: "config", 20 | Short: "change configs on running metagpu device plugin instance", 21 | Run: func(cmd *cobra.Command, args []string) { 22 | patchConfigs() 23 | }, 24 | } 25 | 26 | func patchConfigs() { 27 | if viper.GetInt32("metagpu") != 0 { 28 | metaGpus := viper.GetInt32("metagpu") 29 | log.Info(metaGpus) 30 | conn := ctlutils.GetGrpcMetaGpuSrvClientConn(viper.GetString("addr")) 31 | if conn == nil { 32 | log.Fatalf("can't initiate connection to metagpu server") 33 | } 34 | defer conn.Close() 35 | device := pbdevice.NewDeviceServiceClient(conn) 36 | 37 | request := &pbdevice.PatchConfigsRequest{MetaGpus: metaGpus} 38 | if _, err := device.PatchConfigs(ctlutils.AuthenticatedContext(viper.GetString("token")), request); err != nil { 39 | log.Error(err) 40 | } 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /hack/dep-gpu.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: gpu-test-with-gpu 5 | spec: 6 | selector: 7 | matchLabels: 8 | app: gpu-test-with-gpu 9 | replicas: 1 10 | template: 11 | metadata: 12 | labels: 13 | app: gpu-test-with-gpu 14 | spec: 15 | hostPID: true 16 | tolerations: 17 | - operator: "Exists" 18 | containers: 19 | - name: gpu-test-with-gpu 20 | image: tensorflow/tensorflow:latest-gpu 21 | command: 22 | - /usr/local/bin/python 23 | - -c 24 | - | 25 | import tensorflow as tf 26 | tf.get_logger().setLevel('INFO') 27 | gpus = tf.config.list_physical_devices('GPU') 28 | if gpus: 29 | # Restrict TensorFlow to only allocate 1GB of memory on the first GPU 30 | try: 31 | tf.config.set_logical_device_configuration(gpus[0],[tf.config.LogicalDeviceConfiguration(memory_limit=1024)]) 32 | logical_gpus = tf.config.list_logical_devices('GPU') 33 | print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs") 34 | except RuntimeError as e: 35 | # Virtual devices must be set before GPUs have been initialized 36 | print(e) 37 | print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU'))) 38 | while True: 39 | print(tf.reduce_sum(tf.random.normal([1000, 1000]))) 40 | resources: 41 | limits: 42 | nvidia.com/gpu: "1" -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | #rsync -r /Users/dima/.go/src/github.com/AccessibleAI/metagpu-device-plugin/docs/* rancher@212.199.86.38:/tmp/docs 2 | 3 | build: 4 | go build -ldflags="-X 'main.Build=$$(git rev-parse --short HEAD)' -X 'main.Version=1.0.0'" -v -o bin/mgdp cmd/mgdp/main.go 5 | 6 | build-exporter: 7 | go build -ldflags="-X 'main.Build=$$(git rev-parse --short HEAD)' -X 'main.Version=1.0.0'" -v -o bin/mgex cmd/mgex/*.go 8 | 9 | remote-sync: 10 | kubectl cp ./ $(shell kubectl get pods -lapp=dev-metagpu -A -ojson | jq -r '.items[] | .metadata.namespace + "/" + .metadata.name'):/opt/workdir/.go/github.com/metagpu 11 | 12 | remote-debug: 13 | dlv debug --headless --listen=:2345 --api-version=2 --accept-multiclient ./cmd/mgdp/main.go -- start 14 | 15 | docker-dev-build: 16 | docker buildx build --platform linux/amd64 --push -t cnvrg/golang-cuda11-6-dvl:latest -f Dockerfile.dev . 17 | 18 | docker-build: build-proto 19 | docker build \ 20 | --platform linux/x86_64 \ 21 | --build-arg BUILD_SHA=$(shell git rev-parse --short HEAD) \ 22 | --build-arg BUILD_VERSION=1.0.0 \ 23 | -t docker.io/cnvrg/metagpu-device-plugin:$(shell git rev-parse --abbrev-ref HEAD) . 24 | 25 | build-mgctl: 26 | go build -ldflags="-X 'main.Build=$$(git rev-parse --short HEAD)' -X 'main.Version=1.0.0'" -v -o bin/mgctl cmd/mgctl/*.go 27 | 28 | docker-push: 29 | docker push docker.io/cnvrg/metagpu-device-plugin:$(shell git rev-parse --abbrev-ref HEAD) 30 | 31 | build-proto: 32 | buf mod update pkg/mgsrv/deviceapi 33 | buf lint 34 | buf build 35 | buf generate 36 | 37 | generate-manifests: 38 | helm template chart/ -n cnvrg --set tag=$(shell git rev-parse --abbrev-ref HEAD) > deploy/static.yaml 39 | 40 | .PHONY: deploy 41 | deploy: 42 | helm template chart/ --set tag=$(shell git rev-parse --abbrev-ref HEAD) | kubectl apply -f - 43 | 44 | test: 45 | go test ./pkg/... -v 46 | 47 | test-allocator: 48 | go test ./pkg/allocator/... -v 49 | 50 | test-gpumgr: 51 | go test ./pkg/gpumgr/... -v -------------------------------------------------------------------------------- /config/crd/bases/mlops.cnvrg.io_metagpus.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | apiVersion: apiextensions.k8s.io/v1 3 | kind: CustomResourceDefinition 4 | metadata: 5 | annotations: 6 | controller-gen.kubebuilder.io/version: (devel) 7 | creationTimestamp: null 8 | name: metagpus.mlops.cnvrg.io 9 | spec: 10 | group: mlops.cnvrg.io 11 | names: 12 | kind: MetaGpu 13 | listKind: MetaGpuList 14 | plural: metagpus 15 | singular: metagpu 16 | scope: Namespaced 17 | versions: 18 | - additionalPrinterColumns: 19 | - jsonPath: .spec.foo 20 | name: Foo 21 | type: string 22 | name: v1 23 | schema: 24 | openAPIV3Schema: 25 | properties: 26 | apiVersion: 27 | description: 'APIVersion defines the versioned schema of this representation 28 | of an object. Servers should convert recognized schemas to the latest 29 | internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources' 30 | type: string 31 | kind: 32 | description: 'Kind is a string value representing the REST resource this 33 | object represents. Servers may infer this from the endpoint the client 34 | submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds' 35 | type: string 36 | metadata: 37 | type: object 38 | spec: 39 | properties: 40 | foo: 41 | type: string 42 | type: object 43 | status: 44 | properties: 45 | message: 46 | type: string 47 | type: object 48 | type: object 49 | served: true 50 | storage: true 51 | subresources: 52 | status: {} 53 | status: 54 | acceptedNames: 55 | kind: "" 56 | plural: "" 57 | conditions: [] 58 | storedVersions: [] 59 | -------------------------------------------------------------------------------- /pkg/mgsrv/interceptor.go: -------------------------------------------------------------------------------- 1 | package mgsrv 2 | 3 | import ( 4 | "context" 5 | log "github.com/sirupsen/logrus" 6 | "github.com/spf13/viper" 7 | "google.golang.org/grpc" 8 | "time" 9 | ) 10 | 11 | type MetaGpuServerStream struct { 12 | grpc.ServerStream 13 | ctx context.Context 14 | } 15 | 16 | func (s *MetaGpuServerStream) Context() context.Context { 17 | return s.ctx 18 | } 19 | 20 | func (s *MetaGpuServer) streamServerInterceptor() grpc.StreamServerInterceptor { 21 | return func(srv interface{}, ss grpc.ServerStream, info *grpc.StreamServerInfo, handler grpc.StreamHandler) error { 22 | wrapper := &MetaGpuServerStream{ServerStream: ss} 23 | if !s.IsMethodPublic(info.FullMethod) { 24 | visibility, err := authorize(ss.Context()) 25 | if err != nil { 26 | return err 27 | } 28 | wrapper.ctx = context.WithValue(ss.Context(), TokenVisibilityClaimName, visibility) 29 | wrapper.ctx = context.WithValue(wrapper.ctx, "containerVl", string(ContainerVisibility)) 30 | wrapper.ctx = context.WithValue(wrapper.ctx, "deviceVl", string(DeviceVisibility)) 31 | wrapper.ctx = context.WithValue(wrapper.ctx, "gpuMgr", s.gpuMgr) 32 | 33 | } 34 | return handler(srv, wrapper) 35 | } 36 | } 37 | 38 | func (s *MetaGpuServer) unaryServerInterceptor() grpc.UnaryServerInterceptor { 39 | return func(ctx context.Context, req interface{}, info *grpc.UnaryServerInfo, handler grpc.UnaryHandler) (interface{}, error) { 40 | start := time.Now() 41 | 42 | if !s.IsMethodPublic(info.FullMethod) { 43 | visibility, err := authorize(ctx) 44 | if err != nil { 45 | return nil, err 46 | } 47 | ctx = context.WithValue(ctx, TokenVisibilityClaimName, visibility) 48 | ctx = context.WithValue(ctx, "containerVl", string(ContainerVisibility)) 49 | ctx = context.WithValue(ctx, "deviceVl", string(DeviceVisibility)) 50 | } 51 | ctx = context.WithValue(ctx, "gpuMgr", s.gpuMgr) 52 | h, err := handler(ctx, req) 53 | if viper.GetBool("verbose") { 54 | log.Infof("[method: %s duration: %s]", info.FullMethod, time.Since(start)) 55 | } 56 | return h, err 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /pkg/gpumgr/device.go: -------------------------------------------------------------------------------- 1 | package gpumgr 2 | 3 | import ( 4 | "github.com/AccessibleAI/cnvrg-fractional-accelerator-device-plugin/pkg/sharecfg" 5 | "github.com/NVIDIA/go-nvml/pkg/nvml" 6 | "os" 7 | 8 | //"github.com/NVIDIA/go-nvml/pkg/nvml" 9 | log "github.com/sirupsen/logrus" 10 | ) 11 | 12 | type DeviceMemory struct { 13 | Total uint64 14 | Free uint64 15 | Used uint64 16 | ShareSize uint64 17 | } 18 | 19 | type DeviceUtilization struct { 20 | Gpu uint32 21 | Memory uint32 22 | } 23 | 24 | type GpuDevice struct { 25 | UUID string 26 | Index int 27 | Shares int 28 | ResourceName string 29 | Utilization *DeviceUtilization 30 | Memory *DeviceMemory 31 | Nodename string 32 | } 33 | 34 | func NewGpuDevice(uuid string, index int, utilization nvml.Utilization, memory nvml.Memory) *GpuDevice { 35 | d := &GpuDevice{ 36 | UUID: uuid, 37 | Index: index, 38 | Utilization: &DeviceUtilization{Gpu: utilization.Gpu, Memory: utilization.Memory / uint32(MB)}, 39 | } 40 | 41 | // set gpu share configs 42 | d.setGpuShareConfigs() 43 | // set nodename 44 | d.setNodename() 45 | // set gpu memory usage 46 | d.setGpuMemoryUsage(memory) 47 | return d 48 | } 49 | 50 | func (d *GpuDevice) setNodename() { 51 | hostname, err := os.Hostname() 52 | if err != nil { 53 | log.Errorf("failed to detect hostname, err: %s", err) 54 | } 55 | d.Nodename = hostname 56 | } 57 | 58 | func (d *GpuDevice) setGpuShareConfigs() { 59 | deviceSharingConfigs := sharecfg.NewDeviceSharingConfig() 60 | if deviceSharing, err := deviceSharingConfigs.GetDeviceSharingConfigs(d.UUID); err != nil { 61 | log.Fatalf("bad configs, unable to find sharing configs for device: %s", d.UUID) 62 | } else { 63 | d.Shares = deviceSharing.MetagpusPerGpu 64 | d.ResourceName = deviceSharing.ResourceName 65 | } 66 | } 67 | 68 | func (d *GpuDevice) setGpuMemoryUsage(memory nvml.Memory) { 69 | d.Memory = &DeviceMemory{ 70 | Total: memory.Total / MB, 71 | Free: memory.Free / MB, 72 | Used: memory.Used / MB, 73 | ShareSize: memory.Total / uint64(d.Shares) / MB, 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /cmd/mgctl/utils.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | pbdevice "github.com/AccessibleAI/cnvrg-fractional-accelerator-device-plugin/gen/proto/go/device/v1" 6 | "github.com/atomicgo/cursor" 7 | "github.com/jedib0t/go-pretty/v6/table" 8 | "strings" 9 | ) 10 | 11 | type TableOutput struct { 12 | data []byte 13 | header table.Row 14 | footer table.Row 15 | body []table.Row 16 | lastPosition int 17 | } 18 | 19 | func (o *TableOutput) rowsCount() int { 20 | return 2 + len(o.body) 21 | } 22 | 23 | func (o *TableOutput) Write(data []byte) (n int, err error) { 24 | o.data = append(o.data, data...) 25 | return len(data), nil 26 | } 27 | 28 | func (o *TableOutput) print() { 29 | if o.lastPosition > 0 { 30 | cursor.ClearLinesUp(o.lastPosition) 31 | } 32 | fmt.Printf("%s", o.data) 33 | o.lastPosition = o.rowsCount() 34 | } 35 | 36 | func (o *TableOutput) buildTable() { 37 | o.data = nil 38 | rowConfigAutoMerge := table.RowConfig{AutoMerge: true} 39 | t := table.NewWriter() 40 | t.SetOutputMirror(o) 41 | t.AppendHeader(o.header, rowConfigAutoMerge) 42 | t.AppendRows(o.body) 43 | t.SetStyle(table.StyleColoredGreenWhiteOnBlack) 44 | t.AppendFooter(o.footer) 45 | t.Render() 46 | } 47 | 48 | func getTotalRequests(containers []*pbdevice.GpuContainer) (totalRequest int) { 49 | for _, c := range containers { 50 | totalRequest += int(c.MetagpuRequests) 51 | } 52 | return 53 | } 54 | 55 | func getTotalShares(devices map[string]*pbdevice.Device) (totalShares int) { 56 | for _, d := range devices { 57 | totalShares += int(d.Shares) 58 | } 59 | return 60 | } 61 | 62 | func getTotalMemoryUsedByProcesses(containers []*pbdevice.GpuContainer) (totalUsedMem int) { 63 | for _, c := range containers { 64 | for _, p := range c.DeviceProcesses { 65 | totalUsedMem += int(p.Memory) 66 | } 67 | } 68 | return 69 | } 70 | 71 | func formatContainerDeviceIndexes(container *pbdevice.GpuContainer) string { 72 | var devIdxs []string 73 | for _, d := range container.ContainerDevices { 74 | devIdxs = append(devIdxs, fmt.Sprintf("%d", d.Device.Index)) 75 | } 76 | if len(devIdxs) > 0 { 77 | return strings.Join(devIdxs, ":") 78 | } 79 | return "-" 80 | } 81 | -------------------------------------------------------------------------------- /pkg/mgsrv/deviceapi/device/v1/utils.go: -------------------------------------------------------------------------------- 1 | package v1 2 | 3 | import ( 4 | pb "github.com/AccessibleAI/cnvrg-fractional-accelerator-device-plugin/gen/proto/go/device/v1" 5 | "github.com/AccessibleAI/cnvrg-fractional-accelerator-device-plugin/pkg/gpumgr" 6 | ) 7 | 8 | func listDeviceProcesses(podId string, gpuMgr *gpumgr.GpuMgr) (containers []*pb.GpuContainer) { 9 | 10 | for _, container := range gpuMgr.GetProcesses(podId) { 11 | var gpuProcesses []*pb.DeviceProcess 12 | 13 | for _, p := range container.Processes { 14 | gpuProcesses = append(gpuProcesses, &pb.DeviceProcess{ 15 | Uuid: p.DeviceUuid, 16 | Pid: p.Pid, 17 | Memory: p.GpuMemory, 18 | Cmdline: p.GetShortCmdLine(), 19 | User: p.User, 20 | ContainerId: p.ContainerId, 21 | GpuUtilization: p.GpuUtilization, 22 | }) 23 | } 24 | var gpuDevices []*pb.ContainerDevice 25 | for _, device := range container.Devices { 26 | gpuDevices = append(gpuDevices, &pb.ContainerDevice{ 27 | Device: &pb.Device{ 28 | Uuid: device.GpuDevice.UUID, 29 | Index: uint32(device.GpuDevice.Index), 30 | Shares: uint32(device.GpuDevice.Shares), 31 | GpuUtilization: device.GpuDevice.Utilization.Gpu, 32 | MemoryUtilization: device.GpuDevice.Utilization.Memory, 33 | MemoryTotal: device.GpuDevice.Memory.Total, 34 | MemoryFree: device.GpuDevice.Memory.Free, 35 | MemoryUsed: device.GpuDevice.Memory.Used, 36 | MemoryShareSize: device.GpuDevice.Memory.ShareSize, 37 | ResourceName: device.GpuDevice.ResourceName, 38 | NodeName: device.GpuDevice.Nodename, 39 | }, 40 | AllocatedShares: device.AllocatedShares, 41 | }) 42 | } 43 | containers = append(containers, &pb.GpuContainer{ 44 | ContainerId: container.ContainerId, 45 | ContainerName: container.ContainerName, 46 | PodId: container.PodId, 47 | PodNamespace: container.PodNamespace, 48 | MetagpuRequests: container.PodMetagpuRequest, 49 | ResourceName: container.ResourceName, 50 | NodeName: container.Nodename, 51 | ContainerDevices: gpuDevices, 52 | DeviceProcesses: gpuProcesses, 53 | }) 54 | } 55 | return 56 | } 57 | -------------------------------------------------------------------------------- /pkg/gpumgr/enforcer_test.go: -------------------------------------------------------------------------------- 1 | package gpumgr 2 | 3 | import ( 4 | . "github.com/onsi/ginkgo" 5 | . "github.com/onsi/gomega" 6 | log "github.com/sirupsen/logrus" 7 | "github.com/spf13/viper" 8 | "testing" 9 | ) 10 | 11 | func TestAllocator(t *testing.T) { 12 | viper.SetConfigName("config") 13 | viper.SetConfigType("yaml") 14 | viper.AddConfigPath("../../config/") 15 | 16 | if err := viper.ReadInConfig(); err != nil { 17 | log.Fatalf("config file not found, err: %s", err) 18 | } 19 | RegisterFailHandler(Fail) 20 | RunSpecs(t, "Enforcer Suite") 21 | } 22 | 23 | var _ = Describe("enforcer", func() { 24 | 25 | Context("enforce", func() { 26 | 27 | It("not oom", func() { 28 | mgr := &GpuMgr{} 29 | mgr.setGpuDevices() 30 | if len(mgr.GpuDevices) < 0 { 31 | log.Fatalf("no gpu devices detected, can't continue unit testing") 32 | } 33 | mgr.gpuContainers = []*GpuContainer{{ 34 | PodMetagpuRequest: 1, 35 | Processes: []*GpuProcess{{ 36 | Pid: 100, 37 | DeviceUuid: mgr.GpuDevices[0].UUID, 38 | GpuUtilization: 0, 39 | GpuMemory: mgr.GpuDevices[0].Memory.ShareSize, 40 | }}, 41 | }} 42 | 43 | res := mgr.enforce() 44 | Expect(len(res)).To(Equal(0)) 45 | }) 46 | 47 | It("oom", func() { 48 | 49 | mgr := &GpuMgr{} 50 | mgr.setGpuDevices() 51 | if len(mgr.GpuDevices) < 0 { 52 | log.Fatalf("no gpu devices detected, can't continue unit testing") 53 | } 54 | mgr.gpuContainers = []*GpuContainer{{ 55 | PodMetagpuRequest: 1, 56 | Processes: []*GpuProcess{{ 57 | Pid: 100, 58 | DeviceUuid: mgr.GpuDevices[0].UUID, 59 | GpuUtilization: 0, 60 | GpuMemory: mgr.GpuDevices[0].Memory.ShareSize + 1, 61 | }}, 62 | }} 63 | 64 | res := mgr.enforce() 65 | Expect(len(res)).To(Equal(1)) 66 | }) 67 | 68 | It("false positive oom", func() { 69 | 70 | mgr := &GpuMgr{} 71 | mgr.setGpuDevices() 72 | if len(mgr.GpuDevices) < 0 { 73 | log.Fatalf("no gpu devices detected, can't continue unit testing") 74 | } 75 | mgr.gpuContainers = []*GpuContainer{{ 76 | PodMetagpuRequest: 1, 77 | Processes: []*GpuProcess{{ 78 | Pid: 100, 79 | DeviceUuid: mgr.GpuDevices[0].UUID, 80 | GpuMemory: 0, 81 | }}, 82 | }} 83 | res := mgr.enforce() 84 | Expect(len(res)).To(Equal(0)) 85 | }) 86 | }) 87 | }) 88 | -------------------------------------------------------------------------------- /pkg/nvmlutils/utils.go: -------------------------------------------------------------------------------- 1 | package nvmlutils 2 | 3 | import ( 4 | "github.com/NVIDIA/go-nvml/pkg/nvml" 5 | log "github.com/sirupsen/logrus" 6 | ) 7 | 8 | func init() { 9 | ret := nvml.Init() 10 | ErrorCheck(ret) 11 | } 12 | 13 | func GetDevices() (devices []*nvml.Device) { 14 | 15 | for i := 0; i < GetTotalDevices(); i++ { 16 | device, ret := nvml.DeviceGetHandleByIndex(i) 17 | ErrorCheck(ret) 18 | devices = append(devices, &device) 19 | } 20 | return 21 | } 22 | 23 | func GetTotalDevices() int { 24 | count, ret := nvml.DeviceGetCount() 25 | ErrorCheck(ret) 26 | return count 27 | } 28 | 29 | func GetComputeRunningProcesses(deviceIdx int) []nvml.ProcessInfo { 30 | processes, ret := getDeviceByIdx(deviceIdx).GetComputeRunningProcesses() 31 | ErrorCheck(ret) 32 | return processes 33 | } 34 | 35 | func GetAccountingStats(deviceIdx int, pid uint32) *nvml.AccountingStats { 36 | stats, ret := getDeviceByIdx(deviceIdx).GetAccountingStats(pid) 37 | ErrorCheck(ret) 38 | return &stats 39 | } 40 | 41 | func SystemGetCudaDriverVersion() int { 42 | cudaVersion, ret := nvml.SystemGetCudaDriverVersion() 43 | ErrorCheck(ret) 44 | return cudaVersion 45 | } 46 | 47 | func SystemGetDriverVersion() string { 48 | driver, ret := nvml.SystemGetDriverVersion() 49 | ErrorCheck(ret) 50 | return driver 51 | } 52 | 53 | func GetDeviceMemory(device *nvml.Device) *nvml.Memory { 54 | memInfo, ret := device.GetMemoryInfo() 55 | ErrorCheck(ret) 56 | return &memInfo 57 | } 58 | 59 | func GetDeviceByUUID(uuid string) *nvml.Device { 60 | for _, device := range GetDevices() { 61 | devUuid, ret := device.GetUUID() 62 | ErrorCheck(ret) 63 | if devUuid == uuid { 64 | return device 65 | } 66 | } 67 | return nil 68 | } 69 | 70 | func GetDeviceUUID(device *nvml.Device) string { 71 | uuid, ret := device.GetUUID() 72 | ErrorCheck(ret) 73 | return uuid 74 | } 75 | 76 | func ErrorCheck(ret nvml.Return) { 77 | if ret == nvml.ERROR_NOT_FOUND { 78 | log.Warnf("nvml error: ERROR_NOT_FOUND: [a query to find an object was unsuccessful]") 79 | return 80 | } 81 | if ret == nvml.ERROR_NOT_SUPPORTED { 82 | log.Warnf("nvml error: ERROR_NOT_SUPPORTED: [device doesn't support this feature]") 83 | return 84 | } 85 | if ret == nvml.ERROR_NO_PERMISSION { 86 | log.Warnf("nvml error: ERROR_NO_PERMISSION: [user doesn't have permission to perform this operation]") 87 | return 88 | } 89 | if ret != nvml.SUCCESS { 90 | log.Fatalf("fatal error during nvml operation: %s", nvml.ErrorString(ret)) 91 | } 92 | } 93 | 94 | func getDeviceByIdx(deviceIdx int) *nvml.Device { 95 | device, ret := nvml.DeviceGetHandleByIndex(deviceIdx) 96 | ErrorCheck(ret) 97 | return &device 98 | } 99 | -------------------------------------------------------------------------------- /pkg/gpumgr/process.go: -------------------------------------------------------------------------------- 1 | package gpumgr 2 | 3 | import ( 4 | "github.com/prometheus/procfs" 5 | "github.com/shirou/gopsutil/v3/process" 6 | log "github.com/sirupsen/logrus" 7 | "path/filepath" 8 | ) 9 | 10 | type GpuProcess struct { 11 | Pid uint32 12 | DeviceUuid string 13 | GpuUtilization uint32 14 | GpuMemory uint64 15 | Cmdline []string 16 | User string 17 | ContainerId string 18 | } 19 | 20 | func (p *GpuProcess) SetProcessCmdline() { 21 | if pr, err := process.NewProcess(int32(p.Pid)); err == nil { 22 | var e error 23 | p.Cmdline, e = pr.CmdlineSlice() 24 | if e != nil { 25 | log.Error(e) 26 | } 27 | } else { 28 | log.Error(err) 29 | } 30 | } 31 | 32 | func (p *GpuProcess) SetProcessUsername() { 33 | if pr, err := process.NewProcess(int32(p.Pid)); err == nil { 34 | var e error 35 | p.User, e = pr.Username() 36 | if e != nil { 37 | log.Error(e) 38 | } 39 | } else { 40 | log.Error(err) 41 | } 42 | } 43 | 44 | func (p *GpuProcess) Kill() error { 45 | if pr, err := process.NewProcess(int32(p.Pid)); err == nil { 46 | return pr.Kill() 47 | } else { 48 | return err 49 | } 50 | } 51 | 52 | func (p *GpuProcess) SetProcessContainerId() { 53 | if proc, err := procfs.NewProc(int(p.Pid)); err == nil { 54 | var e error 55 | var cgroups []procfs.Cgroup 56 | cgroups, e = proc.Cgroups() 57 | if e != nil { 58 | log.Error(e) 59 | } 60 | if len(cgroups) == 0 { 61 | log.Errorf("cgroups list for %d is empty", p.Pid) 62 | } 63 | ExitContainerIdSet: 64 | if p.ContainerId == "" { 65 | for _, g := range cgroups { 66 | for _, c := range g.Controllers { 67 | if c == "memory" { 68 | p.ContainerId = filepath.Base(g.Path) 69 | goto ExitContainerIdSet 70 | } 71 | } 72 | } 73 | log.Warnf("unable to set containerId for pid: %d", p.Pid) 74 | } 75 | } 76 | } 77 | 78 | func (p *GpuProcess) GetShortCmdLine() string { 79 | if len(p.Cmdline) == 0 { 80 | return "-" 81 | } 82 | return p.Cmdline[0] 83 | } 84 | 85 | func (p *GpuProcess) GetDevice(devices []*GpuDevice) *GpuDevice { 86 | for _, device := range devices { 87 | if device.UUID == p.DeviceUuid { 88 | return device 89 | } 90 | } 91 | return nil 92 | } 93 | 94 | func NewGpuProcess(pid, gpuUtil uint32, gpuMem uint64, devUuid string) *GpuProcess { 95 | p := &GpuProcess{ 96 | Pid: pid, 97 | GpuUtilization: gpuUtil, 98 | GpuMemory: gpuMem, 99 | DeviceUuid: devUuid, 100 | } 101 | p.SetProcessUsername() 102 | p.SetProcessCmdline() 103 | p.SetProcessContainerId() 104 | return p 105 | } 106 | -------------------------------------------------------------------------------- /hack/remote-dev.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: dev-metagpu 5 | namespace: default 6 | spec: 7 | selector: 8 | matchLabels: 9 | app: dev-metagpu 10 | template: 11 | metadata: 12 | labels: 13 | app: dev-metagpu 14 | spec: 15 | nodeSelector: 16 | accelerator: nvidia 17 | tolerations: 18 | - operator: Exists 19 | hostPID: true 20 | hostNetwork: true 21 | serviceAccountName: metagpu-device-plugin 22 | containers: 23 | - name: dev-metagpu 24 | imagePullPolicy: Always 25 | image: cnvrg/golang-cuda11-6-dvl:latest 26 | ports: 27 | - containerPort: 2345 28 | - containerPort: 50052 29 | securityContext: 30 | privileged: true 31 | volumeMounts: 32 | - name: device-plugin 33 | mountPath: /var/lib/kubelet/device-plugins 34 | - mountPath: /host/proc 35 | mountPropagation: HostToContainer 36 | name: proc 37 | readOnly: true 38 | volumes: 39 | - name: device-plugin 40 | hostPath: 41 | path: /var/lib/kubelet/device-plugins 42 | - hostPath: 43 | path: /proc 44 | name: proc 45 | --- 46 | apiVersion: v1 47 | kind: Service 48 | metadata: 49 | name: dev-metagpu 50 | namespace: default 51 | spec: 52 | ports: 53 | - name: tcp 54 | port: 2345 55 | selector: 56 | app: dev-metagpu 57 | --- 58 | apiVersion: rbac.authorization.k8s.io/v1 59 | kind: ClusterRole 60 | metadata: 61 | name: metagpu-device-plugin 62 | namespace: default 63 | rules: 64 | - apiGroups: 65 | - "" 66 | resources: 67 | - pods 68 | verbs: 69 | - list 70 | - get 71 | - create 72 | - apiGroups: 73 | - "" 74 | resources: 75 | - pods/exec 76 | verbs: 77 | - create 78 | - apiGroups: 79 | - "" 80 | resources: 81 | - configmaps 82 | resourceNames: 83 | - metagpu-device-plugin-config 84 | verbs: 85 | - get 86 | - update 87 | --- 88 | apiVersion: v1 89 | kind: ServiceAccount 90 | metadata: 91 | name: metagpu-device-plugin 92 | namespace: default 93 | --- 94 | apiVersion: rbac.authorization.k8s.io/v1 95 | kind: ClusterRoleBinding 96 | metadata: 97 | name: metagpu-device-plugin 98 | namespace: default 99 | roleRef: 100 | apiGroup: rbac.authorization.k8s.io 101 | kind: ClusterRole 102 | name: metagpu-device-plugin 103 | subjects: 104 | - kind: ServiceAccount 105 | name: metagpu-device-plugin 106 | namespace: default -------------------------------------------------------------------------------- /pkged.go: -------------------------------------------------------------------------------- 1 | // Code generated by pkger; DO NOT EDIT. 2 | 3 | // +build !skippkger 4 | 5 | package main 6 | 7 | import ( 8 | "github.com/markbates/pkger" 9 | "github.com/markbates/pkger/pkging/mem" 10 | ) 11 | 12 | var _ = pkger.Apply(mem.UnmarshalEmbed([]byte(`1f8b08000000000000ffec584d8fdb3613fe2b01cf5a33de64f3bef12d40d020872d8a34eda5c8614c8da4c9921c96a4bc7617fbdf0b52b22dadbdb6036c801e74b0c967e611bf861c0ee74190ad3888c583a82936ed72a6d8c80f4a6108b4d4f8e1b35476e5ebabca838ac416f41528851a3d44f65725ae48e195d36d4d36b5f291bc5808f947401f644906e4ac6619bc922fd07e213e1bc73efe06b1118b971871217e058362210c64f4919558085188afe06b8c47a6b224fb631d7c613ed6ce8b2dc92d44d588c55f6226be15e2f7081ac522fa167bf00521b0150b11127a55a2435ba2559bc5ab9a35d87ac6be966b193641928de82d68d9da00153608257a51884ffc0b690ca997b450b39a535f9d31b2b4325114438304f2ad0b68a5e6dab7e189d255f33752f1d2c331c58a5cee95d3672ed9ba10beb5910c8a4284e815db551ac04774b9f7655b118b422c3711d3378a8df31882ac34441c0aea7fc8656c2390452f3585d80b709d6b7ee322ef2a12ba163ba0c83579683d2e87ca32c01ea01ac3f2fae666fefe40b05f712cefc197e14768b222d4e580ac35b948ea50b2ffbe22d8cf52360606ec1dc9832ddb48fa882ab4cba871af30e5cd1ea4ef0648bd1d80e16a8406e623747df36e846fe6d703fca4cba8078bbebe79fd7e8cf66335a0f80955ba3b5a8b42a0555c92ad075509c1ce87780901dfbd1d49c882df0c252aac86b0c161e3f27b3a7803ecd024e83dfb34ae4a431ac0c1d9a982e548d5665719ab1b080d29f64e364a9f50e51f84788ee2c087bca94fb37c5ed773b4a0c0da0b68fd213e438b7c87f61429adf00513c8b40b869679473a355093620be4a5f3ecd047c2273ecd50540d6add48032e44dfaad87a1c731c6a8d91d0cb9aaf221bfdc33e132af4fcac429abcc10e3dedc12e38e381bfdf838ee8ef116283de803d4672fdfe7dd67f0f15ed920345963547ecec7ee9dd73406c6d3ec2237972de327ab0a1626f8e6a5b4b8a4b94764b7077f58cac244bb3d57c28d980d1b355f2406977f485545ebdc9b298ed960a19d1b8fe8ad90d1f963484e94e025d0f45cab543589918d8c7a1a8665c3bf464d08ee4141986d8628c1ed4a87f0ed9c10c458eb51ee2a79f78ac34aaa8693c9340b6d65869aa9bd128c22628d05ae21a5567cc03556fa39d3c62889a47abf0c4ccf902279655e84ae2fe06ea3e335d18900ab9a47a5f8d615bef6f1fd38509a990a6d5911ce4c5cd82bf5b8e58665f06cb7ca9588cddbf6c6274836afedbaeee4eb81dfe319984a0887a4dda73ce730e25126ebd7e12d2a44256a4711be27456c8b51ad76e5791616323ac4751505f93aae601da8f24b2c981c081c68ce22939b45c0aaeba8db877cf217ab275566dacea8b7df3bdbd4521ba1308cb7b4fdd35d109f60764843b9f9da4dd74fac3b9afc93656f37763fcff0cd3ae118558a12d3939d2c139df46690da806ae5f5fc672ac37f337af6fceb073910ef3a5bc6d3c718adcfa156e23be13bce6aeac4e330ee3b313e433334ebbb5b421fd0c8600f573cd8d4e4adde6937896e73caf376788d7b271a0ee4eb0a8b47042edb97dd650e90ae93cf0316ddea50155eb512ea924df3ebb9a0777ceb3a4ed1e4e0d5ec2cb57d4b7427cc510774f2fdb6add8976efae4e74cb651ae4e241bce4bbf8363d88fb77e44f7ed47fe25b2e7f5a07b2e699e132f7f327fa40f9313c9fcdff271e1f1f0b5175cbfbf05888063d4e498c2989312531a624c694c49892185312634a624c498c2989312531a624c694c49892185312634a62fc179318ff020000ffff010000ffffa4fc0ea4c6210000`))) 13 | -------------------------------------------------------------------------------- /cmd/mgctl/kill.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "github.com/spf13/cobra" 5 | ) 6 | 7 | var killCmd = &cobra.Command{ 8 | Use: "kill", 9 | Short: "kill process", 10 | Run: func(cmd *cobra.Command, args []string) { 11 | //killGpuProcess() 12 | }, 13 | } 14 | 15 | //func killGpuProcess() { 16 | // conn := ctlutils.GetGrpcMetaGpuSrvClientConn(viper.GetString("addr")) 17 | // if conn == nil { 18 | // log.Fatalf("can't initiate connection to metagpu server") 19 | // } 20 | // defer conn.Close() 21 | // device := pbdevice.NewDeviceServiceClient(conn) 22 | // hostname, err := os.Hostname() 23 | // if err != nil { 24 | // log.Errorf("faild to detect podId, err: %s", err) 25 | // } 26 | // ldr := &pbdevice.GetGpuContainersRequest{PodId: hostname} 27 | // resp, err := device.GetGpuContainers(ctlutils.AuthenticatedContext(viper.GetString("token")), ldr) 28 | // if err != nil { 29 | // log.Errorf("falid to list device processes, err: %s ", err) 30 | // return 31 | // } 32 | // 33 | // killProcessTemplate := &promptui.SelectTemplates{ 34 | // Label: "{{ . }}?", 35 | // Active: `> {{ printf "[Pid:%d] %s" .Pid .Uuid | cyan }}`, 36 | // Inactive: ` {{ printf "[Pid:%d] %s" .Pid .Uuid | faint }}`, 37 | // Selected: `> {{ printf "[Pid:%d] %s" .Pid .Uuid | cyan }}`, 38 | // Details: ` 39 | //--------- Kill GPU process ---------- 40 | //{{ "Cmd:" | faint }} {{ .Cmdline }} 41 | //{{ "GpuMemory:" | faint }} {{ .Memory }}MB 42 | //{{ "Pod name:" | faint }} {{ .PodName }} 43 | //{{ "Pod namespace:" | faint }} {{ .PodNamespace }}`, 44 | // } 45 | // 46 | // killProcessSelect := promptui.Select{ 47 | // Label: "Select a process", 48 | // Items: resp.DevicesProcesses, 49 | // Size: 10, 50 | // Templates: killProcessTemplate, 51 | // } 52 | // idx, _, err := killProcessSelect.Run() 53 | // if err != nil { 54 | // log.Error(err) 55 | // return 56 | // } 57 | // process := resp.DevicesProcesses[idx] 58 | // var confirmTemplate = &promptui.SelectTemplates{ 59 | // Label: `{{ . }}?`, 60 | // Active: `> {{ . | red}}`, 61 | // Inactive: ` {{ . | faint}} `, 62 | // Selected: `> {{ . | red }}`, 63 | // } 64 | // confirmDelete := promptui.Select{ 65 | // Label: fmt.Sprintf("Killing PID: %d on device: %s, are you sure?", process.Pid, process.Uuid), 66 | // Items: []string{"No", "Yes"}, 67 | // Templates: confirmTemplate, 68 | // } 69 | // _, confirm, err := confirmDelete.Run() 70 | // if err != nil { 71 | // log.Error(err) 72 | // return 73 | // } 74 | // 75 | // if confirm == "Yes" { 76 | // killRequest := &pbdevice.KillGpuProcessRequest{Pid: process.Pid} 77 | // if _, err := device.KillGpuProcess(ctlutils.AuthenticatedContext(viper.GetString("token")), killRequest); err != nil { 78 | // log.Fatalf("error killing process, err: %s", err) 79 | // } else { 80 | // log.Infof("%d killed", process.Pid) 81 | // } 82 | // } 83 | //} 84 | -------------------------------------------------------------------------------- /pkg/mgsrv/deviceapi/device/v1/device.proto: -------------------------------------------------------------------------------- 1 | syntax = "proto3"; 2 | 3 | package device.v1; 4 | 5 | service DeviceService{ 6 | rpc GetGpuContainers (GetGpuContainersRequest) returns (GetGpuContainersResponse){} 7 | rpc StreamGpuContainers (StreamGpuContainersRequest) returns (stream StreamGpuContainersResponse){} 8 | rpc GetDevices(GetDevicesRequest) returns (GetDevicesResponse){} 9 | rpc KillGpuProcess(KillGpuProcessRequest) returns (KillGpuProcessResponse){} 10 | rpc PatchConfigs(PatchConfigsRequest) returns (PatchConfigsResponse){} 11 | rpc GetMetaDeviceInfo(GetMetaDeviceInfoRequest) returns (GetMetaDeviceInfoResponse){} 12 | 13 | rpc PingServer(PingServerRequest) returns (PingServerResponse){} 14 | 15 | } 16 | 17 | 18 | 19 | message DeviceProcess{ 20 | string uuid = 1; 21 | uint32 pid = 2; 22 | uint64 memory = 3; 23 | string cmdline = 4; 24 | string user = 5; 25 | string container_id = 6; 26 | uint32 gpu_utilization = 10; 27 | 28 | } 29 | 30 | message ContainerDevice{ 31 | Device device = 1; 32 | int32 allocated_shares = 2; 33 | } 34 | 35 | message GpuContainer{ 36 | string container_id = 1; 37 | string container_name = 2; 38 | string pod_id = 3; 39 | string pod_namespace = 4; 40 | int64 metagpu_requests = 5; 41 | string resource_name = 6; 42 | string node_name = 7; 43 | repeated DeviceProcess device_processes = 8; 44 | repeated ContainerDevice container_devices = 9; 45 | } 46 | 47 | message Device{ 48 | string uuid = 1; 49 | uint32 index = 2; 50 | uint32 shares = 3; 51 | uint32 gpu_utilization = 4; 52 | uint32 memory_utilization = 5; 53 | uint64 memory_total = 6; 54 | uint64 memory_free = 7; 55 | uint64 memory_used = 8; 56 | uint64 memory_share_size = 9; 57 | string resource_name = 10; 58 | string node_name = 11; 59 | } 60 | 61 | message StreamGpuContainersRequest{ 62 | string pod_id = 1; 63 | } 64 | message StreamGpuContainersResponse{ 65 | string visibility_level = 1; 66 | repeated GpuContainer gpu_containers = 2; 67 | } 68 | 69 | message GetGpuContainersRequest{ 70 | string pod_id = 1; 71 | } 72 | message GetGpuContainersResponse{ 73 | string visibility_level = 1; 74 | repeated GpuContainer gpu_containers = 2; 75 | } 76 | 77 | message KillGpuProcessRequest{ 78 | uint32 pid = 1; 79 | } 80 | message KillGpuProcessResponse{} 81 | 82 | message PatchConfigsRequest{ 83 | int32 meta_gpus = 1; 84 | } 85 | message PatchConfigsResponse{} 86 | 87 | message GetDevicesRequest{} 88 | message GetDevicesResponse{ 89 | map device = 1; 90 | } 91 | 92 | message GetMetaDeviceInfoRequest{} 93 | message GetMetaDeviceInfoResponse{ 94 | string node = 1; 95 | map metadata = 2; 96 | repeated Device devices = 3; 97 | } 98 | 99 | message PingServerRequest{} 100 | message PingServerResponse{} 101 | -------------------------------------------------------------------------------- /chart/templates/ds.yml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: DaemonSet 3 | metadata: 4 | name: metagpu-device-plugin 5 | namespace: {{ .Release.Namespace }} 6 | spec: 7 | selector: 8 | matchLabels: 9 | name: metagpu-device-plugin 10 | template: 11 | metadata: 12 | annotations: 13 | scheduler.alpha.kubernetes.io/critical-pod: "" 14 | labels: 15 | name: metagpu-device-plugin 16 | spec: 17 | tolerations: 18 | - key: CriticalAddonsOnly 19 | operator: Exists 20 | - key: nvidia.com/gpu 21 | operator: Exists 22 | effect: NoSchedule 23 | hostPID: true 24 | hostNetwork: true 25 | serviceAccountName: metagpu-device-plugin 26 | nodeSelector: 27 | accelerator: nvidia 28 | containers: 29 | - name: metagpu-device-plugin 30 | image: "docker.io/cnvrg/metagpu-device-plugin:{{ .Values.tag }}" 31 | imagePullPolicy: Always 32 | command: 33 | - /usr/bin/mgdp 34 | - start 35 | - -c 36 | - /etc/metagpu-device-plugin 37 | ports: 38 | - containerPort: 50052 39 | securityContext: 40 | privileged: true 41 | env: 42 | - name: METAGPU_DEVICE_PLUGIN_NODENAME 43 | valueFrom: 44 | fieldRef: 45 | fieldPath: spec.nodeName 46 | - name: POD_IP 47 | valueFrom: 48 | fieldRef: 49 | fieldPath: status.podIP 50 | - name: MG_CTL_TOKEN 51 | value: eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJlbWFpbCI6Im1ldGFncHVAaW5zdGFuY2UiLCJ2aXNpYmlsaXR5TGV2ZWwiOiJsMCJ9.2rHykHFcHoIr-OCoPA5Am4ubf31-RJcayZnOTK6db94 52 | volumeMounts: 53 | - name: device-plugin 54 | mountPath: /var/lib/kubelet/device-plugins 55 | - name: config 56 | mountPath: /etc/metagpu-device-plugin 57 | - mountPath: /host/proc 58 | mountPropagation: HostToContainer 59 | name: proc 60 | readOnly: true 61 | - name: metagpu-exporter 62 | image: "docker.io/cnvrg/metagpu-device-plugin:{{ .Values.tag }}" 63 | imagePullPolicy: Always 64 | command: 65 | - /usr/bin/mgex 66 | - start 67 | - -t 68 | - eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJlbWFpbCI6Im1ldGFncHVAaW5zdGFuY2UiLCJ2aXNpYmlsaXR5TGV2ZWwiOiJsMCJ9.2rHykHFcHoIr-OCoPA5Am4ubf31-RJcayZnOTK6db94 69 | ports: 70 | - containerPort: 2112 71 | volumes: 72 | - name: device-plugin 73 | hostPath: 74 | path: /var/lib/kubelet/device-plugins 75 | - name: config 76 | configMap: 77 | name: metagpu-device-plugin-config 78 | - hostPath: 79 | path: /proc 80 | name: proc -------------------------------------------------------------------------------- /pkg/gpumgr/container.go: -------------------------------------------------------------------------------- 1 | package gpumgr 2 | 3 | import ( 4 | "github.com/AccessibleAI/cnvrg-fractional-accelerator-device-plugin/pkg/podexec" 5 | log "github.com/sirupsen/logrus" 6 | "github.com/spf13/viper" 7 | v1core "k8s.io/api/core/v1" 8 | "regexp" 9 | "strings" 10 | ) 11 | 12 | type ContainerDevice struct { 13 | GpuDevice *GpuDevice 14 | AllocatedShares int32 15 | } 16 | 17 | type GpuContainer struct { 18 | ContainerId string 19 | ContainerName string 20 | PodId string 21 | PodNamespace string 22 | PodMetagpuRequest int64 23 | ResourceName string 24 | Nodename string 25 | Processes []*GpuProcess 26 | Devices []*ContainerDevice 27 | } 28 | 29 | func getContainerId(pod *v1core.Pod, containerName string) (containerId string) { 30 | for _, status := range pod.Status.ContainerStatuses { 31 | if status.Name == containerName { 32 | idx := strings.Index(status.ContainerID, "//") 33 | if idx != -1 { 34 | return status.ContainerID[idx+2:] 35 | } else { 36 | log.WithField("pod", pod.Name).Error("can't extract container id") 37 | } 38 | } 39 | } 40 | return 41 | } 42 | 43 | func (c *GpuContainer) setAllocatedGpus(gpuDevices []*GpuDevice) { 44 | l := log.WithField("pod", c.PodId) 45 | pe, err := podexec.NewPodExec(c.ContainerName, c.PodId, c.PodNamespace) 46 | if err != nil { 47 | l.Error(err) 48 | return 49 | } 50 | output, err := pe.RunCommand([]string{"printenv", "CNVRG_META_GPU_DEVICES"}) 51 | if err != nil { 52 | l.Error(err) 53 | return 54 | } 55 | var gpuAllocationMap = make(map[string]int32) 56 | for _, metaDeviceId := range strings.Split(output, ",") { 57 | r, _ := regexp.Compile("cnvrg-meta-\\d+-\\d+-") 58 | deviceUuid := strings.TrimSuffix(r.ReplaceAllString(metaDeviceId, ""), "\n") 59 | if _, ok := gpuAllocationMap[deviceUuid]; ok { 60 | gpuAllocationMap[deviceUuid] = gpuAllocationMap[deviceUuid] + 1 61 | } else { 62 | gpuAllocationMap[deviceUuid] = 0 63 | } 64 | } 65 | 66 | for uuid, allocatedShares := range gpuAllocationMap { 67 | for _, device := range gpuDevices { 68 | if device.UUID == uuid { 69 | c.Devices = append(c.Devices, &ContainerDevice{ 70 | GpuDevice: device, 71 | AllocatedShares: allocatedShares, 72 | }) 73 | } 74 | } 75 | } 76 | } 77 | 78 | func NewGpuContainer(containerId, containerName, podId, ns, resourceName, nodename string, metagpuRequests int64, gpuDevices []*GpuDevice) *GpuContainer { 79 | p := &GpuContainer{ 80 | ContainerId: containerId, 81 | PodId: podId, 82 | ContainerName: containerName, 83 | PodNamespace: ns, 84 | PodMetagpuRequest: metagpuRequests, 85 | ResourceName: resourceName, 86 | Nodename: nodename, 87 | } 88 | // discover allocated GPUs 89 | p.setAllocatedGpus(gpuDevices) 90 | // inject mgctl bin 91 | if viper.GetBool("mgctlAutoInject") { 92 | podexec.CopymgctlToContainer(p.ContainerName, p.PodId, p.PodNamespace) 93 | } 94 | return p 95 | } 96 | -------------------------------------------------------------------------------- /cmd/mgex/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | log "github.com/sirupsen/logrus" 6 | "github.com/spf13/cobra" 7 | "github.com/spf13/viper" 8 | "os" 9 | "path" 10 | "runtime" 11 | "strconv" 12 | "strings" 13 | ) 14 | 15 | type param struct { 16 | name string 17 | shorthand string 18 | value interface{} 19 | usage string 20 | required bool 21 | } 22 | 23 | var ( 24 | Version string 25 | Build string 26 | rootCmd = &cobra.Command{ 27 | Use: "mgexporter", 28 | Short: "mgexporter - Metagpu metrics exporter", 29 | } 30 | version = &cobra.Command{ 31 | Use: "version", 32 | Short: "Print metagpu metric exporter version and build sha", 33 | Run: func(cmd *cobra.Command, args []string) { 34 | fmt.Printf("🐾 version: %s build: %s \n", Version, Build) 35 | }, 36 | } 37 | startParams = []param{ 38 | {name: "metrics-addr", shorthand: "a", value: "0.0.0.0:2112", usage: "listen address"}, 39 | {name: "mgsrv", shorthand: "s", value: "127.0.0.1:50052", usage: "metagpu device plugin gRPC server address"}, 40 | {name: "token", shorthand: "t", value: "", usage: "metagpu server authenticate token"}, 41 | } 42 | start = &cobra.Command{ 43 | Use: "start", 44 | Short: "start metagpu metrics exporter", 45 | Run: func(cmd *cobra.Command, args []string) { 46 | startExporter() 47 | }, 48 | } 49 | ) 50 | 51 | func init() { 52 | cobra.OnInitialize(initConfig) 53 | setParams(startParams, start) 54 | rootCmd.AddCommand(version) 55 | rootCmd.AddCommand(start) 56 | } 57 | 58 | func initConfig() { 59 | viper.AutomaticEnv() 60 | viper.SetEnvPrefix("MG_EX") 61 | viper.SetEnvKeyReplacer(strings.NewReplacer("-", "_")) 62 | setupLogging() 63 | } 64 | 65 | func setParams(params []param, command *cobra.Command) { 66 | for _, param := range params { 67 | switch v := param.value.(type) { 68 | case int: 69 | command.PersistentFlags().IntP(param.name, param.shorthand, v, param.usage) 70 | case string: 71 | command.PersistentFlags().StringP(param.name, param.shorthand, v, param.usage) 72 | case bool: 73 | command.PersistentFlags().BoolP(param.name, param.shorthand, v, param.usage) 74 | } 75 | if err := viper.BindPFlag(param.name, command.PersistentFlags().Lookup(param.name)); err != nil { 76 | panic(err) 77 | } 78 | } 79 | } 80 | 81 | func setupLogging() { 82 | 83 | // Set log verbosity 84 | if viper.GetBool("verbose") { 85 | log.SetLevel(log.DebugLevel) 86 | log.SetReportCaller(true) 87 | log.SetFormatter(&log.TextFormatter{ 88 | FullTimestamp: true, 89 | CallerPrettyfier: func(frame *runtime.Frame) (function string, file string) { 90 | fileName := fmt.Sprintf(" [%s]", path.Base(frame.Function)+":"+strconv.Itoa(frame.Line)) 91 | return "", fileName 92 | }, 93 | }) 94 | } else { 95 | log.SetLevel(log.InfoLevel) 96 | log.SetFormatter(&log.TextFormatter{FullTimestamp: true}) 97 | } 98 | // Set log format 99 | if viper.GetBool("json-log") { 100 | log.SetFormatter(&log.JSONFormatter{}) 101 | } 102 | // Logs are always goes to STDOUT 103 | log.SetOutput(os.Stdout) 104 | } 105 | 106 | func main() { 107 | 108 | if err := rootCmd.Execute(); err != nil { 109 | fmt.Println(err) 110 | os.Exit(1) 111 | } 112 | 113 | } 114 | -------------------------------------------------------------------------------- /cmd/mgctl/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | log "github.com/sirupsen/logrus" 6 | "github.com/spf13/cobra" 7 | "github.com/spf13/viper" 8 | "os" 9 | "path" 10 | "runtime" 11 | "strconv" 12 | "strings" 13 | ) 14 | 15 | type param struct { 16 | name string 17 | shorthand string 18 | value interface{} 19 | usage string 20 | required bool 21 | } 22 | 23 | var ( 24 | Version string 25 | Build string 26 | rootParams = []param{ 27 | {name: "json-log", shorthand: "", value: false, usage: "output logs in json format"}, 28 | {name: "verbose", shorthand: "", value: false, usage: "enable verbose logs"}, 29 | {name: "addr", shorthand: "s", value: "localhost:50052", usage: "address to access the metagpu server"}, 30 | {name: "token", shorthand: "t", value: "", usage: "authentication token"}, 31 | {name: "output", shorthand: "o", value: "table", usage: "output format, one of: table|json|raw"}, 32 | } 33 | ) 34 | 35 | var metaGpuCtlVersion = &cobra.Command{ 36 | Use: "version", 37 | Short: "Print metagpuctl version and build sha", 38 | Run: func(cmd *cobra.Command, args []string) { 39 | fmt.Printf("🐾 version: %s build: %s \n", Version, Build) 40 | }, 41 | } 42 | 43 | var rootCmd = &cobra.Command{ 44 | Use: "mgctl", 45 | Short: "mgctl - cli client for metagpu management and monitoring", 46 | } 47 | 48 | func init() { 49 | cobra.OnInitialize(initConfig) 50 | setParams(configCmdParams, configCmd) 51 | setParams(processGetParams, processesGetCmd) 52 | setParams(rootParams, rootCmd) 53 | // processes 54 | getCmd.AddCommand(processesGetCmd) 55 | getCmd.AddCommand(getDevicesCmd) 56 | // root commands 57 | rootCmd.AddCommand(configCmd) 58 | rootCmd.AddCommand(enforceCmd) 59 | rootCmd.AddCommand(killCmd) 60 | rootCmd.AddCommand(getCmd) 61 | rootCmd.AddCommand(pingCmd) 62 | rootCmd.AddCommand(metaGpuCtlVersion) 63 | 64 | } 65 | 66 | func initConfig() { 67 | viper.AutomaticEnv() 68 | viper.SetEnvPrefix("MG_CTL") 69 | viper.SetEnvKeyReplacer(strings.NewReplacer("-", "_")) 70 | setupLogging() 71 | } 72 | 73 | func setParams(params []param, command *cobra.Command) { 74 | for _, param := range params { 75 | switch v := param.value.(type) { 76 | case int: 77 | command.PersistentFlags().IntP(param.name, param.shorthand, v, param.usage) 78 | case string: 79 | command.PersistentFlags().StringP(param.name, param.shorthand, v, param.usage) 80 | case bool: 81 | command.PersistentFlags().BoolP(param.name, param.shorthand, v, param.usage) 82 | } 83 | if err := viper.BindPFlag(param.name, command.PersistentFlags().Lookup(param.name)); err != nil { 84 | panic(err) 85 | } 86 | } 87 | } 88 | 89 | func setupLogging() { 90 | 91 | // Set log verbosity 92 | if viper.GetBool("verbose") { 93 | log.SetLevel(log.DebugLevel) 94 | log.SetFormatter(&log.TextFormatter{ 95 | FullTimestamp: true, 96 | CallerPrettyfier: func(frame *runtime.Frame) (function string, file string) { 97 | fileName := fmt.Sprintf(" [%s]", path.Base(frame.Function)+":"+strconv.Itoa(frame.Line)) 98 | return "", fileName 99 | }, 100 | }) 101 | } else { 102 | log.SetLevel(log.InfoLevel) 103 | log.SetFormatter(&log.TextFormatter{FullTimestamp: true}) 104 | } 105 | 106 | // Set log format 107 | if viper.GetBool("json-log") { 108 | log.SetFormatter(&log.JSONFormatter{}) 109 | } 110 | 111 | // Logs are always goes to STDOUT 112 | log.SetOutput(os.Stdout) 113 | } 114 | 115 | func main() { 116 | 117 | if err := rootCmd.Execute(); err != nil { 118 | fmt.Println(err) 119 | os.Exit(1) 120 | } 121 | 122 | } 123 | -------------------------------------------------------------------------------- /cmd/mgctl/enforce.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "github.com/spf13/cobra" 5 | ) 6 | 7 | var enforceCmd = &cobra.Command{ 8 | Use: "enforce", 9 | Short: "enforce memory limits", 10 | Run: func(cmd *cobra.Command, args []string) { 11 | //enforceMemoryLimits() 12 | }, 13 | } 14 | 15 | //// 16 | ////func enforceMemoryLimits() { 17 | //// conn := ctlutils.GetGrpcMetaGpuSrvClientConn(viper.GetString("addr")) 18 | //// if conn == nil { 19 | //// log.Fatalf("can't initiate connection to metagpu server") 20 | //// } 21 | //// defer conn.Close() 22 | //// device := pbdevice.NewDeviceServiceClient(conn) 23 | //// hostname, err := os.Hostname() 24 | //// if err != nil { 25 | //// log.Errorf("faild to detect podId, err: %s", err) 26 | //// } 27 | //// request := &pbdevice.StreamProcessesRequest{PodId: hostname} 28 | //// stream, err := device.StreamProcesses(ctlutils.AuthenticatedContext(viper.GetString("token")), request) 29 | //// if err != nil { 30 | //// log.Fatal(err) 31 | //// } 32 | //// 33 | //// refreshCh := make(chan bool) 34 | //// sigCh := make(chan os.Signal, 1) 35 | //// signal.Notify(sigCh, syscall.SIGHUP, syscall.SIGINT, syscall.SIGTERM, syscall.SIGQUIT) 36 | //// 37 | //// to := &TableOutput{} 38 | //// to.header = table.Row{"Idx", "Pod", "Used Mem", "Meta Mem"} 39 | //// 40 | //// go func() { 41 | //// for { 42 | //// time.Sleep(1 * time.Second) 43 | //// refreshCh <- true 44 | //// } 45 | //// }() 46 | //// 47 | //// for { 48 | //// select { 49 | //// case <-sigCh: 50 | //// cursor.ClearLine() 51 | //// log.Info("shutting down") 52 | //// os.Exit(0) 53 | //// case <-refreshCh: 54 | //// processResp, err := stream.Recv() 55 | //// if err == io.EOF { 56 | // break 57 | // } 58 | // if err != nil { 59 | // log.Fatalf("error watching gpu processes, err: %s", err) 60 | // } 61 | // deviceResp, err := device.GetDevices(ctlutils.AuthenticatedContext(viper.GetString("token")), &pbdevice.GetDevicesRequest{}) 62 | // if err != nil { 63 | // log.Errorf("falid to list devices, err: %s ", err) 64 | // return 65 | // } 66 | // to.body, to.footer = composeMemEnforceListAndFooter(processResp.DevicesProcesses, deviceResp.Device) 67 | // to.buildTable() 68 | // to.print() 69 | // 70 | // for _, p := range processResp.DevicesProcesses { 71 | // d := deviceResp.Device[p.Uuid] 72 | // if p.Memory > d.MemoryShareSize*uint64(p.MetagpuRequests) { 73 | // killRequest := &pbdevice.KillGpuProcessRequest{Pid: p.Pid} 74 | // _, _ = device.KillGpuProcess(ctlutils.AuthenticatedContext(viper.GetString("token")), killRequest) 75 | // } 76 | // } 77 | // } 78 | // } 79 | //} 80 | // 81 | //func composeMemEnforceListAndFooter(processes []*pbdevice.DeviceProcess, devices map[string]*pbdevice.Device) (body []table.Row, footer table.Row) { 82 | // 83 | // type enforceObj struct { 84 | // uuid string 85 | // podName string 86 | // memUsed uint64 87 | // maxMem uint64 88 | // } 89 | // 90 | // var el = make(map[string]*enforceObj) 91 | // 92 | // for _, p := range processes { 93 | // d := devices[p.Uuid] 94 | // el[p.PodName] = &enforceObj{ 95 | // uuid: p.Uuid, 96 | // podName: p.PodName, 97 | // memUsed: p.Memory, 98 | // maxMem: d.MemoryShareSize * uint64(p.MetagpuRequests), 99 | // } 100 | // } 101 | // 102 | // for _, eObj := range el { 103 | // podName := fmt.Sprintf("\033[32m%s\033[0m", eObj.podName) 104 | // if eObj.memUsed > eObj.maxMem { 105 | // podName = fmt.Sprintf("\033[31m%s\033[0m", eObj.podName) 106 | // } 107 | // body = append(body, table.Row{eObj.uuid, podName, eObj.memUsed, eObj.maxMem}) 108 | // } 109 | // 110 | // footer = table.Row{"", "", "", ""} 111 | // 112 | // return 113 | // 114 | //} 115 | -------------------------------------------------------------------------------- /pkg/sharecfg/sharecfg.go: -------------------------------------------------------------------------------- 1 | package sharecfg 2 | 3 | import ( 4 | "fmt" 5 | "github.com/AccessibleAI/cnvrg-fractional-accelerator-device-plugin/pkg/nvmlutils" 6 | "github.com/NVIDIA/go-nvml/pkg/nvml" 7 | log "github.com/sirupsen/logrus" 8 | "github.com/spf13/viper" 9 | ) 10 | 11 | type DeviceSharingConfig struct { 12 | Uuid []string 13 | ResourceName string 14 | MetagpusPerGpu int 15 | AutoReshare bool 16 | } 17 | 18 | type DevicesSharingConfigs struct { 19 | Configs []*DeviceSharingConfig 20 | } 21 | 22 | var shareCfg *DevicesSharingConfigs 23 | 24 | func NewDeviceSharingConfig() *DevicesSharingConfigs { 25 | if shareCfg != nil { 26 | return shareCfg 27 | } 28 | var cfg []*DeviceSharingConfig 29 | if err := viper.UnmarshalKey("deviceSharing", &cfg); err != nil { 30 | log.Fatal(err) 31 | } 32 | shareCfg = &DevicesSharingConfigs{Configs: cfg} 33 | shareCfg.ValidateSharingConfiguration() 34 | shareCfg.AutoReshare() 35 | return shareCfg 36 | } 37 | 38 | func (c *DevicesSharingConfigs) ValidateSharingConfiguration() { 39 | if len(c.Configs) == 0 { 40 | log.Fatalf("mission gpu sharing configuration, can't proceed") 41 | } 42 | if len(c.Configs) > 1 { 43 | for _, devCfg := range c.Configs { 44 | for _, uuid := range devCfg.Uuid { 45 | if uuid == "*" { 46 | log.Fatalf("wrong gpu sharing configuration, "+ 47 | "'deviceSharing' with uuid: [ * ] must have sinlge (1) entry, but have: %d", len(c.Configs)) 48 | } 49 | } 50 | } 51 | } 52 | } 53 | 54 | func (c *DevicesSharingConfigs) AutoReshare() { 55 | for _, cfg := range c.Configs { 56 | if cfg.AutoReshare { 57 | cfg.GpuAutoResharing() 58 | continue 59 | } 60 | log.Infof("autoReshare disabled for: %s, skipping re-configuration", cfg.ResourceName) 61 | } 62 | } 63 | 64 | func (c *DevicesSharingConfigs) GetDeviceSharingConfigs(devUuid string) (*DeviceSharingConfig, error) { 65 | for _, devCfg := range c.Configs { 66 | for _, uuid := range devCfg.Uuid { 67 | if uuid == devUuid || uuid == "*" { 68 | return devCfg, nil 69 | } 70 | } 71 | } 72 | return nil, fmt.Errorf("device uuid: %s not found in sharing configs", devUuid) 73 | } 74 | 75 | func (c *DeviceSharingConfig) GpuAutoResharing() { 76 | log.Info("autoResharing enabled, re-configuring gpu shares") 77 | c.MetagpusPerGpu = 100 78 | // the following code is sharing GPU by memory, 79 | // currently we are not using it, and I don't think we ever will 80 | // but, never say never, thus it's here 81 | 82 | //nvmlDevice := c.getFirstDevice() 83 | //if nvmlDevice != nil { 84 | // mem := nvmlutils.GetDeviceMemory(nvmlDevice) 85 | // if mem.Total > 0 { 86 | // c.MetagpusPerGpu = int((mem.Total / (1024 * 1024)) / 1024) 87 | // } 88 | //} 89 | 90 | // TODO: make sharing configurations persistent 91 | } 92 | 93 | // GetShareSize Get share size in MB 94 | func (c *DeviceSharingConfig) GetShareSize() int { 95 | nvmlDevice := c.getFirstDevice() 96 | if nvmlDevice != nil { 97 | mem := nvmlutils.GetDeviceMemory(nvmlDevice) 98 | if mem.Total > 0 { 99 | return int((mem.Total / (1024 * 1024)) / uint64(c.MetagpusPerGpu)) 100 | } 101 | } 102 | return 0 103 | } 104 | 105 | func (c *DeviceSharingConfig) getFirstDevice() *nvml.Device { 106 | if c.isWildcardSharing() { 107 | devices := nvmlutils.GetDevices() 108 | if len(devices) < 0 { 109 | log.Error("can't execute autoReshare, the devices list is empty") 110 | return nil 111 | } 112 | return devices[0] 113 | } 114 | if len(c.Uuid) < 0 { 115 | log.Error("can't execute autoReshare, uuid config list es empty") 116 | return nil 117 | } 118 | return nvmlutils.GetDeviceByUUID(c.Uuid[0]) 119 | } 120 | 121 | func (c *DeviceSharingConfig) isWildcardSharing() bool { 122 | for _, uuid := range c.Uuid { 123 | if uuid == "*" { 124 | return true 125 | } 126 | } 127 | return false 128 | } 129 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # MetaGPU Device Plugin for Kubernetes 2 | 3 | The metagpu device plugin (`mgdp`) allows you to share one or more Nvidia GPUs between 4 | different K8s workloads. 5 | 6 | ### Motivation 7 | K8s doesn't provide a support for the GPU sharing. 8 | Meaning user must allocate entire GPU to his workload, even if the actual GPU usage 9 | is much bellow of 100%. 10 | This project will help to improve the GPU utilization by allowing GPU sharing between 11 | multiple K8s workloads. 12 | 13 | 14 | ### How it works 15 | The `mgdp` is based on [Nvidia Container Runtime](https://github.com/NVIDIA/nvidia-container-runtime) 16 | and on [go-nvml](https://github.com/NVIDIA/go-nvml) 17 | One for the features the nvidia container runtime providers, is an ability 18 | to specify the visible GPU devices Ids by using env vars `NVIDIA_VISIBLE_DEVICES`. 19 | 20 | The most short & simple explanation of the `mgdp` logic is: 21 | 1. `mgdp` detects all the GPU devices Ids 22 | 2. From the real GPU deices Ids, it's generates a meta-devices Ids 23 | 3. `mgdp` advertise these meta-devices Ids to the K8s 24 | 4. Once a user requests for a gpu fraction, for example 0.5 GPU, `mgdp` will allocate 50 meta-devices IDs 25 | 5. The 50 meta-gpus are bounded to 1 real device id, this real device ID will be injected to the container 26 | 27 | In addition, each metagpu container will have `mgctl` binary. 28 | The `mgctl` is an alternative for `nvidia-smi`. 29 | The `mgctl` improves security and provides better K8s integration. 30 | 31 | ### The sharing configurations 32 | By default, `mgdp` will share each of your GPU devices to 100 meta-gpus. 33 | For example, if you've a machine with 2 GPUs, `mgdp` will generate 200 metagpus. 34 | Requesting for 50 metagpus, will give you 0.5 GPU, requesting 150 metagpus, 35 | will give you 1.5 metagpus. 36 | 37 | 38 | ### [MetaGPU demo from Cnvrg's MLCon 2.0](https://www.youtube.com/watch?v=hsP9GXUtNNs) 39 | 40 | ### Deployment 41 | 1. clone the repo 42 | 2. use helm chart to install or dump manifest and install manually 43 | 44 | ### Install with helm chart 45 | ```bash 46 | # cd into cloned directory and run 47 | # for openshift set ocp=true 48 | helm install chart --set ocp=false -ncnvrg 49 | ``` 50 | 51 | ### Install with raw K8s manifests 52 | ```bash 53 | # cd into cloned directory and run 54 | # for openshift set ocp=true 55 | helm template chart --set ocp=false -ncnvrg > meatgpu.yaml 56 | kubectl apply -f meatgpu.yaml 57 | ``` 58 | 59 | 60 | ### Test the Metagpu 61 | ```bash 62 | cat < 0 { 86 | for _, devLoad := range a.LoadMap { 87 | if devLoad == nil { 88 | continue 89 | } 90 | if devLoad.getFreeShares() >= gpuFractionsRequest { 91 | var devicesToAdd []string 92 | for i, device := range devLoad.Metagpus { 93 | if i == gpuFractionsRequest { 94 | break 95 | } 96 | devicesToAdd = append(devicesToAdd, device) 97 | } 98 | a.MetagpusAllocations = append(a.MetagpusAllocations, devicesToAdd...) 99 | devLoad.removeDevices(devicesToAdd) 100 | break 101 | } 102 | } 103 | } 104 | // if still missing allocations, 105 | // meaning wasn't able to allocate required fractions from the same GPU 106 | // will try to allocate a fractions from different GPUs 107 | if len(a.MetagpusAllocations) != a.AllocationSize { 108 | allocationsLeft := a.AllocationSize 109 | ExitMultiGpuFractionAlloc: 110 | if allocationsLeft > 0 { 111 | for _, devLoad := range a.LoadMap { 112 | if devLoad == nil { 113 | continue 114 | } 115 | for _, device := range devLoad.Metagpus { 116 | a.MetagpusAllocations = append(a.MetagpusAllocations, device) 117 | allocationsLeft-- 118 | if allocationsLeft == 0 { 119 | goto ExitMultiGpuFractionAlloc 120 | } 121 | } 122 | } 123 | } 124 | } 125 | if len(a.MetagpusAllocations) != a.AllocationSize { 126 | log.Errorf("error during allocation, the allocationSize: %d doesn't match total allocated devices: %d", a.AllocationSize, len(a.MetagpusAllocations)) 127 | } 128 | } 129 | 130 | func (l *DeviceLoad) getFreeShares() int { 131 | return len(l.Metagpus) 132 | } 133 | 134 | func (l *DeviceLoad) removeDevices(devIds []string) { 135 | for _, devId := range devIds { 136 | for i, v := range l.Metagpus { 137 | if v == devId { 138 | l.Metagpus = append(l.Metagpus[:i], l.Metagpus[i+1:]...) 139 | } 140 | } 141 | } 142 | } 143 | 144 | func metaDeviceIdToDeviceIndex(metaDeviceId string) (deviceIndex int) { 145 | r, _ := regexp.Compile("-\\d+-") 146 | s := strings.ReplaceAll(r.FindString(metaDeviceId), "-", "") 147 | idx, err := strconv.Atoi(s) 148 | if err != nil { 149 | log.Errorf("can't detect physical device ID from meta device id, err: %s", err) 150 | } 151 | return idx 152 | 153 | } 154 | -------------------------------------------------------------------------------- /deploy/static.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | # Source: metagpu-device-plugin/templates/rbac.yml 3 | apiVersion: v1 4 | kind: ServiceAccount 5 | metadata: 6 | name: metagpu-device-plugin 7 | namespace: cnvrg 8 | --- 9 | # Source: metagpu-device-plugin/templates/cm.yml 10 | apiVersion: v1 11 | kind: ConfigMap 12 | metadata: 13 | name: metagpu-device-plugin-config 14 | namespace: cnvrg 15 | data: 16 | config.yaml: | 17 | accelerator: nvidia 18 | processesDiscoveryPeriod: 5 19 | deviceCacheTTL: 3600 20 | jwtSecret: topSecret 21 | mgctlTar: /tmp/mgctl 22 | mgctlAutoInject: true 23 | serverAddr: 0.0.0.0:50052 24 | memoryEnforcer: true 25 | deviceToken: eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJlbWFpbCI6Im1ldGFncHVAaW5zdGFuY2UiLCJ2aXNpYmlsaXR5TGV2ZWwiOiJsMCJ9.2rHykHFcHoIr-OCoPA5Am4ubf31-RJcayZnOTK6db94 26 | containerToken: eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJlbWFpbCI6Im1ldGFncHVAaW5zdGFuY2UiLCJ2aXNpYmlsaXR5TGV2ZWwiOiJsMSJ9.o5v6Zdi1FKXQevRjuSbABBX1vIRYgN3Daz9iXabuFFA 27 | deviceSharing: 28 | - resourceName: cnvrg.io/metagpu 29 | autoReshare: true 30 | metaGpus: 2 31 | uuid: [ "*" ] 32 | --- 33 | # Source: metagpu-device-plugin/templates/rbac.yml 34 | apiVersion: rbac.authorization.k8s.io/v1 35 | kind: ClusterRole 36 | metadata: 37 | name: metagpu-device-plugin 38 | namespace: cnvrg 39 | rules: 40 | - apiGroups: 41 | - "" 42 | resources: 43 | - pods 44 | verbs: 45 | - list 46 | - get 47 | - create 48 | - apiGroups: 49 | - "" 50 | resources: 51 | - pods/exec 52 | verbs: 53 | - create 54 | - apiGroups: 55 | - "" 56 | resources: 57 | - configmaps 58 | resourceNames: 59 | - metagpu-device-plugin-config 60 | verbs: 61 | - get 62 | - update 63 | --- 64 | # Source: metagpu-device-plugin/templates/rbac.yml 65 | apiVersion: rbac.authorization.k8s.io/v1 66 | kind: ClusterRoleBinding 67 | metadata: 68 | name: metagpu-device-plugin 69 | namespace: cnvrg 70 | roleRef: 71 | apiGroup: rbac.authorization.k8s.io 72 | kind: ClusterRole 73 | name: metagpu-device-plugin 74 | subjects: 75 | - kind: ServiceAccount 76 | name: metagpu-device-plugin 77 | namespace: cnvrg 78 | --- 79 | # Source: metagpu-device-plugin/templates/svc.yml 80 | kind: Service 81 | apiVersion: v1 82 | metadata: 83 | name: metagpu-device-plugin 84 | namespace: cnvrg 85 | labels: 86 | app: "metagpu-exporter" 87 | spec: 88 | selector: 89 | name: metagpu-device-plugin 90 | ports: 91 | - protocol: TCP 92 | port: 50052 93 | name: grcp 94 | - protocol: TCP 95 | port: 2112 96 | name: metrics 97 | --- 98 | # Source: metagpu-device-plugin/templates/ds.yml 99 | apiVersion: apps/v1 100 | kind: DaemonSet 101 | metadata: 102 | name: metagpu-device-plugin 103 | namespace: cnvrg 104 | spec: 105 | selector: 106 | matchLabels: 107 | name: metagpu-device-plugin 108 | template: 109 | metadata: 110 | annotations: 111 | scheduler.alpha.kubernetes.io/critical-pod: "" 112 | labels: 113 | name: metagpu-device-plugin 114 | spec: 115 | tolerations: 116 | - key: CriticalAddonsOnly 117 | operator: Exists 118 | - key: nvidia.com/gpu 119 | operator: Exists 120 | effect: NoSchedule 121 | priorityClassName: "system-node-critical" 122 | imagePullSecrets: 123 | - name: regcred 124 | hostPID: true 125 | hostNetwork: true 126 | serviceAccountName: metagpu-device-plugin 127 | nodeSelector: 128 | accelerator: nvidia 129 | containers: 130 | - name: metagpu-device-plugin 131 | image: "docker.io/cnvrg/metagpu-device-plugin:DEV-13690-tot-mem-cmd-line" 132 | imagePullPolicy: Always 133 | command: 134 | - /usr/bin/mgdp 135 | - start 136 | - -c 137 | - /etc/metagpu-device-plugin 138 | ports: 139 | - containerPort: 50052 140 | securityContext: 141 | privileged: true 142 | env: 143 | - name: POD_IP 144 | valueFrom: 145 | fieldRef: 146 | fieldPath: status.podIP 147 | - name: MG_CTL_TOKEN 148 | value: eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJlbWFpbCI6Im1ldGFncHVAaW5zdGFuY2UiLCJ2aXNpYmlsaXR5TGV2ZWwiOiJsMCJ9.2rHykHFcHoIr-OCoPA5Am4ubf31-RJcayZnOTK6db94 149 | volumeMounts: 150 | - name: device-plugin 151 | mountPath: /var/lib/kubelet/device-plugins 152 | - name: config 153 | mountPath: /etc/metagpu-device-plugin 154 | - mountPath: /host/proc 155 | mountPropagation: HostToContainer 156 | name: proc 157 | readOnly: true 158 | - name: metagpu-exporter 159 | image: "docker.io/cnvrg/metagpu-device-plugin:DEV-13690-tot-mem-cmd-line" 160 | imagePullPolicy: Always 161 | command: 162 | - /usr/bin/mgex 163 | - start 164 | - -t 165 | - eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJlbWFpbCI6Im1ldGFncHVAaW5zdGFuY2UiLCJ2aXNpYmlsaXR5TGV2ZWwiOiJsMCJ9.2rHykHFcHoIr-OCoPA5Am4ubf31-RJcayZnOTK6db94 166 | ports: 167 | - containerPort: 2112 168 | volumes: 169 | - name: device-plugin 170 | hostPath: 171 | path: /var/lib/kubelet/device-plugins 172 | - name: config 173 | configMap: 174 | name: metagpu-device-plugin-config 175 | - hostPath: 176 | path: /proc 177 | name: proc 178 | --- 179 | # Source: metagpu-device-plugin/templates/svcmon.yml 180 | apiVersion: monitoring.coreos.com/v1 181 | kind: ServiceMonitor 182 | metadata: 183 | name: metagpu-exporter 184 | namespace: cnvrg 185 | labels: 186 | app: "metagpu-exporter" 187 | cnvrg-infra-prometheus: cnvrg-infra-cnvrg 188 | spec: 189 | selector: 190 | matchLabels: 191 | app: "metagpu-exporter" 192 | namespaceSelector: 193 | matchNames: 194 | - cnvrg 195 | endpoints: 196 | - port: "metrics" 197 | path: "/metrics" 198 | interval: "15s" 199 | -------------------------------------------------------------------------------- /pkg/plugin/server.go: -------------------------------------------------------------------------------- 1 | package plugin 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | log "github.com/sirupsen/logrus" 7 | "github.com/spf13/viper" 8 | "google.golang.org/grpc" 9 | pluginapi "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1" 10 | "net" 11 | "os" 12 | "path" 13 | "sort" 14 | "strings" 15 | "time" 16 | ) 17 | 18 | func (p *MetaGpuDevicePlugin) dial(socket string, timeout time.Duration) (*grpc.ClientConn, error) { 19 | c, err := grpc.Dial(socket, grpc.WithInsecure(), grpc.WithBlock(), 20 | grpc.WithContextDialer(func(ctx context.Context, s string) (net.Conn, error) { 21 | return net.DialTimeout("unix", socket, timeout) 22 | }), 23 | ) 24 | 25 | if err != nil { 26 | return nil, err 27 | } 28 | 29 | return c, nil 30 | 31 | } 32 | 33 | func (p *MetaGpuDevicePlugin) Register() error { 34 | conn, err := p.dial(pluginapi.KubeletSocket, 5*time.Second) 35 | if err != nil { 36 | return err 37 | } 38 | defer conn.Close() 39 | client := pluginapi.NewRegistrationClient(conn) 40 | req := &pluginapi.RegisterRequest{ 41 | Version: pluginapi.Version, 42 | Endpoint: path.Base(p.socket), 43 | ResourceName: p.GetDeviceSharingConfig().ResourceName, 44 | Options: &pluginapi.DevicePluginOptions{ 45 | GetPreferredAllocationAvailable: true, 46 | }, 47 | } 48 | if _, err := client.Register(context.Background(), req); err != nil { 49 | return err 50 | } 51 | return nil 52 | } 53 | 54 | func (p *MetaGpuDevicePlugin) GetDevicePluginOptions(ctx context.Context, empty *pluginapi.Empty) (*pluginapi.DevicePluginOptions, error) { 55 | return &pluginapi.DevicePluginOptions{GetPreferredAllocationAvailable: true}, nil 56 | } 57 | 58 | func (p *MetaGpuDevicePlugin) ListAndWatch(e *pluginapi.Empty, s pluginapi.DevicePlugin_ListAndWatchServer) error { 59 | 60 | if err := s.Send(&pluginapi.ListAndWatchResponse{Devices: p.GetPluginDevices()}); err != nil { 61 | log.Error(err) 62 | } 63 | 64 | for { 65 | select { 66 | case <-p.stop: 67 | return nil 68 | case <-p.MetaGpuRecalculation: 69 | if err := s.Send(&pluginapi.ListAndWatchResponse{Devices: p.GetPluginDevices()}); err != nil { 70 | log.Error(err) 71 | } 72 | } 73 | } 74 | } 75 | 76 | func (p *MetaGpuDevicePlugin) GetPreferredAllocation(ctx context.Context, request *pluginapi.PreferredAllocationRequest) (*pluginapi.PreferredAllocationResponse, error) { 77 | 78 | allocResponse := &pluginapi.PreferredAllocationResponse{} 79 | for _, req := range request.ContainerRequests { 80 | allocContainerResponse := &pluginapi.ContainerPreferredAllocationResponse{} 81 | allocContainerResponse.DeviceIDs, _ = p.MetagpuAllocation(int(req.AllocationSize), req.GetAvailableDeviceIDs()) 82 | log.Info("preferred devices ids:") 83 | for _, devId := range allocContainerResponse.DeviceIDs { 84 | log.Info(devId) 85 | } 86 | allocResponse.ContainerResponses = append(allocResponse.ContainerResponses, allocContainerResponse) 87 | } 88 | return allocResponse, nil 89 | 90 | } 91 | 92 | func (p *MetaGpuDevicePlugin) Allocate(ctx context.Context, request *pluginapi.AllocateRequest) (*pluginapi.AllocateResponse, error) { 93 | allocResponse := &pluginapi.AllocateResponse{} 94 | for _, req := range request.ContainerRequests { 95 | response := pluginapi.ContainerAllocateResponse{} 96 | sort.Strings(req.DevicesIDs) 97 | log.Info("requested devices ids:") 98 | for _, dev := range req.DevicesIDs { 99 | log.Info(dev) 100 | } 101 | realDevices := p.ParseRealDeviceId(req.DevicesIDs) 102 | response.Envs = map[string]string{ 103 | "CNVRG_META_GPU_DEVICES": strings.Join(req.DevicesIDs, ","), 104 | "NVIDIA_VISIBLE_DEVICES": strings.Join(realDevices, ","), 105 | "METAGPU_MAX_MEM": fmt.Sprintf("%d", p.GetDeviceSharingConfig().GetShareSize()*len(req.DevicesIDs)), 106 | "MG_CTL_ADDR": fmt.Sprintf("%s:50052", os.Getenv("POD_IP")), 107 | "MG_CTL_TOKEN": viper.GetString("containerToken"), 108 | } 109 | allocResponse.ContainerResponses = append(allocResponse.ContainerResponses, &response) 110 | } 111 | return allocResponse, nil 112 | } 113 | 114 | func (p *MetaGpuDevicePlugin) PreStartContainer(ctx context.Context, request *pluginapi.PreStartContainerRequest) (*pluginapi.PreStartContainerResponse, error) { 115 | return &pluginapi.PreStartContainerResponse{}, nil 116 | } 117 | 118 | func (p *MetaGpuDevicePlugin) Serve() error { 119 | _ = os.Remove(p.socket) 120 | 121 | sock, err := net.Listen("unix", p.socket) 122 | if err != nil { 123 | log.Error(err) 124 | } 125 | log.Infof("listening on %s", p.socket) 126 | pluginapi.RegisterDevicePluginServer(p.server, p) 127 | 128 | go func() { 129 | if err := p.server.Serve(sock); err != nil { 130 | log.Errorf("gRPC server craeshed, %s", err) 131 | } 132 | }() 133 | 134 | if conn, err := p.dial(p.socket, 3*time.Second); err != nil { 135 | log.Error(err) 136 | return err 137 | } else { 138 | _ = conn.Close() 139 | log.Info("gRPC server successfully started and ready accept new connections") 140 | } 141 | return nil 142 | 143 | } 144 | 145 | func (p *MetaGpuDevicePlugin) Start() { 146 | if err := p.Serve(); err != nil { 147 | log.Fatal(err) 148 | } 149 | 150 | if err := p.Register(); err != nil { 151 | log.Fatal(err) 152 | } 153 | 154 | } 155 | 156 | func (p *MetaGpuDevicePlugin) Stop() { 157 | log.Info("stopping GRPC server") 158 | if p != nil && p.server != nil { 159 | p.server.Stop() 160 | } 161 | log.Info("removing unix socket") 162 | _ = os.Remove(p.socket) 163 | log.Info("closing all channels") 164 | close(p.stop) 165 | close(p.MetaGpuRecalculation) 166 | } 167 | 168 | func NewMetaGpuDevicePlugin(metaGpuRecalculation chan bool, deviceMgr DeviceManager) *MetaGpuDevicePlugin { 169 | if viper.GetString("accelerator") != "nvidia" { 170 | log.Fatal("accelerator not supported, currently only nvidia is supported") 171 | } 172 | return &MetaGpuDevicePlugin{ 173 | server: grpc.NewServer([]grpc.ServerOption{}...), 174 | socket: fmt.Sprintf("%s%s", pluginapi.DevicePluginPath, deviceMgr.GetUnixSocket()), 175 | DeviceManager: deviceMgr, 176 | stop: make(chan interface{}), 177 | MetaGpuRecalculation: metaGpuRecalculation, 178 | } 179 | } 180 | -------------------------------------------------------------------------------- /.github/workflows/docker-image.yml: -------------------------------------------------------------------------------- 1 | name: Docker Image CI 2 | 3 | on: 4 | push: 5 | branches: [ 'main' ] 6 | 7 | jobs: 8 | build: 9 | runs-on: ubuntu-latest 10 | steps: 11 | - uses: actions/checkout@v2 12 | with: 13 | fetch-depth: 0 14 | 15 | - name: Bump version and push tag 16 | uses: AccessibleAI/github-tag-action@1.0.0 17 | id: tag_bump 18 | env: 19 | MSG: ${{ github.event.inputs.msg }} 20 | GITHUB_TOKEN: ${{ secrets.CNVRG_GITHUB_TOKEN }} 21 | PRERELEASE_AUTOMATIC_BUMP: true 22 | 23 | - name: Set up Python 24 | uses: actions/setup-python@v2 25 | with: 26 | python-version: '3.9' 27 | architecture: x64 28 | if: ${{ steps.tag_bump.outputs.bumped == 'true' && steps.tag_bump.outputs.prerelease == 'false' }} 29 | 30 | - name: Generate Cnvrg Changelog 31 | uses: AccessibleAI/github-changelog-action@1.0.0 32 | id: cnvrg_changelog 33 | with: 34 | from_version: ${{ steps.tag_bump.outputs.tag }} 35 | to_version: ${{ steps.tag_bump.outputs.new_tag }} 36 | jira_token: ${{ secrets.JIRA_TOKEN }} 37 | slack_webhook_url: false 38 | if: ${{ steps.tag_bump.outputs.bumped == 'true' && steps.tag_bump.outputs.prerelease == 'false' }} 39 | 40 | - name: Generate Non Cnvrg Changelog 41 | id: changelog 42 | uses: metcalfc/changelog-generator@v3.0.0 43 | with: 44 | myToken: ${{ secrets.CNVRG_GITHUB_TOKEN }} 45 | head-ref: ${{ steps.tag_bump.outputs.tag }} 46 | base-ref: ${{ steps.tag_bump.outputs.new_tag }} 47 | if: ${{ steps.cnvrg_changelog.outputs.empty == 'true' && steps.tag_bump.outputs.bumped == 'true' && steps.tag_bump.outputs.prerelease == 'false' }} 48 | 49 | - name: Generate changelog 50 | id: changelog_final 51 | if: ${{ steps.tag_bump.outputs.bumped == 'true' && steps.tag_bump.outputs.prerelease == 'false' }} 52 | run: | 53 | set -o noglob 54 | if ${{ steps.cnvrg_changelog.outputs.empty }}; then 55 | log=$(cat << "EOF" 56 | ${{ steps.changelog.outputs.changelog }} 57 | EOF 58 | ) 59 | else 60 | log=$(cat << "EOF" 61 | ${{ steps.cnvrg_changelog.outputs.changelog }} 62 | EOF 63 | ) 64 | fi 65 | log="${log//'%'/'%25'}" 66 | log="${log//$'\n'/'%0A'}" 67 | log="${log//$'\r'/'%0D'}" 68 | echo "::set-output name=changelog::$log" 69 | - name: Print the final changelog 70 | if: ${{ steps.tag_bump.outputs.bumped == 'true' && steps.tag_bump.outputs.prerelease == 'false' }} 71 | run: | 72 | cat << "EOF" 73 | ${{ steps.changelog_final.outputs.changelog }} 74 | EOF 75 | - name: Changelog Release 76 | uses: softprops/action-gh-release@v1 77 | with: 78 | body: ${{steps.changelog_final.outputs.changelog}} 79 | tag_name: ${{ steps.tag_bump.outputs.new_tag }} 80 | prerelease: ${{ steps.tag_bump.outputs.prerelease }} 81 | generate_release_notes: true 82 | if: ${{ steps.tag_bump.outputs.bumped == 'true' && steps.tag_bump.outputs.prerelease == 'false' }} 83 | 84 | - name: Extract repo/branch name 85 | shell: bash 86 | if: ${{ steps.tag_bump.outputs.bumped == 'true' }} 87 | run: | 88 | echo "##[set-output name=repo;]$(echo ${{github.event.repository.name}} | sed 's/cnvrg-//')" 89 | echo "##[set-output name=branch;]$(echo ${GITHUB_REF#refs/*/})" 90 | echo "##[set-output name=head;]$(git rev-parse --short HEAD)" 91 | echo "##[set-output name=repo_url;]$(echo $GITHUB_SERVER_URL/$GITHUB_REPOSITORY)" 92 | id: extract_info 93 | 94 | - name: Login to Docker Hub 95 | uses: docker/login-action@v1 96 | if: ${{ steps.tag_bump.outputs.bumped == 'true' }} 97 | with: 98 | username: ${{ secrets.DOCKER_USER}} 99 | password: ${{ secrets.DOCKER_PASSWORD}} 100 | 101 | - name: Set up Docker Buildx 102 | id: buildx 103 | uses: docker/setup-buildx-action@v1 104 | if: ${{ steps.tag_bump.outputs.bumped == 'true' }} 105 | 106 | - name: Build and push main latest 107 | id: docker_build_main 108 | uses: docker/build-push-action@v2 109 | if: ${{ steps.tag_bump.outputs.bumped == 'true' && (github.ref == 'refs/heads/main' || github.ref == 'refs/heads/master') }} 110 | with: 111 | context: ./ 112 | file: ./Dockerfile 113 | push: true 114 | tags: cnvrg/metagpu-device-plugin:latest 115 | build-args: | 116 | BUILD_SHA=${{ steps.extract_info.outputs.head }} 117 | BUILD_VERSION=latest 118 | - name: Build and push tagged image 119 | id: docker_build_tag 120 | uses: docker/build-push-action@v2 121 | if: ${{ steps.tag_bump.outputs.bumped == 'true' }} 122 | with: 123 | context: ./ 124 | file: ./Dockerfile 125 | push: true 126 | tags: cnvrg/metagpu-device-plugin:${{ steps.tag_bump.outputs.new_tag }} 127 | build-args: | 128 | BUILD_SHA=${{ steps.extract_info.outputs.head }} 129 | BUILD_VERSION=${{ steps.tag_bump.outputs.new_tag }} 130 | - name: Slack Notification 131 | uses: rtCamp/action-slack-notify@v2 132 | if: ${{ steps.tag_bump.outputs.bumped == 'true' && steps.tag_bump.outputs.prerelease == 'false' }} 133 | env: 134 | SLACK_USERNAME: Github Actions 135 | SLACK_WEBHOOK: ${{ secrets.SLACK_GITHUB_APP_TOKEN }} 136 | SLACK_CHANNEL: "#release-notes-metacloud" 137 | SLACK_ICON: https://avatars.githubusercontent.com/u/44036562?s=48&v=4 138 | SLACK_COLOR: ${{ job.status }} 139 | SLACK_FOOTER: "" 140 | MSG_MINIMAL: true 141 | SLACK_TITLE: "Repo Name" 142 | SLACK_MESSAGE: | 143 | <${{ steps.extract_info.outputs.repo_url }}|${{github.event.repository.name}}> 144 | *Docker Image: cnvrg/${{ steps.extract_info.outputs.repo }}:${{ steps.tag_bump.outputs.new_tag }}* 145 | *Version: ${{ steps.tag_bump.outputs.new_tag }}* 146 | <${{ steps.extract_info.outputs.repo_url }}/releases|Release Notes:> 147 | ${{steps.changelog_final.outputs.changelog}} 148 | -------------------------------------------------------------------------------- /pkg/gpumgr/mgr.go: -------------------------------------------------------------------------------- 1 | package gpumgr 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | "github.com/AccessibleAI/cnvrg-fractional-accelerator-device-plugin/pkg/nvmlutils" 7 | "github.com/AccessibleAI/cnvrg-fractional-accelerator-device-plugin/pkg/podexec" 8 | "github.com/AccessibleAI/cnvrg-fractional-accelerator-device-plugin/pkg/sharecfg" 9 | log "github.com/sirupsen/logrus" 10 | "github.com/spf13/viper" 11 | v1core "k8s.io/api/core/v1" 12 | "os" 13 | "time" 14 | ) 15 | 16 | var MB uint64 = 1024 * 1024 17 | 18 | type GpuMgr struct { 19 | ContainerLevelVisibilityToken string 20 | DeviceLevelVisibilityToken string 21 | GpuDevices []*GpuDevice 22 | // list of gpu containers 23 | gpuContainers []*GpuContainer 24 | // collection of the gpu processes: the anonymouse and active running 25 | gpuContainersCollector []*GpuContainer 26 | } 27 | 28 | type GpuDeviceInfo struct { 29 | Node string 30 | Metadata map[string]string 31 | Devices []*GpuDevice 32 | } 33 | 34 | func (m *GpuMgr) startGpuStatusCache() { 35 | go func() { 36 | for { 37 | time.Sleep(5 * time.Second) 38 | // set gpu devices 39 | m.setGpuDevices() 40 | // set gpu containers 41 | m.discoverGpuContainers() 42 | // set active gpu processes 43 | m.enrichGpuContainer() 44 | // set final gpu containers list 45 | m.setGpuContainers() 46 | } 47 | }() 48 | } 49 | 50 | func (m *GpuMgr) setGpuDevices() { 51 | var gpuDevices []*GpuDevice 52 | for idx, device := range nvmlutils.GetDevices() { 53 | uuid, ret := device.GetUUID() 54 | nvmlutils.ErrorCheck(ret) 55 | deviceMemory, ret := device.GetMemoryInfo() 56 | nvmlutils.ErrorCheck(ret) 57 | utilization, ret := device.GetUtilizationRates() 58 | nvmlutils.ErrorCheck(ret) 59 | gpuDevices = append(gpuDevices, NewGpuDevice(uuid, idx, utilization, deviceMemory)) 60 | } 61 | m.GpuDevices = gpuDevices 62 | } 63 | 64 | func (m *GpuMgr) enrichGpuContainer() { 65 | for _, device := range m.GpuDevices { 66 | for _, nvmlProcessInfo := range nvmlutils.GetComputeRunningProcesses(device.Index) { 67 | stats := nvmlutils.GetAccountingStats(device.Index, nvmlProcessInfo.Pid) 68 | gpuProc := NewGpuProcess(nvmlProcessInfo.Pid, stats.GpuUtilization, nvmlProcessInfo.UsedGpuMemory/MB, device.UUID) 69 | for _, c := range m.gpuContainersCollector { 70 | if c.ContainerId == gpuProc.ContainerId { 71 | c.Processes = append(c.Processes, gpuProc) 72 | } 73 | } 74 | } 75 | } 76 | } 77 | 78 | func (m *GpuMgr) setGpuContainers() { 79 | m.gpuContainers = m.gpuContainersCollector 80 | log.Infof("discovered %d gpu containers", len(m.gpuContainers)) 81 | } 82 | 83 | func (m *GpuMgr) GetDeviceInfo() *GpuDeviceInfo { 84 | hostname, err := os.Hostname() 85 | if err != nil { 86 | log.Errorf("failed to detect hostname, err: %s", err) 87 | } 88 | info := make(map[string]string) 89 | cudaVersion := nvmlutils.SystemGetCudaDriverVersion() 90 | info["cudaVersion"] = fmt.Sprintf("%d", cudaVersion) 91 | driver := nvmlutils.SystemGetDriverVersion() 92 | info["driverVersion"] = driver 93 | return &GpuDeviceInfo{Node: hostname, Metadata: info, Devices: m.GpuDevices} 94 | } 95 | 96 | func (m *GpuMgr) discoverGpuContainers() { 97 | c, err := podexec.GetK8sClient() 98 | if err != nil { 99 | log.Error(err) 100 | return 101 | } 102 | pl := &v1core.PodList{} 103 | if err := c.List(context.Background(), pl); err != nil { 104 | log.Error(err) 105 | return 106 | } 107 | // reset gpu containers collector 108 | m.gpuContainersCollector = nil 109 | cfg := sharecfg.NewDeviceSharingConfig() 110 | for _, p := range pl.Items { 111 | for _, container := range p.Spec.Containers { 112 | for _, config := range cfg.Configs { 113 | resourceName := v1core.ResourceName(config.ResourceName) 114 | if quantity, ok := container.Resources.Limits[resourceName]; ok { 115 | 116 | if viper.GetString("nodename") == "" { 117 | m.gpuContainersCollector = append(m.gpuContainersCollector, 118 | NewGpuContainer( 119 | getContainerId(&p, container.Name), 120 | container.Name, 121 | p.Name, 122 | p.Namespace, 123 | config.ResourceName, 124 | p.Spec.NodeName, 125 | quantity.Value(), 126 | m.GpuDevices, 127 | ), 128 | ) 129 | continue 130 | } 131 | 132 | if viper.GetString("nodename") == p.Spec.NodeName { 133 | { 134 | m.gpuContainersCollector = append(m.gpuContainersCollector, 135 | NewGpuContainer( 136 | getContainerId(&p, container.Name), 137 | container.Name, 138 | p.Name, 139 | p.Namespace, 140 | config.ResourceName, 141 | p.Spec.NodeName, 142 | quantity.Value(), 143 | m.GpuDevices, 144 | ), 145 | ) 146 | } 147 | } 148 | 149 | } 150 | } 151 | } 152 | } 153 | } 154 | 155 | func (m *GpuMgr) GetProcesses(podId string) []*GpuContainer { 156 | // if podId is set, return single process 157 | if podId != "" { 158 | var gpuContainers []*GpuContainer 159 | for _, deviceProcess := range m.gpuContainers { 160 | if deviceProcess.PodId == podId { 161 | gpuContainers = append(gpuContainers, deviceProcess) 162 | } 163 | } 164 | return gpuContainers 165 | } 166 | // return all processes 167 | return m.gpuContainers 168 | } 169 | 170 | func (m *GpuMgr) GetMetaDevices() map[string]*GpuDevice { 171 | var deviceMap = make(map[string]*GpuDevice) 172 | for _, d := range m.GpuDevices { 173 | deviceMap[d.UUID] = d 174 | } 175 | return deviceMap 176 | } 177 | 178 | func (m *GpuMgr) KillGpuProcess(pid uint32) error { 179 | p := NewGpuProcess(pid, 0, 0, "") 180 | return p.Kill() 181 | } 182 | 183 | func (m *GpuMgr) SetDeviceLevelVisibilityToken(token string) { 184 | m.DeviceLevelVisibilityToken = token 185 | } 186 | 187 | func (m *GpuMgr) SetContainerLevelVisibilityToken(token string) { 188 | m.ContainerLevelVisibilityToken = token 189 | } 190 | 191 | func NewGpuManager() *GpuMgr { 192 | mgr := &GpuMgr{} 193 | // init gpu devices 194 | mgr.setGpuDevices() 195 | // init gpu containers 196 | mgr.discoverGpuContainers() 197 | // init active gpu processes 198 | mgr.enrichGpuContainer() 199 | // set gpu processes 200 | mgr.setGpuContainers() 201 | // start gpu devices and processes cache 202 | mgr.startGpuStatusCache() 203 | // start mem enforcer 204 | if viper.GetBool("memoryEnforcer") { 205 | mgr.StartMemoryEnforcer() 206 | } 207 | return mgr 208 | } 209 | -------------------------------------------------------------------------------- /pkg/mgsrv/deviceapi/device/v1/device.go: -------------------------------------------------------------------------------- 1 | package v1 2 | 3 | import ( 4 | "context" 5 | pb "github.com/AccessibleAI/cnvrg-fractional-accelerator-device-plugin/gen/proto/go/device/v1" 6 | "github.com/AccessibleAI/cnvrg-fractional-accelerator-device-plugin/pkg/gpumgr" 7 | log "github.com/sirupsen/logrus" 8 | "google.golang.org/grpc/codes" 9 | "google.golang.org/grpc/status" 10 | "time" 11 | ) 12 | 13 | type DeviceService struct { 14 | pb.UnimplementedDeviceServiceServer 15 | gpuMgr *gpumgr.GpuMgr 16 | vl string // visibility level 17 | cvl string // container visibility level ID 18 | dvl string // device visibility level ID 19 | } 20 | 21 | func (s *DeviceService) LoadContext(ctx context.Context) error { 22 | 23 | s.gpuMgr = ctx.Value("gpuMgr").(*gpumgr.GpuMgr) 24 | if s.gpuMgr == nil { 25 | log.Fatalf("gpuMgr instance not set in context") 26 | } 27 | s.vl = ctx.Value("visibilityLevel").(string) 28 | s.cvl = ctx.Value("containerVl").(string) 29 | s.dvl = ctx.Value("deviceVl").(string) 30 | // stop execution if visibility level is empty 31 | if s.vl == "" { 32 | return status.Errorf(codes.Aborted, "can't detect visibility level for request: %s", s.vl) 33 | } 34 | // stop executing if container or device visibility level is empty 35 | if s.cvl == "" || s.dvl == "" { 36 | return status.Error(codes.Aborted, "can't detect visibility levels") 37 | } 38 | return nil 39 | } 40 | 41 | func (s *DeviceService) GetGpuContainers(ctx context.Context, r *pb.GetGpuContainersRequest) (*pb.GetGpuContainersResponse, error) { 42 | 43 | if err := s.LoadContext(ctx); err != nil { 44 | return &pb.GetGpuContainersResponse{}, err 45 | } 46 | response := &pb.GetGpuContainersResponse{VisibilityLevel: s.vl} 47 | // stop execution if visibility level is container and pod id is not set (not enough permissions) 48 | if s.vl == s.cvl && r.PodId == "" { 49 | return response, status.Errorf(codes.PermissionDenied, "missing pod id and visibility level is to low (%s), can't proceed", s.vl) 50 | } 51 | if s.vl == s.dvl { 52 | r.PodId = "" // for deviceVisibilityLevel server should return all running process on all containers 53 | } 54 | response.GpuContainers = listDeviceProcesses(r.PodId, s.gpuMgr) 55 | return response, nil 56 | } 57 | 58 | func (s *DeviceService) StreamGpuContainers(r *pb.StreamGpuContainersRequest, stream pb.DeviceService_StreamGpuContainersServer) error { 59 | 60 | for { 61 | 62 | if err := s.LoadContext(stream.Context()); err != nil { 63 | return err 64 | } 65 | // stop execution if visibility level is container and pod id is not set (not enough permissions) 66 | if s.vl == s.cvl && r.PodId == "" { 67 | return status.Errorf(codes.PermissionDenied, "missing pod id and visibility level is to low (%s), can't proceed", s.vl) 68 | } 69 | if s.vl == s.dvl { 70 | r.PodId = "" // for deviceVisibilityLevel server should return all running process on all containers 71 | } 72 | response := &pb.StreamGpuContainersResponse{VisibilityLevel: s.vl} 73 | response.GpuContainers = listDeviceProcesses(r.PodId, s.gpuMgr) 74 | if err := stream.Send(response); err != nil { 75 | return err 76 | } 77 | 78 | time.Sleep(1 * time.Second) 79 | } 80 | 81 | } 82 | 83 | func (s *DeviceService) GetDevices(ctx context.Context, r *pb.GetDevicesRequest) (*pb.GetDevicesResponse, error) { 84 | response := &pb.GetDevicesResponse{} 85 | if err := s.LoadContext(ctx); err != nil { 86 | return response, err 87 | } 88 | response.Device = make(map[string]*pb.Device) 89 | for _, device := range s.gpuMgr.GetMetaDevices() { 90 | d := &pb.Device{ 91 | Uuid: device.UUID, 92 | Index: uint32(device.Index), 93 | Shares: uint32(device.Shares), 94 | GpuUtilization: device.Utilization.Gpu, 95 | MemoryUtilization: device.Utilization.Memory, 96 | MemoryShareSize: device.Memory.ShareSize, 97 | ResourceName: device.ResourceName, 98 | NodeName: device.Nodename, 99 | } 100 | if s.vl == s.dvl { 101 | d.MemoryTotal = device.Memory.Total 102 | d.MemoryFree = device.Memory.Free 103 | d.MemoryUsed = device.Memory.Used 104 | } 105 | response.Device[d.Uuid] = d 106 | } 107 | return response, nil 108 | } 109 | 110 | func (s *DeviceService) KillGpuProcess(ctx context.Context, r *pb.KillGpuProcessRequest) (*pb.KillGpuProcessResponse, error) { 111 | response := &pb.KillGpuProcessResponse{} 112 | if err := s.LoadContext(ctx); err != nil { 113 | return response, err 114 | } 115 | if err := s.gpuMgr.KillGpuProcess(r.Pid); err != nil { 116 | return response, status.Errorf(codes.Internal, "error killing GPU process, err: %s", err) 117 | } 118 | return response, nil 119 | } 120 | 121 | func (s *DeviceService) GetMetaDeviceInfo(ctx context.Context, r *pb.GetMetaDeviceInfoRequest) (*pb.GetMetaDeviceInfoResponse, error) { 122 | resp := &pb.GetMetaDeviceInfoResponse{} 123 | if err := s.LoadContext(ctx); err != nil { 124 | return resp, err 125 | } 126 | if s.vl != s.dvl { 127 | return resp, status.Errorf(codes.PermissionDenied, "wrong visibility level: %s", s.vl) 128 | } 129 | deviceInfo := s.gpuMgr.GetDeviceInfo() 130 | resp.Node = deviceInfo.Node 131 | resp.Metadata = deviceInfo.Metadata 132 | for _, device := range deviceInfo.Devices { 133 | resp.Devices = append(resp.Devices, &pb.Device{ 134 | Uuid: device.UUID, 135 | Index: uint32(device.Index), 136 | Shares: uint32(device.Shares), 137 | GpuUtilization: device.Utilization.Gpu, 138 | MemoryUtilization: device.Utilization.Memory, 139 | MemoryShareSize: device.Memory.ShareSize, 140 | MemoryTotal: device.Memory.Total, 141 | MemoryFree: device.Memory.Free, 142 | MemoryUsed: device.Memory.Used, 143 | ResourceName: device.ResourceName, 144 | NodeName: device.Nodename, 145 | }) 146 | } 147 | return resp, nil 148 | } 149 | 150 | func (s *DeviceService) PatchConfigs(ctx context.Context, r *pb.PatchConfigsRequest) (*pb.PatchConfigsResponse, error) { 151 | //if err := s.LoadContext(ctx); err != nil { 152 | // return &pb.PatchConfigsResponse{}, err 153 | //} 154 | //if s.vl != s.dvl { 155 | // return &pb.PatchConfigsResponse{}, status.Errorf(codes.PermissionDenied, "visibility level too high", s.vl) 156 | //} 157 | //deviceplugin.UpdatePersistentConfigs(r.MetagpusPerGpu) 158 | //viper.Set("metaGpus", r.MetagpusPerGpu) 159 | //s.gpuMgr.MetaGpuRecalculation <- true 160 | return &pb.PatchConfigsResponse{}, nil 161 | 162 | } 163 | 164 | func (s *DeviceService) PingServer(ctx context.Context, r *pb.PingServerRequest) (*pb.PingServerResponse, error) { 165 | return &pb.PingServerResponse{}, nil 166 | } 167 | -------------------------------------------------------------------------------- /pkg/allocator/allocator_test.go: -------------------------------------------------------------------------------- 1 | package allocator 2 | 3 | import ( 4 | "fmt" 5 | . "github.com/onsi/ginkgo" 6 | . "github.com/onsi/gomega" 7 | "testing" 8 | ) 9 | 10 | func TestAllocator(t *testing.T) { 11 | RegisterFailHandler(Fail) 12 | RunSpecs(t, "Allocation Suite") 13 | } 14 | 15 | var _ = Describe("Metagpu allocations", func() { 16 | 17 | Context("allocate", func() { 18 | 19 | It("10% gpu", func() { 20 | physDevs := 2 21 | allocationSize := 1 22 | sharesPerGpu := 10 23 | testDevices := getTestDevicesIds(physDevs, sharesPerGpu) 24 | alloc := NewDeviceAllocation(physDevs, allocationSize, sharesPerGpu, testDevices) 25 | Expect(len(alloc.MetagpusAllocations)).To(Equal(1)) 26 | expectedDevices := []string{"cnvrg-meta-0-0-test-device-0"} 27 | Expect(alloc.MetagpusAllocations).To(Equal(expectedDevices)) 28 | }) 29 | 30 | It("50% gpu", func() { 31 | physDevs := 2 32 | sharesPerGpu := 10 33 | allocationSize := 5 34 | testDevices := getTestDevicesIds(physDevs, sharesPerGpu) 35 | alloc := NewDeviceAllocation(physDevs, allocationSize, sharesPerGpu, testDevices) 36 | Expect(len(alloc.MetagpusAllocations)).To(Equal(5)) 37 | Expect(alloc.MetagpusAllocations).To(Equal(getTestDevicesIds(1, 5))) 38 | }) 39 | 40 | It("80% gpu", func() { 41 | physDevs := 2 42 | sharesPerGpu := 10 43 | allocationSize := 8 44 | testDevices := getTestDevicesIds(physDevs, sharesPerGpu) 45 | alloc := NewDeviceAllocation(physDevs, allocationSize, sharesPerGpu, testDevices) 46 | Expect(len(alloc.MetagpusAllocations)).To(Equal(8)) 47 | Expect(alloc.MetagpusAllocations).To(Equal(getTestDevicesIds(1, 8))) 48 | }) 49 | 50 | It("100% gpu", func() { 51 | physDevs := 2 52 | allocationSize := 10 53 | sharesPerGpu := 10 54 | testDevices := getTestDevicesIds(physDevs, sharesPerGpu) 55 | alloc := NewDeviceAllocation(physDevs, allocationSize, sharesPerGpu, testDevices) 56 | Expect(len(alloc.MetagpusAllocations)).To(Equal(10)) 57 | Expect(alloc.MetagpusAllocations).To(Equal(getTestDevicesIds(1, 10))) 58 | }) 59 | 60 | It("110% gpu", func() { 61 | physDevs := 2 62 | allocationSize := 12 63 | sharesPerGpu := 10 64 | testDevices := getTestDevicesIds(physDevs, sharesPerGpu) 65 | alloc := NewDeviceAllocation(physDevs, allocationSize, sharesPerGpu, testDevices) 66 | Expect(len(alloc.MetagpusAllocations)).To(Equal(12)) 67 | expectedIds := getTestDevicesIds(1, 10) 68 | expectedIds = append(expectedIds, "cnvrg-meta-1-0-test-device-1") 69 | expectedIds = append(expectedIds, "cnvrg-meta-1-1-test-device-1") 70 | Expect(alloc.MetagpusAllocations).To(Equal(expectedIds)) 71 | }) 72 | 73 | It("200% gpu", func() { 74 | physDevs := 2 75 | allocationSize := 20 76 | sharesPerGpu := 10 77 | testDevices := getTestDevicesIds(physDevs, sharesPerGpu) 78 | alloc := NewDeviceAllocation(physDevs, allocationSize, sharesPerGpu, testDevices) 79 | Expect(len(alloc.MetagpusAllocations)).To(Equal(20)) 80 | Expect(alloc.MetagpusAllocations).To(Equal(getTestDevicesIds(2, 10))) 81 | }) 82 | 83 | It("single GPU -> 50% after 50% has been taken", func() { 84 | devices := []string{ 85 | //"cnvrg-meta-0-0-test-device-0", -> already allocated by previous request 86 | //"cnvrg-meta-0-1-test-device-0", -> already allocated by previous request 87 | "cnvrg-meta-0-2-test-device-0", // -> this should be allocated now 88 | "cnvrg-meta-0-3-test-device-0", // -> this should be allocated now 89 | "cnvrg-meta-1-0-test-device-1", 90 | "cnvrg-meta-1-1-test-device-1", 91 | "cnvrg-meta-1-2-test-device-1", 92 | "cnvrg-meta-1-3-test-device-1", 93 | } 94 | physDevs := 2 95 | allocationSize := 2 96 | sharesPerGpu := 4 97 | alloc := NewDeviceAllocation(physDevs, allocationSize, sharesPerGpu, devices) 98 | Expect(len(alloc.MetagpusAllocations)).To(Equal(2)) 99 | Expect(alloc.MetagpusAllocations).To(Equal([]string{"cnvrg-meta-0-2-test-device-0", "cnvrg-meta-0-3-test-device-0"})) 100 | }) 101 | 102 | It("single GPU -> 60% after 50% has been taken (jump to next device)", func() { 103 | devices := []string{ 104 | //"cnvrg-meta-0-0-test-device-0", -> already allocated by previous request 105 | //"cnvrg-meta-0-1-test-device-0", -> already allocated by previous request 106 | "cnvrg-meta-0-2-test-device-0", 107 | "cnvrg-meta-0-3-test-device-0", 108 | "cnvrg-meta-1-0-test-device-1", // -> this should be allocated now 109 | "cnvrg-meta-1-1-test-device-1", // -> this should be allocated now 110 | "cnvrg-meta-1-2-test-device-1", // -> this should be allocated now 111 | "cnvrg-meta-1-3-test-device-1", 112 | } 113 | physDevs := 2 114 | allocationSize := 3 115 | sharesPerGpu := 4 116 | alloc := NewDeviceAllocation(physDevs, allocationSize, sharesPerGpu, devices) 117 | Expect(len(alloc.MetagpusAllocations)).To(Equal(3)) 118 | Expect(alloc.MetagpusAllocations).To(Equal([]string{ 119 | "cnvrg-meta-1-0-test-device-1", 120 | "cnvrg-meta-1-1-test-device-1", 121 | "cnvrg-meta-1-2-test-device-1", 122 | })) 123 | }) 124 | 125 | It("allocate fractions from 2 different physical gpus", func() { 126 | devices := []string{ 127 | //"cnvrg-meta-0-0-test-device-0", -> already allocated by previous request 128 | //"cnvrg-meta-0-1-test-device-0", -> already allocated by previous request 129 | "cnvrg-meta-0-2-test-device-0", // -> this should be allocated now 130 | "cnvrg-meta-0-3-test-device-0", // -> this should be allocated now 131 | //"cnvrg-meta-1-0-test-device-1", -> already allocated by previous request 132 | //"cnvrg-meta-1-1-test-device-1", -> already allocated by previous request 133 | "cnvrg-meta-1-2-test-device-1", // -> this should be allocated now 134 | "cnvrg-meta-1-3-test-device-1", 135 | } 136 | physDevs := 2 137 | allocationSize := 3 138 | sharesPerGpu := 4 139 | alloc := NewDeviceAllocation(physDevs, allocationSize, sharesPerGpu, devices) 140 | Expect(len(alloc.MetagpusAllocations)).To(Equal(3)) 141 | Expect(alloc.MetagpusAllocations).To(Equal([]string{ 142 | "cnvrg-meta-0-2-test-device-0", 143 | "cnvrg-meta-0-3-test-device-0", 144 | "cnvrg-meta-1-2-test-device-1", 145 | })) 146 | }) 147 | It("allocate full from 2 different physical gpus", func() { 148 | devices := []string{ 149 | //"cnvrg-meta-0-0-test-device-0", -> already allocated by previous request 150 | //"cnvrg-meta-0-1-test-device-0", -> already allocated by previous request 151 | "cnvrg-meta-0-2-test-device-0", // -> this should be allocated now 152 | "cnvrg-meta-0-3-test-device-0", // -> this should be allocated now 153 | //"cnvrg-meta-1-0-test-device-1", -> already allocated by previous request 154 | //"cnvrg-meta-1-1-test-device-1", -> already allocated by previous request 155 | "cnvrg-meta-1-2-test-device-1", // -> this should be allocated now 156 | "cnvrg-meta-1-3-test-device-1", // -> this should be allocated now 157 | } 158 | physDevs := 2 159 | allocationSize := 4 160 | sharesPerGpu := 4 161 | alloc := NewDeviceAllocation(physDevs, allocationSize, sharesPerGpu, devices) 162 | Expect(len(alloc.MetagpusAllocations)).To(Equal(4)) 163 | Expect(alloc.MetagpusAllocations).To(Equal([]string{ 164 | "cnvrg-meta-0-2-test-device-0", 165 | "cnvrg-meta-0-3-test-device-0", 166 | "cnvrg-meta-1-2-test-device-1", 167 | "cnvrg-meta-1-3-test-device-1", 168 | })) 169 | }) 170 | }) 171 | }) 172 | 173 | func getTestDevicesIds(physicalDevices, sharesPerGpu int) (metagpus []string) { 174 | for i := 0; i < physicalDevices; i++ { 175 | for j := 0; j < sharesPerGpu; j++ { 176 | metagpus = append(metagpus, fmt.Sprintf("cnvrg-meta-%d-%d-test-device-%d", i, j, i)) 177 | } 178 | } 179 | return 180 | } 181 | -------------------------------------------------------------------------------- /cmd/mgctl/get.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | pbdevice "github.com/AccessibleAI/cnvrg-fractional-accelerator-device-plugin/gen/proto/go/device/v1" 6 | "github.com/AccessibleAI/cnvrg-fractional-accelerator-device-plugin/pkg/ctlutils" 7 | "github.com/atomicgo/cursor" 8 | "github.com/jedib0t/go-pretty/v6/table" 9 | log "github.com/sirupsen/logrus" 10 | "github.com/spf13/cobra" 11 | "github.com/spf13/viper" 12 | "io" 13 | "os" 14 | "os/signal" 15 | "syscall" 16 | "time" 17 | ) 18 | 19 | var getCmd = &cobra.Command{ 20 | Use: "get", 21 | Aliases: []string{"g"}, 22 | Short: "get resources", 23 | } 24 | 25 | var processGetParams = []param{ 26 | {name: "watch", shorthand: "w", value: false, usage: "watch for the changes"}, 27 | } 28 | 29 | var processesGetCmd = &cobra.Command{ 30 | Use: "processes", 31 | Aliases: []string{"p", "process"}, 32 | Short: "list gpu processes and processes metadata", 33 | Run: func(cmd *cobra.Command, args []string) { 34 | getDevicesProcesses() 35 | }, 36 | } 37 | 38 | var getDevicesCmd = &cobra.Command{ 39 | Use: "devices", 40 | Aliases: []string{"d", "device"}, 41 | Short: "get gpu devices", 42 | Run: func(cmd *cobra.Command, args []string) { 43 | getDevices() 44 | }, 45 | } 46 | 47 | func getDevices() { 48 | conn := ctlutils.GetGrpcMetaGpuSrvClientConn(viper.GetString("addr")) 49 | if conn == nil { 50 | log.Fatalf("can't initiate connection to metagpu server") 51 | } 52 | defer conn.Close() 53 | device := pbdevice.NewDeviceServiceClient(conn) 54 | resp, err := device.GetMetaDeviceInfo(ctlutils.AuthenticatedContext(viper.GetString("token")), &pbdevice.GetMetaDeviceInfoRequest{}) 55 | if err != nil { 56 | log.Fatal(err) 57 | } 58 | to := &TableOutput{} 59 | to.header = table.Row{"Idx", "UUID", "Memory", "Shares", "Share size"} 60 | to.body, to.footer = buildDeviceInfoTableBody(resp.Devices) 61 | to.buildTable() 62 | to.print() 63 | 64 | } 65 | 66 | func getDevicesProcesses() { 67 | 68 | conn := ctlutils.GetGrpcMetaGpuSrvClientConn(viper.GetString("addr")) 69 | if conn == nil { 70 | log.Fatalf("can't initiate connection to metagpu server") 71 | } 72 | defer conn.Close() 73 | device := pbdevice.NewDeviceServiceClient(conn) 74 | hostname, err := os.Hostname() 75 | if err != nil { 76 | log.Errorf("faild to detect podId, err: %s", err) 77 | } 78 | 79 | to := &TableOutput{} 80 | to.header = table.Row{"Pod", "NS", "Device", "Node", "GPU", "Memory", "Pid", "Cmd", "Req"} 81 | 82 | if viper.GetBool("watch") { 83 | request := &pbdevice.StreamGpuContainersRequest{PodId: hostname} 84 | stream, err := device.StreamGpuContainers(ctlutils.AuthenticatedContext(viper.GetString("token")), request) 85 | if err != nil { 86 | log.Fatal(err) 87 | } 88 | 89 | refreshCh := make(chan bool) 90 | sigCh := make(chan os.Signal, 1) 91 | signal.Notify(sigCh, syscall.SIGHUP, syscall.SIGINT, syscall.SIGTERM, syscall.SIGQUIT) 92 | 93 | go func() { 94 | for { 95 | time.Sleep(1 * time.Second) 96 | refreshCh <- true 97 | } 98 | }() 99 | 100 | for { 101 | select { 102 | case <-sigCh: 103 | cursor.ClearLine() 104 | log.Info("shutting down") 105 | os.Exit(0) 106 | case <-refreshCh: 107 | processResp, err := stream.Recv() 108 | if err == io.EOF { 109 | break 110 | } 111 | if err != nil { 112 | log.Fatalf("error watching gpu processes, err: %s", err) 113 | } 114 | deviceResp, err := device.GetDevices(ctlutils.AuthenticatedContext(viper.GetString("token")), &pbdevice.GetDevicesRequest{}) 115 | if err != nil { 116 | log.Errorf("falid to list devices, err: %s ", err) 117 | return 118 | } 119 | to.body = buildDeviceProcessesTableBody(processResp.GpuContainers) 120 | to.footer = buildDeviceProcessesTableFooter(processResp.GpuContainers, deviceResp.Device, processResp.VisibilityLevel) 121 | to.buildTable() 122 | to.print() 123 | } 124 | } 125 | } else { 126 | processResp, err := device.GetGpuContainers(ctlutils.AuthenticatedContext(viper.GetString("token")), &pbdevice.GetGpuContainersRequest{PodId: hostname}) 127 | if err != nil { 128 | log.Errorf("falid to list device processes, err: %s ", err) 129 | return 130 | } 131 | deviceResp, err := device.GetDevices(ctlutils.AuthenticatedContext(viper.GetString("token")), &pbdevice.GetDevicesRequest{}) 132 | if err != nil { 133 | log.Errorf("falid to list devices, err: %s ", err) 134 | return 135 | } 136 | to.body = buildDeviceProcessesTableBody(processResp.GpuContainers) 137 | to.footer = buildDeviceProcessesTableFooter(processResp.GpuContainers, deviceResp.Device, processResp.VisibilityLevel) 138 | to.buildTable() 139 | to.print() 140 | } 141 | } 142 | 143 | func buildDeviceInfoTableBody(devices []*pbdevice.Device) (body []table.Row, footer table.Row) { 144 | var totMem uint64 145 | var shares uint32 146 | for _, d := range devices { 147 | shares = d.Shares 148 | totMem += d.MemoryTotal 149 | body = append(body, table.Row{ 150 | d.Index, 151 | d.Uuid, 152 | d.MemoryTotal, 153 | d.Shares, 154 | d.MemoryShareSize, 155 | }) 156 | } 157 | footer = table.Row{len(devices), "", fmt.Sprintf("%dMB", totMem), uint32(len(devices)) * shares, ""} 158 | return body, footer 159 | } 160 | 161 | func buildDeviceProcessesTableBody(containers []*pbdevice.GpuContainer) (body []table.Row) { 162 | 163 | for _, c := range containers { 164 | if len(c.ContainerDevices) > 0 { 165 | maxMem := int64(c.ContainerDevices[0].Device.MemoryShareSize * uint64(c.MetagpuRequests)) 166 | if len(c.DeviceProcesses) > 0 { 167 | for _, p := range c.DeviceProcesses { 168 | relativeGpuUsage := (p.GpuUtilization * 100) / (100 / c.ContainerDevices[0].Device.Shares * uint32(c.MetagpuRequests)) 169 | gpuUsage := fmt.Sprintf("\u001B[32m%d%%\u001B[0m", relativeGpuUsage) 170 | if relativeGpuUsage > 100 { 171 | gpuUsage = fmt.Sprintf("\u001B[31m%d%%\u001B[0m", relativeGpuUsage) 172 | } 173 | memUsage := fmt.Sprintf("\u001B[32m%d\u001B[0m/%d", p.Memory, maxMem) 174 | if int64(p.Memory) > maxMem { 175 | memUsage = fmt.Sprintf("\u001B[31m%d\u001B[0m/%d", p.Memory, maxMem) 176 | } 177 | body = append(body, table.Row{ 178 | c.PodId, 179 | c.PodNamespace, 180 | formatContainerDeviceIndexes(c), 181 | c.NodeName, 182 | gpuUsage, 183 | memUsage, 184 | p.Pid, 185 | p.Cmdline, 186 | c.MetagpuRequests, 187 | }) 188 | } 189 | } else { 190 | memUsage := fmt.Sprintf("\u001B[32m%d\u001B[0m/%d", 0, maxMem) 191 | body = append(body, table.Row{ 192 | c.PodId, 193 | c.PodNamespace, 194 | formatContainerDeviceIndexes(c), 195 | c.NodeName, 196 | "-", 197 | memUsage, 198 | "-", 199 | "-", 200 | c.MetagpuRequests, 201 | }) 202 | } 203 | } else { 204 | body = append(body, table.Row{ 205 | c.PodId, 206 | c.PodNamespace, 207 | formatContainerDeviceIndexes(c), 208 | c.NodeName, 209 | "-", 210 | "-", 211 | "-", 212 | "-", 213 | c.MetagpuRequests, 214 | }) 215 | } 216 | 217 | } 218 | 219 | return 220 | } 221 | 222 | func buildDeviceProcessesTableFooter(containers []*pbdevice.GpuContainer, devices map[string]*pbdevice.Device, vl string) (footer table.Row) { 223 | metaGpuSummary := fmt.Sprintf("%d", getTotalRequests(containers)) 224 | // TODO: fix this, the vl should be taken from directly form the package 225 | // to problem is that package now includes the nvidia linux native stuff 226 | // and some package re-org is required 227 | //if vl == "l0" { // TODO: temporary disabled 228 | metaGpuSummary = fmt.Sprintf("%d/%d", getTotalShares(devices), getTotalRequests(containers)) 229 | //} 230 | usedMem := fmt.Sprintf("%dMb", getTotalMemoryUsedByProcesses(containers)) 231 | return table.Row{len(containers), "", "", "", "", usedMem, "", "", metaGpuSummary} 232 | } 233 | -------------------------------------------------------------------------------- /cmd/mgex/exporter.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "errors" 5 | "fmt" 6 | pbdevice "github.com/AccessibleAI/cnvrg-fractional-accelerator-device-plugin/gen/proto/go/device/v1" 7 | "github.com/AccessibleAI/cnvrg-fractional-accelerator-device-plugin/pkg/ctlutils" 8 | "github.com/prometheus/client_golang/prometheus" 9 | "github.com/prometheus/client_golang/prometheus/promhttp" 10 | log "github.com/sirupsen/logrus" 11 | "github.com/spf13/viper" 12 | "google.golang.org/grpc" 13 | "net" 14 | "net/http" 15 | "time" 16 | ) 17 | 18 | var ( 19 | conn *grpc.ClientConn 20 | devicesCache map[string]*pbdevice.Device 21 | 22 | deviceShares = prometheus.NewGaugeVec(prometheus.GaugeOpts{ 23 | Namespace: "metagpu", 24 | Subsystem: "device", 25 | Name: "shares", 26 | Help: "total shares for single gpu unit", 27 | }, []string{"device_uuid", "device_index", "resource_name", "node_name"}) 28 | 29 | deviceMemTotal = prometheus.NewGaugeVec(prometheus.GaugeOpts{ 30 | Namespace: "metagpu", 31 | Subsystem: "device", 32 | Name: "memory_total", 33 | Help: "total memory per device", 34 | }, []string{"device_uuid", "device_index", "resource_name", "node_name"}) 35 | 36 | deviceMemFree = prometheus.NewGaugeVec(prometheus.GaugeOpts{ 37 | Namespace: "metagpu", 38 | Subsystem: "device", 39 | Name: "memory_free", 40 | Help: "free memory per device", 41 | }, []string{"device_uuid", "device_index", "resource_name", "node_name"}) 42 | 43 | deviceMemUsed = prometheus.NewGaugeVec(prometheus.GaugeOpts{ 44 | Namespace: "metagpu", 45 | Subsystem: "device", 46 | Name: "memory_used", 47 | Help: "used memory per device", 48 | }, []string{"device_uuid", "device_index", "resource_name", "node_name"}) 49 | 50 | deviceMemShareSize = prometheus.NewGaugeVec(prometheus.GaugeOpts{ 51 | Namespace: "metagpu", 52 | Subsystem: "device", 53 | Name: "memory_share_size", 54 | Help: "metagpu memory share size", 55 | }, []string{"device_uuid", "device_index", "resource_name", "node_name"}) 56 | 57 | deviceProcessAbsoluteGpuUtilization = prometheus.NewGaugeVec(prometheus.GaugeOpts{ 58 | Namespace: "metagpu", 59 | Subsystem: "process", 60 | Name: "absolute_gpu_utilization", 61 | Help: "gpu process utilization in percentage", 62 | }, []string{"uuid", "pid", "cmdline", "user", "pod_name", "pod_namespace", "resource_name", "node_name"}) 63 | 64 | deviceProcessMemoryUsage = prometheus.NewGaugeVec(prometheus.GaugeOpts{ 65 | Namespace: "metagpu", 66 | Subsystem: "process", 67 | Name: "memory_usage", 68 | Help: "process gpu-memory usage", 69 | }, []string{"uuid", "pid", "cmdline", "user", "pod_name", "pod_namespace", "resource_name", "node_name"}) 70 | 71 | deviceProcessMetagpuRequests = prometheus.NewGaugeVec(prometheus.GaugeOpts{ 72 | Namespace: "metagpu", 73 | Subsystem: "process", 74 | Name: "metagpu_requests", 75 | Help: "total metagpu requests in deployment spec", 76 | }, []string{"pod_name", "pod_namespace", "resource_name", "node_name"}) 77 | 78 | deviceProcessMaxAllowedMetagpuGPUUtilization = prometheus.NewGaugeVec(prometheus.GaugeOpts{ 79 | Namespace: "metagpu", 80 | Subsystem: "process", 81 | Name: "max_allowed_metagpu_gpu_utilization", 82 | Help: "max allowed metagpu gpu utilization", 83 | }, []string{"uuid", "pid", "cmdline", "user", "pod_name", "pod_namespace", "resource_name", "node_name"}) 84 | 85 | deviceProcessMetagpuRelativeGPUUtilization = prometheus.NewGaugeVec(prometheus.GaugeOpts{ 86 | Namespace: "metagpu", 87 | Subsystem: "process", 88 | Name: "metagpu_relative_gpu_utilization", 89 | Help: "relative to metagpu request gpu utilization", 90 | }, []string{"uuid", "pid", "cmdline", "user", "pod_name", "pod_namespace", "resource_name", "node_name"}) 91 | 92 | deviceProcessMaxAllowedMetaGpuMemory = prometheus.NewGaugeVec(prometheus.GaugeOpts{ 93 | Namespace: "metagpu", 94 | Subsystem: "process", 95 | Name: "max_allowed_metagpu_memory", 96 | Help: "max allowed metagpu memory usage", 97 | }, []string{"uuid", "pid", "cmdline", "user", "pod_name", "pod_namespace", "resource_name", "node_name"}) 98 | 99 | deviceProcessMetagpuRelativeMemoryUtilization = prometheus.NewGaugeVec(prometheus.GaugeOpts{ 100 | Namespace: "metagpu", 101 | Subsystem: "process", 102 | Name: "metagpu_relative_memory_utilization", 103 | Help: "relative to metagpus request memory utilization", 104 | }, []string{"uuid", "pid", "cmdline", "user", "pod_name", "pod_namespace", "resource_name", "node_name"}) 105 | ) 106 | 107 | func getGpuContainers() []*pbdevice.GpuContainer { 108 | devices := pbdevice.NewDeviceServiceClient(conn) 109 | req := &pbdevice.GetGpuContainersRequest{} 110 | ctx := ctlutils.AuthenticatedContext(viper.GetString("token")) 111 | resp, err := devices.GetGpuContainers(ctx, req) 112 | if err != nil { 113 | log.Error(err) 114 | return nil 115 | } 116 | return resp.GpuContainers 117 | } 118 | 119 | func getGpuDevicesInfo() []*pbdevice.Device { 120 | devices := pbdevice.NewDeviceServiceClient(conn) 121 | req := &pbdevice.GetMetaDeviceInfoRequest{} 122 | ctx := ctlutils.AuthenticatedContext(viper.GetString("token")) 123 | resp, err := devices.GetMetaDeviceInfo(ctx, req) 124 | if err != nil { 125 | log.Error(err) 126 | return nil 127 | } 128 | return resp.Devices 129 | } 130 | 131 | func setGpuDevicesCache() map[string]*pbdevice.Device { 132 | if devicesCache != nil { 133 | return devicesCache 134 | } 135 | devicesCache = make(map[string]*pbdevice.Device) 136 | devices := pbdevice.NewDeviceServiceClient(conn) 137 | req := &pbdevice.GetDevicesRequest{} 138 | ctx := ctlutils.AuthenticatedContext(viper.GetString("token")) 139 | resp, err := devices.GetDevices(ctx, req) 140 | if err != nil { 141 | log.Error(err) 142 | return nil 143 | } 144 | devicesCache = resp.Device 145 | return devicesCache 146 | } 147 | 148 | func clearGpuDevicesCache() { 149 | devicesCache = nil 150 | } 151 | 152 | func setDevicesMetrics() { 153 | // GPU device metrics 154 | for _, d := range getGpuDevicesInfo() { 155 | labels := []string{d.Uuid, fmt.Sprintf("%d", d.Index), d.ResourceName, d.NodeName} 156 | deviceShares.WithLabelValues(labels...).Set(float64(d.Shares)) 157 | deviceMemTotal.WithLabelValues(labels...).Set(float64(d.MemoryTotal)) 158 | deviceMemFree.WithLabelValues(labels...).Set(float64(d.MemoryFree)) 159 | deviceMemUsed.WithLabelValues(labels...).Set(float64(d.MemoryUsed)) 160 | deviceMemShareSize.WithLabelValues(labels...).Set(float64(d.MemoryShareSize)) 161 | } 162 | } 163 | 164 | func resetProcessLevelMetrics() { 165 | deviceProcessAbsoluteGpuUtilization.Reset() 166 | deviceProcessMemoryUsage.Reset() 167 | deviceProcessMetagpuRequests.Reset() 168 | deviceProcessMaxAllowedMetagpuGPUUtilization.Reset() 169 | deviceProcessMetagpuRelativeGPUUtilization.Reset() 170 | deviceProcessMaxAllowedMetaGpuMemory.Reset() 171 | deviceProcessMetagpuRelativeMemoryUtilization.Reset() 172 | } 173 | 174 | func setProcessesMetrics() { 175 | // reset metrics 176 | resetProcessLevelMetrics() 177 | // GPU processes metrics 178 | for _, c := range getGpuContainers() { 179 | // metagpu requests 180 | deviceProcessMetagpuRequests.WithLabelValues( 181 | c.PodId, c.PodNamespace, c.ResourceName, c.NodeName).Set(float64(c.MetagpuRequests)) 182 | // if pod has processes expose process metrics 183 | if len(c.DeviceProcesses) > 0 { 184 | for _, p := range c.DeviceProcesses { 185 | // set labels for device process level metrics 186 | labels := []string{ 187 | p.Uuid, fmt.Sprintf("%d", p.Pid), p.Cmdline, p.User, c.PodId, c.PodNamespace, c.ResourceName, c.NodeName} 188 | // absolute memory and gpu usage 189 | deviceProcessAbsoluteGpuUtilization.WithLabelValues(labels...).Set(float64(p.GpuUtilization)) 190 | deviceProcessMemoryUsage.WithLabelValues(labels...).Set(float64(p.Memory)) 191 | // max (relative to metagpus request) allowed gpu and memory utilization 192 | deviceProcessMaxAllowedMetagpuGPUUtilization.WithLabelValues(labels...).Set(getMaxAllowedMetagpuGPUUtilization(c)) 193 | deviceProcessMaxAllowedMetaGpuMemory.WithLabelValues(labels...).Set(getMaxAllowedMetaGpuMemory(c)) 194 | // relative gpu and memory utilization 195 | deviceProcessMetagpuRelativeGPUUtilization.WithLabelValues(labels...).Set(getRelativeGPUUtilization(c, p)) 196 | deviceProcessMetagpuRelativeMemoryUtilization.WithLabelValues(labels...).Set(getRelativeMemoryUtilization(c, p)) 197 | } 198 | } else { // pod doesn't have any processes, all the metrics should be set to 0 199 | labels := []string{ 200 | "-", "-", "-", "-", c.PodId, c.PodNamespace, c.ResourceName, c.NodeName} 201 | // absolute memory and gpu usage 202 | deviceProcessAbsoluteGpuUtilization.WithLabelValues(labels...).Set(0) 203 | deviceProcessMemoryUsage.WithLabelValues(labels...).Set(0) 204 | // max (relative to metagpus request) allowed gpu and memory utilization 205 | deviceProcessMaxAllowedMetagpuGPUUtilization.WithLabelValues(labels...).Set(0) 206 | deviceProcessMaxAllowedMetaGpuMemory.WithLabelValues(labels...).Set(0) 207 | // relative gpu and memory utilization 208 | deviceProcessMetagpuRelativeGPUUtilization.WithLabelValues(labels...).Set(0) 209 | deviceProcessMetagpuRelativeMemoryUtilization.WithLabelValues(labels...).Set(0) 210 | } 211 | } 212 | } 213 | 214 | func getMaxAllowedMetagpuGPUUtilization(c *pbdevice.GpuContainer) float64 { 215 | l := log.WithField("pod", c.PodId) 216 | d, err := getFirstContainerDevice(c) 217 | if err != nil { 218 | l.Error(err) 219 | return 0 220 | } 221 | return float64((100 / d.Device.Shares) * uint32(c.MetagpuRequests)) 222 | } 223 | 224 | func getMaxAllowedMetaGpuMemory(c *pbdevice.GpuContainer) float64 { 225 | l := log.WithField("pod", c.PodId) 226 | d, err := getFirstContainerDevice(c) 227 | if err != nil { 228 | l.Error(err) 229 | return 0 230 | } 231 | return float64(uint64(c.MetagpuRequests) * d.Device.MemoryShareSize) 232 | 233 | } 234 | 235 | func getFirstContainerDevice(c *pbdevice.GpuContainer) (*pbdevice.ContainerDevice, error) { 236 | if len(c.ContainerDevices) == 0 { 237 | return nil, errors.New("no allocated gpus found") 238 | } 239 | return c.ContainerDevices[0], nil 240 | } 241 | 242 | func getRelativeGPUUtilization(c *pbdevice.GpuContainer, p *pbdevice.DeviceProcess) float64 { 243 | l := log.WithField("pod", c.PodId) 244 | d, err := getFirstContainerDevice(c) 245 | if err != nil { 246 | l.Error(err) 247 | return 0 248 | } 249 | maxMetaGpuUtilization := (100 / d.Device.Shares) * uint32(c.MetagpuRequests) 250 | metaGpuUtilization := 0 251 | if p.GpuUtilization > 0 && maxMetaGpuUtilization > 0 { 252 | metaGpuUtilization = int((p.GpuUtilization * 100) / maxMetaGpuUtilization) 253 | } 254 | return float64(metaGpuUtilization) 255 | } 256 | 257 | func getRelativeMemoryUtilization(c *pbdevice.GpuContainer, p *pbdevice.DeviceProcess) float64 { 258 | l := log.WithField("pod", c.PodId) 259 | d, err := getFirstContainerDevice(c) 260 | if err != nil { 261 | l.Error(err) 262 | return 0 263 | } 264 | maxMetaMemory := int(uint64(c.MetagpuRequests) * d.Device.MemoryShareSize) 265 | metaMemUtilization := 0 266 | if maxMetaMemory > 0 { 267 | metaMemUtilization = (int(p.Memory) * 100) / maxMetaMemory 268 | } 269 | return float64(metaMemUtilization) 270 | } 271 | 272 | func recordMetrics() { 273 | go func() { 274 | for { 275 | conn = ctlutils.GetGrpcMetaGpuSrvClientConn(viper.GetString("mgsrv")) 276 | if conn == nil { 277 | log.Fatal("connection is nil, can't continue") 278 | continue 279 | } 280 | // load devices cache 281 | setGpuDevicesCache() 282 | // set devices level metrics 283 | setDevicesMetrics() 284 | // set processes level metrics 285 | setProcessesMetrics() 286 | // close grcp connections 287 | conn.Close() 288 | // clear the cache 289 | clearGpuDevicesCache() 290 | time.Sleep(15 * time.Second) 291 | } 292 | }() 293 | } 294 | 295 | func startExporter() { 296 | 297 | log.Info("starting metagpu metrics exporter") 298 | prometheus.MustRegister(deviceShares) 299 | prometheus.MustRegister(deviceMemTotal) 300 | prometheus.MustRegister(deviceMemFree) 301 | prometheus.MustRegister(deviceMemUsed) 302 | prometheus.MustRegister(deviceMemShareSize) 303 | prometheus.MustRegister(deviceProcessAbsoluteGpuUtilization) 304 | prometheus.MustRegister(deviceProcessMemoryUsage) 305 | prometheus.MustRegister(deviceProcessMetagpuRequests) 306 | prometheus.MustRegister(deviceProcessMaxAllowedMetagpuGPUUtilization) 307 | prometheus.MustRegister(deviceProcessMetagpuRelativeGPUUtilization) 308 | prometheus.MustRegister(deviceProcessMaxAllowedMetaGpuMemory) 309 | prometheus.MustRegister(deviceProcessMetagpuRelativeMemoryUtilization) 310 | recordMetrics() 311 | addr := viper.GetString("metrics-addr") 312 | http.Handle("/metrics", promhttp.Handler()) 313 | l, err := net.Listen("tcp", addr) 314 | if err != nil { 315 | log.Error(err) 316 | return 317 | } 318 | log.Infof("metrics serving on http://%s/metrics", addr) 319 | if err := http.Serve(l, nil); err != nil { 320 | log.Error(err) 321 | return 322 | } 323 | } 324 | -------------------------------------------------------------------------------- /gen/proto/go/device/v1/device_grpc.pb.go: -------------------------------------------------------------------------------- 1 | // Code generated by protoc-gen-go-grpc. DO NOT EDIT. 2 | 3 | package devicev1 4 | 5 | import ( 6 | context "context" 7 | grpc "google.golang.org/grpc" 8 | codes "google.golang.org/grpc/codes" 9 | status "google.golang.org/grpc/status" 10 | ) 11 | 12 | // This is a compile-time assertion to ensure that this generated file 13 | // is compatible with the grpc package it is being compiled against. 14 | // Requires gRPC-Go v1.32.0 or later. 15 | const _ = grpc.SupportPackageIsVersion7 16 | 17 | // DeviceServiceClient is the client API for DeviceService service. 18 | // 19 | // For semantics around ctx use and closing/ending streaming RPCs, please refer to https://pkg.go.dev/google.golang.org/grpc/?tab=doc#ClientConn.NewStream. 20 | type DeviceServiceClient interface { 21 | GetGpuContainers(ctx context.Context, in *GetGpuContainersRequest, opts ...grpc.CallOption) (*GetGpuContainersResponse, error) 22 | StreamGpuContainers(ctx context.Context, in *StreamGpuContainersRequest, opts ...grpc.CallOption) (DeviceService_StreamGpuContainersClient, error) 23 | GetDevices(ctx context.Context, in *GetDevicesRequest, opts ...grpc.CallOption) (*GetDevicesResponse, error) 24 | KillGpuProcess(ctx context.Context, in *KillGpuProcessRequest, opts ...grpc.CallOption) (*KillGpuProcessResponse, error) 25 | PatchConfigs(ctx context.Context, in *PatchConfigsRequest, opts ...grpc.CallOption) (*PatchConfigsResponse, error) 26 | GetMetaDeviceInfo(ctx context.Context, in *GetMetaDeviceInfoRequest, opts ...grpc.CallOption) (*GetMetaDeviceInfoResponse, error) 27 | PingServer(ctx context.Context, in *PingServerRequest, opts ...grpc.CallOption) (*PingServerResponse, error) 28 | } 29 | 30 | type deviceServiceClient struct { 31 | cc grpc.ClientConnInterface 32 | } 33 | 34 | func NewDeviceServiceClient(cc grpc.ClientConnInterface) DeviceServiceClient { 35 | return &deviceServiceClient{cc} 36 | } 37 | 38 | func (c *deviceServiceClient) GetGpuContainers(ctx context.Context, in *GetGpuContainersRequest, opts ...grpc.CallOption) (*GetGpuContainersResponse, error) { 39 | out := new(GetGpuContainersResponse) 40 | err := c.cc.Invoke(ctx, "/device.v1.DeviceService/GetGpuContainers", in, out, opts...) 41 | if err != nil { 42 | return nil, err 43 | } 44 | return out, nil 45 | } 46 | 47 | func (c *deviceServiceClient) StreamGpuContainers(ctx context.Context, in *StreamGpuContainersRequest, opts ...grpc.CallOption) (DeviceService_StreamGpuContainersClient, error) { 48 | stream, err := c.cc.NewStream(ctx, &DeviceService_ServiceDesc.Streams[0], "/device.v1.DeviceService/StreamGpuContainers", opts...) 49 | if err != nil { 50 | return nil, err 51 | } 52 | x := &deviceServiceStreamGpuContainersClient{stream} 53 | if err := x.ClientStream.SendMsg(in); err != nil { 54 | return nil, err 55 | } 56 | if err := x.ClientStream.CloseSend(); err != nil { 57 | return nil, err 58 | } 59 | return x, nil 60 | } 61 | 62 | type DeviceService_StreamGpuContainersClient interface { 63 | Recv() (*StreamGpuContainersResponse, error) 64 | grpc.ClientStream 65 | } 66 | 67 | type deviceServiceStreamGpuContainersClient struct { 68 | grpc.ClientStream 69 | } 70 | 71 | func (x *deviceServiceStreamGpuContainersClient) Recv() (*StreamGpuContainersResponse, error) { 72 | m := new(StreamGpuContainersResponse) 73 | if err := x.ClientStream.RecvMsg(m); err != nil { 74 | return nil, err 75 | } 76 | return m, nil 77 | } 78 | 79 | func (c *deviceServiceClient) GetDevices(ctx context.Context, in *GetDevicesRequest, opts ...grpc.CallOption) (*GetDevicesResponse, error) { 80 | out := new(GetDevicesResponse) 81 | err := c.cc.Invoke(ctx, "/device.v1.DeviceService/GetDevices", in, out, opts...) 82 | if err != nil { 83 | return nil, err 84 | } 85 | return out, nil 86 | } 87 | 88 | func (c *deviceServiceClient) KillGpuProcess(ctx context.Context, in *KillGpuProcessRequest, opts ...grpc.CallOption) (*KillGpuProcessResponse, error) { 89 | out := new(KillGpuProcessResponse) 90 | err := c.cc.Invoke(ctx, "/device.v1.DeviceService/KillGpuProcess", in, out, opts...) 91 | if err != nil { 92 | return nil, err 93 | } 94 | return out, nil 95 | } 96 | 97 | func (c *deviceServiceClient) PatchConfigs(ctx context.Context, in *PatchConfigsRequest, opts ...grpc.CallOption) (*PatchConfigsResponse, error) { 98 | out := new(PatchConfigsResponse) 99 | err := c.cc.Invoke(ctx, "/device.v1.DeviceService/PatchConfigs", in, out, opts...) 100 | if err != nil { 101 | return nil, err 102 | } 103 | return out, nil 104 | } 105 | 106 | func (c *deviceServiceClient) GetMetaDeviceInfo(ctx context.Context, in *GetMetaDeviceInfoRequest, opts ...grpc.CallOption) (*GetMetaDeviceInfoResponse, error) { 107 | out := new(GetMetaDeviceInfoResponse) 108 | err := c.cc.Invoke(ctx, "/device.v1.DeviceService/GetMetaDeviceInfo", in, out, opts...) 109 | if err != nil { 110 | return nil, err 111 | } 112 | return out, nil 113 | } 114 | 115 | func (c *deviceServiceClient) PingServer(ctx context.Context, in *PingServerRequest, opts ...grpc.CallOption) (*PingServerResponse, error) { 116 | out := new(PingServerResponse) 117 | err := c.cc.Invoke(ctx, "/device.v1.DeviceService/PingServer", in, out, opts...) 118 | if err != nil { 119 | return nil, err 120 | } 121 | return out, nil 122 | } 123 | 124 | // DeviceServiceServer is the server API for DeviceService service. 125 | // All implementations must embed UnimplementedDeviceServiceServer 126 | // for forward compatibility 127 | type DeviceServiceServer interface { 128 | GetGpuContainers(context.Context, *GetGpuContainersRequest) (*GetGpuContainersResponse, error) 129 | StreamGpuContainers(*StreamGpuContainersRequest, DeviceService_StreamGpuContainersServer) error 130 | GetDevices(context.Context, *GetDevicesRequest) (*GetDevicesResponse, error) 131 | KillGpuProcess(context.Context, *KillGpuProcessRequest) (*KillGpuProcessResponse, error) 132 | PatchConfigs(context.Context, *PatchConfigsRequest) (*PatchConfigsResponse, error) 133 | GetMetaDeviceInfo(context.Context, *GetMetaDeviceInfoRequest) (*GetMetaDeviceInfoResponse, error) 134 | PingServer(context.Context, *PingServerRequest) (*PingServerResponse, error) 135 | mustEmbedUnimplementedDeviceServiceServer() 136 | } 137 | 138 | // UnimplementedDeviceServiceServer must be embedded to have forward compatible implementations. 139 | type UnimplementedDeviceServiceServer struct { 140 | } 141 | 142 | func (UnimplementedDeviceServiceServer) GetGpuContainers(context.Context, *GetGpuContainersRequest) (*GetGpuContainersResponse, error) { 143 | return nil, status.Errorf(codes.Unimplemented, "method GetGpuContainers not implemented") 144 | } 145 | func (UnimplementedDeviceServiceServer) StreamGpuContainers(*StreamGpuContainersRequest, DeviceService_StreamGpuContainersServer) error { 146 | return status.Errorf(codes.Unimplemented, "method StreamGpuContainers not implemented") 147 | } 148 | func (UnimplementedDeviceServiceServer) GetDevices(context.Context, *GetDevicesRequest) (*GetDevicesResponse, error) { 149 | return nil, status.Errorf(codes.Unimplemented, "method GetDevices not implemented") 150 | } 151 | func (UnimplementedDeviceServiceServer) KillGpuProcess(context.Context, *KillGpuProcessRequest) (*KillGpuProcessResponse, error) { 152 | return nil, status.Errorf(codes.Unimplemented, "method KillGpuProcess not implemented") 153 | } 154 | func (UnimplementedDeviceServiceServer) PatchConfigs(context.Context, *PatchConfigsRequest) (*PatchConfigsResponse, error) { 155 | return nil, status.Errorf(codes.Unimplemented, "method PatchConfigs not implemented") 156 | } 157 | func (UnimplementedDeviceServiceServer) GetMetaDeviceInfo(context.Context, *GetMetaDeviceInfoRequest) (*GetMetaDeviceInfoResponse, error) { 158 | return nil, status.Errorf(codes.Unimplemented, "method GetMetaDeviceInfo not implemented") 159 | } 160 | func (UnimplementedDeviceServiceServer) PingServer(context.Context, *PingServerRequest) (*PingServerResponse, error) { 161 | return nil, status.Errorf(codes.Unimplemented, "method PingServer not implemented") 162 | } 163 | func (UnimplementedDeviceServiceServer) mustEmbedUnimplementedDeviceServiceServer() {} 164 | 165 | // UnsafeDeviceServiceServer may be embedded to opt out of forward compatibility for this service. 166 | // Use of this interface is not recommended, as added methods to DeviceServiceServer will 167 | // result in compilation errors. 168 | type UnsafeDeviceServiceServer interface { 169 | mustEmbedUnimplementedDeviceServiceServer() 170 | } 171 | 172 | func RegisterDeviceServiceServer(s grpc.ServiceRegistrar, srv DeviceServiceServer) { 173 | s.RegisterService(&DeviceService_ServiceDesc, srv) 174 | } 175 | 176 | func _DeviceService_GetGpuContainers_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) { 177 | in := new(GetGpuContainersRequest) 178 | if err := dec(in); err != nil { 179 | return nil, err 180 | } 181 | if interceptor == nil { 182 | return srv.(DeviceServiceServer).GetGpuContainers(ctx, in) 183 | } 184 | info := &grpc.UnaryServerInfo{ 185 | Server: srv, 186 | FullMethod: "/device.v1.DeviceService/GetGpuContainers", 187 | } 188 | handler := func(ctx context.Context, req interface{}) (interface{}, error) { 189 | return srv.(DeviceServiceServer).GetGpuContainers(ctx, req.(*GetGpuContainersRequest)) 190 | } 191 | return interceptor(ctx, in, info, handler) 192 | } 193 | 194 | func _DeviceService_StreamGpuContainers_Handler(srv interface{}, stream grpc.ServerStream) error { 195 | m := new(StreamGpuContainersRequest) 196 | if err := stream.RecvMsg(m); err != nil { 197 | return err 198 | } 199 | return srv.(DeviceServiceServer).StreamGpuContainers(m, &deviceServiceStreamGpuContainersServer{stream}) 200 | } 201 | 202 | type DeviceService_StreamGpuContainersServer interface { 203 | Send(*StreamGpuContainersResponse) error 204 | grpc.ServerStream 205 | } 206 | 207 | type deviceServiceStreamGpuContainersServer struct { 208 | grpc.ServerStream 209 | } 210 | 211 | func (x *deviceServiceStreamGpuContainersServer) Send(m *StreamGpuContainersResponse) error { 212 | return x.ServerStream.SendMsg(m) 213 | } 214 | 215 | func _DeviceService_GetDevices_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) { 216 | in := new(GetDevicesRequest) 217 | if err := dec(in); err != nil { 218 | return nil, err 219 | } 220 | if interceptor == nil { 221 | return srv.(DeviceServiceServer).GetDevices(ctx, in) 222 | } 223 | info := &grpc.UnaryServerInfo{ 224 | Server: srv, 225 | FullMethod: "/device.v1.DeviceService/GetDevices", 226 | } 227 | handler := func(ctx context.Context, req interface{}) (interface{}, error) { 228 | return srv.(DeviceServiceServer).GetDevices(ctx, req.(*GetDevicesRequest)) 229 | } 230 | return interceptor(ctx, in, info, handler) 231 | } 232 | 233 | func _DeviceService_KillGpuProcess_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) { 234 | in := new(KillGpuProcessRequest) 235 | if err := dec(in); err != nil { 236 | return nil, err 237 | } 238 | if interceptor == nil { 239 | return srv.(DeviceServiceServer).KillGpuProcess(ctx, in) 240 | } 241 | info := &grpc.UnaryServerInfo{ 242 | Server: srv, 243 | FullMethod: "/device.v1.DeviceService/KillGpuProcess", 244 | } 245 | handler := func(ctx context.Context, req interface{}) (interface{}, error) { 246 | return srv.(DeviceServiceServer).KillGpuProcess(ctx, req.(*KillGpuProcessRequest)) 247 | } 248 | return interceptor(ctx, in, info, handler) 249 | } 250 | 251 | func _DeviceService_PatchConfigs_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) { 252 | in := new(PatchConfigsRequest) 253 | if err := dec(in); err != nil { 254 | return nil, err 255 | } 256 | if interceptor == nil { 257 | return srv.(DeviceServiceServer).PatchConfigs(ctx, in) 258 | } 259 | info := &grpc.UnaryServerInfo{ 260 | Server: srv, 261 | FullMethod: "/device.v1.DeviceService/PatchConfigs", 262 | } 263 | handler := func(ctx context.Context, req interface{}) (interface{}, error) { 264 | return srv.(DeviceServiceServer).PatchConfigs(ctx, req.(*PatchConfigsRequest)) 265 | } 266 | return interceptor(ctx, in, info, handler) 267 | } 268 | 269 | func _DeviceService_GetMetaDeviceInfo_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) { 270 | in := new(GetMetaDeviceInfoRequest) 271 | if err := dec(in); err != nil { 272 | return nil, err 273 | } 274 | if interceptor == nil { 275 | return srv.(DeviceServiceServer).GetMetaDeviceInfo(ctx, in) 276 | } 277 | info := &grpc.UnaryServerInfo{ 278 | Server: srv, 279 | FullMethod: "/device.v1.DeviceService/GetMetaDeviceInfo", 280 | } 281 | handler := func(ctx context.Context, req interface{}) (interface{}, error) { 282 | return srv.(DeviceServiceServer).GetMetaDeviceInfo(ctx, req.(*GetMetaDeviceInfoRequest)) 283 | } 284 | return interceptor(ctx, in, info, handler) 285 | } 286 | 287 | func _DeviceService_PingServer_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) { 288 | in := new(PingServerRequest) 289 | if err := dec(in); err != nil { 290 | return nil, err 291 | } 292 | if interceptor == nil { 293 | return srv.(DeviceServiceServer).PingServer(ctx, in) 294 | } 295 | info := &grpc.UnaryServerInfo{ 296 | Server: srv, 297 | FullMethod: "/device.v1.DeviceService/PingServer", 298 | } 299 | handler := func(ctx context.Context, req interface{}) (interface{}, error) { 300 | return srv.(DeviceServiceServer).PingServer(ctx, req.(*PingServerRequest)) 301 | } 302 | return interceptor(ctx, in, info, handler) 303 | } 304 | 305 | // DeviceService_ServiceDesc is the grpc.ServiceDesc for DeviceService service. 306 | // It's only intended for direct use with grpc.RegisterService, 307 | // and not to be introspected or modified (even as a copy) 308 | var DeviceService_ServiceDesc = grpc.ServiceDesc{ 309 | ServiceName: "device.v1.DeviceService", 310 | HandlerType: (*DeviceServiceServer)(nil), 311 | Methods: []grpc.MethodDesc{ 312 | { 313 | MethodName: "GetGpuContainers", 314 | Handler: _DeviceService_GetGpuContainers_Handler, 315 | }, 316 | { 317 | MethodName: "GetDevices", 318 | Handler: _DeviceService_GetDevices_Handler, 319 | }, 320 | { 321 | MethodName: "KillGpuProcess", 322 | Handler: _DeviceService_KillGpuProcess_Handler, 323 | }, 324 | { 325 | MethodName: "PatchConfigs", 326 | Handler: _DeviceService_PatchConfigs_Handler, 327 | }, 328 | { 329 | MethodName: "GetMetaDeviceInfo", 330 | Handler: _DeviceService_GetMetaDeviceInfo_Handler, 331 | }, 332 | { 333 | MethodName: "PingServer", 334 | Handler: _DeviceService_PingServer_Handler, 335 | }, 336 | }, 337 | Streams: []grpc.StreamDesc{ 338 | { 339 | StreamName: "StreamGpuContainers", 340 | Handler: _DeviceService_StreamGpuContainers_Handler, 341 | ServerStreams: true, 342 | }, 343 | }, 344 | Metadata: "device/v1/device.proto", 345 | } 346 | --------------------------------------------------------------------------------