├── .gitignore
├── config
    ├── crd
    │   ├── metagpu.yaml
    │   └── bases
    │   │   └── mlops.cnvrg.io_metagpus.yaml
    └── config.yaml
├── chart
    ├── values.yaml
    ├── Chart.yaml
    └── templates
    │   ├── svc.yml
    │   ├── svcmon.yml
    │   ├── scc.yml
    │   ├── cm.yml
    │   ├── rbac.yml
    │   └── ds.yml
├── buf.work.yaml
├── cmd
    ├── mgctl
    │   ├── create.go
    │   ├── ping.go
    │   ├── config.go
    │   ├── utils.go
    │   ├── kill.go
    │   ├── main.go
    │   ├── enforce.go
    │   └── get.go
    ├── mgex
    │   ├── main.go
    │   ├── readme.md
    │   └── exporter.go
    └── mgdp
    │   └── main.go
├── hack
    ├── scripts
    │   ├── test2.py
    │   ├── test3.py
    │   └── test.py
    ├── dp.yaml
    ├── dep-gpu.yaml
    └── remote-dev.yaml
├── pkg
    ├── mgsrv
    │   ├── deviceapi
    │   │   ├── buf.yaml
    │   │   ├── buf.lock
    │   │   └── device
    │   │   │   └── v1
    │   │   │       ├── utils.go
    │   │   │       ├── device.proto
    │   │   │       └── device.go
    │   ├── interceptor.go
    │   └── server.go
    ├── allocator
    │   ├── types.go
    │   ├── allocator.go
    │   └── allocator_test.go
    ├── podexec
    │   ├── types.go
    │   ├── copycache.go
    │   └── podexec.go
    ├── gpumgr
    │   ├── enforcer.go
    │   ├── device.go
    │   ├── enforcer_test.go
    │   ├── process.go
    │   ├── container.go
    │   └── mgr.go
    ├── plugin
    │   ├── types.go
    │   ├── nvidia.go
    │   └── server.go
    ├── ctlutils
    │   └── utils.go
    ├── nvmlutils
    │   └── utils.go
    └── sharecfg
    │   └── sharecfg.go
├── buf.gen.yaml
├── Dockerfile.dev
├── LICENSE
├── Dockerfile
├── Makefile
├── pkged.go
├── README.md
├── go.mod
├── deploy
    └── static.yaml
├── .github
    └── workflows
    │   └── docker-image.yml
└── gen
    └── proto
        └── go
            └── device
                └── v1
                    └── device_grpc.pb.go


/.gitignore:
--------------------------------------------------------------------------------
1 | .idea
2 | bin


--------------------------------------------------------------------------------
/config/crd/metagpu.yaml:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/chart/values.yaml:
--------------------------------------------------------------------------------
1 | tag: latest
2 | ocp: false


--------------------------------------------------------------------------------
/buf.work.yaml:
--------------------------------------------------------------------------------
1 | version: v1
2 | directories:
3 |   - pkg/mgsrv/deviceapi


--------------------------------------------------------------------------------
/cmd/mgctl/create.go:
--------------------------------------------------------------------------------
1 | package main
2 | 
3 | // TODO: create logic for container and device visibility level tokens generations
4 | 


--------------------------------------------------------------------------------
/hack/scripts/test2.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 | tf.get_logger().setLevel('INFO')
3 | gpus = tf.config.list_physical_devices('GPU')
4 | print(gpus)


--------------------------------------------------------------------------------
/pkg/mgsrv/deviceapi/buf.yaml:
--------------------------------------------------------------------------------
1 | version: v1
2 | deps:
3 |   - buf.build/googleapis/googleapis
4 | lint:
5 |   use:
6 |     - DEFAULT
7 | breaking:
8 |   use:
9 |     - FILE


--------------------------------------------------------------------------------
/chart/Chart.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v2
2 | name: metagpu-device-plugin
3 | description: Metagpu device plugin
4 | type: application
5 | version: 1.0.0
6 | appVersion: 1.0.0
7 | 


--------------------------------------------------------------------------------
/hack/scripts/test3.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 | tf.get_logger().setLevel('INFO')
3 | print("===============================")
4 | print(tf.config.list_physical_devices('GPU'))
5 | print("===============================")


--------------------------------------------------------------------------------
/pkg/mgsrv/deviceapi/buf.lock:
--------------------------------------------------------------------------------
1 | # Generated by buf. DO NOT EDIT.
2 | version: v1
3 | deps:
4 |   - remote: buf.build
5 |     owner: googleapis
6 |     repository: googleapis
7 |     commit: f78a83d0b4bd469fa605445ada4c6249
8 | 


--------------------------------------------------------------------------------
/pkg/allocator/types.go:
--------------------------------------------------------------------------------
 1 | package allocator
 2 | 
 3 | type DeviceLoad struct {
 4 | 	Metagpus []string
 5 | }
 6 | 
 7 | type DeviceAllocation struct {
 8 | 	LoadMap             []*DeviceLoad
 9 | 	AvailableDevIds     []string
10 | 	AllocationSize      int
11 | 	TotalSharesPerGpu   int
12 | 	MetagpusAllocations []string
13 | }
14 | 


--------------------------------------------------------------------------------
/pkg/podexec/types.go:
--------------------------------------------------------------------------------
 1 | package podexec
 2 | 
 3 | import (
 4 | 	"bytes"
 5 | 	"io"
 6 | 	"sync"
 7 | )
 8 | 
 9 | type mgctlCopyCache struct {
10 | 	mu    sync.Mutex
11 | 	cache map[string]bool
12 | }
13 | 
14 | type podExec struct {
15 | 	podName       string
16 | 	podNs         string
17 | 	containerName string
18 | 	cmd           []string
19 | 	stdin         io.Reader
20 | 	stdout        *bytes.Buffer
21 | }
22 | 


--------------------------------------------------------------------------------
/chart/templates/svc.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | kind: Service
 3 | apiVersion: v1
 4 | metadata:
 5 |   name: metagpu-device-plugin
 6 |   namespace: {{ .Release.Namespace }}
 7 |   labels:
 8 |     app: "metagpu-exporter"
 9 | spec:
10 |   selector:
11 |     name: metagpu-device-plugin
12 |   ports:
13 |     - protocol: TCP
14 |       port: 50052
15 |       name: grcp
16 |     - protocol: TCP
17 |       port: 2112
18 |       name: metrics


--------------------------------------------------------------------------------
/buf.gen.yaml:
--------------------------------------------------------------------------------
 1 | version: v1
 2 | managed:
 3 |   enabled: true
 4 |   optimize_for: CODE_SIZE
 5 |   go_package_prefix:
 6 |     default: github.com/AccessibleAI/metagpu-device-plugin/gen/proto/go
 7 |     except:
 8 |       - buf.build/googleapis/googleapis
 9 | plugins:
10 |   - name: go
11 |     opt: paths=source_relative
12 |     out: gen/proto/go
13 |   - name: go-grpc
14 |     opt: paths=source_relative
15 |     out: gen/proto/go


--------------------------------------------------------------------------------
/pkg/podexec/copycache.go:
--------------------------------------------------------------------------------
 1 | package podexec
 2 | 
 3 | func NewMgctlCopyCache() *mgctlCopyCache {
 4 | 	return &mgctlCopyCache{cache: make(map[string]bool)}
 5 | }
 6 | 
 7 | func (c *mgctlCopyCache) setCache(podId string) {
 8 | 	c.mu.Lock()
 9 | 	defer c.mu.Unlock()
10 | 	c.cache[podId] = true
11 | }
12 | 
13 | func (c *mgctlCopyCache) isCached(podId string) bool {
14 | 	c.mu.Lock()
15 | 	defer c.mu.Unlock()
16 | 	_, cached := c.cache[podId]
17 | 	return cached
18 | }
19 | 


--------------------------------------------------------------------------------
/chart/templates/svcmon.yml:
--------------------------------------------------------------------------------
 1 | apiVersion: monitoring.coreos.com/v1
 2 | kind: ServiceMonitor
 3 | metadata:
 4 |   name: metagpu-exporter
 5 |   namespace: {{ .Release.Namespace }}
 6 |   labels:
 7 |     app: "metagpu-exporter"
 8 |     cnvrg-infra-prometheus: cnvrg-infra-cnvrg
 9 | spec:
10 |   selector:
11 |     matchLabels:
12 |       app: "metagpu-exporter"
13 |   namespaceSelector:
14 |     matchNames:
15 |       - {{ .Release.Namespace }}
16 |   endpoints:
17 |     - port: "metrics"
18 |       path: "/metrics"
19 |       interval: "15s"


--------------------------------------------------------------------------------
/cmd/mgctl/ping.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"github.com/AccessibleAI/cnvrg-fractional-accelerator-device-plugin/pkg/ctlutils"
 5 | 	log "github.com/sirupsen/logrus"
 6 | 	"github.com/spf13/cobra"
 7 | 	"github.com/spf13/viper"
 8 | )
 9 | 
10 | var pingCmd = &cobra.Command{
11 | 	Use:   "ping",
12 | 	Short: "ping server to check connectivity",
13 | 	Run: func(cmd *cobra.Command, args []string) {
14 | 		conn := ctlutils.GetGrpcMetaGpuSrvClientConn(viper.GetString("addr"))
15 | 		if conn == nil {
16 | 			log.Fatalf("can't initiate connection to metagpu server")
17 | 		}
18 | 		defer conn.Close()
19 | 	},
20 | }
21 | 


--------------------------------------------------------------------------------
/config/config.yaml:
--------------------------------------------------------------------------------
 1 | accelerator: nvidia
 2 | processesDiscoveryPeriod: 5
 3 | deviceCacheTTL: 3600
 4 | jwtSecret: topSecret
 5 | mgctlTar: /tmp/mgctl
 6 | mgctlAutoInject: true
 7 | serverAddr: 0.0.0.0:50052
 8 | memoryEnforcer: true
 9 | deviceToken: eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJlbWFpbCI6Im1ldGFncHVAaW5zdGFuY2UiLCJ2aXNpYmlsaXR5TGV2ZWwiOiJsMCJ9.2rHykHFcHoIr-OCoPA5Am4ubf31-RJcayZnOTK6db94
10 | containerToken: eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJlbWFpbCI6Im1ldGFncHVAaW5zdGFuY2UiLCJ2aXNpYmlsaXR5TGV2ZWwiOiJsMSJ9.o5v6Zdi1FKXQevRjuSbABBX1vIRYgN3Daz9iXabuFFA
11 | nodename: ""
12 | deviceSharing:
13 |   - resourceName: cnvrg.io/metagpu
14 |     autoReshare: true
15 |     metaGpus: 2
16 |     uuid: [ "*" ]
17 | 


--------------------------------------------------------------------------------
/Dockerfile.dev:
--------------------------------------------------------------------------------
 1 | FROM nvidia/cuda:11.6.0-base-ubuntu20.04
 2 | 
 3 | ENV NVIDIA_DISABLE_REQUIRE="true"
 4 | ENV NVIDIA_VISIBLE_DEVICES=all
 5 | ENV NVIDIA_DRIVER_CAPABILITIES=utility
 6 | 
 7 | ENV PATH=${PATH}:/usr/local/go/bin:/opt/workdir/.go/bin
 8 | ENV GOPATH=/opt/workdir/.go
 9 | ENV GOCACHE=/opt/workdir/.go/.cache
10 | 
11 | WORKDIR /opt/workdir
12 | RUN apt update -y && apt install curl wget vim git gcc make -y
13 | RUN wget https://go.dev/dl/go1.17.11.linux-amd64.tar.gz
14 | RUN rm -rf /usr/local/go \
15 |     && tar -C /usr/local -xzf go1.17.11.linux-amd64.tar.gz \
16 |     && mkdir -p /opt/workdir/.go/github.com
17 | RUN go install github.com/go-delve/delve/cmd/dlv@latest
18 | CMD ["/bin/bash", "-c", "sleep inf"]
19 | 


--------------------------------------------------------------------------------
/hack/scripts/test.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | tf.get_logger().setLevel('INFO')
 3 | gpus = tf.config.list_physical_devices('GPU')
 4 | if gpus:
 5 |   # Restrict TensorFlow to only allocate 1GB of memory on the first GPU
 6 |   try:
 7 |     tf.config.set_logical_device_configuration(gpus[0],[tf.config.LogicalDeviceConfiguration(memory_limit=1024)])
 8 |     logical_gpus = tf.config.list_logical_devices('GPU')
 9 |     print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
10 |   except RuntimeError as e:
11 |     # Virtual devices must be set before GPUs have been initialized
12 |     print(e)
13 | print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
14 | while True:
15 |         print(tf.reduce_sum(tf.random.normal([1000, 1000])))


--------------------------------------------------------------------------------
/chart/templates/scc.yml:
--------------------------------------------------------------------------------
 1 | {{- if eq .Values.ocp true }}
 2 | kind: SecurityContextConstraints
 3 | apiVersion: security.openshift.io/v1
 4 | metadata:
 5 |   annotations:
 6 |     mlops.cnvrg.io/default-loader: "false"
 7 |     mlops.cnvrg.io/own: "false"
 8 |     mlops.cnvrg.io/updatable: "false"
 9 |   name: metagpu-device-plugin
10 | allowHostDirVolumePlugin: true
11 | allowHostIPC: true
12 | allowHostNetwork: true
13 | allowHostPID: true
14 | allowHostPorts: true
15 | allowPrivilegeEscalation: true
16 | allowPrivilegedContainer: true
17 | readOnlyRootFilesystem: false
18 | requiredDropCapabilities: null
19 | allowedCapabilities:
20 | - '*'
21 | allowedUnsafeSysctls:
22 | - '*'
23 | fsGroup:
24 |   type: RunAsAny
25 | runAsUser:
26 |   type: RunAsAny
27 | seLinuxContext:
28 |   type: RunAsAny
29 | seccompProfiles:
30 | - '*'
31 | supplementalGroups:
32 |   type: RunAsAny
33 | users:
34 | - system:serviceaccount:{{ .Release.Namespace }}:metagpu-device-plugin
35 | volumes:
36 | - '*'
37 | {{- end }}


--------------------------------------------------------------------------------
/chart/templates/cm.yml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: ConfigMap
 3 | metadata:
 4 |   name: metagpu-device-plugin-config
 5 |   namespace: {{ .Release.Namespace }}
 6 | data:
 7 |   config.yaml: |
 8 |     accelerator: nvidia
 9 |     processesDiscoveryPeriod: 5
10 |     deviceCacheTTL: 3600
11 |     jwtSecret: topSecret
12 |     mgctlTar: /tmp/mgctl
13 |     mgctlAutoInject: true
14 |     serverAddr: 0.0.0.0:50052
15 |     memoryEnforcer: true
16 |     deviceToken: eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJlbWFpbCI6Im1ldGFncHVAaW5zdGFuY2UiLCJ2aXNpYmlsaXR5TGV2ZWwiOiJsMCJ9.2rHykHFcHoIr-OCoPA5Am4ubf31-RJcayZnOTK6db94
17 |     containerToken: eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJlbWFpbCI6Im1ldGFncHVAaW5zdGFuY2UiLCJ2aXNpYmlsaXR5TGV2ZWwiOiJsMSJ9.o5v6Zdi1FKXQevRjuSbABBX1vIRYgN3Daz9iXabuFFA
18 |     deviceSharing:
19 |       - resourceName: cnvrg.io/metagpu
20 |         autoReshare: true
21 |         metaGpus: 2
22 |         uuid: [ "*" ]
23 | ---
24 | apiVersion: v1
25 | kind: ConfigMap
26 | metadata:
27 |   name: metagpu-presence
28 |   namespace: {{ .Release.Namespace }}
29 | data:
30 |   enabled: "true"
31 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 cnvrg.io
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/pkg/gpumgr/enforcer.go:
--------------------------------------------------------------------------------
 1 | package gpumgr
 2 | 
 3 | import (
 4 | 	log "github.com/sirupsen/logrus"
 5 | 	"time"
 6 | )
 7 | 
 8 | func (m *GpuMgr) StartMemoryEnforcer() {
 9 | 	log.Info("starting gpu memory enforcer")
10 | 	go func() {
11 | 		for {
12 | 			for _, p := range m.enforce() {
13 | 				p.Kill()
14 | 			}
15 | 			time.Sleep(5 * time.Second)
16 | 		}
17 | 	}()
18 | }
19 | 
20 | func (m *GpuMgr) enforce() (gpuProcForKill []*GpuProcess) {
21 | 	for _, c := range m.gpuContainers {
22 | 		for _, p := range c.Processes {
23 | 			if d := m.getGpuDeviceByUuid(p.DeviceUuid); d != nil {
24 | 				maxAllowedMem := d.Memory.ShareSize * uint64(c.PodMetagpuRequest)
25 | 				if p.GpuMemory > maxAllowedMem && p.Pid != 0 && maxAllowedMem > 0 {
26 | 					log.Infof("out of memory: %dMB/%dMB, pod: %s going to be terminated", p.GpuMemory, maxAllowedMem, c.PodId)
27 | 					gpuProcForKill = append(gpuProcForKill, p)
28 | 				}
29 | 			}
30 | 		}
31 | 	}
32 | 	return
33 | }
34 | 
35 | func (m *GpuMgr) getGpuDeviceByUuid(uuid string) *GpuDevice {
36 | 	for _, d := range m.GpuDevices {
37 | 		if d.UUID == uuid {
38 | 			return d
39 | 		}
40 | 	}
41 | 	return nil
42 | }
43 | 


--------------------------------------------------------------------------------
/pkg/plugin/types.go:
--------------------------------------------------------------------------------
 1 | package plugin
 2 | 
 3 | import (
 4 | 	"github.com/AccessibleAI/cnvrg-fractional-accelerator-device-plugin/pkg/sharecfg"
 5 | 	"google.golang.org/grpc"
 6 | 	pluginapi "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1"
 7 | 	"time"
 8 | )
 9 | 
10 | type DeviceManager interface {
11 | 	GetPluginDevices() []*pluginapi.Device
12 | 	GetDeviceSharingConfig() *sharecfg.DeviceSharingConfig
13 | 	GetUnixSocket() string
14 | 	ParseRealDeviceId(metaDevicesIds []string) (realDeviceId []string)
15 | 	MetagpuAllocation(allocationSize int, availableDevIds []string) ([]string, error)
16 | }
17 | 
18 | type DeviceUuid string
19 | 
20 | type MetaGpuDevicePlugin struct {
21 | 	DeviceManager
22 | 	server               *grpc.Server
23 | 	socket               string
24 | 	stop                 chan interface{}
25 | 	MetaGpuRecalculation chan bool
26 | }
27 | 
28 | type NvidiaDeviceManager struct {
29 | 	Devices                  []*MetaDevice
30 | 	cacheTTL                 time.Duration
31 | 	processesDiscoveryPeriod time.Duration
32 | 	shareCfg                 *sharecfg.DeviceSharingConfig
33 | }
34 | 
35 | type MetaDevice struct {
36 | 	UUID  string
37 | 	Index int
38 | }
39 | 


--------------------------------------------------------------------------------
/chart/templates/rbac.yml:
--------------------------------------------------------------------------------
 1 | apiVersion: rbac.authorization.k8s.io/v1
 2 | kind: ClusterRole
 3 | metadata:
 4 |   name: metagpu-device-plugin
 5 |   namespace: {{ .Release.Namespace }}
 6 | rules:
 7 |   - apiGroups:
 8 |       - ""
 9 |     resources:
10 |       - pods
11 |     verbs:
12 |       - list
13 |       - get
14 |       - create
15 |   - apiGroups:
16 |       - ""
17 |     resources:
18 |       - pods/exec
19 |     verbs:
20 |       - create
21 |   - apiGroups:
22 |       - ""
23 |     resources:
24 |       - configmaps
25 |     resourceNames:
26 |       - metagpu-device-plugin-config
27 |     verbs:
28 |       - get
29 |       - update
30 | ---
31 | apiVersion: v1
32 | kind: ServiceAccount
33 | metadata:
34 |   name: metagpu-device-plugin
35 |   namespace: {{ .Release.Namespace }}
36 | ---
37 | apiVersion: rbac.authorization.k8s.io/v1
38 | kind: ClusterRoleBinding
39 | metadata:
40 |   name: metagpu-device-plugin
41 |   namespace: {{ .Release.Namespace }}
42 | roleRef:
43 |   apiGroup: rbac.authorization.k8s.io
44 |   kind: ClusterRole
45 |   name: metagpu-device-plugin
46 | subjects:
47 |   - kind: ServiceAccount
48 |     name: metagpu-device-plugin
49 |     namespace: {{ .Release.Namespace }}


--------------------------------------------------------------------------------
/hack/dp.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: apps/v1
 2 | kind: DaemonSet
 3 | metadata:
 4 |   name: nvidia-device-plugin-daemonset
 5 | spec:
 6 |   selector:
 7 |     matchLabels:
 8 |       name: nvidia-device-plugin-ds
 9 |   template:
10 |     metadata:
11 |       labels:
12 |         name: nvidia-device-plugin-ds
13 |     spec:
14 |       nodeSelector:
15 |         accelerator: nvidia
16 |       tolerations:
17 |         - operator: Exists
18 |       priorityClassName: "system-node-critical"
19 |       containers:
20 |         - image: nvcr.io/nvidia/k8s-device-plugin:v0.9.0
21 |           name: nvidia-device-plugin-ctr
22 |           args: ["--fail-on-init-error=true"]
23 |           resources:
24 |             requests:
25 |               cpu: 100m
26 |               memory: 100Mi
27 |             limits:
28 |               cpu: 500m
29 |               memory: 500Mi
30 |           securityContext:
31 |             allowPrivilegeEscalation: false
32 |             capabilities:
33 |               drop: ["ALL"]
34 |           volumeMounts:
35 |             - name: device-plugin
36 |               mountPath: /var/lib/kubelet/device-plugins
37 |       volumes:
38 |         - name: device-plugin
39 |           hostPath:
40 |             path: /var/lib/kubelet/device-plugins


--------------------------------------------------------------------------------
/pkg/ctlutils/utils.go:
--------------------------------------------------------------------------------
 1 | package ctlutils
 2 | 
 3 | import (
 4 | 	"context"
 5 | 	"net"
 6 | 	"time"
 7 | 
 8 | 	log "github.com/sirupsen/logrus"
 9 | 	"google.golang.org/grpc"
10 | 	"google.golang.org/grpc/metadata"
11 | 	"os"
12 | )
13 | 
14 | func GetGrpcMetaGpuSrvClientConn(address string) *grpc.ClientConn {
15 | 	log.Debugf("initiating gRPC connection to %s", address)
16 | 
17 | 	c, err := dial(address, 3*time.Second)
18 | 	if err != nil {
19 | 		log.Errorf("failed to connect to server 🙀, err: %s", err)
20 | 		os.Exit(1)
21 | 	}
22 | 	log.Debugf("connected to %s", address)
23 | 	return c
24 | }
25 | 
26 | func AuthenticatedContext(token string) context.Context {
27 | 	ctx := context.Background()
28 | 	md := metadata.Pairs("Authorization", token)
29 | 	return metadata.NewOutgoingContext(ctx, md)
30 | }
31 | 
32 | func dial(socket string, timeout time.Duration) (*grpc.ClientConn, error) {
33 | 	opts := []grpc.DialOption{
34 | 		grpc.WithInsecure(),
35 | 		grpc.WithBlock(),
36 | 		grpc.WithContextDialer(func(ctx context.Context, s string) (net.Conn, error) {
37 | 			c, e := net.DialTimeout("tcp", socket, timeout)
38 | 			if e != nil {
39 | 				log.Fatalf("error connecting to the server, e: %s", e)
40 | 			}
41 | 			return c, e
42 | 		}),
43 | 	}
44 | 	c, err := grpc.Dial(socket, opts...)
45 | 	if err != nil {
46 | 		return nil, err
47 | 	}
48 | 
49 | 	return c, nil
50 | }
51 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM golang:1.17.3 as builder
 2 | ARG BUILD_SHA
 3 | ARG BUILD_VERSION
 4 | WORKDIR /root/.go/src/metagpu
 5 | COPY go.mod go.mod
 6 | COPY go.sum go.sum
 7 | RUN go mod download
 8 | COPY cmd cmd
 9 | COPY pkg pkg
10 | COPY gen gen
11 | RUN go mod tidy
12 | RUN go build \
13 |     -ldflags="-extldflags=-Wl,-z,lazy -s -w -X 'main.Build=${BUILD_SHA}' -X 'main.Version=${BUILD_VERSION}'" \
14 |     -o mgdp cmd/mgdp/main.go
15 | RUN go build \
16 |      -ldflags="-X 'main.Build=${BUILD_SHA}' -X 'main.Version=${BUILD_VERSION}'" \
17 |      -o mgctl cmd/mgctl/*.go
18 | RUN go build \
19 |      -ldflags="-X 'main.Build=${BUILD_SHA}' -X 'main.Version=${BUILD_VERSION}'" \
20 |      -o mgex cmd/mgex/*.go
21 | 
22 | FROM nvidia/cuda:11.6.0-base-ubuntu20.04
23 | 
24 | ENV NVIDIA_DISABLE_REQUIRE="true"
25 | ENV NVIDIA_VISIBLE_DEVICES=all
26 | ENV NVIDIA_DRIVER_CAPABILITIES=utility
27 | 
28 | LABEL io.k8s.display-name="cnvrg.io Meta GPU Device Plugin"
29 | LABEL name="cnvrg.io MetaGPU Device Plugin"
30 | LABEL vendor="cnvrg.io"
31 | LABEL version="N/A"
32 | LABEL release="N/A"
33 | LABEL summary="cnvrg.io MetaGPU device plugin for Kubernetes"
34 | LABEL description="See summary"
35 | COPY --from=builder /root/.go/src/metagpu/mgdp /usr/bin/mgdp
36 | COPY --from=builder /root/.go/src/metagpu/mgctl /usr/bin/mgctl
37 | COPY --from=builder /root/.go/src/metagpu/mgex /usr/bin/mgex
38 | RUN cp /usr/bin/mgctl /tmp


--------------------------------------------------------------------------------
/cmd/mgctl/config.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	pbdevice "github.com/AccessibleAI/cnvrg-fractional-accelerator-device-plugin/gen/proto/go/device/v1"
 5 | 	"github.com/AccessibleAI/cnvrg-fractional-accelerator-device-plugin/pkg/ctlutils"
 6 | 	log "github.com/sirupsen/logrus"
 7 | 	"github.com/spf13/cobra"
 8 | 	"github.com/spf13/viper"
 9 | )
10 | 
11 | var (
12 | 	configCmdParams = []param{
13 | 		{name: "metagpu", shorthand: "m", value: 0, usage: "set metagpus quantity (gpu shares)"},
14 | 		{name: "auto", shorthand: "a", value: false, usage: "automatically configure GPU shares"},
15 | 	}
16 | )
17 | 
18 | var configCmd = &cobra.Command{
19 | 	Use:   "config",
20 | 	Short: "change configs on running metagpu device plugin instance",
21 | 	Run: func(cmd *cobra.Command, args []string) {
22 | 		patchConfigs()
23 | 	},
24 | }
25 | 
26 | func patchConfigs() {
27 | 	if viper.GetInt32("metagpu") != 0 {
28 | 		metaGpus := viper.GetInt32("metagpu")
29 | 		log.Info(metaGpus)
30 | 		conn := ctlutils.GetGrpcMetaGpuSrvClientConn(viper.GetString("addr"))
31 | 		if conn == nil {
32 | 			log.Fatalf("can't initiate connection to metagpu server")
33 | 		}
34 | 		defer conn.Close()
35 | 		device := pbdevice.NewDeviceServiceClient(conn)
36 | 
37 | 		request := &pbdevice.PatchConfigsRequest{MetaGpus: metaGpus}
38 | 		if _, err := device.PatchConfigs(ctlutils.AuthenticatedContext(viper.GetString("token")), request); err != nil {
39 | 			log.Error(err)
40 | 		}
41 | 	}
42 | }
43 | 


--------------------------------------------------------------------------------
/hack/dep-gpu.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: apps/v1
 2 | kind: Deployment
 3 | metadata:
 4 |   name: gpu-test-with-gpu
 5 | spec:
 6 |   selector:
 7 |     matchLabels:
 8 |       app: gpu-test-with-gpu
 9 |   replicas: 1
10 |   template:
11 |     metadata:
12 |       labels:
13 |         app: gpu-test-with-gpu
14 |     spec:
15 |       hostPID: true
16 |       tolerations:
17 |         - operator: "Exists"
18 |       containers:
19 |       - name: gpu-test-with-gpu
20 |         image: tensorflow/tensorflow:latest-gpu
21 |         command:
22 |           - /usr/local/bin/python
23 |           - -c
24 |           - |
25 |             import tensorflow as tf
26 |             tf.get_logger().setLevel('INFO')
27 |             gpus = tf.config.list_physical_devices('GPU')
28 |             if gpus:
29 |               # Restrict TensorFlow to only allocate 1GB of memory on the first GPU
30 |               try:
31 |                 tf.config.set_logical_device_configuration(gpus[0],[tf.config.LogicalDeviceConfiguration(memory_limit=1024)])
32 |                 logical_gpus = tf.config.list_logical_devices('GPU')
33 |                 print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
34 |               except RuntimeError as e:
35 |                 # Virtual devices must be set before GPUs have been initialized
36 |                 print(e)
37 |             print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
38 |             while True:
39 |               print(tf.reduce_sum(tf.random.normal([1000, 1000])))
40 |         resources:
41 |           limits:
42 |             nvidia.com/gpu: "1"


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | #rsync -r /Users/dima/.go/src/github.com/AccessibleAI/metagpu-device-plugin/docs/* rancher@212.199.86.38:/tmp/docs
 2 | 
 3 | build:
 4 | 	go build -ldflags="-X 'main.Build=$$(git rev-parse --short HEAD)' -X 'main.Version=1.0.0'" -v -o bin/mgdp cmd/mgdp/main.go
 5 | 
 6 | build-exporter:
 7 | 	go build -ldflags="-X 'main.Build=$$(git rev-parse --short HEAD)' -X 'main.Version=1.0.0'" -v -o bin/mgex cmd/mgex/*.go
 8 | 
 9 | remote-sync:
10 | 	kubectl cp ./ $(shell kubectl get pods -lapp=dev-metagpu -A -ojson | jq -r '.items[] | .metadata.namespace + "/" + .metadata.name'):/opt/workdir/.go/github.com/metagpu
11 | 
12 | remote-debug:
13 | 	dlv debug --headless --listen=:2345 --api-version=2 --accept-multiclient  ./cmd/mgdp/main.go -- start
14 | 
15 | docker-dev-build:
16 | 	docker buildx build --platform linux/amd64 --push -t cnvrg/golang-cuda11-6-dvl:latest -f Dockerfile.dev .
17 | 
18 | docker-build: build-proto
19 | 	docker build \
20 | 	 --platform linux/x86_64 \
21 |      --build-arg BUILD_SHA=$(shell git rev-parse --short HEAD) \
22 |      --build-arg BUILD_VERSION=1.0.0 \
23 |      -t docker.io/cnvrg/metagpu-device-plugin:$(shell git rev-parse --abbrev-ref HEAD) .
24 | 
25 | build-mgctl:
26 | 	go build -ldflags="-X 'main.Build=$$(git rev-parse --short HEAD)' -X 'main.Version=1.0.0'" -v -o bin/mgctl cmd/mgctl/*.go
27 | 
28 | docker-push:
29 | 	docker push docker.io/cnvrg/metagpu-device-plugin:$(shell git rev-parse --abbrev-ref HEAD)
30 | 
31 | build-proto:
32 | 	buf mod update pkg/mgsrv/deviceapi
33 | 	buf lint
34 | 	buf build
35 | 	buf generate
36 | 
37 | generate-manifests:
38 | 	helm template chart/ -n cnvrg --set tag=$(shell git rev-parse --abbrev-ref HEAD) > deploy/static.yaml
39 | 
40 | .PHONY: deploy
41 | deploy:
42 | 	helm template chart/ --set tag=$(shell git rev-parse --abbrev-ref HEAD) | kubectl apply -f -
43 | 
44 | test:
45 | 	go test ./pkg/... -v
46 | 
47 | test-allocator:
48 | 	go test ./pkg/allocator/... -v
49 | 
50 | test-gpumgr:
51 | 	go test ./pkg/gpumgr/... -v


--------------------------------------------------------------------------------
/config/crd/bases/mlops.cnvrg.io_metagpus.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | apiVersion: apiextensions.k8s.io/v1
 3 | kind: CustomResourceDefinition
 4 | metadata:
 5 |   annotations:
 6 |     controller-gen.kubebuilder.io/version: (devel)
 7 |   creationTimestamp: null
 8 |   name: metagpus.mlops.cnvrg.io
 9 | spec:
10 |   group: mlops.cnvrg.io
11 |   names:
12 |     kind: MetaGpu
13 |     listKind: MetaGpuList
14 |     plural: metagpus
15 |     singular: metagpu
16 |   scope: Namespaced
17 |   versions:
18 |   - additionalPrinterColumns:
19 |     - jsonPath: .spec.foo
20 |       name: Foo
21 |       type: string
22 |     name: v1
23 |     schema:
24 |       openAPIV3Schema:
25 |         properties:
26 |           apiVersion:
27 |             description: 'APIVersion defines the versioned schema of this representation
28 |               of an object. Servers should convert recognized schemas to the latest
29 |               internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources'
30 |             type: string
31 |           kind:
32 |             description: 'Kind is a string value representing the REST resource this
33 |               object represents. Servers may infer this from the endpoint the client
34 |               submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds'
35 |             type: string
36 |           metadata:
37 |             type: object
38 |           spec:
39 |             properties:
40 |               foo:
41 |                 type: string
42 |             type: object
43 |           status:
44 |             properties:
45 |               message:
46 |                 type: string
47 |             type: object
48 |         type: object
49 |     served: true
50 |     storage: true
51 |     subresources:
52 |       status: {}
53 | status:
54 |   acceptedNames:
55 |     kind: ""
56 |     plural: ""
57 |   conditions: []
58 |   storedVersions: []
59 | 


--------------------------------------------------------------------------------
/pkg/mgsrv/interceptor.go:
--------------------------------------------------------------------------------
 1 | package mgsrv
 2 | 
 3 | import (
 4 | 	"context"
 5 | 	log "github.com/sirupsen/logrus"
 6 | 	"github.com/spf13/viper"
 7 | 	"google.golang.org/grpc"
 8 | 	"time"
 9 | )
10 | 
11 | type MetaGpuServerStream struct {
12 | 	grpc.ServerStream
13 | 	ctx context.Context
14 | }
15 | 
16 | func (s *MetaGpuServerStream) Context() context.Context {
17 | 	return s.ctx
18 | }
19 | 
20 | func (s *MetaGpuServer) streamServerInterceptor() grpc.StreamServerInterceptor {
21 | 	return func(srv interface{}, ss grpc.ServerStream, info *grpc.StreamServerInfo, handler grpc.StreamHandler) error {
22 | 		wrapper := &MetaGpuServerStream{ServerStream: ss}
23 | 		if !s.IsMethodPublic(info.FullMethod) {
24 | 			visibility, err := authorize(ss.Context())
25 | 			if err != nil {
26 | 				return err
27 | 			}
28 | 			wrapper.ctx = context.WithValue(ss.Context(), TokenVisibilityClaimName, visibility)
29 | 			wrapper.ctx = context.WithValue(wrapper.ctx, "containerVl", string(ContainerVisibility))
30 | 			wrapper.ctx = context.WithValue(wrapper.ctx, "deviceVl", string(DeviceVisibility))
31 | 			wrapper.ctx = context.WithValue(wrapper.ctx, "gpuMgr", s.gpuMgr)
32 | 
33 | 		}
34 | 		return handler(srv, wrapper)
35 | 	}
36 | }
37 | 
38 | func (s *MetaGpuServer) unaryServerInterceptor() grpc.UnaryServerInterceptor {
39 | 	return func(ctx context.Context, req interface{}, info *grpc.UnaryServerInfo, handler grpc.UnaryHandler) (interface{}, error) {
40 | 		start := time.Now()
41 | 
42 | 		if !s.IsMethodPublic(info.FullMethod) {
43 | 			visibility, err := authorize(ctx)
44 | 			if err != nil {
45 | 				return nil, err
46 | 			}
47 | 			ctx = context.WithValue(ctx, TokenVisibilityClaimName, visibility)
48 | 			ctx = context.WithValue(ctx, "containerVl", string(ContainerVisibility))
49 | 			ctx = context.WithValue(ctx, "deviceVl", string(DeviceVisibility))
50 | 		}
51 | 		ctx = context.WithValue(ctx, "gpuMgr", s.gpuMgr)
52 | 		h, err := handler(ctx, req)
53 | 		if viper.GetBool("verbose") {
54 | 			log.Infof("[method: %s duration: %s]", info.FullMethod, time.Since(start))
55 | 		}
56 | 		return h, err
57 | 	}
58 | }
59 | 


--------------------------------------------------------------------------------
/pkg/gpumgr/device.go:
--------------------------------------------------------------------------------
 1 | package gpumgr
 2 | 
 3 | import (
 4 | 	"github.com/AccessibleAI/cnvrg-fractional-accelerator-device-plugin/pkg/sharecfg"
 5 | 	"github.com/NVIDIA/go-nvml/pkg/nvml"
 6 | 	"os"
 7 | 
 8 | 	//"github.com/NVIDIA/go-nvml/pkg/nvml"
 9 | 	log "github.com/sirupsen/logrus"
10 | )
11 | 
12 | type DeviceMemory struct {
13 | 	Total     uint64
14 | 	Free      uint64
15 | 	Used      uint64
16 | 	ShareSize uint64
17 | }
18 | 
19 | type DeviceUtilization struct {
20 | 	Gpu    uint32
21 | 	Memory uint32
22 | }
23 | 
24 | type GpuDevice struct {
25 | 	UUID         string
26 | 	Index        int
27 | 	Shares       int
28 | 	ResourceName string
29 | 	Utilization  *DeviceUtilization
30 | 	Memory       *DeviceMemory
31 | 	Nodename     string
32 | }
33 | 
34 | func NewGpuDevice(uuid string, index int, utilization nvml.Utilization, memory nvml.Memory) *GpuDevice {
35 | 	d := &GpuDevice{
36 | 		UUID:        uuid,
37 | 		Index:       index,
38 | 		Utilization: &DeviceUtilization{Gpu: utilization.Gpu, Memory: utilization.Memory / uint32(MB)},
39 | 	}
40 | 
41 | 	// set gpu share configs
42 | 	d.setGpuShareConfigs()
43 | 	// set nodename
44 | 	d.setNodename()
45 | 	// set gpu memory usage
46 | 	d.setGpuMemoryUsage(memory)
47 | 	return d
48 | }
49 | 
50 | func (d *GpuDevice) setNodename() {
51 | 	hostname, err := os.Hostname()
52 | 	if err != nil {
53 | 		log.Errorf("failed to detect hostname, err: %s", err)
54 | 	}
55 | 	d.Nodename = hostname
56 | }
57 | 
58 | func (d *GpuDevice) setGpuShareConfigs() {
59 | 	deviceSharingConfigs := sharecfg.NewDeviceSharingConfig()
60 | 	if deviceSharing, err := deviceSharingConfigs.GetDeviceSharingConfigs(d.UUID); err != nil {
61 | 		log.Fatalf("bad configs, unable to find sharing configs for device: %s", d.UUID)
62 | 	} else {
63 | 		d.Shares = deviceSharing.MetagpusPerGpu
64 | 		d.ResourceName = deviceSharing.ResourceName
65 | 	}
66 | }
67 | 
68 | func (d *GpuDevice) setGpuMemoryUsage(memory nvml.Memory) {
69 | 	d.Memory = &DeviceMemory{
70 | 		Total:     memory.Total / MB,
71 | 		Free:      memory.Free / MB,
72 | 		Used:      memory.Used / MB,
73 | 		ShareSize: memory.Total / uint64(d.Shares) / MB,
74 | 	}
75 | }
76 | 


--------------------------------------------------------------------------------
/cmd/mgctl/utils.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	pbdevice "github.com/AccessibleAI/cnvrg-fractional-accelerator-device-plugin/gen/proto/go/device/v1"
 6 | 	"github.com/atomicgo/cursor"
 7 | 	"github.com/jedib0t/go-pretty/v6/table"
 8 | 	"strings"
 9 | )
10 | 
11 | type TableOutput struct {
12 | 	data         []byte
13 | 	header       table.Row
14 | 	footer       table.Row
15 | 	body         []table.Row
16 | 	lastPosition int
17 | }
18 | 
19 | func (o *TableOutput) rowsCount() int {
20 | 	return 2 + len(o.body)
21 | }
22 | 
23 | func (o *TableOutput) Write(data []byte) (n int, err error) {
24 | 	o.data = append(o.data, data...)
25 | 	return len(data), nil
26 | }
27 | 
28 | func (o *TableOutput) print() {
29 | 	if o.lastPosition > 0 {
30 | 		cursor.ClearLinesUp(o.lastPosition)
31 | 	}
32 | 	fmt.Printf("%s", o.data)
33 | 	o.lastPosition = o.rowsCount()
34 | }
35 | 
36 | func (o *TableOutput) buildTable() {
37 | 	o.data = nil
38 | 	rowConfigAutoMerge := table.RowConfig{AutoMerge: true}
39 | 	t := table.NewWriter()
40 | 	t.SetOutputMirror(o)
41 | 	t.AppendHeader(o.header, rowConfigAutoMerge)
42 | 	t.AppendRows(o.body)
43 | 	t.SetStyle(table.StyleColoredGreenWhiteOnBlack)
44 | 	t.AppendFooter(o.footer)
45 | 	t.Render()
46 | }
47 | 
48 | func getTotalRequests(containers []*pbdevice.GpuContainer) (totalRequest int) {
49 | 	for _, c := range containers {
50 | 		totalRequest += int(c.MetagpuRequests)
51 | 	}
52 | 	return
53 | }
54 | 
55 | func getTotalShares(devices map[string]*pbdevice.Device) (totalShares int) {
56 | 	for _, d := range devices {
57 | 		totalShares += int(d.Shares)
58 | 	}
59 | 	return
60 | }
61 | 
62 | func getTotalMemoryUsedByProcesses(containers []*pbdevice.GpuContainer) (totalUsedMem int) {
63 | 	for _, c := range containers {
64 | 		for _, p := range c.DeviceProcesses {
65 | 			totalUsedMem += int(p.Memory)
66 | 		}
67 | 	}
68 | 	return
69 | }
70 | 
71 | func formatContainerDeviceIndexes(container *pbdevice.GpuContainer) string {
72 | 	var devIdxs []string
73 | 	for _, d := range container.ContainerDevices {
74 | 		devIdxs = append(devIdxs, fmt.Sprintf("%d", d.Device.Index))
75 | 	}
76 | 	if len(devIdxs) > 0 {
77 | 		return strings.Join(devIdxs, ":")
78 | 	}
79 | 	return "-"
80 | }
81 | 


--------------------------------------------------------------------------------
/pkg/mgsrv/deviceapi/device/v1/utils.go:
--------------------------------------------------------------------------------
 1 | package v1
 2 | 
 3 | import (
 4 | 	pb "github.com/AccessibleAI/cnvrg-fractional-accelerator-device-plugin/gen/proto/go/device/v1"
 5 | 	"github.com/AccessibleAI/cnvrg-fractional-accelerator-device-plugin/pkg/gpumgr"
 6 | )
 7 | 
 8 | func listDeviceProcesses(podId string, gpuMgr *gpumgr.GpuMgr) (containers []*pb.GpuContainer) {
 9 | 
10 | 	for _, container := range gpuMgr.GetProcesses(podId) {
11 | 		var gpuProcesses []*pb.DeviceProcess
12 | 
13 | 		for _, p := range container.Processes {
14 | 			gpuProcesses = append(gpuProcesses, &pb.DeviceProcess{
15 | 				Uuid:           p.DeviceUuid,
16 | 				Pid:            p.Pid,
17 | 				Memory:         p.GpuMemory,
18 | 				Cmdline:        p.GetShortCmdLine(),
19 | 				User:           p.User,
20 | 				ContainerId:    p.ContainerId,
21 | 				GpuUtilization: p.GpuUtilization,
22 | 			})
23 | 		}
24 | 		var gpuDevices []*pb.ContainerDevice
25 | 		for _, device := range container.Devices {
26 | 			gpuDevices = append(gpuDevices, &pb.ContainerDevice{
27 | 				Device: &pb.Device{
28 | 					Uuid:              device.GpuDevice.UUID,
29 | 					Index:             uint32(device.GpuDevice.Index),
30 | 					Shares:            uint32(device.GpuDevice.Shares),
31 | 					GpuUtilization:    device.GpuDevice.Utilization.Gpu,
32 | 					MemoryUtilization: device.GpuDevice.Utilization.Memory,
33 | 					MemoryTotal:       device.GpuDevice.Memory.Total,
34 | 					MemoryFree:        device.GpuDevice.Memory.Free,
35 | 					MemoryUsed:        device.GpuDevice.Memory.Used,
36 | 					MemoryShareSize:   device.GpuDevice.Memory.ShareSize,
37 | 					ResourceName:      device.GpuDevice.ResourceName,
38 | 					NodeName:          device.GpuDevice.Nodename,
39 | 				},
40 | 				AllocatedShares: device.AllocatedShares,
41 | 			})
42 | 		}
43 | 		containers = append(containers, &pb.GpuContainer{
44 | 			ContainerId:      container.ContainerId,
45 | 			ContainerName:    container.ContainerName,
46 | 			PodId:            container.PodId,
47 | 			PodNamespace:     container.PodNamespace,
48 | 			MetagpuRequests:  container.PodMetagpuRequest,
49 | 			ResourceName:     container.ResourceName,
50 | 			NodeName:         container.Nodename,
51 | 			ContainerDevices: gpuDevices,
52 | 			DeviceProcesses:  gpuProcesses,
53 | 		})
54 | 	}
55 | 	return
56 | }
57 | 


--------------------------------------------------------------------------------
/pkg/gpumgr/enforcer_test.go:
--------------------------------------------------------------------------------
 1 | package gpumgr
 2 | 
 3 | import (
 4 | 	. "github.com/onsi/ginkgo"
 5 | 	. "github.com/onsi/gomega"
 6 | 	log "github.com/sirupsen/logrus"
 7 | 	"github.com/spf13/viper"
 8 | 	"testing"
 9 | )
10 | 
11 | func TestAllocator(t *testing.T) {
12 | 	viper.SetConfigName("config")
13 | 	viper.SetConfigType("yaml")
14 | 	viper.AddConfigPath("../../config/")
15 | 
16 | 	if err := viper.ReadInConfig(); err != nil {
17 | 		log.Fatalf("config file not found, err: %s", err)
18 | 	}
19 | 	RegisterFailHandler(Fail)
20 | 	RunSpecs(t, "Enforcer Suite")
21 | }
22 | 
23 | var _ = Describe("enforcer", func() {
24 | 
25 | 	Context("enforce", func() {
26 | 
27 | 		It("not oom", func() {
28 | 			mgr := &GpuMgr{}
29 | 			mgr.setGpuDevices()
30 | 			if len(mgr.GpuDevices) < 0 {
31 | 				log.Fatalf("no gpu devices detected, can't continue unit testing")
32 | 			}
33 | 			mgr.gpuContainers = []*GpuContainer{{
34 | 				PodMetagpuRequest: 1,
35 | 				Processes: []*GpuProcess{{
36 | 					Pid:            100,
37 | 					DeviceUuid:     mgr.GpuDevices[0].UUID,
38 | 					GpuUtilization: 0,
39 | 					GpuMemory:      mgr.GpuDevices[0].Memory.ShareSize,
40 | 				}},
41 | 			}}
42 | 
43 | 			res := mgr.enforce()
44 | 			Expect(len(res)).To(Equal(0))
45 | 		})
46 | 
47 | 		It("oom", func() {
48 | 
49 | 			mgr := &GpuMgr{}
50 | 			mgr.setGpuDevices()
51 | 			if len(mgr.GpuDevices) < 0 {
52 | 				log.Fatalf("no gpu devices detected, can't continue unit testing")
53 | 			}
54 | 			mgr.gpuContainers = []*GpuContainer{{
55 | 				PodMetagpuRequest: 1,
56 | 				Processes: []*GpuProcess{{
57 | 					Pid:            100,
58 | 					DeviceUuid:     mgr.GpuDevices[0].UUID,
59 | 					GpuUtilization: 0,
60 | 					GpuMemory:      mgr.GpuDevices[0].Memory.ShareSize + 1,
61 | 				}},
62 | 			}}
63 | 
64 | 			res := mgr.enforce()
65 | 			Expect(len(res)).To(Equal(1))
66 | 		})
67 | 
68 | 		It("false positive oom", func() {
69 | 
70 | 			mgr := &GpuMgr{}
71 | 			mgr.setGpuDevices()
72 | 			if len(mgr.GpuDevices) < 0 {
73 | 				log.Fatalf("no gpu devices detected, can't continue unit testing")
74 | 			}
75 | 			mgr.gpuContainers = []*GpuContainer{{
76 | 				PodMetagpuRequest: 1,
77 | 				Processes: []*GpuProcess{{
78 | 					Pid:        100,
79 | 					DeviceUuid: mgr.GpuDevices[0].UUID,
80 | 					GpuMemory:  0,
81 | 				}},
82 | 			}}
83 | 			res := mgr.enforce()
84 | 			Expect(len(res)).To(Equal(0))
85 | 		})
86 | 	})
87 | })
88 | 


--------------------------------------------------------------------------------
/pkg/nvmlutils/utils.go:
--------------------------------------------------------------------------------
 1 | package nvmlutils
 2 | 
 3 | import (
 4 | 	"github.com/NVIDIA/go-nvml/pkg/nvml"
 5 | 	log "github.com/sirupsen/logrus"
 6 | )
 7 | 
 8 | func init() {
 9 | 	ret := nvml.Init()
10 | 	ErrorCheck(ret)
11 | }
12 | 
13 | func GetDevices() (devices []*nvml.Device) {
14 | 
15 | 	for i := 0; i < GetTotalDevices(); i++ {
16 | 		device, ret := nvml.DeviceGetHandleByIndex(i)
17 | 		ErrorCheck(ret)
18 | 		devices = append(devices, &device)
19 | 	}
20 | 	return
21 | }
22 | 
23 | func GetTotalDevices() int {
24 | 	count, ret := nvml.DeviceGetCount()
25 | 	ErrorCheck(ret)
26 | 	return count
27 | }
28 | 
29 | func GetComputeRunningProcesses(deviceIdx int) []nvml.ProcessInfo {
30 | 	processes, ret := getDeviceByIdx(deviceIdx).GetComputeRunningProcesses()
31 | 	ErrorCheck(ret)
32 | 	return processes
33 | }
34 | 
35 | func GetAccountingStats(deviceIdx int, pid uint32) *nvml.AccountingStats {
36 | 	stats, ret := getDeviceByIdx(deviceIdx).GetAccountingStats(pid)
37 | 	ErrorCheck(ret)
38 | 	return &stats
39 | }
40 | 
41 | func SystemGetCudaDriverVersion() int {
42 | 	cudaVersion, ret := nvml.SystemGetCudaDriverVersion()
43 | 	ErrorCheck(ret)
44 | 	return cudaVersion
45 | }
46 | 
47 | func SystemGetDriverVersion() string {
48 | 	driver, ret := nvml.SystemGetDriverVersion()
49 | 	ErrorCheck(ret)
50 | 	return driver
51 | }
52 | 
53 | func GetDeviceMemory(device *nvml.Device) *nvml.Memory {
54 | 	memInfo, ret := device.GetMemoryInfo()
55 | 	ErrorCheck(ret)
56 | 	return &memInfo
57 | }
58 | 
59 | func GetDeviceByUUID(uuid string) *nvml.Device {
60 | 	for _, device := range GetDevices() {
61 | 		devUuid, ret := device.GetUUID()
62 | 		ErrorCheck(ret)
63 | 		if devUuid == uuid {
64 | 			return device
65 | 		}
66 | 	}
67 | 	return nil
68 | }
69 | 
70 | func GetDeviceUUID(device *nvml.Device) string {
71 | 	uuid, ret := device.GetUUID()
72 | 	ErrorCheck(ret)
73 | 	return uuid
74 | }
75 | 
76 | func ErrorCheck(ret nvml.Return) {
77 | 	if ret == nvml.ERROR_NOT_FOUND {
78 | 		log.Warnf("nvml error: ERROR_NOT_FOUND: [a query to find an object was unsuccessful]")
79 | 		return
80 | 	}
81 | 	if ret == nvml.ERROR_NOT_SUPPORTED {
82 | 		log.Warnf("nvml error: ERROR_NOT_SUPPORTED: [device doesn't support this feature]")
83 | 		return
84 | 	}
85 | 	if ret == nvml.ERROR_NO_PERMISSION {
86 | 		log.Warnf("nvml error: ERROR_NO_PERMISSION: [user doesn't have permission to perform this operation]")
87 | 		return
88 | 	}
89 | 	if ret != nvml.SUCCESS {
90 | 		log.Fatalf("fatal error during nvml operation: %s", nvml.ErrorString(ret))
91 | 	}
92 | }
93 | 
94 | func getDeviceByIdx(deviceIdx int) *nvml.Device {
95 | 	device, ret := nvml.DeviceGetHandleByIndex(deviceIdx)
96 | 	ErrorCheck(ret)
97 | 	return &device
98 | }
99 | 


--------------------------------------------------------------------------------
/pkg/gpumgr/process.go:
--------------------------------------------------------------------------------
  1 | package gpumgr
  2 | 
  3 | import (
  4 | 	"github.com/prometheus/procfs"
  5 | 	"github.com/shirou/gopsutil/v3/process"
  6 | 	log "github.com/sirupsen/logrus"
  7 | 	"path/filepath"
  8 | )
  9 | 
 10 | type GpuProcess struct {
 11 | 	Pid            uint32
 12 | 	DeviceUuid     string
 13 | 	GpuUtilization uint32
 14 | 	GpuMemory      uint64
 15 | 	Cmdline        []string
 16 | 	User           string
 17 | 	ContainerId    string
 18 | }
 19 | 
 20 | func (p *GpuProcess) SetProcessCmdline() {
 21 | 	if pr, err := process.NewProcess(int32(p.Pid)); err == nil {
 22 | 		var e error
 23 | 		p.Cmdline, e = pr.CmdlineSlice()
 24 | 		if e != nil {
 25 | 			log.Error(e)
 26 | 		}
 27 | 	} else {
 28 | 		log.Error(err)
 29 | 	}
 30 | }
 31 | 
 32 | func (p *GpuProcess) SetProcessUsername() {
 33 | 	if pr, err := process.NewProcess(int32(p.Pid)); err == nil {
 34 | 		var e error
 35 | 		p.User, e = pr.Username()
 36 | 		if e != nil {
 37 | 			log.Error(e)
 38 | 		}
 39 | 	} else {
 40 | 		log.Error(err)
 41 | 	}
 42 | }
 43 | 
 44 | func (p *GpuProcess) Kill() error {
 45 | 	if pr, err := process.NewProcess(int32(p.Pid)); err == nil {
 46 | 		return pr.Kill()
 47 | 	} else {
 48 | 		return err
 49 | 	}
 50 | }
 51 | 
 52 | func (p *GpuProcess) SetProcessContainerId() {
 53 | 	if proc, err := procfs.NewProc(int(p.Pid)); err == nil {
 54 | 		var e error
 55 | 		var cgroups []procfs.Cgroup
 56 | 		cgroups, e = proc.Cgroups()
 57 | 		if e != nil {
 58 | 			log.Error(e)
 59 | 		}
 60 | 		if len(cgroups) == 0 {
 61 | 			log.Errorf("cgroups list for %d is empty", p.Pid)
 62 | 		}
 63 | 	ExitContainerIdSet:
 64 | 		if p.ContainerId == "" {
 65 | 			for _, g := range cgroups {
 66 | 				for _, c := range g.Controllers {
 67 | 					if c == "memory" {
 68 | 						p.ContainerId = filepath.Base(g.Path)
 69 | 						goto ExitContainerIdSet
 70 | 					}
 71 | 				}
 72 | 			}
 73 | 			log.Warnf("unable to set containerId for pid: %d", p.Pid)
 74 | 		}
 75 | 	}
 76 | }
 77 | 
 78 | func (p *GpuProcess) GetShortCmdLine() string {
 79 | 	if len(p.Cmdline) == 0 {
 80 | 		return "-"
 81 | 	}
 82 | 	return p.Cmdline[0]
 83 | }
 84 | 
 85 | func (p *GpuProcess) GetDevice(devices []*GpuDevice) *GpuDevice {
 86 | 	for _, device := range devices {
 87 | 		if device.UUID == p.DeviceUuid {
 88 | 			return device
 89 | 		}
 90 | 	}
 91 | 	return nil
 92 | }
 93 | 
 94 | func NewGpuProcess(pid, gpuUtil uint32, gpuMem uint64, devUuid string) *GpuProcess {
 95 | 	p := &GpuProcess{
 96 | 		Pid:            pid,
 97 | 		GpuUtilization: gpuUtil,
 98 | 		GpuMemory:      gpuMem,
 99 | 		DeviceUuid:     devUuid,
100 | 	}
101 | 	p.SetProcessUsername()
102 | 	p.SetProcessCmdline()
103 | 	p.SetProcessContainerId()
104 | 	return p
105 | }
106 | 


--------------------------------------------------------------------------------
/hack/remote-dev.yaml:
--------------------------------------------------------------------------------
  1 | apiVersion: apps/v1
  2 | kind: Deployment
  3 | metadata:
  4 |   name: dev-metagpu
  5 |   namespace: default
  6 | spec:
  7 |   selector:
  8 |     matchLabels:
  9 |       app: dev-metagpu
 10 |   template:
 11 |     metadata:
 12 |       labels:
 13 |         app: dev-metagpu
 14 |     spec:
 15 |       nodeSelector:
 16 |         accelerator: nvidia
 17 |       tolerations:
 18 |         - operator: Exists
 19 |       hostPID: true
 20 |       hostNetwork: true
 21 |       serviceAccountName: metagpu-device-plugin
 22 |       containers:
 23 |       - name: dev-metagpu
 24 |         imagePullPolicy: Always
 25 |         image: cnvrg/golang-cuda11-6-dvl:latest
 26 |         ports:
 27 |           - containerPort: 2345
 28 |           - containerPort: 50052
 29 |         securityContext:
 30 |           privileged: true
 31 |         volumeMounts:
 32 |           - name: device-plugin
 33 |             mountPath: /var/lib/kubelet/device-plugins
 34 |           - mountPath: /host/proc
 35 |             mountPropagation: HostToContainer
 36 |             name: proc
 37 |             readOnly: true
 38 |       volumes:
 39 |         - name: device-plugin
 40 |           hostPath:
 41 |             path: /var/lib/kubelet/device-plugins
 42 |         - hostPath:
 43 |             path: /proc
 44 |           name: proc
 45 | ---
 46 | apiVersion: v1
 47 | kind: Service
 48 | metadata:
 49 |   name: dev-metagpu
 50 |   namespace: default
 51 | spec:
 52 |   ports:
 53 |     - name: tcp
 54 |       port: 2345
 55 |   selector:
 56 |     app: dev-metagpu
 57 | ---
 58 | apiVersion: rbac.authorization.k8s.io/v1
 59 | kind: ClusterRole
 60 | metadata:
 61 |   name: metagpu-device-plugin
 62 |   namespace: default
 63 | rules:
 64 |   - apiGroups:
 65 |       - ""
 66 |     resources:
 67 |       - pods
 68 |     verbs:
 69 |       - list
 70 |       - get
 71 |       - create
 72 |   - apiGroups:
 73 |       - ""
 74 |     resources:
 75 |       - pods/exec
 76 |     verbs:
 77 |       - create
 78 |   - apiGroups:
 79 |       - ""
 80 |     resources:
 81 |       - configmaps
 82 |     resourceNames:
 83 |       - metagpu-device-plugin-config
 84 |     verbs:
 85 |       - get
 86 |       - update
 87 | ---
 88 | apiVersion: v1
 89 | kind: ServiceAccount
 90 | metadata:
 91 |   name: metagpu-device-plugin
 92 |   namespace: default
 93 | ---
 94 | apiVersion: rbac.authorization.k8s.io/v1
 95 | kind: ClusterRoleBinding
 96 | metadata:
 97 |   name: metagpu-device-plugin
 98 |   namespace: default
 99 | roleRef:
100 |   apiGroup: rbac.authorization.k8s.io
101 |   kind: ClusterRole
102 |   name: metagpu-device-plugin
103 | subjects:
104 |   - kind: ServiceAccount
105 |     name: metagpu-device-plugin
106 |     namespace: default


--------------------------------------------------------------------------------
/pkged.go:
--------------------------------------------------------------------------------
 1 | // Code generated by pkger; DO NOT EDIT.
 2 | 
 3 | // +build !skippkger
 4 | 
 5 | package main
 6 | 
 7 | import (
 8 | 	"github.com/markbates/pkger"
 9 | 	"github.com/markbates/pkger/pkging/mem"
10 | )
11 | 
12 | var _ = pkger.Apply(mem.UnmarshalEmbed([]byte(`1f8b08000000000000ffec584d8fdb3613fe2b01cf5a33de64f3bef12d40d020872d8a34eda5c8614c8da4c9921c96a4bc7617fbdf0b52b22dadbdb6036c801e74b0c967e611bf861c0ee74190ad3888c583a82936ed72a6d8c80f4a6108b4d4f8e1b35476e5ebabca838ac416f41528851a3d44f65725ae48e195d36d4d36b5f291bc5808f947401f644906e4ac6619bc922fd07e213e1bc73efe06b1118b971871217e058362210c64f4919558085188afe06b8c47a6b224fb631d7c613ed6ce8b2dc92d44d588c55f6226be15e2f7081ac522fa167bf00521b0150b11127a55a2435ba2559bc5ab9a35d87ac6be966b193641928de82d68d9da00153608257a51884ffc0b690ca997b450b39a535f9d31b2b4325114438304f2ad0b68a5e6dab7e189d255f33752f1d2c331c58a5cee95d3672ed9ba10beb5910c8a4284e815db551ac04774b9f7655b118b422c3711d3378a8df31882ac34441c0aea7fc8656c2390452f3585d80b709d6b7ee322ef2a12ba163ba0c83579683d2e87ca32c01ea01ac3f2fae666fefe40b05f712cefc197e14768b222d4e580ac35b948ea50b2ffbe22d8cf52360606ec1dc9832ddb48fa882ab4cba871af30e5cd1ea4ef0648bd1d80e16a8406e623747df36e846fe6d703fca4cba8078bbebe79fd7e8cf66335a0f80955ba3b5a8b42a0555c92ad075509c1ce87780901dfbd1d49c882df0c252aac86b0c161e3f27b3a7803ecd024e83dfb34ae4a431ac0c1d9a982e548d5665719ab1b080d29f64e364a9f50e51f84788ee2c087bca94fb37c5ed773b4a0c0da0b68fd213e438b7c87f61429adf00513c8b40b869679473a355093620be4a5f3ecd047c2273ecd50540d6add48032e44dfaad87a1c731c6a8d91d0cb9aaf221bfdc33e132af4fcac429abcc10e3dedc12e38e381bfdf838ee8ef116283de803d4672fdfe7dd67f0f15ed920345963547ecec7ee9dd73406c6d3ec2237972de327ab0a1626f8e6a5b4b8a4b94764b7077f58cac244bb3d57c28d980d1b355f2406977f485545ebdc9b298ed960a19d1b8fe8ad90d1f963484e94e025d0f45cab543589918d8c7a1a8665c3bf464d08ee4141986d8628c1ed4a87f0ed9c10c458eb51ee2a79f78ac34aaa8693c9340b6d65869aa9bd128c22628d05ae21a5567cc03556fa39d3c62889a47abf0c4ccf902279655e84ae2fe06ea3e335d18900ab9a47a5f8d615bef6f1fd38509a990a6d5911ce4c5cd82bf5b8e58665f06cb7ca9588cddbf6c6274836afedbaeee4eb81dfe319984a0887a4dda73ce730e25126ebd7e12d2a44256a4711be27456c8b51ad76e5791616323ac4751505f93aae601da8f24b2c981c081c68ce22939b45c0aaeba8db877cf217ab275566dacea8b7df3bdbd4521ba1308cb7b4fdd35d109f60764843b9f9da4dd74fac3b9afc93656f37763fcff0cd3ae118558a12d3939d2c139df46690da806ae5f5fc672ac37f337af6fceb073910ef3a5bc6d3c718adcfa156e23be13bce6aeac4e330ee3b313e433334ebbb5b421fd0c8600f573cd8d4e4adde6937896e73caf376788d7b271a0ee4eb0a8b47042edb97dd650e90ae93cf0316ddea50155eb512ea924df3ebb9a0777ceb3a4ed1e4e0d5ec2cb57d4b7427cc510774f2fdb6add8976efae4e74cb651ae4e241bce4bbf8363d88fb77e44f7ed47fe25b2e7f5a07b2e699e132f7f327fa40f9313c9fcdff271e1f1f0b5175cbfbf05888063d4e498c2989312531a624c694c49892185312634a624c498c2989312531a624c694c49892185312634a62fc179318ff020000ffff010000ffffa4fc0ea4c6210000`)))
13 | 


--------------------------------------------------------------------------------
/cmd/mgctl/kill.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"github.com/spf13/cobra"
 5 | )
 6 | 
 7 | var killCmd = &cobra.Command{
 8 | 	Use:   "kill",
 9 | 	Short: "kill process",
10 | 	Run: func(cmd *cobra.Command, args []string) {
11 | 		//killGpuProcess()
12 | 	},
13 | }
14 | 
15 | //func killGpuProcess() {
16 | //	conn := ctlutils.GetGrpcMetaGpuSrvClientConn(viper.GetString("addr"))
17 | //	if conn == nil {
18 | //		log.Fatalf("can't initiate connection to metagpu server")
19 | //	}
20 | //	defer conn.Close()
21 | //	device := pbdevice.NewDeviceServiceClient(conn)
22 | //	hostname, err := os.Hostname()
23 | //	if err != nil {
24 | //		log.Errorf("faild to detect podId, err: %s", err)
25 | //	}
26 | //	ldr := &pbdevice.GetGpuContainersRequest{PodId: hostname}
27 | //	resp, err := device.GetGpuContainers(ctlutils.AuthenticatedContext(viper.GetString("token")), ldr)
28 | //	if err != nil {
29 | //		log.Errorf("falid to list device processes, err: %s ", err)
30 | //		return
31 | //	}
32 | //
33 | //	killProcessTemplate := &promptui.SelectTemplates{
34 | //		Label:    "{{ . }}?",
35 | //		Active:   `> {{ printf "[Pid:%d] %s" .Pid .Uuid | cyan }}`,
36 | //		Inactive: `  {{ printf "[Pid:%d] %s" .Pid .Uuid | faint }}`,
37 | //		Selected: `> {{ printf "[Pid:%d] %s" .Pid .Uuid | cyan }}`,
38 | //		Details: `
39 | //--------- Kill GPU process  ----------
40 | //{{ "Cmd:" | faint }}	{{ .Cmdline }}
41 | //{{ "GpuMemory:" | faint }}	{{ .Memory }}MB
42 | //{{ "Pod name:" | faint }}	{{ .PodName }}
43 | //{{ "Pod namespace:" | faint }}	{{ .PodNamespace }}`,
44 | //	}
45 | //
46 | //	killProcessSelect := promptui.Select{
47 | //		Label:     "Select a process",
48 | //		Items:     resp.DevicesProcesses,
49 | //		Size:      10,
50 | //		Templates: killProcessTemplate,
51 | //	}
52 | //	idx, _, err := killProcessSelect.Run()
53 | //	if err != nil {
54 | //		log.Error(err)
55 | //		return
56 | //	}
57 | //	process := resp.DevicesProcesses[idx]
58 | //	var confirmTemplate = &promptui.SelectTemplates{
59 | //		Label:    `{{ . }}?`,
60 | //		Active:   `> {{ . | red}}`,
61 | //		Inactive: `  {{ . | faint}} `,
62 | //		Selected: `> {{ . | red }}`,
63 | //	}
64 | //	confirmDelete := promptui.Select{
65 | //		Label:     fmt.Sprintf("Killing PID: %d on device: %s, are you sure?", process.Pid, process.Uuid),
66 | //		Items:     []string{"No", "Yes"},
67 | //		Templates: confirmTemplate,
68 | //	}
69 | //	_, confirm, err := confirmDelete.Run()
70 | //	if err != nil {
71 | //		log.Error(err)
72 | //		return
73 | //	}
74 | //
75 | //	if confirm == "Yes" {
76 | //		killRequest := &pbdevice.KillGpuProcessRequest{Pid: process.Pid}
77 | //		if _, err := device.KillGpuProcess(ctlutils.AuthenticatedContext(viper.GetString("token")), killRequest); err != nil {
78 | //			log.Fatalf("error killing process, err: %s", err)
79 | //		} else {
80 | //			log.Infof("%d killed", process.Pid)
81 | //		}
82 | //	}
83 | //}
84 | 


--------------------------------------------------------------------------------
/pkg/mgsrv/deviceapi/device/v1/device.proto:
--------------------------------------------------------------------------------
  1 | syntax = "proto3";
  2 | 
  3 | package device.v1;
  4 | 
  5 | service DeviceService{
  6 |   rpc GetGpuContainers (GetGpuContainersRequest) returns (GetGpuContainersResponse){}
  7 |   rpc StreamGpuContainers (StreamGpuContainersRequest) returns (stream StreamGpuContainersResponse){}
  8 |   rpc GetDevices(GetDevicesRequest) returns (GetDevicesResponse){}
  9 |   rpc KillGpuProcess(KillGpuProcessRequest) returns (KillGpuProcessResponse){}
 10 |   rpc PatchConfigs(PatchConfigsRequest) returns (PatchConfigsResponse){}
 11 |   rpc GetMetaDeviceInfo(GetMetaDeviceInfoRequest) returns (GetMetaDeviceInfoResponse){}
 12 | 
 13 |   rpc PingServer(PingServerRequest) returns (PingServerResponse){}
 14 | 
 15 | }
 16 | 
 17 | 
 18 | 
 19 | message DeviceProcess{
 20 |   string uuid = 1;
 21 |   uint32 pid = 2;
 22 |   uint64 memory = 3;
 23 |   string cmdline = 4;
 24 |   string user = 5;
 25 |   string container_id = 6;
 26 |   uint32 gpu_utilization = 10;
 27 | 
 28 | }
 29 | 
 30 | message ContainerDevice{
 31 |   Device device = 1;
 32 |   int32 allocated_shares = 2;
 33 | }
 34 | 
 35 | message GpuContainer{
 36 |   string container_id = 1;
 37 |   string container_name = 2;
 38 |   string pod_id = 3;
 39 |   string pod_namespace = 4;
 40 |   int64  metagpu_requests = 5;
 41 |   string resource_name = 6;
 42 |   string node_name = 7;
 43 |   repeated DeviceProcess device_processes = 8;
 44 |   repeated ContainerDevice container_devices = 9;
 45 | }
 46 | 
 47 | message Device{
 48 |   string uuid = 1;
 49 |   uint32 index = 2;
 50 |   uint32 shares = 3;
 51 |   uint32 gpu_utilization = 4;
 52 |   uint32 memory_utilization = 5;
 53 |   uint64 memory_total = 6;
 54 |   uint64 memory_free = 7;
 55 |   uint64 memory_used = 8;
 56 |   uint64 memory_share_size = 9;
 57 |   string resource_name = 10;
 58 |   string node_name = 11;
 59 | }
 60 | 
 61 | message StreamGpuContainersRequest{
 62 |   string pod_id = 1;
 63 | }
 64 | message StreamGpuContainersResponse{
 65 |   string visibility_level = 1;
 66 |   repeated GpuContainer gpu_containers = 2;
 67 | }
 68 | 
 69 | message GetGpuContainersRequest{
 70 |   string pod_id = 1;
 71 | }
 72 | message GetGpuContainersResponse{
 73 |   string visibility_level = 1;
 74 |   repeated GpuContainer gpu_containers = 2;
 75 | }
 76 | 
 77 | message KillGpuProcessRequest{
 78 |   uint32 pid = 1;
 79 | }
 80 | message KillGpuProcessResponse{}
 81 | 
 82 | message PatchConfigsRequest{
 83 |   int32 meta_gpus = 1;
 84 | }
 85 | message PatchConfigsResponse{}
 86 | 
 87 | message GetDevicesRequest{}
 88 | message GetDevicesResponse{
 89 |   map<string, Device> device = 1;
 90 | }
 91 | 
 92 | message GetMetaDeviceInfoRequest{}
 93 | message GetMetaDeviceInfoResponse{
 94 |   string node = 1;
 95 |   map<string, string> metadata = 2;
 96 |   repeated Device devices = 3;
 97 | }
 98 | 
 99 | message PingServerRequest{}
100 | message PingServerResponse{}
101 | 


--------------------------------------------------------------------------------
/chart/templates/ds.yml:
--------------------------------------------------------------------------------
 1 | apiVersion: apps/v1
 2 | kind: DaemonSet
 3 | metadata:
 4 |   name: metagpu-device-plugin
 5 |   namespace: {{ .Release.Namespace }}
 6 | spec:
 7 |   selector:
 8 |     matchLabels:
 9 |       name: metagpu-device-plugin
10 |   template:
11 |     metadata:
12 |       annotations:
13 |         scheduler.alpha.kubernetes.io/critical-pod: ""
14 |       labels:
15 |         name: metagpu-device-plugin
16 |     spec:
17 |       tolerations:
18 |         - key: CriticalAddonsOnly
19 |           operator: Exists
20 |         - key: nvidia.com/gpu
21 |           operator: Exists
22 |           effect: NoSchedule
23 |       hostPID: true
24 |       hostNetwork: true
25 |       serviceAccountName: metagpu-device-plugin
26 |       nodeSelector:
27 |         accelerator: nvidia
28 |       containers:
29 |         - name: metagpu-device-plugin
30 |           image: "docker.io/cnvrg/metagpu-device-plugin:{{ .Values.tag }}"
31 |           imagePullPolicy: Always
32 |           command:
33 |             - /usr/bin/mgdp
34 |             - start
35 |             - -c
36 |             - /etc/metagpu-device-plugin
37 |           ports:
38 |             - containerPort: 50052
39 |           securityContext:
40 |             privileged: true
41 |           env:
42 |             - name: METAGPU_DEVICE_PLUGIN_NODENAME
43 |               valueFrom:
44 |                 fieldRef:
45 |                   fieldPath: spec.nodeName
46 |             - name: POD_IP
47 |               valueFrom:
48 |                 fieldRef:
49 |                   fieldPath: status.podIP
50 |             - name: MG_CTL_TOKEN
51 |               value: eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJlbWFpbCI6Im1ldGFncHVAaW5zdGFuY2UiLCJ2aXNpYmlsaXR5TGV2ZWwiOiJsMCJ9.2rHykHFcHoIr-OCoPA5Am4ubf31-RJcayZnOTK6db94
52 |           volumeMounts:
53 |             - name: device-plugin
54 |               mountPath: /var/lib/kubelet/device-plugins
55 |             - name: config
56 |               mountPath: /etc/metagpu-device-plugin
57 |             - mountPath: /host/proc
58 |               mountPropagation: HostToContainer
59 |               name: proc
60 |               readOnly: true
61 |         - name: metagpu-exporter
62 |           image: "docker.io/cnvrg/metagpu-device-plugin:{{ .Values.tag }}"
63 |           imagePullPolicy: Always
64 |           command:
65 |             - /usr/bin/mgex
66 |             - start
67 |             - -t
68 |             - eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJlbWFpbCI6Im1ldGFncHVAaW5zdGFuY2UiLCJ2aXNpYmlsaXR5TGV2ZWwiOiJsMCJ9.2rHykHFcHoIr-OCoPA5Am4ubf31-RJcayZnOTK6db94
69 |           ports:
70 |             - containerPort: 2112
71 |       volumes:
72 |         - name: device-plugin
73 |           hostPath:
74 |             path: /var/lib/kubelet/device-plugins
75 |         - name: config
76 |           configMap:
77 |             name: metagpu-device-plugin-config
78 |         - hostPath:
79 |             path: /proc
80 |           name: proc


--------------------------------------------------------------------------------
/pkg/gpumgr/container.go:
--------------------------------------------------------------------------------
 1 | package gpumgr
 2 | 
 3 | import (
 4 | 	"github.com/AccessibleAI/cnvrg-fractional-accelerator-device-plugin/pkg/podexec"
 5 | 	log "github.com/sirupsen/logrus"
 6 | 	"github.com/spf13/viper"
 7 | 	v1core "k8s.io/api/core/v1"
 8 | 	"regexp"
 9 | 	"strings"
10 | )
11 | 
12 | type ContainerDevice struct {
13 | 	GpuDevice       *GpuDevice
14 | 	AllocatedShares int32
15 | }
16 | 
17 | type GpuContainer struct {
18 | 	ContainerId       string
19 | 	ContainerName     string
20 | 	PodId             string
21 | 	PodNamespace      string
22 | 	PodMetagpuRequest int64
23 | 	ResourceName      string
24 | 	Nodename          string
25 | 	Processes         []*GpuProcess
26 | 	Devices           []*ContainerDevice
27 | }
28 | 
29 | func getContainerId(pod *v1core.Pod, containerName string) (containerId string) {
30 | 	for _, status := range pod.Status.ContainerStatuses {
31 | 		if status.Name == containerName {
32 | 			idx := strings.Index(status.ContainerID, "//")
33 | 			if idx != -1 {
34 | 				return status.ContainerID[idx+2:]
35 | 			} else {
36 | 				log.WithField("pod", pod.Name).Error("can't extract container id")
37 | 			}
38 | 		}
39 | 	}
40 | 	return
41 | }
42 | 
43 | func (c *GpuContainer) setAllocatedGpus(gpuDevices []*GpuDevice) {
44 | 	l := log.WithField("pod", c.PodId)
45 | 	pe, err := podexec.NewPodExec(c.ContainerName, c.PodId, c.PodNamespace)
46 | 	if err != nil {
47 | 		l.Error(err)
48 | 		return
49 | 	}
50 | 	output, err := pe.RunCommand([]string{"printenv", "CNVRG_META_GPU_DEVICES"})
51 | 	if err != nil {
52 | 		l.Error(err)
53 | 		return
54 | 	}
55 | 	var gpuAllocationMap = make(map[string]int32)
56 | 	for _, metaDeviceId := range strings.Split(output, ",") {
57 | 		r, _ := regexp.Compile("cnvrg-meta-\\d+-\\d+-")
58 | 		deviceUuid := strings.TrimSuffix(r.ReplaceAllString(metaDeviceId, ""), "\n")
59 | 		if _, ok := gpuAllocationMap[deviceUuid]; ok {
60 | 			gpuAllocationMap[deviceUuid] = gpuAllocationMap[deviceUuid] + 1
61 | 		} else {
62 | 			gpuAllocationMap[deviceUuid] = 0
63 | 		}
64 | 	}
65 | 
66 | 	for uuid, allocatedShares := range gpuAllocationMap {
67 | 		for _, device := range gpuDevices {
68 | 			if device.UUID == uuid {
69 | 				c.Devices = append(c.Devices, &ContainerDevice{
70 | 					GpuDevice:       device,
71 | 					AllocatedShares: allocatedShares,
72 | 				})
73 | 			}
74 | 		}
75 | 	}
76 | }
77 | 
78 | func NewGpuContainer(containerId, containerName, podId, ns, resourceName, nodename string, metagpuRequests int64, gpuDevices []*GpuDevice) *GpuContainer {
79 | 	p := &GpuContainer{
80 | 		ContainerId:       containerId,
81 | 		PodId:             podId,
82 | 		ContainerName:     containerName,
83 | 		PodNamespace:      ns,
84 | 		PodMetagpuRequest: metagpuRequests,
85 | 		ResourceName:      resourceName,
86 | 		Nodename:          nodename,
87 | 	}
88 | 	// discover allocated GPUs
89 | 	p.setAllocatedGpus(gpuDevices)
90 | 	// inject mgctl bin
91 | 	if viper.GetBool("mgctlAutoInject") {
92 | 		podexec.CopymgctlToContainer(p.ContainerName, p.PodId, p.PodNamespace)
93 | 	}
94 | 	return p
95 | }
96 | 


--------------------------------------------------------------------------------
/cmd/mgex/main.go:
--------------------------------------------------------------------------------
  1 | package main
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | 	log "github.com/sirupsen/logrus"
  6 | 	"github.com/spf13/cobra"
  7 | 	"github.com/spf13/viper"
  8 | 	"os"
  9 | 	"path"
 10 | 	"runtime"
 11 | 	"strconv"
 12 | 	"strings"
 13 | )
 14 | 
 15 | type param struct {
 16 | 	name      string
 17 | 	shorthand string
 18 | 	value     interface{}
 19 | 	usage     string
 20 | 	required  bool
 21 | }
 22 | 
 23 | var (
 24 | 	Version string
 25 | 	Build   string
 26 | 	rootCmd = &cobra.Command{
 27 | 		Use:   "mgexporter",
 28 | 		Short: "mgexporter - Metagpu metrics exporter",
 29 | 	}
 30 | 	version = &cobra.Command{
 31 | 		Use:   "version",
 32 | 		Short: "Print metagpu metric exporter version and build sha",
 33 | 		Run: func(cmd *cobra.Command, args []string) {
 34 | 			fmt.Printf("🐾 version: %s build: %s \n", Version, Build)
 35 | 		},
 36 | 	}
 37 | 	startParams = []param{
 38 | 		{name: "metrics-addr", shorthand: "a", value: "0.0.0.0:2112", usage: "listen address"},
 39 | 		{name: "mgsrv", shorthand: "s", value: "127.0.0.1:50052", usage: "metagpu device plugin gRPC server address"},
 40 | 		{name: "token", shorthand: "t", value: "", usage: "metagpu server authenticate token"},
 41 | 	}
 42 | 	start = &cobra.Command{
 43 | 		Use:   "start",
 44 | 		Short: "start metagpu metrics exporter",
 45 | 		Run: func(cmd *cobra.Command, args []string) {
 46 | 			startExporter()
 47 | 		},
 48 | 	}
 49 | )
 50 | 
 51 | func init() {
 52 | 	cobra.OnInitialize(initConfig)
 53 | 	setParams(startParams, start)
 54 | 	rootCmd.AddCommand(version)
 55 | 	rootCmd.AddCommand(start)
 56 | }
 57 | 
 58 | func initConfig() {
 59 | 	viper.AutomaticEnv()
 60 | 	viper.SetEnvPrefix("MG_EX")
 61 | 	viper.SetEnvKeyReplacer(strings.NewReplacer("-", "_"))
 62 | 	setupLogging()
 63 | }
 64 | 
 65 | func setParams(params []param, command *cobra.Command) {
 66 | 	for _, param := range params {
 67 | 		switch v := param.value.(type) {
 68 | 		case int:
 69 | 			command.PersistentFlags().IntP(param.name, param.shorthand, v, param.usage)
 70 | 		case string:
 71 | 			command.PersistentFlags().StringP(param.name, param.shorthand, v, param.usage)
 72 | 		case bool:
 73 | 			command.PersistentFlags().BoolP(param.name, param.shorthand, v, param.usage)
 74 | 		}
 75 | 		if err := viper.BindPFlag(param.name, command.PersistentFlags().Lookup(param.name)); err != nil {
 76 | 			panic(err)
 77 | 		}
 78 | 	}
 79 | }
 80 | 
 81 | func setupLogging() {
 82 | 
 83 | 	// Set log verbosity
 84 | 	if viper.GetBool("verbose") {
 85 | 		log.SetLevel(log.DebugLevel)
 86 | 		log.SetReportCaller(true)
 87 | 		log.SetFormatter(&log.TextFormatter{
 88 | 			FullTimestamp: true,
 89 | 			CallerPrettyfier: func(frame *runtime.Frame) (function string, file string) {
 90 | 				fileName := fmt.Sprintf(" [%s]", path.Base(frame.Function)+":"+strconv.Itoa(frame.Line))
 91 | 				return "", fileName
 92 | 			},
 93 | 		})
 94 | 	} else {
 95 | 		log.SetLevel(log.InfoLevel)
 96 | 		log.SetFormatter(&log.TextFormatter{FullTimestamp: true})
 97 | 	}
 98 | 	// Set log format
 99 | 	if viper.GetBool("json-log") {
100 | 		log.SetFormatter(&log.JSONFormatter{})
101 | 	}
102 | 	// Logs are always goes to STDOUT
103 | 	log.SetOutput(os.Stdout)
104 | }
105 | 
106 | func main() {
107 | 
108 | 	if err := rootCmd.Execute(); err != nil {
109 | 		fmt.Println(err)
110 | 		os.Exit(1)
111 | 	}
112 | 
113 | }
114 | 


--------------------------------------------------------------------------------
/cmd/mgctl/main.go:
--------------------------------------------------------------------------------
  1 | package main
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | 	log "github.com/sirupsen/logrus"
  6 | 	"github.com/spf13/cobra"
  7 | 	"github.com/spf13/viper"
  8 | 	"os"
  9 | 	"path"
 10 | 	"runtime"
 11 | 	"strconv"
 12 | 	"strings"
 13 | )
 14 | 
 15 | type param struct {
 16 | 	name      string
 17 | 	shorthand string
 18 | 	value     interface{}
 19 | 	usage     string
 20 | 	required  bool
 21 | }
 22 | 
 23 | var (
 24 | 	Version    string
 25 | 	Build      string
 26 | 	rootParams = []param{
 27 | 		{name: "json-log", shorthand: "", value: false, usage: "output logs in json format"},
 28 | 		{name: "verbose", shorthand: "", value: false, usage: "enable verbose logs"},
 29 | 		{name: "addr", shorthand: "s", value: "localhost:50052", usage: "address to access the metagpu server"},
 30 | 		{name: "token", shorthand: "t", value: "", usage: "authentication token"},
 31 | 		{name: "output", shorthand: "o", value: "table", usage: "output format, one of: table|json|raw"},
 32 | 	}
 33 | )
 34 | 
 35 | var metaGpuCtlVersion = &cobra.Command{
 36 | 	Use:   "version",
 37 | 	Short: "Print metagpuctl version and build sha",
 38 | 	Run: func(cmd *cobra.Command, args []string) {
 39 | 		fmt.Printf("🐾 version: %s build: %s \n", Version, Build)
 40 | 	},
 41 | }
 42 | 
 43 | var rootCmd = &cobra.Command{
 44 | 	Use:   "mgctl",
 45 | 	Short: "mgctl - cli client for metagpu management and monitoring",
 46 | }
 47 | 
 48 | func init() {
 49 | 	cobra.OnInitialize(initConfig)
 50 | 	setParams(configCmdParams, configCmd)
 51 | 	setParams(processGetParams, processesGetCmd)
 52 | 	setParams(rootParams, rootCmd)
 53 | 	// processes
 54 | 	getCmd.AddCommand(processesGetCmd)
 55 | 	getCmd.AddCommand(getDevicesCmd)
 56 | 	// root commands
 57 | 	rootCmd.AddCommand(configCmd)
 58 | 	rootCmd.AddCommand(enforceCmd)
 59 | 	rootCmd.AddCommand(killCmd)
 60 | 	rootCmd.AddCommand(getCmd)
 61 | 	rootCmd.AddCommand(pingCmd)
 62 | 	rootCmd.AddCommand(metaGpuCtlVersion)
 63 | 
 64 | }
 65 | 
 66 | func initConfig() {
 67 | 	viper.AutomaticEnv()
 68 | 	viper.SetEnvPrefix("MG_CTL")
 69 | 	viper.SetEnvKeyReplacer(strings.NewReplacer("-", "_"))
 70 | 	setupLogging()
 71 | }
 72 | 
 73 | func setParams(params []param, command *cobra.Command) {
 74 | 	for _, param := range params {
 75 | 		switch v := param.value.(type) {
 76 | 		case int:
 77 | 			command.PersistentFlags().IntP(param.name, param.shorthand, v, param.usage)
 78 | 		case string:
 79 | 			command.PersistentFlags().StringP(param.name, param.shorthand, v, param.usage)
 80 | 		case bool:
 81 | 			command.PersistentFlags().BoolP(param.name, param.shorthand, v, param.usage)
 82 | 		}
 83 | 		if err := viper.BindPFlag(param.name, command.PersistentFlags().Lookup(param.name)); err != nil {
 84 | 			panic(err)
 85 | 		}
 86 | 	}
 87 | }
 88 | 
 89 | func setupLogging() {
 90 | 
 91 | 	// Set log verbosity
 92 | 	if viper.GetBool("verbose") {
 93 | 		log.SetLevel(log.DebugLevel)
 94 | 		log.SetFormatter(&log.TextFormatter{
 95 | 			FullTimestamp: true,
 96 | 			CallerPrettyfier: func(frame *runtime.Frame) (function string, file string) {
 97 | 				fileName := fmt.Sprintf(" [%s]", path.Base(frame.Function)+":"+strconv.Itoa(frame.Line))
 98 | 				return "", fileName
 99 | 			},
100 | 		})
101 | 	} else {
102 | 		log.SetLevel(log.InfoLevel)
103 | 		log.SetFormatter(&log.TextFormatter{FullTimestamp: true})
104 | 	}
105 | 
106 | 	// Set log format
107 | 	if viper.GetBool("json-log") {
108 | 		log.SetFormatter(&log.JSONFormatter{})
109 | 	}
110 | 
111 | 	// Logs are always goes to STDOUT
112 | 	log.SetOutput(os.Stdout)
113 | }
114 | 
115 | func main() {
116 | 
117 | 	if err := rootCmd.Execute(); err != nil {
118 | 		fmt.Println(err)
119 | 		os.Exit(1)
120 | 	}
121 | 
122 | }
123 | 


--------------------------------------------------------------------------------
/cmd/mgctl/enforce.go:
--------------------------------------------------------------------------------
  1 | package main
  2 | 
  3 | import (
  4 | 	"github.com/spf13/cobra"
  5 | )
  6 | 
  7 | var enforceCmd = &cobra.Command{
  8 | 	Use:   "enforce",
  9 | 	Short: "enforce memory limits",
 10 | 	Run: func(cmd *cobra.Command, args []string) {
 11 | 		//enforceMemoryLimits()
 12 | 	},
 13 | }
 14 | 
 15 | ////
 16 | ////func enforceMemoryLimits() {
 17 | ////	conn := ctlutils.GetGrpcMetaGpuSrvClientConn(viper.GetString("addr"))
 18 | ////	if conn == nil {
 19 | ////		log.Fatalf("can't initiate connection to metagpu server")
 20 | ////	}
 21 | ////	defer conn.Close()
 22 | ////	device := pbdevice.NewDeviceServiceClient(conn)
 23 | ////	hostname, err := os.Hostname()
 24 | ////	if err != nil {
 25 | ////		log.Errorf("faild to detect podId, err: %s", err)
 26 | ////	}
 27 | ////	request := &pbdevice.StreamProcessesRequest{PodId: hostname}
 28 | ////	stream, err := device.StreamProcesses(ctlutils.AuthenticatedContext(viper.GetString("token")), request)
 29 | ////	if err != nil {
 30 | ////		log.Fatal(err)
 31 | ////	}
 32 | ////
 33 | ////	refreshCh := make(chan bool)
 34 | ////	sigCh := make(chan os.Signal, 1)
 35 | ////	signal.Notify(sigCh, syscall.SIGHUP, syscall.SIGINT, syscall.SIGTERM, syscall.SIGQUIT)
 36 | ////
 37 | ////	to := &TableOutput{}
 38 | ////	to.header = table.Row{"Idx", "Pod", "Used Mem", "Meta Mem"}
 39 | ////
 40 | ////	go func() {
 41 | ////		for {
 42 | ////			time.Sleep(1 * time.Second)
 43 | ////			refreshCh <- true
 44 | ////		}
 45 | ////	}()
 46 | ////
 47 | ////	for {
 48 | ////		select {
 49 | ////		case <-sigCh:
 50 | ////			cursor.ClearLine()
 51 | ////			log.Info("shutting down")
 52 | ////			os.Exit(0)
 53 | ////		case <-refreshCh:
 54 | ////			processResp, err := stream.Recv()
 55 | ////			if err == io.EOF {
 56 | //				break
 57 | //			}
 58 | //			if err != nil {
 59 | //				log.Fatalf("error watching gpu processes, err: %s", err)
 60 | //			}
 61 | //			deviceResp, err := device.GetDevices(ctlutils.AuthenticatedContext(viper.GetString("token")), &pbdevice.GetDevicesRequest{})
 62 | //			if err != nil {
 63 | //				log.Errorf("falid to list devices, err: %s ", err)
 64 | //				return
 65 | //			}
 66 | //			to.body, to.footer = composeMemEnforceListAndFooter(processResp.DevicesProcesses, deviceResp.Device)
 67 | //			to.buildTable()
 68 | //			to.print()
 69 | //
 70 | //			for _, p := range processResp.DevicesProcesses {
 71 | //				d := deviceResp.Device[p.Uuid]
 72 | //				if p.Memory > d.MemoryShareSize*uint64(p.MetagpuRequests) {
 73 | //					killRequest := &pbdevice.KillGpuProcessRequest{Pid: p.Pid}
 74 | //					_, _ = device.KillGpuProcess(ctlutils.AuthenticatedContext(viper.GetString("token")), killRequest)
 75 | //				}
 76 | //			}
 77 | //		}
 78 | //	}
 79 | //}
 80 | //
 81 | //func composeMemEnforceListAndFooter(processes []*pbdevice.DeviceProcess, devices map[string]*pbdevice.Device) (body []table.Row, footer table.Row) {
 82 | //
 83 | //	type enforceObj struct {
 84 | //		uuid    string
 85 | //		podName string
 86 | //		memUsed uint64
 87 | //		maxMem  uint64
 88 | //	}
 89 | //
 90 | //	var el = make(map[string]*enforceObj)
 91 | //
 92 | //	for _, p := range processes {
 93 | //		d := devices[p.Uuid]
 94 | //		el[p.PodName] = &enforceObj{
 95 | //			uuid:    p.Uuid,
 96 | //			podName: p.PodName,
 97 | //			memUsed: p.Memory,
 98 | //			maxMem:  d.MemoryShareSize * uint64(p.MetagpuRequests),
 99 | //		}
100 | //	}
101 | //
102 | //	for _, eObj := range el {
103 | //		podName := fmt.Sprintf("\033[32m%s\033[0m", eObj.podName)
104 | //		if eObj.memUsed > eObj.maxMem {
105 | //			podName = fmt.Sprintf("\033[31m%s\033[0m", eObj.podName)
106 | //		}
107 | //		body = append(body, table.Row{eObj.uuid, podName, eObj.memUsed, eObj.maxMem})
108 | //	}
109 | //
110 | //	footer = table.Row{"", "", "", ""}
111 | //
112 | //	return
113 | //
114 | //}
115 | 


--------------------------------------------------------------------------------
/pkg/sharecfg/sharecfg.go:
--------------------------------------------------------------------------------
  1 | package sharecfg
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | 	"github.com/AccessibleAI/cnvrg-fractional-accelerator-device-plugin/pkg/nvmlutils"
  6 | 	"github.com/NVIDIA/go-nvml/pkg/nvml"
  7 | 	log "github.com/sirupsen/logrus"
  8 | 	"github.com/spf13/viper"
  9 | )
 10 | 
 11 | type DeviceSharingConfig struct {
 12 | 	Uuid           []string
 13 | 	ResourceName   string
 14 | 	MetagpusPerGpu int
 15 | 	AutoReshare    bool
 16 | }
 17 | 
 18 | type DevicesSharingConfigs struct {
 19 | 	Configs []*DeviceSharingConfig
 20 | }
 21 | 
 22 | var shareCfg *DevicesSharingConfigs
 23 | 
 24 | func NewDeviceSharingConfig() *DevicesSharingConfigs {
 25 | 	if shareCfg != nil {
 26 | 		return shareCfg
 27 | 	}
 28 | 	var cfg []*DeviceSharingConfig
 29 | 	if err := viper.UnmarshalKey("deviceSharing", &cfg); err != nil {
 30 | 		log.Fatal(err)
 31 | 	}
 32 | 	shareCfg = &DevicesSharingConfigs{Configs: cfg}
 33 | 	shareCfg.ValidateSharingConfiguration()
 34 | 	shareCfg.AutoReshare()
 35 | 	return shareCfg
 36 | }
 37 | 
 38 | func (c *DevicesSharingConfigs) ValidateSharingConfiguration() {
 39 | 	if len(c.Configs) == 0 {
 40 | 		log.Fatalf("mission gpu sharing configuration, can't proceed")
 41 | 	}
 42 | 	if len(c.Configs) > 1 {
 43 | 		for _, devCfg := range c.Configs {
 44 | 			for _, uuid := range devCfg.Uuid {
 45 | 				if uuid == "*" {
 46 | 					log.Fatalf("wrong gpu sharing configuration, "+
 47 | 						"'deviceSharing' with uuid: [ * ] must have sinlge (1) entry, but have: %d", len(c.Configs))
 48 | 				}
 49 | 			}
 50 | 		}
 51 | 	}
 52 | }
 53 | 
 54 | func (c *DevicesSharingConfigs) AutoReshare() {
 55 | 	for _, cfg := range c.Configs {
 56 | 		if cfg.AutoReshare {
 57 | 			cfg.GpuAutoResharing()
 58 | 			continue
 59 | 		}
 60 | 		log.Infof("autoReshare disabled for: %s, skipping re-configuration", cfg.ResourceName)
 61 | 	}
 62 | }
 63 | 
 64 | func (c *DevicesSharingConfigs) GetDeviceSharingConfigs(devUuid string) (*DeviceSharingConfig, error) {
 65 | 	for _, devCfg := range c.Configs {
 66 | 		for _, uuid := range devCfg.Uuid {
 67 | 			if uuid == devUuid || uuid == "*" {
 68 | 				return devCfg, nil
 69 | 			}
 70 | 		}
 71 | 	}
 72 | 	return nil, fmt.Errorf("device uuid: %s not found in sharing configs", devUuid)
 73 | }
 74 | 
 75 | func (c *DeviceSharingConfig) GpuAutoResharing() {
 76 | 	log.Info("autoResharing enabled, re-configuring gpu shares")
 77 | 	c.MetagpusPerGpu = 100
 78 | 	// the following code is sharing GPU by memory,
 79 | 	// currently we are not using it, and I don't think we ever will
 80 | 	// but, never say never, thus it's here
 81 | 
 82 | 	//nvmlDevice := c.getFirstDevice()
 83 | 	//if nvmlDevice != nil {
 84 | 	//	mem := nvmlutils.GetDeviceMemory(nvmlDevice)
 85 | 	//	if mem.Total > 0 {
 86 | 	//		c.MetagpusPerGpu = int((mem.Total / (1024 * 1024)) / 1024)
 87 | 	//	}
 88 | 	//}
 89 | 
 90 | 	// TODO: make sharing configurations persistent
 91 | }
 92 | 
 93 | // GetShareSize Get share size in MB
 94 | func (c *DeviceSharingConfig) GetShareSize() int {
 95 | 	nvmlDevice := c.getFirstDevice()
 96 | 	if nvmlDevice != nil {
 97 | 		mem := nvmlutils.GetDeviceMemory(nvmlDevice)
 98 | 		if mem.Total > 0 {
 99 | 			return int((mem.Total / (1024 * 1024)) / uint64(c.MetagpusPerGpu))
100 | 		}
101 | 	}
102 | 	return 0
103 | }
104 | 
105 | func (c *DeviceSharingConfig) getFirstDevice() *nvml.Device {
106 | 	if c.isWildcardSharing() {
107 | 		devices := nvmlutils.GetDevices()
108 | 		if len(devices) < 0 {
109 | 			log.Error("can't execute autoReshare, the devices list is empty")
110 | 			return nil
111 | 		}
112 | 		return devices[0]
113 | 	}
114 | 	if len(c.Uuid) < 0 {
115 | 		log.Error("can't execute autoReshare, uuid config list es empty")
116 | 		return nil
117 | 	}
118 | 	return nvmlutils.GetDeviceByUUID(c.Uuid[0])
119 | }
120 | 
121 | func (c *DeviceSharingConfig) isWildcardSharing() bool {
122 | 	for _, uuid := range c.Uuid {
123 | 		if uuid == "*" {
124 | 			return true
125 | 		}
126 | 	}
127 | 	return false
128 | }
129 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # MetaGPU Device Plugin for Kubernetes
  2 | 
  3 | The metagpu device plugin (`mgdp`) allows you to share one or more Nvidia GPUs between
  4 | different K8s workloads. 
  5 | 
  6 | ### Motivation
  7 | K8s doesn't provide a support for the GPU sharing. 
  8 | Meaning user must allocate entire GPU to his workload, even if the actual GPU usage 
  9 | is much bellow of 100%. 
 10 | This project will help to improve the GPU utilization by allowing GPU sharing between 
 11 | multiple K8s workloads. 
 12 | 
 13 | 
 14 | ### How it works 
 15 | The `mgdp` is based on [Nvidia Container Runtime](https://github.com/NVIDIA/nvidia-container-runtime)
 16 | and on [go-nvml](https://github.com/NVIDIA/go-nvml)
 17 | One for the features the nvidia container runtime providers, is an ability 
 18 | to specify the visible GPU devices Ids by using env vars `NVIDIA_VISIBLE_DEVICES`.
 19 | 
 20 | The most short & simple explanation of the `mgdp` logic is:
 21 | 1. `mgdp` detects all the GPU devices Ids 
 22 | 2. From the real GPU deices Ids, it's generates a meta-devices Ids
 23 | 3. `mgdp` advertise these meta-devices Ids to the K8s
 24 | 4. Once a user requests for a gpu fraction, for example 0.5 GPU, `mgdp` will allocate 50 meta-devices IDs
 25 | 5. The 50 meta-gpus are bounded to 1 real device id, this real device ID will be injected to the container 
 26 | 
 27 | In addition, each metagpu container will have `mgctl` binary. 
 28 | The `mgctl` is an alternative for `nvidia-smi`. 
 29 | The `mgctl` improves security and provides better K8s integration.
 30 | 
 31 | ### The sharing configurations
 32 | By default, `mgdp` will share each of your GPU devices to 100 meta-gpus. 
 33 | For example, if you've a machine with 2 GPUs, `mgdp` will generate 200 metagpus. 
 34 | Requesting for 50 metagpus, will give you 0.5 GPU, requesting 150 metagpus, 
 35 | will give you 1.5 metagpus.
 36 | 
 37 | 
 38 | ### [MetaGPU demo from Cnvrg's MLCon 2.0](https://www.youtube.com/watch?v=hsP9GXUtNNs)
 39 | 
 40 | ### Deployment 
 41 | 1. clone the repo 
 42 | 2. use helm chart to install or dump manifest and install manually 
 43 | 
 44 | ### Install with helm chart 
 45 | ```bash
 46 | # cd into cloned directory and run
 47 | # for openshift set ocp=true  
 48 | helm install chart --set ocp=false -ncnvrg 
 49 | ```
 50 | 
 51 | ### Install with raw K8s manifests 
 52 | ```bash
 53 | # cd into cloned directory and run
 54 | # for openshift set ocp=true  
 55 | helm template chart --set ocp=false -ncnvrg > meatgpu.yaml 
 56 | kubectl apply -f meatgpu.yaml 
 57 | ```
 58 | 
 59 | 
 60 | ### Test the Metagpu 
 61 | ```bash
 62 | cat <<EOF | kubectl apply -f -
 63 | apiVersion: v1
 64 | kind: Pod
 65 | metadata:
 66 |   name: metagpu-test
 67 |   namespace: cnvrg
 68 | spec:
 69 |   tolerations:
 70 |    - operator: "Exists"
 71 |   containers:
 72 |   - name: gpu-test-with-gpu
 73 |     image: tensorflow/tensorflow:latest-gpu
 74 |     command:
 75 |       - /usr/local/bin/python
 76 |       - -c
 77 |       - |
 78 |         import tensorflow as tf
 79 |         tf.get_logger().setLevel('INFO')
 80 |         gpus = tf.config.list_physical_devices('GPU')
 81 |         if gpus:
 82 |           # Restrict TensorFlow to only allocate 1GB of memory on the first GPU
 83 |           try:
 84 |             tf.config.set_logical_device_configuration(gpus[0],[tf.config.LogicalDeviceConfiguration(memory_limit=1024)])
 85 |             logical_gpus = tf.config.list_logical_devices('GPU')
 86 |             print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
 87 |           except RuntimeError as e:
 88 |             # Virtual devices must be set before GPUs have been initialized
 89 |             print(e)
 90 |         print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
 91 |         while True:
 92 |           print(tf.reduce_sum(tf.random.normal([1000, 1000])))
 93 |     resources:
 94 |       limits:
 95 |         cnvrg.io/metagpu: "30"
 96 | EOF
 97 | ```
 98 | 
 99 |  
100 | 
101 | 
102 | 
103 | 


--------------------------------------------------------------------------------
/cmd/mgex/readme.md:
--------------------------------------------------------------------------------
 1 | # Metagpu metrics exporter
 2 | 
 3 | ### Device level metrics
 4 | 
 5 | Each device metric includes the following labels:
 6 | 
 7 | 1. device id
 8 | 2. device uuid
 9 | 
10 | Device metrics:
11 | 
12 | * `metagpu_device_memory_total` total gpu memory for single gpu unit
13 | * `metagpu_device_memory_free` free gpu memory per single gpu unit
14 | * `metagpu_device_memory_used` total memory used per single gpu unit
15 | * `metagpu_device_shares` total amount of shares per single gpu unit
16 | * `metagpu_device_memory_share_size` amount of memory for each gpu share
17 | 
18 | Metrics example: 
19 | ```
20 | metagpu_device_memory_free{device_index="0",device_uuid="GPU-92fbf3b0-28f0-1add-7cd7-255fbdbd6e53"} 8790
21 | metagpu_device_memory_share_size{device_index="0",device_uuid="GPU-92fbf3b0-28f0-1add-7cd7-255fbdbd6e53"} 1040
22 | metagpu_device_memory_total{device_index="0",device_uuid="GPU-92fbf3b0-28f0-1add-7cd7-255fbdbd6e53"} 11441
23 | metagpu_device_memory_used{device_index="0",device_uuid="GPU-92fbf3b0-28f0-1add-7cd7-255fbdbd6e53"} 2650
24 | metagpu_device_shares{device_index="0",device_uuid="GPU-92fbf3b0-28f0-1add-7cd7-255fbdbd6e53"} 11
25 | ```
26 | 
27 | ### Process level metrics 
28 | 
29 | Each process metrics includes the following labels:
30 | 1. process pid
31 | 2. cmdline
32 | 3. pod name
33 | 4. pod namespace
34 | 5. user 
35 | 6. device uuid 
36 | 
37 | Process metrics:
38 | * `metagpu_process_absolute_gpu_utilization` gpu utilization - calculated from device GPU totals
39 | 
40 | * `metagpu_process_memory_usage` process memory usage
41 | 
42 | * `metagpu_process_metagpu_requests` amount of metagpu requests
43 | 
44 | * `metagpu_process_max_allowed_metagpu_gpu_utilization` 
45 | max allowed metagpu GPU utilization, calculated by: 
46 | `metagpu_process_metagpu_requests` * `metagpu_device_memory_share_size`
47 | 
48 | * `metagpu_process_max_allowed_metagpu_memory` max allowed metagpu memory usage
49 | calculated by: `metagpu_process_metagpu_requests` * `metagpu_device_memory_share_size` 
50 |  
51 | * `metagpu_process_metagpu_relative_gpu_utilization` current gpu utilization relative to the amount of metagpu requests
52 | calculated by: `metagpu_process_gpu_utilization` * 100 / `metagpu_process_max_allowed_metagpu_memory`
53 |  
54 | * `metagpu_process_metagpu_relative_memory_utilization` memory utilization relative to the amount of 
55 | metagpus requests calculated by: `metagpu_process_memory_usage` * 100 / `metagpu_process_max_allowed_metagpu_memory`
56 | 
57 | Metrics example:
58 | ```bash
59 | metagpu_process_gpu_utilization{cmdline="/usr/local/bin/python",pid="3954530",pod_name="gpu-test-with-gpu-74f754c674-99gdb",pod_namespace="default",user="root",uuid="GPU-92fbf3b0-28f0-1add-7cd7-255fbdbd6e53"} 44
60 | metagpu_process_max_allowed_metagpu_gpu_utilization{cmdline="/usr/local/bin/python",pid="3954530",pod_name="gpu-test-with-gpu-74f754c674-99gdb",pod_namespace="default",user="root",uuid="GPU-92fbf3b0-28f0-1add-7cd7-255fbdbd6e53"} 36
61 | metagpu_process_max_allowed_metagpu_memory{cmdline="/usr/local/bin/python",pid="3954530",pod_name="gpu-test-with-gpu-74f754c674-99gdb",pod_namespace="default",user="root",uuid="GPU-92fbf3b0-28f0-1add-7cd7-255fbdbd6e53"} 4160
62 | metagpu_process_memory_usage{cmdline="/usr/local/bin/python",pid="3954530",pod_name="gpu-test-with-gpu-74f754c674-99gdb",pod_namespace="default",user="root",uuid="GPU-92fbf3b0-28f0-1add-7cd7-255fbdbd6e53"} 2643
63 | metagpu_process_metagpu_current_gpu_utilization{cmdline="/usr/local/bin/python",pid="3954530",pod_name="gpu-test-with-gpu-74f754c674-99gdb",pod_namespace="default",user="root",uuid="GPU-92fbf3b0-28f0-1add-7cd7-255fbdbd6e53"} 122
64 | metagpu_process_metagpu_current_memory_utilization{cmdline="/usr/local/bin/python",pid="3954530",pod_name="gpu-test-with-gpu-74f754c674-99gdb",pod_namespace="default",user="root",uuid="GPU-92fbf3b0-28f0-1add-7cd7-255fbdbd6e53"} 63
65 | metagpu_process_metagpu_requests{cmdline="/usr/local/bin/python",pid="3954530",pod_name="gpu-test-with-gpu-74f754c674-99gdb",pod_namespace="default",user="root",uuid="GPU-92fbf3b0-28f0-1add-7cd7-255fbdbd6e53"} 4
66 | ```
67 | 


--------------------------------------------------------------------------------
/pkg/mgsrv/server.go:
--------------------------------------------------------------------------------
  1 | package mgsrv
  2 | 
  3 | import (
  4 | 	"context"
  5 | 	"errors"
  6 | 	"fmt"
  7 | 	devicevpb "github.com/AccessibleAI/cnvrg-fractional-accelerator-device-plugin/gen/proto/go/device/v1"
  8 | 	"github.com/AccessibleAI/cnvrg-fractional-accelerator-device-plugin/pkg/gpumgr"
  9 | 	devicevapi "github.com/AccessibleAI/cnvrg-fractional-accelerator-device-plugin/pkg/mgsrv/deviceapi/device/v1"
 10 | 	"github.com/golang-jwt/jwt"
 11 | 	log "github.com/sirupsen/logrus"
 12 | 	"github.com/spf13/viper"
 13 | 	"google.golang.org/grpc"
 14 | 	"google.golang.org/grpc/codes"
 15 | 	"google.golang.org/grpc/metadata"
 16 | 	"google.golang.org/grpc/reflection"
 17 | 	"google.golang.org/grpc/status"
 18 | 	"net"
 19 | )
 20 | 
 21 | type VisibilityLevel string
 22 | 
 23 | type MetaGpuServer struct {
 24 | 	gpuMgr                        *gpumgr.GpuMgr
 25 | 	ContainerLevelVisibilityToken string
 26 | 	DeviceLevelVisibilityToken    string
 27 | }
 28 | 
 29 | var (
 30 | 	DeviceVisibility         VisibilityLevel = "l0"
 31 | 	ContainerVisibility      VisibilityLevel = "l1"
 32 | 	TokenVisibilityClaimName                 = "visibilityLevel"
 33 | )
 34 | 
 35 | func NewMetaGpuServer() *MetaGpuServer {
 36 | 	return &MetaGpuServer{gpuMgr: gpumgr.NewGpuManager()}
 37 | }
 38 | 
 39 | func (s *MetaGpuServer) Start() {
 40 | 
 41 | 	go func() {
 42 | 		lis, err := net.Listen("tcp", fmt.Sprintf("%s", viper.GetString("serverAddr")))
 43 | 		if err != nil {
 44 | 			log.Fatalf("failed to listen: %v", err)
 45 | 		}
 46 | 		log.Infof("metagpu gRPC management server listening on %s", viper.GetString("serverAddr"))
 47 | 
 48 | 		opts := []grpc.ServerOption{
 49 | 			grpc.UnaryInterceptor(s.unaryServerInterceptor()),
 50 | 			grpc.StreamInterceptor(s.streamServerInterceptor()),
 51 | 		}
 52 | 
 53 | 		grpcServer := grpc.NewServer(opts...)
 54 | 
 55 | 		dp := devicevapi.DeviceService{}
 56 | 		devicevpb.RegisterDeviceServiceServer(grpcServer, &dp)
 57 | 		reflection.Register(grpcServer)
 58 | 		if err := grpcServer.Serve(lis); err != nil {
 59 | 			log.Fatal(err)
 60 | 		}
 61 | 	}()
 62 | }
 63 | 
 64 | func (s *MetaGpuServer) GenerateAuthTokens(visibility VisibilityLevel) string {
 65 | 
 66 | 	claims := jwt.MapClaims{"email": "metagpu@instance", TokenVisibilityClaimName: visibility}
 67 | 	containerScopeToken := jwt.NewWithClaims(jwt.SigningMethodHS256, claims)
 68 | 	tokenString, err := containerScopeToken.SignedString([]byte(viper.GetString("jwtSecret")))
 69 | 	if err != nil {
 70 | 		log.Error(err)
 71 | 	}
 72 | 	return tokenString
 73 | }
 74 | 
 75 | func (s *MetaGpuServer) IsMethodPublic(fullMethod string) bool {
 76 | 	publicMethods := []string{
 77 | 		"/device.v1.DeviceService/PingServer",
 78 | 	}
 79 | 	for _, method := range publicMethods {
 80 | 		if method == fullMethod {
 81 | 			return true
 82 | 		}
 83 | 	}
 84 | 	return false
 85 | 
 86 | }
 87 | 
 88 | func authorize(ctx context.Context) (string, error) {
 89 | 	md, ok := metadata.FromIncomingContext(ctx)
 90 | 	if !ok {
 91 | 		return "", status.Errorf(codes.InvalidArgument, "retrieving metadata is failed")
 92 | 	}
 93 | 
 94 | 	authHeader, ok := md["authorization"]
 95 | 
 96 | 	if !ok {
 97 | 		return "", status.Errorf(codes.Unauthenticated, "authorization token is not supplied")
 98 | 	}
 99 | 
100 | 	tokenString := authHeader[0]
101 | 	token, err := jwt.Parse(tokenString, func(token *jwt.Token) (interface{}, error) {
102 | 		if _, ok := token.Method.(*jwt.SigningMethodHMAC); !ok {
103 | 			log.Errorf("unexpected signing method: %v", token.Header["alg"])
104 | 			return nil, status.Errorf(codes.Unauthenticated, errors.New("error authenticate").Error())
105 | 		}
106 | 		return []byte(viper.GetString("jwtSecret")), nil
107 | 	})
108 | 	if err != nil {
109 | 		log.Error(err)
110 | 		return "", status.Errorf(codes.Unauthenticated, errors.New("error authenticate").Error())
111 | 	}
112 | 
113 | 	if claims, ok := token.Claims.(jwt.MapClaims); ok && token.Valid {
114 | 		if visibility, ok := claims[TokenVisibilityClaimName]; ok {
115 | 			visibility := visibility.(string)
116 | 			return visibility, nil
117 | 		}
118 | 	}
119 | 	return "", status.Errorf(codes.Unauthenticated, errors.New("error authenticate").Error())
120 | 
121 | }
122 | 


--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
 1 | module github.com/AccessibleAI/cnvrg-fractional-accelerator-device-plugin
 2 | 
 3 | go 1.17
 4 | 
 5 | require (
 6 | 	github.com/NVIDIA/go-nvml v0.11.1-0
 7 | 	github.com/atomicgo/cursor v0.0.1
 8 | 	github.com/fsnotify/fsnotify v1.5.1
 9 | 	github.com/golang-jwt/jwt v3.2.2+incompatible
10 | 	github.com/jedib0t/go-pretty/v6 v6.2.7
11 | 	github.com/manifoldco/promptui v0.9.0
12 | 	github.com/markbates/pkger v0.17.1
13 | 	github.com/onsi/ginkgo v1.16.5
14 | 	github.com/onsi/gomega v1.17.0
15 | 	github.com/prometheus/client_golang v1.11.0
16 | 	github.com/prometheus/procfs v0.6.0
17 | 	github.com/shirou/gopsutil/v3 v3.22.1
18 | 	github.com/sirupsen/logrus v1.8.1
19 | 	github.com/spf13/cobra v1.2.1
20 | 	github.com/spf13/viper v1.8.1
21 | 	google.golang.org/grpc v1.42.0
22 | 	google.golang.org/protobuf v1.27.1
23 | 	k8s.io/api v0.23.0
24 | 	k8s.io/apimachinery v0.23.0
25 | 	k8s.io/client-go v0.23.0
26 | 	k8s.io/kubelet v0.23.0
27 | 	sigs.k8s.io/controller-runtime v0.11.0
28 | )
29 | 
30 | require (
31 | 	github.com/beorn7/perks v1.0.1 // indirect
32 | 	github.com/cespare/xxhash/v2 v2.1.1 // indirect
33 | 	github.com/chzyer/readline v0.0.0-20180603132655-2972be24d48e // indirect
34 | 	github.com/davecgh/go-spew v1.1.1 // indirect
35 | 	github.com/evanphx/json-patch v4.12.0+incompatible // indirect
36 | 	github.com/go-logr/logr v1.2.0 // indirect
37 | 	github.com/go-ole/go-ole v1.2.6 // indirect
38 | 	github.com/gobuffalo/here v0.6.0 // indirect
39 | 	github.com/gogo/protobuf v1.3.2 // indirect
40 | 	github.com/golang/protobuf v1.5.2 // indirect
41 | 	github.com/google/go-cmp v0.5.7 // indirect
42 | 	github.com/google/gofuzz v1.1.0 // indirect
43 | 	github.com/googleapis/gnostic v0.5.5 // indirect
44 | 	github.com/hashicorp/hcl v1.0.0 // indirect
45 | 	github.com/imdario/mergo v0.3.12 // indirect
46 | 	github.com/inconshreveable/mousetrap v1.0.0 // indirect
47 | 	github.com/json-iterator/go v1.1.12 // indirect
48 | 	github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0 // indirect
49 | 	github.com/magiconair/properties v1.8.5 // indirect
50 | 	github.com/mattn/go-runewidth v0.0.13 // indirect
51 | 	github.com/matttproud/golang_protobuf_extensions v1.0.2-0.20181231171920-c182affec369 // indirect
52 | 	github.com/mitchellh/mapstructure v1.4.1 // indirect
53 | 	github.com/moby/spdystream v0.2.0 // indirect
54 | 	github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
55 | 	github.com/modern-go/reflect2 v1.0.2 // indirect
56 | 	github.com/nxadm/tail v1.4.8 // indirect
57 | 	github.com/pelletier/go-toml v1.9.3 // indirect
58 | 	github.com/pkg/errors v0.9.1 // indirect
59 | 	github.com/power-devops/perfstat v0.0.0-20210106213030-5aafc221ea8c // indirect
60 | 	github.com/prometheus/client_model v0.2.0 // indirect
61 | 	github.com/prometheus/common v0.28.0 // indirect
62 | 	github.com/rivo/uniseg v0.2.0 // indirect
63 | 	github.com/spf13/afero v1.6.0 // indirect
64 | 	github.com/spf13/cast v1.3.1 // indirect
65 | 	github.com/spf13/jwalterweatherman v1.1.0 // indirect
66 | 	github.com/spf13/pflag v1.0.5 // indirect
67 | 	github.com/subosito/gotenv v1.2.0 // indirect
68 | 	github.com/tklauser/go-sysconf v0.3.9 // indirect
69 | 	github.com/tklauser/numcpus v0.3.0 // indirect
70 | 	github.com/yusufpapurcu/wmi v1.2.2 // indirect
71 | 	golang.org/x/net v0.0.0-20210825183410-e898025ed96a // indirect
72 | 	golang.org/x/oauth2 v0.0.0-20210819190943-2bc19b11175f // indirect
73 | 	golang.org/x/sys v0.0.0-20220111092808-5a964db01320 // indirect
74 | 	golang.org/x/term v0.0.0-20210615171337-6886f2dfbf5b // indirect
75 | 	golang.org/x/text v0.3.7 // indirect
76 | 	golang.org/x/time v0.0.0-20210723032227-1f47c861a9ac // indirect
77 | 	google.golang.org/appengine v1.6.7 // indirect
78 | 	google.golang.org/genproto v0.0.0-20210831024726-fe130286e0e2 // indirect
79 | 	gopkg.in/inf.v0 v0.9.1 // indirect
80 | 	gopkg.in/ini.v1 v1.62.0 // indirect
81 | 	gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7 // indirect
82 | 	gopkg.in/yaml.v2 v2.4.0 // indirect
83 | 	gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b // indirect
84 | 	k8s.io/klog/v2 v2.30.0 // indirect
85 | 	k8s.io/kube-openapi v0.0.0-20211115234752-e816edb12b65 // indirect
86 | 	k8s.io/utils v0.0.0-20210930125809-cb0fa318a74b // indirect
87 | 	sigs.k8s.io/json v0.0.0-20211020170558-c049b76a60c6 // indirect
88 | 	sigs.k8s.io/structured-merge-diff/v4 v4.2.0 // indirect
89 | 	sigs.k8s.io/yaml v1.3.0 // indirect
90 | )
91 | 


--------------------------------------------------------------------------------
/pkg/plugin/nvidia.go:
--------------------------------------------------------------------------------
  1 | package plugin
  2 | 
  3 | import (
  4 | 	b64 "encoding/base64"
  5 | 	"fmt"
  6 | 	"github.com/AccessibleAI/cnvrg-fractional-accelerator-device-plugin/pkg/allocator"
  7 | 	"github.com/AccessibleAI/cnvrg-fractional-accelerator-device-plugin/pkg/nvmlutils"
  8 | 	"github.com/AccessibleAI/cnvrg-fractional-accelerator-device-plugin/pkg/sharecfg"
  9 | 	"github.com/NVIDIA/go-nvml/pkg/nvml"
 10 | 	log "github.com/sirupsen/logrus"
 11 | 	"github.com/spf13/viper"
 12 | 	pluginapi "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1"
 13 | 	"regexp"
 14 | 	"time"
 15 | )
 16 | 
 17 | func (m *NvidiaDeviceManager) CacheDevices() {
 18 | 	// enforce device discovery
 19 | 	// to make sure all the devices will be set
 20 | 	// before kubelet api server will be started
 21 | 	m.setDevices()
 22 | 	go func() {
 23 | 		for {
 24 | 			<-time.After(m.cacheTTL)
 25 | 			m.setDevices()
 26 | 		}
 27 | 	}()
 28 | }
 29 | 
 30 | func (m *NvidiaDeviceManager) ParseRealDeviceId(metaDevicesIds []string) (realDevicesIds []string) {
 31 | 
 32 | 	// each meta gpu will start from 'cnvrg-meta-[index-number]-[sequence-number]'
 33 | 	r, _ := regexp.Compile("cnvrg-meta-\\d+-\\d+-")
 34 | 	// string map will eliminate doubles in real Devices ids
 35 | 	realDevicesIdsMap := make(map[string]bool)
 36 | 	for _, metaDeviceId := range metaDevicesIds {
 37 | 		deviceId := r.ReplaceAllString(metaDeviceId, "")
 38 | 		if !m.DeviceExists(deviceId) {
 39 | 			log.Errorf("device %s doesn not exists, but was claimed", metaDeviceId)
 40 | 			continue
 41 | 		}
 42 | 		realDevicesIdsMap[deviceId] = true
 43 | 	}
 44 | 
 45 | 	var realDevicesIdsList []string
 46 | 	for dId, _ := range realDevicesIdsMap {
 47 | 		realDevicesIdsList = append(realDevicesIdsList, dId)
 48 | 	}
 49 | 	return realDevicesIdsList
 50 | }
 51 | 
 52 | func (m *NvidiaDeviceManager) DeviceExists(deviceId string) bool {
 53 | 	for _, d := range m.Devices {
 54 | 		if d.UUID == deviceId {
 55 | 			return true
 56 | 		}
 57 | 	}
 58 | 	return false
 59 | }
 60 | 
 61 | func (m *NvidiaDeviceManager) GetPluginDevices() []*pluginapi.Device {
 62 | 	var metaGpus []*pluginapi.Device
 63 | 	log.Infof("generating meta gpu devices (total: %d)", len(m.Devices)*m.shareCfg.MetagpusPerGpu)
 64 | 	for _, d := range m.Devices {
 65 | 		for j := 0; j < m.shareCfg.MetagpusPerGpu; j++ {
 66 | 			metaGpus = append(metaGpus, &pluginapi.Device{
 67 | 				ID:     fmt.Sprintf("cnvrg-meta-%d-%d-%s", d.Index, j, d.UUID),
 68 | 				Health: pluginapi.Healthy,
 69 | 			})
 70 | 		}
 71 | 	}
 72 | 
 73 | 	return metaGpus
 74 | }
 75 | 
 76 | func (m *NvidiaDeviceManager) setDevices() {
 77 | 
 78 | 	var devices []*MetaDevice
 79 | 	nvmlDevices := nvmlutils.GetDevices()
 80 | 	log.Infof("refreshing nvidia devices cache (total: %d)", len(nvmlDevices))
 81 | 	for idx, device := range nvmlDevices {
 82 | 		uuid, ret := device.GetUUID()
 83 | 		if m.isManagedDevice(uuid) {
 84 | 			// enable accounting mode
 85 | 			nvmlutils.ErrorCheck(ret)
 86 | 			ret = device.SetAccountingMode(nvml.FEATURE_ENABLED)
 87 | 			nvmlutils.ErrorCheck(ret)
 88 | 			// verify accounting mode is on,
 89 | 			// seems like for MIG-enabled devices we can't enable accounting mode?
 90 | 			// https://github.com/NVIDIA/nvidia-settings/blob/main/src/nvml.h#L5717
 91 | 			state, ret := device.GetAccountingMode()
 92 | 			nvmlutils.ErrorCheck(ret)
 93 | 			log.Infof("accounting mode for device: %s is: %d", uuid, state)
 94 | 			devices = append(devices, &MetaDevice{UUID: uuid, Index: idx})
 95 | 		}
 96 | 	}
 97 | 
 98 | 	m.Devices = devices
 99 | }
100 | 
101 | func (m *NvidiaDeviceManager) isManagedDevice(deviceUuid string) bool {
102 | 	for _, uuid := range m.shareCfg.Uuid {
103 | 		if uuid == deviceUuid || uuid == "*" {
104 | 			return true
105 | 		}
106 | 	}
107 | 	return false
108 | }
109 | 
110 | func (m *NvidiaDeviceManager) GetDeviceSharingConfig() *sharecfg.DeviceSharingConfig {
111 | 	return m.shareCfg
112 | }
113 | 
114 | func (m *NvidiaDeviceManager) GetUnixSocket() string {
115 | 	return b64.StdEncoding.EncodeToString([]byte(m.shareCfg.ResourceName))
116 | }
117 | 
118 | func (m *NvidiaDeviceManager) MetagpuAllocation(allocationSize int, availableDevIds []string) ([]string, error) {
119 | 	return allocator.NewDeviceAllocation(nvmlutils.GetTotalDevices(), allocationSize, m.shareCfg.MetagpusPerGpu, availableDevIds).MetagpusAllocations, nil
120 | }
121 | 
122 | func NewNvidiaDeviceManager(shareCfg *sharecfg.DeviceSharingConfig) *NvidiaDeviceManager {
123 | 	ndm := &NvidiaDeviceManager{
124 | 		cacheTTL:                 time.Second * time.Duration(viper.GetInt("deviceCacheTTL")),
125 | 		processesDiscoveryPeriod: time.Second * time.Duration(viper.GetInt("processesDiscoveryPeriod")),
126 | 		shareCfg:                 shareCfg,
127 | 	}
128 | 	// start devices cache loop
129 | 	ndm.CacheDevices()
130 | 	return ndm
131 | }
132 | 


--------------------------------------------------------------------------------
/pkg/podexec/podexec.go:
--------------------------------------------------------------------------------
  1 | package podexec
  2 | 
  3 | import (
  4 | 	"bytes"
  5 | 	"errors"
  6 | 	log "github.com/sirupsen/logrus"
  7 | 	"github.com/spf13/viper"
  8 | 	corev1 "k8s.io/api/core/v1"
  9 | 	"k8s.io/apimachinery/pkg/runtime"
 10 | 	"k8s.io/client-go/kubernetes"
 11 | 	"k8s.io/client-go/tools/remotecommand"
 12 | 	"os"
 13 | 	"sigs.k8s.io/controller-runtime/pkg/client"
 14 | 	k8sClientConfig "sigs.k8s.io/controller-runtime/pkg/client/config"
 15 | 	"strings"
 16 | )
 17 | 
 18 | var copyCache *mgctlCopyCache
 19 | 
 20 | func GetK8sClient() (client.Client, error) {
 21 | 	l := log.WithFields(log.Fields{"context": "getK8sClient"})
 22 | 	rc := k8sClientConfig.GetConfigOrDie()
 23 | 	scheme := runtime.NewScheme()
 24 | 	if err := corev1.AddToScheme(scheme); err != nil {
 25 | 		log.Fatalf("error adding to scheme, err: %s ", err)
 26 | 	}
 27 | 	controllerClient, err := client.New(rc, client.Options{Scheme: scheme})
 28 | 	if err != nil {
 29 | 		l.Errorf("error creating new client, err: %s", err)
 30 | 		return nil, err
 31 | 	}
 32 | 
 33 | 	return controllerClient, nil
 34 | }
 35 | 
 36 | func CopymgctlToContainer(containerName, podId, ns string) {
 37 | 
 38 | 	pe, err := NewPodExec(containerName, podId, ns)
 39 | 	if err != nil {
 40 | 		log.Error(err)
 41 | 		return
 42 | 	}
 43 | 
 44 | 	if pe.shouldCopyMgctl() {
 45 | 		pe.copyMgctl()
 46 | 		pe.makeMgctlExecutable()
 47 | 		pe.getCopyCache().setCache(podId) // in memory cache to skip pod exec command
 48 | 	}
 49 | }
 50 | 
 51 | func (e *podExec) shouldCopyMgctl() bool {
 52 | 	if e.getCopyCache().isCached(e.podName) {
 53 | 		return false
 54 | 	}
 55 | 	l := log.WithFields(log.Fields{"containerName": e.containerName, "podName": e.podName})
 56 | 	output, err := e.RunCommand([]string{"/usr/bin/ls", "/usr/bin"})
 57 | 	if err != nil {
 58 | 		l.Error(err)
 59 | 		return false
 60 | 	}
 61 | 	files := strings.Split(output, "\n")
 62 | 	for _, fileName := range files {
 63 | 		if fileName == "mgctl" {
 64 | 			return false
 65 | 		}
 66 | 	}
 67 | 	l.Info("injecting mgctl bin")
 68 | 	return true
 69 | }
 70 | 
 71 | func (e *podExec) RunCommand(command []string) (output string, err error) {
 72 | 	e.cmd = command
 73 | 	e.stdout = new(bytes.Buffer)
 74 | 	if err := e.exec(); err != nil {
 75 | 		return "", err
 76 | 	}
 77 | 	return e.stdout.String(), nil
 78 | }
 79 | 
 80 | func (e *podExec) makeMgctlExecutable() {
 81 | 	e.cmd = []string{"chmod", "+x", "/usr/bin/mgctl"}
 82 | 	e.stdout = new(bytes.Buffer)
 83 | 	if err := e.exec(); err != nil {
 84 | 		log.WithFields(log.Fields{"containerName": e.containerName, "podName": e.podName}).Error(err)
 85 | 	}
 86 | }
 87 | 
 88 | func (e *podExec) copyMgctl() {
 89 | 	l := log.WithFields(log.Fields{"containerName": e.containerName, "podName": e.podName})
 90 | 	var err error
 91 | 	e.cmd = []string{"cp", "/dev/stdin", "/usr/bin/mgctl"}
 92 | 	e.stdin, err = e.getmgctlBinFile()
 93 | 	if err != nil {
 94 | 		l.Error(err)
 95 | 		return
 96 | 	}
 97 | 	if err := e.exec(); err != nil {
 98 | 		l.Error(err)
 99 | 		return
100 | 	}
101 | }
102 | 
103 | func (e *podExec) getmgctlBinFile() (*os.File, error) {
104 | 	mgctlFile := viper.GetString("mgctlTar")
105 | 	if _, err := os.Stat(mgctlFile); err == nil {
106 | 		return os.Open(mgctlFile)
107 | 	} else {
108 | 		return nil, err
109 | 	}
110 | }
111 | 
112 | func (e *podExec) getCopyCache() *mgctlCopyCache {
113 | 	if copyCache == nil {
114 | 		copyCache = NewMgctlCopyCache()
115 | 	}
116 | 	return copyCache
117 | }
118 | 
119 | func (e *podExec) exec() error {
120 | 	rc := k8sClientConfig.GetConfigOrDie()
121 | 	clientset, err := kubernetes.NewForConfig(rc)
122 | 	if err != nil {
123 | 		return err
124 | 	}
125 | 	scheme := runtime.NewScheme()
126 | 	if err := corev1.AddToScheme(scheme); err != nil {
127 | 		return err
128 | 	}
129 | 	req := clientset.CoreV1().RESTClient().Post().
130 | 		Resource("pods").
131 | 		Name(e.podName).
132 | 		Namespace(e.podNs).
133 | 		SubResource("exec").
134 | 		VersionedParams(&corev1.PodExecOptions{
135 | 			Stdin:     e.stdin != nil,
136 | 			Stdout:    e.stdout != nil,
137 | 			Stderr:    true,
138 | 			TTY:       false,
139 | 			Container: e.containerName,
140 | 			Command:   e.cmd,
141 | 		}, runtime.NewParameterCodec(scheme))
142 | 	exec, err := remotecommand.NewSPDYExecutor(rc, "POST", req.URL())
143 | 	if err != nil {
144 | 		return err
145 | 	}
146 | 	var stderr bytes.Buffer
147 | 	err = exec.Stream(remotecommand.StreamOptions{Stdin: e.stdin, Stdout: e.stdout, Stderr: &stderr, Tty: false})
148 | 	if err != nil {
149 | 		return err
150 | 	}
151 | 	stdError := stderr.String()
152 | 	if stdError != "" {
153 | 		return errors.New(stdError)
154 | 	}
155 | 	return nil
156 | }
157 | 
158 | func NewPodExec(containerName, podId, ns string) (*podExec, error) {
159 | 	return &podExec{podName: podId, podNs: ns, containerName: containerName}, nil
160 | }
161 | 


--------------------------------------------------------------------------------
/cmd/mgdp/main.go:
--------------------------------------------------------------------------------
  1 | package main
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | 	"github.com/AccessibleAI/cnvrg-fractional-accelerator-device-plugin/pkg/mgsrv"
  6 | 	"github.com/AccessibleAI/cnvrg-fractional-accelerator-device-plugin/pkg/plugin"
  7 | 	"github.com/AccessibleAI/cnvrg-fractional-accelerator-device-plugin/pkg/sharecfg"
  8 | 	"github.com/fsnotify/fsnotify"
  9 | 	log "github.com/sirupsen/logrus"
 10 | 	"github.com/spf13/cobra"
 11 | 	"github.com/spf13/viper"
 12 | 	"os"
 13 | 	"os/signal"
 14 | 	"path/filepath"
 15 | 	"runtime"
 16 | 	"strconv"
 17 | 	"strings"
 18 | 	"syscall"
 19 | )
 20 | 
 21 | type param struct {
 22 | 	name      string
 23 | 	shorthand string
 24 | 	value     interface{}
 25 | 	usage     string
 26 | 	required  bool
 27 | }
 28 | 
 29 | var (
 30 | 	Version    string
 31 | 	Build      string
 32 | 	rootParams = []param{
 33 | 		{name: "config", shorthand: "c", value: ".", usage: "path to configuration file"},
 34 | 		{name: "json-log", shorthand: "", value: false, usage: "output logs in json format"},
 35 | 		{name: "verbose", shorthand: "", value: false, usage: "enable verbose logs"},
 36 | 	}
 37 | 	metaGpuRecalc              = make(chan bool)
 38 | 	metaGpuDevicePluginVersion = &cobra.Command{
 39 | 		Use:   "version",
 40 | 		Short: "Print metagpu-device-plugin version and build sha",
 41 | 		Run: func(cmd *cobra.Command, args []string) {
 42 | 			fmt.Printf("🐾 version: %s build: %s \n", Version, Build)
 43 | 		}}
 44 | 	metaGpuStart = &cobra.Command{
 45 | 		Use:   "start",
 46 | 		Short: "Start metagpu device plugin",
 47 | 		Run: func(cmd *cobra.Command, args []string) {
 48 | 			var plugins []*plugin.MetaGpuDevicePlugin
 49 | 			// load gpu shares configuration
 50 | 			shareConfigs := sharecfg.NewDeviceSharingConfig()
 51 | 			// init plugins
 52 | 			for _, c := range shareConfigs.Configs {
 53 | 				plugins = append(plugins, plugin.NewMetaGpuDevicePlugin(metaGpuRecalc, plugin.NewNvidiaDeviceManager(c)))
 54 | 			}
 55 | 			// start plugins
 56 | 			for _, p := range plugins {
 57 | 				p.Start()
 58 | 			}
 59 | 			// start grpc server
 60 | 			mgsrv.NewMetaGpuServer().Start()
 61 | 			// handle interrupts
 62 | 			sigCh := make(chan os.Signal, 1)
 63 | 			signal.Notify(sigCh, syscall.SIGHUP, syscall.SIGINT, syscall.SIGTERM, syscall.SIGQUIT)
 64 | 			for {
 65 | 				select {
 66 | 				case s := <-sigCh:
 67 | 					log.Infof("signal: %s, shutting down", s)
 68 | 					// stop all plugins
 69 | 					for _, p := range plugins {
 70 | 						p.Stop()
 71 | 					}
 72 | 					log.Info("bye bye 👋")
 73 | 					os.Exit(0)
 74 | 				}
 75 | 			}
 76 | 		},
 77 | 	}
 78 | 	rootCmd = &cobra.Command{
 79 | 		Use:   "metagpu",
 80 | 		Short: "Metagpu - fractional accelerator device plugin",
 81 | 	}
 82 | )
 83 | 
 84 | func init() {
 85 | 	cobra.OnInitialize(initConfig)
 86 | 	setParams(rootParams, rootCmd)
 87 | 	rootCmd.AddCommand(metaGpuDevicePluginVersion)
 88 | 	rootCmd.AddCommand(metaGpuStart)
 89 | }
 90 | 
 91 | func initConfig() {
 92 | 	viper.AutomaticEnv()
 93 | 	viper.SetConfigName("config")
 94 | 	viper.SetConfigType("yaml")
 95 | 	viper.AddConfigPath("./config")
 96 | 	viper.AddConfigPath(viper.GetString("config"))
 97 | 	viper.SetEnvPrefix("METAGPU_DEVICE_PLUGIN")
 98 | 	viper.SetEnvKeyReplacer(strings.NewReplacer(".", "_"))
 99 | 	setupLogging()
100 | 	err := viper.ReadInConfig()
101 | 	if err != nil {
102 | 		log.Fatalf("config file not found, err: %s", err)
103 | 	}
104 | 	viper.WatchConfig()
105 | 	viper.OnConfigChange(func(e fsnotify.Event) {
106 | 		log.Infof("config file changed: %s, triggering meta gpu recalculation", e.Name)
107 | 		metaGpuRecalc <- true
108 | 	})
109 | }
110 | 
111 | func setParams(params []param, command *cobra.Command) {
112 | 	for _, param := range params {
113 | 		switch v := param.value.(type) {
114 | 		case int:
115 | 			command.PersistentFlags().IntP(param.name, param.shorthand, v, param.usage)
116 | 		case string:
117 | 			command.PersistentFlags().StringP(param.name, param.shorthand, v, param.usage)
118 | 		case bool:
119 | 			command.PersistentFlags().BoolP(param.name, param.shorthand, v, param.usage)
120 | 		}
121 | 		if err := viper.BindPFlag(param.name, command.PersistentFlags().Lookup(param.name)); err != nil {
122 | 			panic(err)
123 | 		}
124 | 	}
125 | }
126 | 
127 | func setupLogging() {
128 | 
129 | 	// Set log verbosity
130 | 	if viper.GetBool("verbose") {
131 | 		log.SetLevel(log.DebugLevel)
132 | 	} else {
133 | 		log.SetLevel(log.InfoLevel)
134 | 		log.SetReportCaller(true)
135 | 		log.SetFormatter(&log.TextFormatter{
136 | 			FullTimestamp: true,
137 | 			DisableColors: true,
138 | 			CallerPrettyfier: func(frame *runtime.Frame) (function string, file string) {
139 | 				fileName := strings.TrimSuffix(filepath.Base(frame.File), filepath.Ext(frame.File))
140 | 				line := strconv.Itoa(frame.Line)
141 | 				return "", fmt.Sprintf("%s:%s", fileName, line)
142 | 			},
143 | 		})
144 | 	}
145 | 
146 | 	// Set log format
147 | 	if viper.GetBool("json-log") {
148 | 		log.SetFormatter(&log.JSONFormatter{})
149 | 	}
150 | 
151 | 	// Logs are always goes to STDOUT
152 | 	log.SetOutput(os.Stdout)
153 | }
154 | 
155 | func main() {
156 | 
157 | 	if err := rootCmd.Execute(); err != nil {
158 | 		fmt.Println(err)
159 | 		os.Exit(1)
160 | 	}
161 | 
162 | }
163 | 


--------------------------------------------------------------------------------
/pkg/allocator/allocator.go:
--------------------------------------------------------------------------------
  1 | package allocator
  2 | 
  3 | import (
  4 | 	log "github.com/sirupsen/logrus"
  5 | 	"regexp"
  6 | 	"sort"
  7 | 	"strconv"
  8 | 	"strings"
  9 | )
 10 | 
 11 | func NewDeviceAllocation(totalPhysicalDevices, allocationSize, totalSharesPerGpu int, availableDevIds []string) *DeviceAllocation {
 12 | 	sort.Strings(availableDevIds)
 13 | 	devAlloc := &DeviceAllocation{
 14 | 		AvailableDevIds: availableDevIds, AllocationSize: allocationSize, TotalSharesPerGpu: totalSharesPerGpu}
 15 | 	// print available device ids
 16 | 	devAlloc.PrintAvailableDevIds()
 17 | 	// init load map
 18 | 	devAlloc.LoadMap = make([]*DeviceLoad, totalPhysicalDevices)
 19 | 	devAlloc.initLoadMap()
 20 | 	// make allocations
 21 | 	devAlloc.allocate()
 22 | 	return devAlloc
 23 | }
 24 | 
 25 | func (a *DeviceAllocation) initLoadMap() {
 26 | 	// build a map of real device id to meta device id
 27 | 	for _, deviceId := range a.MetaDeviceIdsToRealDeviceIds() {
 28 | 		for _, availableDevId := range a.AvailableDevIds {
 29 | 			if !strings.Contains(availableDevId, deviceId) {
 30 | 				continue
 31 | 			}
 32 | 			devIdx := metaDeviceIdToDeviceIndex(availableDevId)
 33 | 			if a.LoadMap[devIdx] == nil {
 34 | 				a.LoadMap[devIdx] = &DeviceLoad{}
 35 | 			}
 36 | 			a.LoadMap[devIdx].Metagpus = append(a.LoadMap[devIdx].Metagpus, availableDevId)
 37 | 		}
 38 | 	}
 39 | }
 40 | 
 41 | func (a *DeviceAllocation) PrintAvailableDevIds() {
 42 | 	log.Infof("[preferred-allocation] available (%d) devices IDs:", len(a.AvailableDevIds))
 43 | 	for _, devId := range a.AvailableDevIds {
 44 | 		log.Info(devId)
 45 | 	}
 46 | }
 47 | 
 48 | func (a *DeviceAllocation) MetaDeviceIdsToRealDeviceIds() (realDeviceIds []string) {
 49 | 	// each meta gpu will start from 'cnvrg-meta-[index-number]-[sequence-number]'
 50 | 	r, _ := regexp.Compile("cnvrg-meta-\\d+-\\d+-")
 51 | 	// map[string] will eliminate doubles in real Devices ids
 52 | 	realDevicesIdsMap := make(map[string]bool)
 53 | 	for _, metaDeviceId := range a.AvailableDevIds {
 54 | 		realDevicesIdsMap[r.ReplaceAllString(metaDeviceId, "")] = true
 55 | 	}
 56 | 
 57 | 	for deviceId, _ := range realDevicesIdsMap {
 58 | 		realDeviceIds = append(realDeviceIds, deviceId)
 59 | 	}
 60 | 
 61 | 	return
 62 | }
 63 | 
 64 | func (a *DeviceAllocation) allocate() {
 65 | 	entireGpusRequest := a.AllocationSize / a.TotalSharesPerGpu
 66 | 	gpuFractionsRequest := a.AllocationSize % a.TotalSharesPerGpu
 67 | 	log.Infof("metagpu allocation request [full: %d fractions: %d]", entireGpusRequest, gpuFractionsRequest)
 68 | 
 69 | 	// first try to allocate entire gpus if requested
 70 | 	for i := 0; i < entireGpusRequest; i++ {
 71 | 		for _, devLoad := range a.LoadMap {
 72 | 			if devLoad == nil {
 73 | 				continue
 74 | 			}
 75 | 			if devLoad.getFreeShares() == a.TotalSharesPerGpu {
 76 | 				a.MetagpusAllocations = append(a.MetagpusAllocations, devLoad.Metagpus...)
 77 | 				// remove current available metagpus from the device load map
 78 | 				devLoad.Metagpus = nil
 79 | 				// entire gpu allocated, continue to the next entire gpu if needed
 80 | 				break
 81 | 			}
 82 | 		}
 83 | 	}
 84 | 
 85 | 	if gpuFractionsRequest > 0 {
 86 | 		for _, devLoad := range a.LoadMap {
 87 | 			if devLoad == nil {
 88 | 				continue
 89 | 			}
 90 | 			if devLoad.getFreeShares() >= gpuFractionsRequest {
 91 | 				var devicesToAdd []string
 92 | 				for i, device := range devLoad.Metagpus {
 93 | 					if i == gpuFractionsRequest {
 94 | 						break
 95 | 					}
 96 | 					devicesToAdd = append(devicesToAdd, device)
 97 | 				}
 98 | 				a.MetagpusAllocations = append(a.MetagpusAllocations, devicesToAdd...)
 99 | 				devLoad.removeDevices(devicesToAdd)
100 | 				break
101 | 			}
102 | 		}
103 | 	}
104 | 	// if still missing allocations,
105 | 	// meaning wasn't able to allocate required fractions from the same GPU
106 | 	// will try to allocate a fractions from different GPUs
107 | 	if len(a.MetagpusAllocations) != a.AllocationSize {
108 | 		allocationsLeft := a.AllocationSize
109 | 	ExitMultiGpuFractionAlloc:
110 | 		if allocationsLeft > 0 {
111 | 			for _, devLoad := range a.LoadMap {
112 | 				if devLoad == nil {
113 | 					continue
114 | 				}
115 | 				for _, device := range devLoad.Metagpus {
116 | 					a.MetagpusAllocations = append(a.MetagpusAllocations, device)
117 | 					allocationsLeft--
118 | 					if allocationsLeft == 0 {
119 | 						goto ExitMultiGpuFractionAlloc
120 | 					}
121 | 				}
122 | 			}
123 | 		}
124 | 	}
125 | 	if len(a.MetagpusAllocations) != a.AllocationSize {
126 | 		log.Errorf("error during allocation, the allocationSize: %d doesn't match total allocated devices: %d", a.AllocationSize, len(a.MetagpusAllocations))
127 | 	}
128 | }
129 | 
130 | func (l *DeviceLoad) getFreeShares() int {
131 | 	return len(l.Metagpus)
132 | }
133 | 
134 | func (l *DeviceLoad) removeDevices(devIds []string) {
135 | 	for _, devId := range devIds {
136 | 		for i, v := range l.Metagpus {
137 | 			if v == devId {
138 | 				l.Metagpus = append(l.Metagpus[:i], l.Metagpus[i+1:]...)
139 | 			}
140 | 		}
141 | 	}
142 | }
143 | 
144 | func metaDeviceIdToDeviceIndex(metaDeviceId string) (deviceIndex int) {
145 | 	r, _ := regexp.Compile("-\\d+-")
146 | 	s := strings.ReplaceAll(r.FindString(metaDeviceId), "-", "")
147 | 	idx, err := strconv.Atoi(s)
148 | 	if err != nil {
149 | 		log.Errorf("can't detect physical device ID from meta device id, err: %s", err)
150 | 	}
151 | 	return idx
152 | 
153 | }
154 | 


--------------------------------------------------------------------------------
/deploy/static.yaml:
--------------------------------------------------------------------------------
  1 | ---
  2 | # Source: metagpu-device-plugin/templates/rbac.yml
  3 | apiVersion: v1
  4 | kind: ServiceAccount
  5 | metadata:
  6 |   name: metagpu-device-plugin
  7 |   namespace: cnvrg
  8 | ---
  9 | # Source: metagpu-device-plugin/templates/cm.yml
 10 | apiVersion: v1
 11 | kind: ConfigMap
 12 | metadata:
 13 |   name: metagpu-device-plugin-config
 14 |   namespace: cnvrg
 15 | data:
 16 |   config.yaml: |
 17 |     accelerator: nvidia
 18 |     processesDiscoveryPeriod: 5
 19 |     deviceCacheTTL: 3600
 20 |     jwtSecret: topSecret
 21 |     mgctlTar: /tmp/mgctl
 22 |     mgctlAutoInject: true
 23 |     serverAddr: 0.0.0.0:50052
 24 |     memoryEnforcer: true
 25 |     deviceToken: eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJlbWFpbCI6Im1ldGFncHVAaW5zdGFuY2UiLCJ2aXNpYmlsaXR5TGV2ZWwiOiJsMCJ9.2rHykHFcHoIr-OCoPA5Am4ubf31-RJcayZnOTK6db94
 26 |     containerToken: eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJlbWFpbCI6Im1ldGFncHVAaW5zdGFuY2UiLCJ2aXNpYmlsaXR5TGV2ZWwiOiJsMSJ9.o5v6Zdi1FKXQevRjuSbABBX1vIRYgN3Daz9iXabuFFA
 27 |     deviceSharing:
 28 |       - resourceName: cnvrg.io/metagpu
 29 |         autoReshare: true
 30 |         metaGpus: 2
 31 |         uuid: [ "*" ]
 32 | ---
 33 | # Source: metagpu-device-plugin/templates/rbac.yml
 34 | apiVersion: rbac.authorization.k8s.io/v1
 35 | kind: ClusterRole
 36 | metadata:
 37 |   name: metagpu-device-plugin
 38 |   namespace: cnvrg
 39 | rules:
 40 |   - apiGroups:
 41 |       - ""
 42 |     resources:
 43 |       - pods
 44 |     verbs:
 45 |       - list
 46 |       - get
 47 |       - create
 48 |   - apiGroups:
 49 |       - ""
 50 |     resources:
 51 |       - pods/exec
 52 |     verbs:
 53 |       - create
 54 |   - apiGroups:
 55 |       - ""
 56 |     resources:
 57 |       - configmaps
 58 |     resourceNames:
 59 |       - metagpu-device-plugin-config
 60 |     verbs:
 61 |       - get
 62 |       - update
 63 | ---
 64 | # Source: metagpu-device-plugin/templates/rbac.yml
 65 | apiVersion: rbac.authorization.k8s.io/v1
 66 | kind: ClusterRoleBinding
 67 | metadata:
 68 |   name: metagpu-device-plugin
 69 |   namespace: cnvrg
 70 | roleRef:
 71 |   apiGroup: rbac.authorization.k8s.io
 72 |   kind: ClusterRole
 73 |   name: metagpu-device-plugin
 74 | subjects:
 75 |   - kind: ServiceAccount
 76 |     name: metagpu-device-plugin
 77 |     namespace: cnvrg
 78 | ---
 79 | # Source: metagpu-device-plugin/templates/svc.yml
 80 | kind: Service
 81 | apiVersion: v1
 82 | metadata:
 83 |   name: metagpu-device-plugin
 84 |   namespace: cnvrg
 85 |   labels:
 86 |     app: "metagpu-exporter"
 87 | spec:
 88 |   selector:
 89 |     name: metagpu-device-plugin
 90 |   ports:
 91 |     - protocol: TCP
 92 |       port: 50052
 93 |       name: grcp
 94 |     - protocol: TCP
 95 |       port: 2112
 96 |       name: metrics
 97 | ---
 98 | # Source: metagpu-device-plugin/templates/ds.yml
 99 | apiVersion: apps/v1
100 | kind: DaemonSet
101 | metadata:
102 |   name: metagpu-device-plugin
103 |   namespace: cnvrg
104 | spec:
105 |   selector:
106 |     matchLabels:
107 |       name: metagpu-device-plugin
108 |   template:
109 |     metadata:
110 |       annotations:
111 |         scheduler.alpha.kubernetes.io/critical-pod: ""
112 |       labels:
113 |         name: metagpu-device-plugin
114 |     spec:
115 |       tolerations:
116 |         - key: CriticalAddonsOnly
117 |           operator: Exists
118 |         - key: nvidia.com/gpu
119 |           operator: Exists
120 |           effect: NoSchedule
121 |       priorityClassName: "system-node-critical"
122 |       imagePullSecrets:
123 |         - name: regcred
124 |       hostPID: true
125 |       hostNetwork: true
126 |       serviceAccountName: metagpu-device-plugin
127 |       nodeSelector:
128 |         accelerator: nvidia
129 |       containers:
130 |         - name: metagpu-device-plugin
131 |           image: "docker.io/cnvrg/metagpu-device-plugin:DEV-13690-tot-mem-cmd-line"
132 |           imagePullPolicy: Always
133 |           command:
134 |             - /usr/bin/mgdp
135 |             - start
136 |             - -c
137 |             - /etc/metagpu-device-plugin
138 |           ports:
139 |             - containerPort: 50052
140 |           securityContext:
141 |             privileged: true
142 |           env:
143 |             - name: POD_IP
144 |               valueFrom:
145 |                 fieldRef:
146 |                   fieldPath: status.podIP
147 |             - name: MG_CTL_TOKEN
148 |               value: eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJlbWFpbCI6Im1ldGFncHVAaW5zdGFuY2UiLCJ2aXNpYmlsaXR5TGV2ZWwiOiJsMCJ9.2rHykHFcHoIr-OCoPA5Am4ubf31-RJcayZnOTK6db94
149 |           volumeMounts:
150 |             - name: device-plugin
151 |               mountPath: /var/lib/kubelet/device-plugins
152 |             - name: config
153 |               mountPath: /etc/metagpu-device-plugin
154 |             - mountPath: /host/proc
155 |               mountPropagation: HostToContainer
156 |               name: proc
157 |               readOnly: true
158 |         - name: metagpu-exporter
159 |           image: "docker.io/cnvrg/metagpu-device-plugin:DEV-13690-tot-mem-cmd-line"
160 |           imagePullPolicy: Always
161 |           command:
162 |             - /usr/bin/mgex
163 |             - start
164 |             - -t
165 |             - eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJlbWFpbCI6Im1ldGFncHVAaW5zdGFuY2UiLCJ2aXNpYmlsaXR5TGV2ZWwiOiJsMCJ9.2rHykHFcHoIr-OCoPA5Am4ubf31-RJcayZnOTK6db94
166 |           ports:
167 |             - containerPort: 2112
168 |       volumes:
169 |         - name: device-plugin
170 |           hostPath:
171 |             path: /var/lib/kubelet/device-plugins
172 |         - name: config
173 |           configMap:
174 |             name: metagpu-device-plugin-config
175 |         - hostPath:
176 |             path: /proc
177 |           name: proc
178 | ---
179 | # Source: metagpu-device-plugin/templates/svcmon.yml
180 | apiVersion: monitoring.coreos.com/v1
181 | kind: ServiceMonitor
182 | metadata:
183 |   name: metagpu-exporter
184 |   namespace: cnvrg
185 |   labels:
186 |     app: "metagpu-exporter"
187 |     cnvrg-infra-prometheus: cnvrg-infra-cnvrg
188 | spec:
189 |   selector:
190 |     matchLabels:
191 |       app: "metagpu-exporter"
192 |   namespaceSelector:
193 |     matchNames:
194 |       - cnvrg
195 |   endpoints:
196 |     - port: "metrics"
197 |       path: "/metrics"
198 |       interval: "15s"
199 | 


--------------------------------------------------------------------------------
/pkg/plugin/server.go:
--------------------------------------------------------------------------------
  1 | package plugin
  2 | 
  3 | import (
  4 | 	"context"
  5 | 	"fmt"
  6 | 	log "github.com/sirupsen/logrus"
  7 | 	"github.com/spf13/viper"
  8 | 	"google.golang.org/grpc"
  9 | 	pluginapi "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1"
 10 | 	"net"
 11 | 	"os"
 12 | 	"path"
 13 | 	"sort"
 14 | 	"strings"
 15 | 	"time"
 16 | )
 17 | 
 18 | func (p *MetaGpuDevicePlugin) dial(socket string, timeout time.Duration) (*grpc.ClientConn, error) {
 19 | 	c, err := grpc.Dial(socket, grpc.WithInsecure(), grpc.WithBlock(),
 20 | 		grpc.WithContextDialer(func(ctx context.Context, s string) (net.Conn, error) {
 21 | 			return net.DialTimeout("unix", socket, timeout)
 22 | 		}),
 23 | 	)
 24 | 
 25 | 	if err != nil {
 26 | 		return nil, err
 27 | 	}
 28 | 
 29 | 	return c, nil
 30 | 
 31 | }
 32 | 
 33 | func (p *MetaGpuDevicePlugin) Register() error {
 34 | 	conn, err := p.dial(pluginapi.KubeletSocket, 5*time.Second)
 35 | 	if err != nil {
 36 | 		return err
 37 | 	}
 38 | 	defer conn.Close()
 39 | 	client := pluginapi.NewRegistrationClient(conn)
 40 | 	req := &pluginapi.RegisterRequest{
 41 | 		Version:      pluginapi.Version,
 42 | 		Endpoint:     path.Base(p.socket),
 43 | 		ResourceName: p.GetDeviceSharingConfig().ResourceName,
 44 | 		Options: &pluginapi.DevicePluginOptions{
 45 | 			GetPreferredAllocationAvailable: true,
 46 | 		},
 47 | 	}
 48 | 	if _, err := client.Register(context.Background(), req); err != nil {
 49 | 		return err
 50 | 	}
 51 | 	return nil
 52 | }
 53 | 
 54 | func (p *MetaGpuDevicePlugin) GetDevicePluginOptions(ctx context.Context, empty *pluginapi.Empty) (*pluginapi.DevicePluginOptions, error) {
 55 | 	return &pluginapi.DevicePluginOptions{GetPreferredAllocationAvailable: true}, nil
 56 | }
 57 | 
 58 | func (p *MetaGpuDevicePlugin) ListAndWatch(e *pluginapi.Empty, s pluginapi.DevicePlugin_ListAndWatchServer) error {
 59 | 
 60 | 	if err := s.Send(&pluginapi.ListAndWatchResponse{Devices: p.GetPluginDevices()}); err != nil {
 61 | 		log.Error(err)
 62 | 	}
 63 | 
 64 | 	for {
 65 | 		select {
 66 | 		case <-p.stop:
 67 | 			return nil
 68 | 		case <-p.MetaGpuRecalculation:
 69 | 			if err := s.Send(&pluginapi.ListAndWatchResponse{Devices: p.GetPluginDevices()}); err != nil {
 70 | 				log.Error(err)
 71 | 			}
 72 | 		}
 73 | 	}
 74 | }
 75 | 
 76 | func (p *MetaGpuDevicePlugin) GetPreferredAllocation(ctx context.Context, request *pluginapi.PreferredAllocationRequest) (*pluginapi.PreferredAllocationResponse, error) {
 77 | 
 78 | 	allocResponse := &pluginapi.PreferredAllocationResponse{}
 79 | 	for _, req := range request.ContainerRequests {
 80 | 		allocContainerResponse := &pluginapi.ContainerPreferredAllocationResponse{}
 81 | 		allocContainerResponse.DeviceIDs, _ = p.MetagpuAllocation(int(req.AllocationSize), req.GetAvailableDeviceIDs())
 82 | 		log.Info("preferred devices ids:")
 83 | 		for _, devId := range allocContainerResponse.DeviceIDs {
 84 | 			log.Info(devId)
 85 | 		}
 86 | 		allocResponse.ContainerResponses = append(allocResponse.ContainerResponses, allocContainerResponse)
 87 | 	}
 88 | 	return allocResponse, nil
 89 | 
 90 | }
 91 | 
 92 | func (p *MetaGpuDevicePlugin) Allocate(ctx context.Context, request *pluginapi.AllocateRequest) (*pluginapi.AllocateResponse, error) {
 93 | 	allocResponse := &pluginapi.AllocateResponse{}
 94 | 	for _, req := range request.ContainerRequests {
 95 | 		response := pluginapi.ContainerAllocateResponse{}
 96 | 		sort.Strings(req.DevicesIDs)
 97 | 		log.Info("requested devices ids:")
 98 | 		for _, dev := range req.DevicesIDs {
 99 | 			log.Info(dev)
100 | 		}
101 | 		realDevices := p.ParseRealDeviceId(req.DevicesIDs)
102 | 		response.Envs = map[string]string{
103 | 			"CNVRG_META_GPU_DEVICES": strings.Join(req.DevicesIDs, ","),
104 | 			"NVIDIA_VISIBLE_DEVICES": strings.Join(realDevices, ","),
105 | 			"METAGPU_MAX_MEM":        fmt.Sprintf("%d", p.GetDeviceSharingConfig().GetShareSize()*len(req.DevicesIDs)),
106 | 			"MG_CTL_ADDR":            fmt.Sprintf("%s:50052", os.Getenv("POD_IP")),
107 | 			"MG_CTL_TOKEN":           viper.GetString("containerToken"),
108 | 		}
109 | 		allocResponse.ContainerResponses = append(allocResponse.ContainerResponses, &response)
110 | 	}
111 | 	return allocResponse, nil
112 | }
113 | 
114 | func (p *MetaGpuDevicePlugin) PreStartContainer(ctx context.Context, request *pluginapi.PreStartContainerRequest) (*pluginapi.PreStartContainerResponse, error) {
115 | 	return &pluginapi.PreStartContainerResponse{}, nil
116 | }
117 | 
118 | func (p *MetaGpuDevicePlugin) Serve() error {
119 | 	_ = os.Remove(p.socket)
120 | 
121 | 	sock, err := net.Listen("unix", p.socket)
122 | 	if err != nil {
123 | 		log.Error(err)
124 | 	}
125 | 	log.Infof("listening on %s", p.socket)
126 | 	pluginapi.RegisterDevicePluginServer(p.server, p)
127 | 
128 | 	go func() {
129 | 		if err := p.server.Serve(sock); err != nil {
130 | 			log.Errorf("gRPC server craeshed, %s", err)
131 | 		}
132 | 	}()
133 | 
134 | 	if conn, err := p.dial(p.socket, 3*time.Second); err != nil {
135 | 		log.Error(err)
136 | 		return err
137 | 	} else {
138 | 		_ = conn.Close()
139 | 		log.Info("gRPC server successfully started and ready accept new connections")
140 | 	}
141 | 	return nil
142 | 
143 | }
144 | 
145 | func (p *MetaGpuDevicePlugin) Start() {
146 | 	if err := p.Serve(); err != nil {
147 | 		log.Fatal(err)
148 | 	}
149 | 
150 | 	if err := p.Register(); err != nil {
151 | 		log.Fatal(err)
152 | 	}
153 | 
154 | }
155 | 
156 | func (p *MetaGpuDevicePlugin) Stop() {
157 | 	log.Info("stopping GRPC server")
158 | 	if p != nil && p.server != nil {
159 | 		p.server.Stop()
160 | 	}
161 | 	log.Info("removing unix socket")
162 | 	_ = os.Remove(p.socket)
163 | 	log.Info("closing all channels")
164 | 	close(p.stop)
165 | 	close(p.MetaGpuRecalculation)
166 | }
167 | 
168 | func NewMetaGpuDevicePlugin(metaGpuRecalculation chan bool, deviceMgr DeviceManager) *MetaGpuDevicePlugin {
169 | 	if viper.GetString("accelerator") != "nvidia" {
170 | 		log.Fatal("accelerator not supported, currently only nvidia is supported")
171 | 	}
172 | 	return &MetaGpuDevicePlugin{
173 | 		server:               grpc.NewServer([]grpc.ServerOption{}...),
174 | 		socket:               fmt.Sprintf("%s%s", pluginapi.DevicePluginPath, deviceMgr.GetUnixSocket()),
175 | 		DeviceManager:        deviceMgr,
176 | 		stop:                 make(chan interface{}),
177 | 		MetaGpuRecalculation: metaGpuRecalculation,
178 | 	}
179 | }
180 | 


--------------------------------------------------------------------------------
/.github/workflows/docker-image.yml:
--------------------------------------------------------------------------------
  1 | name: Docker Image CI
  2 | 
  3 | on:
  4 |   push:
  5 |     branches: [ 'main' ]
  6 | 
  7 | jobs:
  8 |   build:
  9 |     runs-on: ubuntu-latest
 10 |     steps:
 11 |     - uses: actions/checkout@v2
 12 |       with:
 13 |         fetch-depth: 0
 14 | 
 15 |     - name: Bump version and push tag
 16 |       uses: AccessibleAI/github-tag-action@1.0.0
 17 |       id: tag_bump
 18 |       env:
 19 |         MSG: ${{ github.event.inputs.msg }}
 20 |         GITHUB_TOKEN: ${{ secrets.CNVRG_GITHUB_TOKEN }}
 21 |         PRERELEASE_AUTOMATIC_BUMP: true
 22 | 
 23 |     - name: Set up Python
 24 |       uses: actions/setup-python@v2
 25 |       with:
 26 |         python-version: '3.9'
 27 |         architecture: x64
 28 |       if: ${{ steps.tag_bump.outputs.bumped == 'true' && steps.tag_bump.outputs.prerelease == 'false' }}
 29 | 
 30 |     - name: Generate Cnvrg Changelog
 31 |       uses: AccessibleAI/github-changelog-action@1.0.0
 32 |       id: cnvrg_changelog
 33 |       with:
 34 |         from_version: ${{ steps.tag_bump.outputs.tag }}
 35 |         to_version:  ${{ steps.tag_bump.outputs.new_tag }}
 36 |         jira_token: ${{ secrets.JIRA_TOKEN }}
 37 |         slack_webhook_url: false
 38 |       if: ${{ steps.tag_bump.outputs.bumped == 'true' && steps.tag_bump.outputs.prerelease == 'false' }}
 39 | 
 40 |     - name: Generate Non Cnvrg Changelog
 41 |       id: changelog
 42 |       uses: metcalfc/changelog-generator@v3.0.0
 43 |       with:
 44 |         myToken: ${{ secrets.CNVRG_GITHUB_TOKEN }}
 45 |         head-ref: ${{ steps.tag_bump.outputs.tag }}
 46 |         base-ref: ${{ steps.tag_bump.outputs.new_tag }}
 47 |       if: ${{ steps.cnvrg_changelog.outputs.empty == 'true' && steps.tag_bump.outputs.bumped == 'true' && steps.tag_bump.outputs.prerelease == 'false' }}
 48 | 
 49 |     - name: Generate changelog
 50 |       id: changelog_final
 51 |       if: ${{ steps.tag_bump.outputs.bumped == 'true' && steps.tag_bump.outputs.prerelease == 'false' }}
 52 |       run: |
 53 |         set -o noglob
 54 |         if ${{ steps.cnvrg_changelog.outputs.empty }}; then
 55 |           log=$(cat << "EOF"
 56 |         ${{ steps.changelog.outputs.changelog }}
 57 |         EOF
 58 |         )
 59 |         else
 60 |           log=$(cat << "EOF"
 61 |         ${{ steps.cnvrg_changelog.outputs.changelog }}
 62 |         EOF
 63 |         )
 64 |         fi
 65 |         log="${log//'%'/'%25'}"
 66 |         log="${log//$'\n'/'%0A'}"
 67 |         log="${log//$'\r'/'%0D'}"
 68 |         echo "::set-output name=changelog::$log"
 69 |     - name: Print the final changelog
 70 |       if: ${{ steps.tag_bump.outputs.bumped == 'true' && steps.tag_bump.outputs.prerelease == 'false' }}
 71 |       run: |
 72 |         cat << "EOF"
 73 |         ${{ steps.changelog_final.outputs.changelog }}
 74 |         EOF
 75 |     - name: Changelog Release
 76 |       uses: softprops/action-gh-release@v1
 77 |       with:
 78 |         body: ${{steps.changelog_final.outputs.changelog}}
 79 |         tag_name: ${{ steps.tag_bump.outputs.new_tag }}
 80 |         prerelease: ${{ steps.tag_bump.outputs.prerelease }}
 81 |         generate_release_notes: true
 82 |       if: ${{ steps.tag_bump.outputs.bumped == 'true' && steps.tag_bump.outputs.prerelease == 'false' }}
 83 | 
 84 |     - name: Extract repo/branch name
 85 |       shell: bash
 86 |       if: ${{ steps.tag_bump.outputs.bumped == 'true' }}
 87 |       run: |
 88 |           echo "##[set-output name=repo;]$(echo ${{github.event.repository.name}} | sed 's/cnvrg-//')"
 89 |           echo "##[set-output name=branch;]$(echo ${GITHUB_REF#refs/*/})"
 90 |           echo "##[set-output name=head;]$(git rev-parse --short HEAD)"
 91 |           echo "##[set-output name=repo_url;]$(echo $GITHUB_SERVER_URL/$GITHUB_REPOSITORY)"
 92 |       id: extract_info
 93 | 
 94 |     - name: Login to Docker Hub
 95 |       uses: docker/login-action@v1
 96 |       if: ${{ steps.tag_bump.outputs.bumped == 'true' }}
 97 |       with:
 98 |         username: ${{ secrets.DOCKER_USER}}
 99 |         password: ${{ secrets.DOCKER_PASSWORD}}
100 | 
101 |     - name: Set up Docker Buildx
102 |       id: buildx
103 |       uses: docker/setup-buildx-action@v1
104 |       if: ${{ steps.tag_bump.outputs.bumped == 'true' }}
105 | 
106 |     - name: Build and push main latest
107 |       id: docker_build_main
108 |       uses: docker/build-push-action@v2
109 |       if: ${{ steps.tag_bump.outputs.bumped == 'true' && (github.ref == 'refs/heads/main'  ||  github.ref == 'refs/heads/master') }}
110 |       with:
111 |         context: ./
112 |         file: ./Dockerfile
113 |         push: true
114 |         tags: cnvrg/metagpu-device-plugin:latest
115 |         build-args: |
116 |           BUILD_SHA=${{ steps.extract_info.outputs.head }}
117 |           BUILD_VERSION=latest
118 |     - name: Build and push tagged image
119 |       id: docker_build_tag
120 |       uses: docker/build-push-action@v2
121 |       if: ${{ steps.tag_bump.outputs.bumped == 'true' }}
122 |       with:
123 |         context: ./
124 |         file: ./Dockerfile
125 |         push: true
126 |         tags: cnvrg/metagpu-device-plugin:${{ steps.tag_bump.outputs.new_tag }}
127 |         build-args: |
128 |           BUILD_SHA=${{ steps.extract_info.outputs.head }}
129 |           BUILD_VERSION=${{ steps.tag_bump.outputs.new_tag }}
130 |     - name: Slack Notification
131 |       uses: rtCamp/action-slack-notify@v2
132 |       if: ${{ steps.tag_bump.outputs.bumped == 'true' && steps.tag_bump.outputs.prerelease == 'false' }}
133 |       env:
134 |         SLACK_USERNAME: Github Actions
135 |         SLACK_WEBHOOK: ${{ secrets.SLACK_GITHUB_APP_TOKEN  }}
136 |         SLACK_CHANNEL: "#release-notes-metacloud"
137 |         SLACK_ICON: https://avatars.githubusercontent.com/u/44036562?s=48&v=4
138 |         SLACK_COLOR: ${{ job.status }}
139 |         SLACK_FOOTER: ""
140 |         MSG_MINIMAL: true
141 |         SLACK_TITLE: "Repo Name"
142 |         SLACK_MESSAGE: |
143 |           <${{ steps.extract_info.outputs.repo_url }}|${{github.event.repository.name}}>
144 |           *Docker Image: cnvrg/${{ steps.extract_info.outputs.repo }}:${{ steps.tag_bump.outputs.new_tag }}*
145 |           *Version: ${{ steps.tag_bump.outputs.new_tag }}*
146 |           <${{ steps.extract_info.outputs.repo_url }}/releases|Release Notes:>
147 |           ${{steps.changelog_final.outputs.changelog}}
148 | 


--------------------------------------------------------------------------------
/pkg/gpumgr/mgr.go:
--------------------------------------------------------------------------------
  1 | package gpumgr
  2 | 
  3 | import (
  4 | 	"context"
  5 | 	"fmt"
  6 | 	"github.com/AccessibleAI/cnvrg-fractional-accelerator-device-plugin/pkg/nvmlutils"
  7 | 	"github.com/AccessibleAI/cnvrg-fractional-accelerator-device-plugin/pkg/podexec"
  8 | 	"github.com/AccessibleAI/cnvrg-fractional-accelerator-device-plugin/pkg/sharecfg"
  9 | 	log "github.com/sirupsen/logrus"
 10 | 	"github.com/spf13/viper"
 11 | 	v1core "k8s.io/api/core/v1"
 12 | 	"os"
 13 | 	"time"
 14 | )
 15 | 
 16 | var MB uint64 = 1024 * 1024
 17 | 
 18 | type GpuMgr struct {
 19 | 	ContainerLevelVisibilityToken string
 20 | 	DeviceLevelVisibilityToken    string
 21 | 	GpuDevices                    []*GpuDevice
 22 | 	// list of gpu containers
 23 | 	gpuContainers []*GpuContainer
 24 | 	// collection of the gpu processes: the anonymouse and active running
 25 | 	gpuContainersCollector []*GpuContainer
 26 | }
 27 | 
 28 | type GpuDeviceInfo struct {
 29 | 	Node     string
 30 | 	Metadata map[string]string
 31 | 	Devices  []*GpuDevice
 32 | }
 33 | 
 34 | func (m *GpuMgr) startGpuStatusCache() {
 35 | 	go func() {
 36 | 		for {
 37 | 			time.Sleep(5 * time.Second)
 38 | 			// set gpu devices
 39 | 			m.setGpuDevices()
 40 | 			// set gpu containers
 41 | 			m.discoverGpuContainers()
 42 | 			// set active gpu processes
 43 | 			m.enrichGpuContainer()
 44 | 			// set final gpu containers list
 45 | 			m.setGpuContainers()
 46 | 		}
 47 | 	}()
 48 | }
 49 | 
 50 | func (m *GpuMgr) setGpuDevices() {
 51 | 	var gpuDevices []*GpuDevice
 52 | 	for idx, device := range nvmlutils.GetDevices() {
 53 | 		uuid, ret := device.GetUUID()
 54 | 		nvmlutils.ErrorCheck(ret)
 55 | 		deviceMemory, ret := device.GetMemoryInfo()
 56 | 		nvmlutils.ErrorCheck(ret)
 57 | 		utilization, ret := device.GetUtilizationRates()
 58 | 		nvmlutils.ErrorCheck(ret)
 59 | 		gpuDevices = append(gpuDevices, NewGpuDevice(uuid, idx, utilization, deviceMemory))
 60 | 	}
 61 | 	m.GpuDevices = gpuDevices
 62 | }
 63 | 
 64 | func (m *GpuMgr) enrichGpuContainer() {
 65 | 	for _, device := range m.GpuDevices {
 66 | 		for _, nvmlProcessInfo := range nvmlutils.GetComputeRunningProcesses(device.Index) {
 67 | 			stats := nvmlutils.GetAccountingStats(device.Index, nvmlProcessInfo.Pid)
 68 | 			gpuProc := NewGpuProcess(nvmlProcessInfo.Pid, stats.GpuUtilization, nvmlProcessInfo.UsedGpuMemory/MB, device.UUID)
 69 | 			for _, c := range m.gpuContainersCollector {
 70 | 				if c.ContainerId == gpuProc.ContainerId {
 71 | 					c.Processes = append(c.Processes, gpuProc)
 72 | 				}
 73 | 			}
 74 | 		}
 75 | 	}
 76 | }
 77 | 
 78 | func (m *GpuMgr) setGpuContainers() {
 79 | 	m.gpuContainers = m.gpuContainersCollector
 80 | 	log.Infof("discovered %d gpu containers", len(m.gpuContainers))
 81 | }
 82 | 
 83 | func (m *GpuMgr) GetDeviceInfo() *GpuDeviceInfo {
 84 | 	hostname, err := os.Hostname()
 85 | 	if err != nil {
 86 | 		log.Errorf("failed to detect hostname, err: %s", err)
 87 | 	}
 88 | 	info := make(map[string]string)
 89 | 	cudaVersion := nvmlutils.SystemGetCudaDriverVersion()
 90 | 	info["cudaVersion"] = fmt.Sprintf("%d", cudaVersion)
 91 | 	driver := nvmlutils.SystemGetDriverVersion()
 92 | 	info["driverVersion"] = driver
 93 | 	return &GpuDeviceInfo{Node: hostname, Metadata: info, Devices: m.GpuDevices}
 94 | }
 95 | 
 96 | func (m *GpuMgr) discoverGpuContainers() {
 97 | 	c, err := podexec.GetK8sClient()
 98 | 	if err != nil {
 99 | 		log.Error(err)
100 | 		return
101 | 	}
102 | 	pl := &v1core.PodList{}
103 | 	if err := c.List(context.Background(), pl); err != nil {
104 | 		log.Error(err)
105 | 		return
106 | 	}
107 | 	// reset gpu containers collector
108 | 	m.gpuContainersCollector = nil
109 | 	cfg := sharecfg.NewDeviceSharingConfig()
110 | 	for _, p := range pl.Items {
111 | 		for _, container := range p.Spec.Containers {
112 | 			for _, config := range cfg.Configs {
113 | 				resourceName := v1core.ResourceName(config.ResourceName)
114 | 				if quantity, ok := container.Resources.Limits[resourceName]; ok {
115 | 
116 | 					if viper.GetString("nodename") == "" {
117 | 						m.gpuContainersCollector = append(m.gpuContainersCollector,
118 | 							NewGpuContainer(
119 | 								getContainerId(&p, container.Name),
120 | 								container.Name,
121 | 								p.Name,
122 | 								p.Namespace,
123 | 								config.ResourceName,
124 | 								p.Spec.NodeName,
125 | 								quantity.Value(),
126 | 								m.GpuDevices,
127 | 							),
128 | 						)
129 | 						continue
130 | 					}
131 | 
132 | 					if viper.GetString("nodename") == p.Spec.NodeName {
133 | 						{
134 | 							m.gpuContainersCollector = append(m.gpuContainersCollector,
135 | 								NewGpuContainer(
136 | 									getContainerId(&p, container.Name),
137 | 									container.Name,
138 | 									p.Name,
139 | 									p.Namespace,
140 | 									config.ResourceName,
141 | 									p.Spec.NodeName,
142 | 									quantity.Value(),
143 | 									m.GpuDevices,
144 | 								),
145 | 							)
146 | 						}
147 | 					}
148 | 
149 | 				}
150 | 			}
151 | 		}
152 | 	}
153 | }
154 | 
155 | func (m *GpuMgr) GetProcesses(podId string) []*GpuContainer {
156 | 	// if podId is set, return single process
157 | 	if podId != "" {
158 | 		var gpuContainers []*GpuContainer
159 | 		for _, deviceProcess := range m.gpuContainers {
160 | 			if deviceProcess.PodId == podId {
161 | 				gpuContainers = append(gpuContainers, deviceProcess)
162 | 			}
163 | 		}
164 | 		return gpuContainers
165 | 	}
166 | 	// return all processes
167 | 	return m.gpuContainers
168 | }
169 | 
170 | func (m *GpuMgr) GetMetaDevices() map[string]*GpuDevice {
171 | 	var deviceMap = make(map[string]*GpuDevice)
172 | 	for _, d := range m.GpuDevices {
173 | 		deviceMap[d.UUID] = d
174 | 	}
175 | 	return deviceMap
176 | }
177 | 
178 | func (m *GpuMgr) KillGpuProcess(pid uint32) error {
179 | 	p := NewGpuProcess(pid, 0, 0, "")
180 | 	return p.Kill()
181 | }
182 | 
183 | func (m *GpuMgr) SetDeviceLevelVisibilityToken(token string) {
184 | 	m.DeviceLevelVisibilityToken = token
185 | }
186 | 
187 | func (m *GpuMgr) SetContainerLevelVisibilityToken(token string) {
188 | 	m.ContainerLevelVisibilityToken = token
189 | }
190 | 
191 | func NewGpuManager() *GpuMgr {
192 | 	mgr := &GpuMgr{}
193 | 	// init gpu devices
194 | 	mgr.setGpuDevices()
195 | 	// init gpu containers
196 | 	mgr.discoverGpuContainers()
197 | 	// init active gpu processes
198 | 	mgr.enrichGpuContainer()
199 | 	// set gpu processes
200 | 	mgr.setGpuContainers()
201 | 	// start gpu devices and processes cache
202 | 	mgr.startGpuStatusCache()
203 | 	// start mem enforcer
204 | 	if viper.GetBool("memoryEnforcer") {
205 | 		mgr.StartMemoryEnforcer()
206 | 	}
207 | 	return mgr
208 | }
209 | 


--------------------------------------------------------------------------------
/pkg/mgsrv/deviceapi/device/v1/device.go:
--------------------------------------------------------------------------------
  1 | package v1
  2 | 
  3 | import (
  4 | 	"context"
  5 | 	pb "github.com/AccessibleAI/cnvrg-fractional-accelerator-device-plugin/gen/proto/go/device/v1"
  6 | 	"github.com/AccessibleAI/cnvrg-fractional-accelerator-device-plugin/pkg/gpumgr"
  7 | 	log "github.com/sirupsen/logrus"
  8 | 	"google.golang.org/grpc/codes"
  9 | 	"google.golang.org/grpc/status"
 10 | 	"time"
 11 | )
 12 | 
 13 | type DeviceService struct {
 14 | 	pb.UnimplementedDeviceServiceServer
 15 | 	gpuMgr *gpumgr.GpuMgr
 16 | 	vl     string // visibility level
 17 | 	cvl    string // container visibility level ID
 18 | 	dvl    string // device visibility level ID
 19 | }
 20 | 
 21 | func (s *DeviceService) LoadContext(ctx context.Context) error {
 22 | 
 23 | 	s.gpuMgr = ctx.Value("gpuMgr").(*gpumgr.GpuMgr)
 24 | 	if s.gpuMgr == nil {
 25 | 		log.Fatalf("gpuMgr instance not set in context")
 26 | 	}
 27 | 	s.vl = ctx.Value("visibilityLevel").(string)
 28 | 	s.cvl = ctx.Value("containerVl").(string)
 29 | 	s.dvl = ctx.Value("deviceVl").(string)
 30 | 	// stop execution if visibility level is empty
 31 | 	if s.vl == "" {
 32 | 		return status.Errorf(codes.Aborted, "can't detect visibility level for request: %s", s.vl)
 33 | 	}
 34 | 	// stop executing if container or device visibility level is empty
 35 | 	if s.cvl == "" || s.dvl == "" {
 36 | 		return status.Error(codes.Aborted, "can't detect visibility levels")
 37 | 	}
 38 | 	return nil
 39 | }
 40 | 
 41 | func (s *DeviceService) GetGpuContainers(ctx context.Context, r *pb.GetGpuContainersRequest) (*pb.GetGpuContainersResponse, error) {
 42 | 
 43 | 	if err := s.LoadContext(ctx); err != nil {
 44 | 		return &pb.GetGpuContainersResponse{}, err
 45 | 	}
 46 | 	response := &pb.GetGpuContainersResponse{VisibilityLevel: s.vl}
 47 | 	// stop execution if visibility level is container and pod id is not set (not enough permissions)
 48 | 	if s.vl == s.cvl && r.PodId == "" {
 49 | 		return response, status.Errorf(codes.PermissionDenied, "missing pod id and visibility level is to low (%s), can't proceed", s.vl)
 50 | 	}
 51 | 	if s.vl == s.dvl {
 52 | 		r.PodId = "" // for deviceVisibilityLevel server should return all running process on all containers
 53 | 	}
 54 | 	response.GpuContainers = listDeviceProcesses(r.PodId, s.gpuMgr)
 55 | 	return response, nil
 56 | }
 57 | 
 58 | func (s *DeviceService) StreamGpuContainers(r *pb.StreamGpuContainersRequest, stream pb.DeviceService_StreamGpuContainersServer) error {
 59 | 
 60 | 	for {
 61 | 
 62 | 		if err := s.LoadContext(stream.Context()); err != nil {
 63 | 			return err
 64 | 		}
 65 | 		// stop execution if visibility level is container and pod id is not set (not enough permissions)
 66 | 		if s.vl == s.cvl && r.PodId == "" {
 67 | 			return status.Errorf(codes.PermissionDenied, "missing pod id and visibility level is to low (%s), can't proceed", s.vl)
 68 | 		}
 69 | 		if s.vl == s.dvl {
 70 | 			r.PodId = "" // for deviceVisibilityLevel server should return all running process on all containers
 71 | 		}
 72 | 		response := &pb.StreamGpuContainersResponse{VisibilityLevel: s.vl}
 73 | 		response.GpuContainers = listDeviceProcesses(r.PodId, s.gpuMgr)
 74 | 		if err := stream.Send(response); err != nil {
 75 | 			return err
 76 | 		}
 77 | 
 78 | 		time.Sleep(1 * time.Second)
 79 | 	}
 80 | 
 81 | }
 82 | 
 83 | func (s *DeviceService) GetDevices(ctx context.Context, r *pb.GetDevicesRequest) (*pb.GetDevicesResponse, error) {
 84 | 	response := &pb.GetDevicesResponse{}
 85 | 	if err := s.LoadContext(ctx); err != nil {
 86 | 		return response, err
 87 | 	}
 88 | 	response.Device = make(map[string]*pb.Device)
 89 | 	for _, device := range s.gpuMgr.GetMetaDevices() {
 90 | 		d := &pb.Device{
 91 | 			Uuid:              device.UUID,
 92 | 			Index:             uint32(device.Index),
 93 | 			Shares:            uint32(device.Shares),
 94 | 			GpuUtilization:    device.Utilization.Gpu,
 95 | 			MemoryUtilization: device.Utilization.Memory,
 96 | 			MemoryShareSize:   device.Memory.ShareSize,
 97 | 			ResourceName:      device.ResourceName,
 98 | 			NodeName:          device.Nodename,
 99 | 		}
100 | 		if s.vl == s.dvl {
101 | 			d.MemoryTotal = device.Memory.Total
102 | 			d.MemoryFree = device.Memory.Free
103 | 			d.MemoryUsed = device.Memory.Used
104 | 		}
105 | 		response.Device[d.Uuid] = d
106 | 	}
107 | 	return response, nil
108 | }
109 | 
110 | func (s *DeviceService) KillGpuProcess(ctx context.Context, r *pb.KillGpuProcessRequest) (*pb.KillGpuProcessResponse, error) {
111 | 	response := &pb.KillGpuProcessResponse{}
112 | 	if err := s.LoadContext(ctx); err != nil {
113 | 		return response, err
114 | 	}
115 | 	if err := s.gpuMgr.KillGpuProcess(r.Pid); err != nil {
116 | 		return response, status.Errorf(codes.Internal, "error killing GPU process, err: %s", err)
117 | 	}
118 | 	return response, nil
119 | }
120 | 
121 | func (s *DeviceService) GetMetaDeviceInfo(ctx context.Context, r *pb.GetMetaDeviceInfoRequest) (*pb.GetMetaDeviceInfoResponse, error) {
122 | 	resp := &pb.GetMetaDeviceInfoResponse{}
123 | 	if err := s.LoadContext(ctx); err != nil {
124 | 		return resp, err
125 | 	}
126 | 	if s.vl != s.dvl {
127 | 		return resp, status.Errorf(codes.PermissionDenied, "wrong visibility level: %s", s.vl)
128 | 	}
129 | 	deviceInfo := s.gpuMgr.GetDeviceInfo()
130 | 	resp.Node = deviceInfo.Node
131 | 	resp.Metadata = deviceInfo.Metadata
132 | 	for _, device := range deviceInfo.Devices {
133 | 		resp.Devices = append(resp.Devices, &pb.Device{
134 | 			Uuid:              device.UUID,
135 | 			Index:             uint32(device.Index),
136 | 			Shares:            uint32(device.Shares),
137 | 			GpuUtilization:    device.Utilization.Gpu,
138 | 			MemoryUtilization: device.Utilization.Memory,
139 | 			MemoryShareSize:   device.Memory.ShareSize,
140 | 			MemoryTotal:       device.Memory.Total,
141 | 			MemoryFree:        device.Memory.Free,
142 | 			MemoryUsed:        device.Memory.Used,
143 | 			ResourceName:      device.ResourceName,
144 | 			NodeName:          device.Nodename,
145 | 		})
146 | 	}
147 | 	return resp, nil
148 | }
149 | 
150 | func (s *DeviceService) PatchConfigs(ctx context.Context, r *pb.PatchConfigsRequest) (*pb.PatchConfigsResponse, error) {
151 | 	//if err := s.LoadContext(ctx); err != nil {
152 | 	//	return &pb.PatchConfigsResponse{}, err
153 | 	//}
154 | 	//if s.vl != s.dvl {
155 | 	//	return &pb.PatchConfigsResponse{}, status.Errorf(codes.PermissionDenied, "visibility level too high", s.vl)
156 | 	//}
157 | 	//deviceplugin.UpdatePersistentConfigs(r.MetagpusPerGpu)
158 | 	//viper.Set("metaGpus", r.MetagpusPerGpu)
159 | 	//s.gpuMgr.MetaGpuRecalculation <- true
160 | 	return &pb.PatchConfigsResponse{}, nil
161 | 
162 | }
163 | 
164 | func (s *DeviceService) PingServer(ctx context.Context, r *pb.PingServerRequest) (*pb.PingServerResponse, error) {
165 | 	return &pb.PingServerResponse{}, nil
166 | }
167 | 


--------------------------------------------------------------------------------
/pkg/allocator/allocator_test.go:
--------------------------------------------------------------------------------
  1 | package allocator
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | 	. "github.com/onsi/ginkgo"
  6 | 	. "github.com/onsi/gomega"
  7 | 	"testing"
  8 | )
  9 | 
 10 | func TestAllocator(t *testing.T) {
 11 | 	RegisterFailHandler(Fail)
 12 | 	RunSpecs(t, "Allocation Suite")
 13 | }
 14 | 
 15 | var _ = Describe("Metagpu allocations", func() {
 16 | 
 17 | 	Context("allocate", func() {
 18 | 
 19 | 		It("10% gpu", func() {
 20 | 			physDevs := 2
 21 | 			allocationSize := 1
 22 | 			sharesPerGpu := 10
 23 | 			testDevices := getTestDevicesIds(physDevs, sharesPerGpu)
 24 | 			alloc := NewDeviceAllocation(physDevs, allocationSize, sharesPerGpu, testDevices)
 25 | 			Expect(len(alloc.MetagpusAllocations)).To(Equal(1))
 26 | 			expectedDevices := []string{"cnvrg-meta-0-0-test-device-0"}
 27 | 			Expect(alloc.MetagpusAllocations).To(Equal(expectedDevices))
 28 | 		})
 29 | 
 30 | 		It("50% gpu", func() {
 31 | 			physDevs := 2
 32 | 			sharesPerGpu := 10
 33 | 			allocationSize := 5
 34 | 			testDevices := getTestDevicesIds(physDevs, sharesPerGpu)
 35 | 			alloc := NewDeviceAllocation(physDevs, allocationSize, sharesPerGpu, testDevices)
 36 | 			Expect(len(alloc.MetagpusAllocations)).To(Equal(5))
 37 | 			Expect(alloc.MetagpusAllocations).To(Equal(getTestDevicesIds(1, 5)))
 38 | 		})
 39 | 
 40 | 		It("80% gpu", func() {
 41 | 			physDevs := 2
 42 | 			sharesPerGpu := 10
 43 | 			allocationSize := 8
 44 | 			testDevices := getTestDevicesIds(physDevs, sharesPerGpu)
 45 | 			alloc := NewDeviceAllocation(physDevs, allocationSize, sharesPerGpu, testDevices)
 46 | 			Expect(len(alloc.MetagpusAllocations)).To(Equal(8))
 47 | 			Expect(alloc.MetagpusAllocations).To(Equal(getTestDevicesIds(1, 8)))
 48 | 		})
 49 | 
 50 | 		It("100% gpu", func() {
 51 | 			physDevs := 2
 52 | 			allocationSize := 10
 53 | 			sharesPerGpu := 10
 54 | 			testDevices := getTestDevicesIds(physDevs, sharesPerGpu)
 55 | 			alloc := NewDeviceAllocation(physDevs, allocationSize, sharesPerGpu, testDevices)
 56 | 			Expect(len(alloc.MetagpusAllocations)).To(Equal(10))
 57 | 			Expect(alloc.MetagpusAllocations).To(Equal(getTestDevicesIds(1, 10)))
 58 | 		})
 59 | 
 60 | 		It("110% gpu", func() {
 61 | 			physDevs := 2
 62 | 			allocationSize := 12
 63 | 			sharesPerGpu := 10
 64 | 			testDevices := getTestDevicesIds(physDevs, sharesPerGpu)
 65 | 			alloc := NewDeviceAllocation(physDevs, allocationSize, sharesPerGpu, testDevices)
 66 | 			Expect(len(alloc.MetagpusAllocations)).To(Equal(12))
 67 | 			expectedIds := getTestDevicesIds(1, 10)
 68 | 			expectedIds = append(expectedIds, "cnvrg-meta-1-0-test-device-1")
 69 | 			expectedIds = append(expectedIds, "cnvrg-meta-1-1-test-device-1")
 70 | 			Expect(alloc.MetagpusAllocations).To(Equal(expectedIds))
 71 | 		})
 72 | 
 73 | 		It("200% gpu", func() {
 74 | 			physDevs := 2
 75 | 			allocationSize := 20
 76 | 			sharesPerGpu := 10
 77 | 			testDevices := getTestDevicesIds(physDevs, sharesPerGpu)
 78 | 			alloc := NewDeviceAllocation(physDevs, allocationSize, sharesPerGpu, testDevices)
 79 | 			Expect(len(alloc.MetagpusAllocations)).To(Equal(20))
 80 | 			Expect(alloc.MetagpusAllocations).To(Equal(getTestDevicesIds(2, 10)))
 81 | 		})
 82 | 
 83 | 		It("single GPU -> 50% after 50% has been taken", func() {
 84 | 			devices := []string{
 85 | 				//"cnvrg-meta-0-0-test-device-0", -> already allocated by previous request
 86 | 				//"cnvrg-meta-0-1-test-device-0",  -> already allocated by previous request
 87 | 				"cnvrg-meta-0-2-test-device-0", // -> this should be allocated now
 88 | 				"cnvrg-meta-0-3-test-device-0", // -> this should be allocated now
 89 | 				"cnvrg-meta-1-0-test-device-1",
 90 | 				"cnvrg-meta-1-1-test-device-1",
 91 | 				"cnvrg-meta-1-2-test-device-1",
 92 | 				"cnvrg-meta-1-3-test-device-1",
 93 | 			}
 94 | 			physDevs := 2
 95 | 			allocationSize := 2
 96 | 			sharesPerGpu := 4
 97 | 			alloc := NewDeviceAllocation(physDevs, allocationSize, sharesPerGpu, devices)
 98 | 			Expect(len(alloc.MetagpusAllocations)).To(Equal(2))
 99 | 			Expect(alloc.MetagpusAllocations).To(Equal([]string{"cnvrg-meta-0-2-test-device-0", "cnvrg-meta-0-3-test-device-0"}))
100 | 		})
101 | 
102 | 		It("single GPU -> 60% after 50% has been taken (jump to next device)", func() {
103 | 			devices := []string{
104 | 				//"cnvrg-meta-0-0-test-device-0", -> already allocated by previous request
105 | 				//"cnvrg-meta-0-1-test-device-0",  -> already allocated by previous request
106 | 				"cnvrg-meta-0-2-test-device-0",
107 | 				"cnvrg-meta-0-3-test-device-0",
108 | 				"cnvrg-meta-1-0-test-device-1", // -> this should be allocated now
109 | 				"cnvrg-meta-1-1-test-device-1", // -> this should be allocated now
110 | 				"cnvrg-meta-1-2-test-device-1", // -> this should be allocated now
111 | 				"cnvrg-meta-1-3-test-device-1",
112 | 			}
113 | 			physDevs := 2
114 | 			allocationSize := 3
115 | 			sharesPerGpu := 4
116 | 			alloc := NewDeviceAllocation(physDevs, allocationSize, sharesPerGpu, devices)
117 | 			Expect(len(alloc.MetagpusAllocations)).To(Equal(3))
118 | 			Expect(alloc.MetagpusAllocations).To(Equal([]string{
119 | 				"cnvrg-meta-1-0-test-device-1",
120 | 				"cnvrg-meta-1-1-test-device-1",
121 | 				"cnvrg-meta-1-2-test-device-1",
122 | 			}))
123 | 		})
124 | 
125 | 		It("allocate fractions from 2 different physical gpus", func() {
126 | 			devices := []string{
127 | 				//"cnvrg-meta-0-0-test-device-0", -> already allocated by previous request
128 | 				//"cnvrg-meta-0-1-test-device-0", -> already allocated by previous request
129 | 				"cnvrg-meta-0-2-test-device-0", // -> this should be allocated now
130 | 				"cnvrg-meta-0-3-test-device-0", // -> this should be allocated now
131 | 				//"cnvrg-meta-1-0-test-device-1", -> already allocated by previous request
132 | 				//"cnvrg-meta-1-1-test-device-1", -> already allocated by previous request
133 | 				"cnvrg-meta-1-2-test-device-1", // -> this should be allocated now
134 | 				"cnvrg-meta-1-3-test-device-1",
135 | 			}
136 | 			physDevs := 2
137 | 			allocationSize := 3
138 | 			sharesPerGpu := 4
139 | 			alloc := NewDeviceAllocation(physDevs, allocationSize, sharesPerGpu, devices)
140 | 			Expect(len(alloc.MetagpusAllocations)).To(Equal(3))
141 | 			Expect(alloc.MetagpusAllocations).To(Equal([]string{
142 | 				"cnvrg-meta-0-2-test-device-0",
143 | 				"cnvrg-meta-0-3-test-device-0",
144 | 				"cnvrg-meta-1-2-test-device-1",
145 | 			}))
146 | 		})
147 | 		It("allocate full from 2 different physical gpus", func() {
148 | 			devices := []string{
149 | 				//"cnvrg-meta-0-0-test-device-0", -> already allocated by previous request
150 | 				//"cnvrg-meta-0-1-test-device-0", -> already allocated by previous request
151 | 				"cnvrg-meta-0-2-test-device-0", // -> this should be allocated now
152 | 				"cnvrg-meta-0-3-test-device-0", // -> this should be allocated now
153 | 				//"cnvrg-meta-1-0-test-device-1", -> already allocated by previous request
154 | 				//"cnvrg-meta-1-1-test-device-1", -> already allocated by previous request
155 | 				"cnvrg-meta-1-2-test-device-1", // -> this should be allocated now
156 | 				"cnvrg-meta-1-3-test-device-1", // -> this should be allocated now
157 | 			}
158 | 			physDevs := 2
159 | 			allocationSize := 4
160 | 			sharesPerGpu := 4
161 | 			alloc := NewDeviceAllocation(physDevs, allocationSize, sharesPerGpu, devices)
162 | 			Expect(len(alloc.MetagpusAllocations)).To(Equal(4))
163 | 			Expect(alloc.MetagpusAllocations).To(Equal([]string{
164 | 				"cnvrg-meta-0-2-test-device-0",
165 | 				"cnvrg-meta-0-3-test-device-0",
166 | 				"cnvrg-meta-1-2-test-device-1",
167 | 				"cnvrg-meta-1-3-test-device-1",
168 | 			}))
169 | 		})
170 | 	})
171 | })
172 | 
173 | func getTestDevicesIds(physicalDevices, sharesPerGpu int) (metagpus []string) {
174 | 	for i := 0; i < physicalDevices; i++ {
175 | 		for j := 0; j < sharesPerGpu; j++ {
176 | 			metagpus = append(metagpus, fmt.Sprintf("cnvrg-meta-%d-%d-test-device-%d", i, j, i))
177 | 		}
178 | 	}
179 | 	return
180 | }
181 | 


--------------------------------------------------------------------------------
/cmd/mgctl/get.go:
--------------------------------------------------------------------------------
  1 | package main
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | 	pbdevice "github.com/AccessibleAI/cnvrg-fractional-accelerator-device-plugin/gen/proto/go/device/v1"
  6 | 	"github.com/AccessibleAI/cnvrg-fractional-accelerator-device-plugin/pkg/ctlutils"
  7 | 	"github.com/atomicgo/cursor"
  8 | 	"github.com/jedib0t/go-pretty/v6/table"
  9 | 	log "github.com/sirupsen/logrus"
 10 | 	"github.com/spf13/cobra"
 11 | 	"github.com/spf13/viper"
 12 | 	"io"
 13 | 	"os"
 14 | 	"os/signal"
 15 | 	"syscall"
 16 | 	"time"
 17 | )
 18 | 
 19 | var getCmd = &cobra.Command{
 20 | 	Use:     "get",
 21 | 	Aliases: []string{"g"},
 22 | 	Short:   "get resources",
 23 | }
 24 | 
 25 | var processGetParams = []param{
 26 | 	{name: "watch", shorthand: "w", value: false, usage: "watch for the changes"},
 27 | }
 28 | 
 29 | var processesGetCmd = &cobra.Command{
 30 | 	Use:     "processes",
 31 | 	Aliases: []string{"p", "process"},
 32 | 	Short:   "list gpu processes and processes metadata",
 33 | 	Run: func(cmd *cobra.Command, args []string) {
 34 | 		getDevicesProcesses()
 35 | 	},
 36 | }
 37 | 
 38 | var getDevicesCmd = &cobra.Command{
 39 | 	Use:     "devices",
 40 | 	Aliases: []string{"d", "device"},
 41 | 	Short:   "get gpu devices",
 42 | 	Run: func(cmd *cobra.Command, args []string) {
 43 | 		getDevices()
 44 | 	},
 45 | }
 46 | 
 47 | func getDevices() {
 48 | 	conn := ctlutils.GetGrpcMetaGpuSrvClientConn(viper.GetString("addr"))
 49 | 	if conn == nil {
 50 | 		log.Fatalf("can't initiate connection to metagpu server")
 51 | 	}
 52 | 	defer conn.Close()
 53 | 	device := pbdevice.NewDeviceServiceClient(conn)
 54 | 	resp, err := device.GetMetaDeviceInfo(ctlutils.AuthenticatedContext(viper.GetString("token")), &pbdevice.GetMetaDeviceInfoRequest{})
 55 | 	if err != nil {
 56 | 		log.Fatal(err)
 57 | 	}
 58 | 	to := &TableOutput{}
 59 | 	to.header = table.Row{"Idx", "UUID", "Memory", "Shares", "Share size"}
 60 | 	to.body, to.footer = buildDeviceInfoTableBody(resp.Devices)
 61 | 	to.buildTable()
 62 | 	to.print()
 63 | 
 64 | }
 65 | 
 66 | func getDevicesProcesses() {
 67 | 
 68 | 	conn := ctlutils.GetGrpcMetaGpuSrvClientConn(viper.GetString("addr"))
 69 | 	if conn == nil {
 70 | 		log.Fatalf("can't initiate connection to metagpu server")
 71 | 	}
 72 | 	defer conn.Close()
 73 | 	device := pbdevice.NewDeviceServiceClient(conn)
 74 | 	hostname, err := os.Hostname()
 75 | 	if err != nil {
 76 | 		log.Errorf("faild to detect podId, err: %s", err)
 77 | 	}
 78 | 
 79 | 	to := &TableOutput{}
 80 | 	to.header = table.Row{"Pod", "NS", "Device", "Node", "GPU", "Memory", "Pid", "Cmd", "Req"}
 81 | 
 82 | 	if viper.GetBool("watch") {
 83 | 		request := &pbdevice.StreamGpuContainersRequest{PodId: hostname}
 84 | 		stream, err := device.StreamGpuContainers(ctlutils.AuthenticatedContext(viper.GetString("token")), request)
 85 | 		if err != nil {
 86 | 			log.Fatal(err)
 87 | 		}
 88 | 
 89 | 		refreshCh := make(chan bool)
 90 | 		sigCh := make(chan os.Signal, 1)
 91 | 		signal.Notify(sigCh, syscall.SIGHUP, syscall.SIGINT, syscall.SIGTERM, syscall.SIGQUIT)
 92 | 
 93 | 		go func() {
 94 | 			for {
 95 | 				time.Sleep(1 * time.Second)
 96 | 				refreshCh <- true
 97 | 			}
 98 | 		}()
 99 | 
100 | 		for {
101 | 			select {
102 | 			case <-sigCh:
103 | 				cursor.ClearLine()
104 | 				log.Info("shutting down")
105 | 				os.Exit(0)
106 | 			case <-refreshCh:
107 | 				processResp, err := stream.Recv()
108 | 				if err == io.EOF {
109 | 					break
110 | 				}
111 | 				if err != nil {
112 | 					log.Fatalf("error watching gpu processes, err: %s", err)
113 | 				}
114 | 				deviceResp, err := device.GetDevices(ctlutils.AuthenticatedContext(viper.GetString("token")), &pbdevice.GetDevicesRequest{})
115 | 				if err != nil {
116 | 					log.Errorf("falid to list devices, err: %s ", err)
117 | 					return
118 | 				}
119 | 				to.body = buildDeviceProcessesTableBody(processResp.GpuContainers)
120 | 				to.footer = buildDeviceProcessesTableFooter(processResp.GpuContainers, deviceResp.Device, processResp.VisibilityLevel)
121 | 				to.buildTable()
122 | 				to.print()
123 | 			}
124 | 		}
125 | 	} else {
126 | 		processResp, err := device.GetGpuContainers(ctlutils.AuthenticatedContext(viper.GetString("token")), &pbdevice.GetGpuContainersRequest{PodId: hostname})
127 | 		if err != nil {
128 | 			log.Errorf("falid to list device processes, err: %s ", err)
129 | 			return
130 | 		}
131 | 		deviceResp, err := device.GetDevices(ctlutils.AuthenticatedContext(viper.GetString("token")), &pbdevice.GetDevicesRequest{})
132 | 		if err != nil {
133 | 			log.Errorf("falid to list devices, err: %s ", err)
134 | 			return
135 | 		}
136 | 		to.body = buildDeviceProcessesTableBody(processResp.GpuContainers)
137 | 		to.footer = buildDeviceProcessesTableFooter(processResp.GpuContainers, deviceResp.Device, processResp.VisibilityLevel)
138 | 		to.buildTable()
139 | 		to.print()
140 | 	}
141 | }
142 | 
143 | func buildDeviceInfoTableBody(devices []*pbdevice.Device) (body []table.Row, footer table.Row) {
144 | 	var totMem uint64
145 | 	var shares uint32
146 | 	for _, d := range devices {
147 | 		shares = d.Shares
148 | 		totMem += d.MemoryTotal
149 | 		body = append(body, table.Row{
150 | 			d.Index,
151 | 			d.Uuid,
152 | 			d.MemoryTotal,
153 | 			d.Shares,
154 | 			d.MemoryShareSize,
155 | 		})
156 | 	}
157 | 	footer = table.Row{len(devices), "", fmt.Sprintf("%dMB", totMem), uint32(len(devices)) * shares, ""}
158 | 	return body, footer
159 | }
160 | 
161 | func buildDeviceProcessesTableBody(containers []*pbdevice.GpuContainer) (body []table.Row) {
162 | 
163 | 	for _, c := range containers {
164 | 		if len(c.ContainerDevices) > 0 {
165 | 			maxMem := int64(c.ContainerDevices[0].Device.MemoryShareSize * uint64(c.MetagpuRequests))
166 | 			if len(c.DeviceProcesses) > 0 {
167 | 				for _, p := range c.DeviceProcesses {
168 | 					relativeGpuUsage := (p.GpuUtilization * 100) / (100 / c.ContainerDevices[0].Device.Shares * uint32(c.MetagpuRequests))
169 | 					gpuUsage := fmt.Sprintf("\u001B[32m%d%%\u001B[0m", relativeGpuUsage)
170 | 					if relativeGpuUsage > 100 {
171 | 						gpuUsage = fmt.Sprintf("\u001B[31m%d%%\u001B[0m", relativeGpuUsage)
172 | 					}
173 | 					memUsage := fmt.Sprintf("\u001B[32m%d\u001B[0m/%d", p.Memory, maxMem)
174 | 					if int64(p.Memory) > maxMem {
175 | 						memUsage = fmt.Sprintf("\u001B[31m%d\u001B[0m/%d", p.Memory, maxMem)
176 | 					}
177 | 					body = append(body, table.Row{
178 | 						c.PodId,
179 | 						c.PodNamespace,
180 | 						formatContainerDeviceIndexes(c),
181 | 						c.NodeName,
182 | 						gpuUsage,
183 | 						memUsage,
184 | 						p.Pid,
185 | 						p.Cmdline,
186 | 						c.MetagpuRequests,
187 | 					})
188 | 				}
189 | 			} else {
190 | 				memUsage := fmt.Sprintf("\u001B[32m%d\u001B[0m/%d", 0, maxMem)
191 | 				body = append(body, table.Row{
192 | 					c.PodId,
193 | 					c.PodNamespace,
194 | 					formatContainerDeviceIndexes(c),
195 | 					c.NodeName,
196 | 					"-",
197 | 					memUsage,
198 | 					"-",
199 | 					"-",
200 | 					c.MetagpuRequests,
201 | 				})
202 | 			}
203 | 		} else {
204 | 			body = append(body, table.Row{
205 | 				c.PodId,
206 | 				c.PodNamespace,
207 | 				formatContainerDeviceIndexes(c),
208 | 				c.NodeName,
209 | 				"-",
210 | 				"-",
211 | 				"-",
212 | 				"-",
213 | 				c.MetagpuRequests,
214 | 			})
215 | 		}
216 | 
217 | 	}
218 | 
219 | 	return
220 | }
221 | 
222 | func buildDeviceProcessesTableFooter(containers []*pbdevice.GpuContainer, devices map[string]*pbdevice.Device, vl string) (footer table.Row) {
223 | 	metaGpuSummary := fmt.Sprintf("%d", getTotalRequests(containers))
224 | 	// TODO: fix this, the vl should be taken from directly form the  package
225 | 	// to problem is that package now includes the nvidia linux native stuff
226 | 	// and some package re-org is required
227 | 	//if vl == "l0" { // TODO: temporary disabled
228 | 	metaGpuSummary = fmt.Sprintf("%d/%d", getTotalShares(devices), getTotalRequests(containers))
229 | 	//}
230 | 	usedMem := fmt.Sprintf("%dMb", getTotalMemoryUsedByProcesses(containers))
231 | 	return table.Row{len(containers), "", "", "", "", usedMem, "", "", metaGpuSummary}
232 | }
233 | 


--------------------------------------------------------------------------------
/cmd/mgex/exporter.go:
--------------------------------------------------------------------------------
  1 | package main
  2 | 
  3 | import (
  4 | 	"errors"
  5 | 	"fmt"
  6 | 	pbdevice "github.com/AccessibleAI/cnvrg-fractional-accelerator-device-plugin/gen/proto/go/device/v1"
  7 | 	"github.com/AccessibleAI/cnvrg-fractional-accelerator-device-plugin/pkg/ctlutils"
  8 | 	"github.com/prometheus/client_golang/prometheus"
  9 | 	"github.com/prometheus/client_golang/prometheus/promhttp"
 10 | 	log "github.com/sirupsen/logrus"
 11 | 	"github.com/spf13/viper"
 12 | 	"google.golang.org/grpc"
 13 | 	"net"
 14 | 	"net/http"
 15 | 	"time"
 16 | )
 17 | 
 18 | var (
 19 | 	conn         *grpc.ClientConn
 20 | 	devicesCache map[string]*pbdevice.Device
 21 | 
 22 | 	deviceShares = prometheus.NewGaugeVec(prometheus.GaugeOpts{
 23 | 		Namespace: "metagpu",
 24 | 		Subsystem: "device",
 25 | 		Name:      "shares",
 26 | 		Help:      "total shares for single gpu unit",
 27 | 	}, []string{"device_uuid", "device_index", "resource_name", "node_name"})
 28 | 
 29 | 	deviceMemTotal = prometheus.NewGaugeVec(prometheus.GaugeOpts{
 30 | 		Namespace: "metagpu",
 31 | 		Subsystem: "device",
 32 | 		Name:      "memory_total",
 33 | 		Help:      "total memory per device",
 34 | 	}, []string{"device_uuid", "device_index", "resource_name", "node_name"})
 35 | 
 36 | 	deviceMemFree = prometheus.NewGaugeVec(prometheus.GaugeOpts{
 37 | 		Namespace: "metagpu",
 38 | 		Subsystem: "device",
 39 | 		Name:      "memory_free",
 40 | 		Help:      "free memory per device",
 41 | 	}, []string{"device_uuid", "device_index", "resource_name", "node_name"})
 42 | 
 43 | 	deviceMemUsed = prometheus.NewGaugeVec(prometheus.GaugeOpts{
 44 | 		Namespace: "metagpu",
 45 | 		Subsystem: "device",
 46 | 		Name:      "memory_used",
 47 | 		Help:      "used memory per device",
 48 | 	}, []string{"device_uuid", "device_index", "resource_name", "node_name"})
 49 | 
 50 | 	deviceMemShareSize = prometheus.NewGaugeVec(prometheus.GaugeOpts{
 51 | 		Namespace: "metagpu",
 52 | 		Subsystem: "device",
 53 | 		Name:      "memory_share_size",
 54 | 		Help:      "metagpu memory share size",
 55 | 	}, []string{"device_uuid", "device_index", "resource_name", "node_name"})
 56 | 
 57 | 	deviceProcessAbsoluteGpuUtilization = prometheus.NewGaugeVec(prometheus.GaugeOpts{
 58 | 		Namespace: "metagpu",
 59 | 		Subsystem: "process",
 60 | 		Name:      "absolute_gpu_utilization",
 61 | 		Help:      "gpu process utilization in percentage",
 62 | 	}, []string{"uuid", "pid", "cmdline", "user", "pod_name", "pod_namespace", "resource_name", "node_name"})
 63 | 
 64 | 	deviceProcessMemoryUsage = prometheus.NewGaugeVec(prometheus.GaugeOpts{
 65 | 		Namespace: "metagpu",
 66 | 		Subsystem: "process",
 67 | 		Name:      "memory_usage",
 68 | 		Help:      "process gpu-memory usage",
 69 | 	}, []string{"uuid", "pid", "cmdline", "user", "pod_name", "pod_namespace", "resource_name", "node_name"})
 70 | 
 71 | 	deviceProcessMetagpuRequests = prometheus.NewGaugeVec(prometheus.GaugeOpts{
 72 | 		Namespace: "metagpu",
 73 | 		Subsystem: "process",
 74 | 		Name:      "metagpu_requests",
 75 | 		Help:      "total metagpu requests in deployment spec",
 76 | 	}, []string{"pod_name", "pod_namespace", "resource_name", "node_name"})
 77 | 
 78 | 	deviceProcessMaxAllowedMetagpuGPUUtilization = prometheus.NewGaugeVec(prometheus.GaugeOpts{
 79 | 		Namespace: "metagpu",
 80 | 		Subsystem: "process",
 81 | 		Name:      "max_allowed_metagpu_gpu_utilization",
 82 | 		Help:      "max allowed metagpu gpu utilization",
 83 | 	}, []string{"uuid", "pid", "cmdline", "user", "pod_name", "pod_namespace", "resource_name", "node_name"})
 84 | 
 85 | 	deviceProcessMetagpuRelativeGPUUtilization = prometheus.NewGaugeVec(prometheus.GaugeOpts{
 86 | 		Namespace: "metagpu",
 87 | 		Subsystem: "process",
 88 | 		Name:      "metagpu_relative_gpu_utilization",
 89 | 		Help:      "relative to metagpu request gpu utilization",
 90 | 	}, []string{"uuid", "pid", "cmdline", "user", "pod_name", "pod_namespace", "resource_name", "node_name"})
 91 | 
 92 | 	deviceProcessMaxAllowedMetaGpuMemory = prometheus.NewGaugeVec(prometheus.GaugeOpts{
 93 | 		Namespace: "metagpu",
 94 | 		Subsystem: "process",
 95 | 		Name:      "max_allowed_metagpu_memory",
 96 | 		Help:      "max allowed metagpu memory usage",
 97 | 	}, []string{"uuid", "pid", "cmdline", "user", "pod_name", "pod_namespace", "resource_name", "node_name"})
 98 | 
 99 | 	deviceProcessMetagpuRelativeMemoryUtilization = prometheus.NewGaugeVec(prometheus.GaugeOpts{
100 | 		Namespace: "metagpu",
101 | 		Subsystem: "process",
102 | 		Name:      "metagpu_relative_memory_utilization",
103 | 		Help:      "relative to metagpus request memory utilization",
104 | 	}, []string{"uuid", "pid", "cmdline", "user", "pod_name", "pod_namespace", "resource_name", "node_name"})
105 | )
106 | 
107 | func getGpuContainers() []*pbdevice.GpuContainer {
108 | 	devices := pbdevice.NewDeviceServiceClient(conn)
109 | 	req := &pbdevice.GetGpuContainersRequest{}
110 | 	ctx := ctlutils.AuthenticatedContext(viper.GetString("token"))
111 | 	resp, err := devices.GetGpuContainers(ctx, req)
112 | 	if err != nil {
113 | 		log.Error(err)
114 | 		return nil
115 | 	}
116 | 	return resp.GpuContainers
117 | }
118 | 
119 | func getGpuDevicesInfo() []*pbdevice.Device {
120 | 	devices := pbdevice.NewDeviceServiceClient(conn)
121 | 	req := &pbdevice.GetMetaDeviceInfoRequest{}
122 | 	ctx := ctlutils.AuthenticatedContext(viper.GetString("token"))
123 | 	resp, err := devices.GetMetaDeviceInfo(ctx, req)
124 | 	if err != nil {
125 | 		log.Error(err)
126 | 		return nil
127 | 	}
128 | 	return resp.Devices
129 | }
130 | 
131 | func setGpuDevicesCache() map[string]*pbdevice.Device {
132 | 	if devicesCache != nil {
133 | 		return devicesCache
134 | 	}
135 | 	devicesCache = make(map[string]*pbdevice.Device)
136 | 	devices := pbdevice.NewDeviceServiceClient(conn)
137 | 	req := &pbdevice.GetDevicesRequest{}
138 | 	ctx := ctlutils.AuthenticatedContext(viper.GetString("token"))
139 | 	resp, err := devices.GetDevices(ctx, req)
140 | 	if err != nil {
141 | 		log.Error(err)
142 | 		return nil
143 | 	}
144 | 	devicesCache = resp.Device
145 | 	return devicesCache
146 | }
147 | 
148 | func clearGpuDevicesCache() {
149 | 	devicesCache = nil
150 | }
151 | 
152 | func setDevicesMetrics() {
153 | 	// GPU device metrics
154 | 	for _, d := range getGpuDevicesInfo() {
155 | 		labels := []string{d.Uuid, fmt.Sprintf("%d", d.Index), d.ResourceName, d.NodeName}
156 | 		deviceShares.WithLabelValues(labels...).Set(float64(d.Shares))
157 | 		deviceMemTotal.WithLabelValues(labels...).Set(float64(d.MemoryTotal))
158 | 		deviceMemFree.WithLabelValues(labels...).Set(float64(d.MemoryFree))
159 | 		deviceMemUsed.WithLabelValues(labels...).Set(float64(d.MemoryUsed))
160 | 		deviceMemShareSize.WithLabelValues(labels...).Set(float64(d.MemoryShareSize))
161 | 	}
162 | }
163 | 
164 | func resetProcessLevelMetrics() {
165 | 	deviceProcessAbsoluteGpuUtilization.Reset()
166 | 	deviceProcessMemoryUsage.Reset()
167 | 	deviceProcessMetagpuRequests.Reset()
168 | 	deviceProcessMaxAllowedMetagpuGPUUtilization.Reset()
169 | 	deviceProcessMetagpuRelativeGPUUtilization.Reset()
170 | 	deviceProcessMaxAllowedMetaGpuMemory.Reset()
171 | 	deviceProcessMetagpuRelativeMemoryUtilization.Reset()
172 | }
173 | 
174 | func setProcessesMetrics() {
175 | 	// reset metrics
176 | 	resetProcessLevelMetrics()
177 | 	// GPU processes metrics
178 | 	for _, c := range getGpuContainers() {
179 | 		// metagpu requests
180 | 		deviceProcessMetagpuRequests.WithLabelValues(
181 | 			c.PodId, c.PodNamespace, c.ResourceName, c.NodeName).Set(float64(c.MetagpuRequests))
182 | 		// if pod has processes expose process metrics
183 | 		if len(c.DeviceProcesses) > 0 {
184 | 			for _, p := range c.DeviceProcesses {
185 | 				// set labels for device process level metrics
186 | 				labels := []string{
187 | 					p.Uuid, fmt.Sprintf("%d", p.Pid), p.Cmdline, p.User, c.PodId, c.PodNamespace, c.ResourceName, c.NodeName}
188 | 				// absolute memory and gpu usage
189 | 				deviceProcessAbsoluteGpuUtilization.WithLabelValues(labels...).Set(float64(p.GpuUtilization))
190 | 				deviceProcessMemoryUsage.WithLabelValues(labels...).Set(float64(p.Memory))
191 | 				// max (relative to metagpus request) allowed gpu and memory utilization
192 | 				deviceProcessMaxAllowedMetagpuGPUUtilization.WithLabelValues(labels...).Set(getMaxAllowedMetagpuGPUUtilization(c))
193 | 				deviceProcessMaxAllowedMetaGpuMemory.WithLabelValues(labels...).Set(getMaxAllowedMetaGpuMemory(c))
194 | 				// relative gpu and memory utilization
195 | 				deviceProcessMetagpuRelativeGPUUtilization.WithLabelValues(labels...).Set(getRelativeGPUUtilization(c, p))
196 | 				deviceProcessMetagpuRelativeMemoryUtilization.WithLabelValues(labels...).Set(getRelativeMemoryUtilization(c, p))
197 | 			}
198 | 		} else { // pod doesn't have any processes, all the metrics should be set to 0
199 | 			labels := []string{
200 | 				"-", "-", "-", "-", c.PodId, c.PodNamespace, c.ResourceName, c.NodeName}
201 | 			// absolute memory and gpu usage
202 | 			deviceProcessAbsoluteGpuUtilization.WithLabelValues(labels...).Set(0)
203 | 			deviceProcessMemoryUsage.WithLabelValues(labels...).Set(0)
204 | 			// max (relative to metagpus request) allowed gpu and memory utilization
205 | 			deviceProcessMaxAllowedMetagpuGPUUtilization.WithLabelValues(labels...).Set(0)
206 | 			deviceProcessMaxAllowedMetaGpuMemory.WithLabelValues(labels...).Set(0)
207 | 			// relative gpu and memory utilization
208 | 			deviceProcessMetagpuRelativeGPUUtilization.WithLabelValues(labels...).Set(0)
209 | 			deviceProcessMetagpuRelativeMemoryUtilization.WithLabelValues(labels...).Set(0)
210 | 		}
211 | 	}
212 | }
213 | 
214 | func getMaxAllowedMetagpuGPUUtilization(c *pbdevice.GpuContainer) float64 {
215 | 	l := log.WithField("pod", c.PodId)
216 | 	d, err := getFirstContainerDevice(c)
217 | 	if err != nil {
218 | 		l.Error(err)
219 | 		return 0
220 | 	}
221 | 	return float64((100 / d.Device.Shares) * uint32(c.MetagpuRequests))
222 | }
223 | 
224 | func getMaxAllowedMetaGpuMemory(c *pbdevice.GpuContainer) float64 {
225 | 	l := log.WithField("pod", c.PodId)
226 | 	d, err := getFirstContainerDevice(c)
227 | 	if err != nil {
228 | 		l.Error(err)
229 | 		return 0
230 | 	}
231 | 	return float64(uint64(c.MetagpuRequests) * d.Device.MemoryShareSize)
232 | 
233 | }
234 | 
235 | func getFirstContainerDevice(c *pbdevice.GpuContainer) (*pbdevice.ContainerDevice, error) {
236 | 	if len(c.ContainerDevices) == 0 {
237 | 		return nil, errors.New("no allocated gpus found")
238 | 	}
239 | 	return c.ContainerDevices[0], nil
240 | }
241 | 
242 | func getRelativeGPUUtilization(c *pbdevice.GpuContainer, p *pbdevice.DeviceProcess) float64 {
243 | 	l := log.WithField("pod", c.PodId)
244 | 	d, err := getFirstContainerDevice(c)
245 | 	if err != nil {
246 | 		l.Error(err)
247 | 		return 0
248 | 	}
249 | 	maxMetaGpuUtilization := (100 / d.Device.Shares) * uint32(c.MetagpuRequests)
250 | 	metaGpuUtilization := 0
251 | 	if p.GpuUtilization > 0 && maxMetaGpuUtilization > 0 {
252 | 		metaGpuUtilization = int((p.GpuUtilization * 100) / maxMetaGpuUtilization)
253 | 	}
254 | 	return float64(metaGpuUtilization)
255 | }
256 | 
257 | func getRelativeMemoryUtilization(c *pbdevice.GpuContainer, p *pbdevice.DeviceProcess) float64 {
258 | 	l := log.WithField("pod", c.PodId)
259 | 	d, err := getFirstContainerDevice(c)
260 | 	if err != nil {
261 | 		l.Error(err)
262 | 		return 0
263 | 	}
264 | 	maxMetaMemory := int(uint64(c.MetagpuRequests) * d.Device.MemoryShareSize)
265 | 	metaMemUtilization := 0
266 | 	if maxMetaMemory > 0 {
267 | 		metaMemUtilization = (int(p.Memory) * 100) / maxMetaMemory
268 | 	}
269 | 	return float64(metaMemUtilization)
270 | }
271 | 
272 | func recordMetrics() {
273 | 	go func() {
274 | 		for {
275 | 			conn = ctlutils.GetGrpcMetaGpuSrvClientConn(viper.GetString("mgsrv"))
276 | 			if conn == nil {
277 | 				log.Fatal("connection is nil, can't continue")
278 | 				continue
279 | 			}
280 | 			// load devices cache
281 | 			setGpuDevicesCache()
282 | 			// set devices level metrics
283 | 			setDevicesMetrics()
284 | 			// set processes level metrics
285 | 			setProcessesMetrics()
286 | 			// close grcp connections
287 | 			conn.Close()
288 | 			// clear the cache
289 | 			clearGpuDevicesCache()
290 | 			time.Sleep(15 * time.Second)
291 | 		}
292 | 	}()
293 | }
294 | 
295 | func startExporter() {
296 | 
297 | 	log.Info("starting metagpu metrics exporter")
298 | 	prometheus.MustRegister(deviceShares)
299 | 	prometheus.MustRegister(deviceMemTotal)
300 | 	prometheus.MustRegister(deviceMemFree)
301 | 	prometheus.MustRegister(deviceMemUsed)
302 | 	prometheus.MustRegister(deviceMemShareSize)
303 | 	prometheus.MustRegister(deviceProcessAbsoluteGpuUtilization)
304 | 	prometheus.MustRegister(deviceProcessMemoryUsage)
305 | 	prometheus.MustRegister(deviceProcessMetagpuRequests)
306 | 	prometheus.MustRegister(deviceProcessMaxAllowedMetagpuGPUUtilization)
307 | 	prometheus.MustRegister(deviceProcessMetagpuRelativeGPUUtilization)
308 | 	prometheus.MustRegister(deviceProcessMaxAllowedMetaGpuMemory)
309 | 	prometheus.MustRegister(deviceProcessMetagpuRelativeMemoryUtilization)
310 | 	recordMetrics()
311 | 	addr := viper.GetString("metrics-addr")
312 | 	http.Handle("/metrics", promhttp.Handler())
313 | 	l, err := net.Listen("tcp", addr)
314 | 	if err != nil {
315 | 		log.Error(err)
316 | 		return
317 | 	}
318 | 	log.Infof("metrics serving on http://%s/metrics", addr)
319 | 	if err := http.Serve(l, nil); err != nil {
320 | 		log.Error(err)
321 | 		return
322 | 	}
323 | }
324 | 


--------------------------------------------------------------------------------
/gen/proto/go/device/v1/device_grpc.pb.go:
--------------------------------------------------------------------------------
  1 | // Code generated by protoc-gen-go-grpc. DO NOT EDIT.
  2 | 
  3 | package devicev1
  4 | 
  5 | import (
  6 | 	context "context"
  7 | 	grpc "google.golang.org/grpc"
  8 | 	codes "google.golang.org/grpc/codes"
  9 | 	status "google.golang.org/grpc/status"
 10 | )
 11 | 
 12 | // This is a compile-time assertion to ensure that this generated file
 13 | // is compatible with the grpc package it is being compiled against.
 14 | // Requires gRPC-Go v1.32.0 or later.
 15 | const _ = grpc.SupportPackageIsVersion7
 16 | 
 17 | // DeviceServiceClient is the client API for DeviceService service.
 18 | //
 19 | // For semantics around ctx use and closing/ending streaming RPCs, please refer to https://pkg.go.dev/google.golang.org/grpc/?tab=doc#ClientConn.NewStream.
 20 | type DeviceServiceClient interface {
 21 | 	GetGpuContainers(ctx context.Context, in *GetGpuContainersRequest, opts ...grpc.CallOption) (*GetGpuContainersResponse, error)
 22 | 	StreamGpuContainers(ctx context.Context, in *StreamGpuContainersRequest, opts ...grpc.CallOption) (DeviceService_StreamGpuContainersClient, error)
 23 | 	GetDevices(ctx context.Context, in *GetDevicesRequest, opts ...grpc.CallOption) (*GetDevicesResponse, error)
 24 | 	KillGpuProcess(ctx context.Context, in *KillGpuProcessRequest, opts ...grpc.CallOption) (*KillGpuProcessResponse, error)
 25 | 	PatchConfigs(ctx context.Context, in *PatchConfigsRequest, opts ...grpc.CallOption) (*PatchConfigsResponse, error)
 26 | 	GetMetaDeviceInfo(ctx context.Context, in *GetMetaDeviceInfoRequest, opts ...grpc.CallOption) (*GetMetaDeviceInfoResponse, error)
 27 | 	PingServer(ctx context.Context, in *PingServerRequest, opts ...grpc.CallOption) (*PingServerResponse, error)
 28 | }
 29 | 
 30 | type deviceServiceClient struct {
 31 | 	cc grpc.ClientConnInterface
 32 | }
 33 | 
 34 | func NewDeviceServiceClient(cc grpc.ClientConnInterface) DeviceServiceClient {
 35 | 	return &deviceServiceClient{cc}
 36 | }
 37 | 
 38 | func (c *deviceServiceClient) GetGpuContainers(ctx context.Context, in *GetGpuContainersRequest, opts ...grpc.CallOption) (*GetGpuContainersResponse, error) {
 39 | 	out := new(GetGpuContainersResponse)
 40 | 	err := c.cc.Invoke(ctx, "/device.v1.DeviceService/GetGpuContainers", in, out, opts...)
 41 | 	if err != nil {
 42 | 		return nil, err
 43 | 	}
 44 | 	return out, nil
 45 | }
 46 | 
 47 | func (c *deviceServiceClient) StreamGpuContainers(ctx context.Context, in *StreamGpuContainersRequest, opts ...grpc.CallOption) (DeviceService_StreamGpuContainersClient, error) {
 48 | 	stream, err := c.cc.NewStream(ctx, &DeviceService_ServiceDesc.Streams[0], "/device.v1.DeviceService/StreamGpuContainers", opts...)
 49 | 	if err != nil {
 50 | 		return nil, err
 51 | 	}
 52 | 	x := &deviceServiceStreamGpuContainersClient{stream}
 53 | 	if err := x.ClientStream.SendMsg(in); err != nil {
 54 | 		return nil, err
 55 | 	}
 56 | 	if err := x.ClientStream.CloseSend(); err != nil {
 57 | 		return nil, err
 58 | 	}
 59 | 	return x, nil
 60 | }
 61 | 
 62 | type DeviceService_StreamGpuContainersClient interface {
 63 | 	Recv() (*StreamGpuContainersResponse, error)
 64 | 	grpc.ClientStream
 65 | }
 66 | 
 67 | type deviceServiceStreamGpuContainersClient struct {
 68 | 	grpc.ClientStream
 69 | }
 70 | 
 71 | func (x *deviceServiceStreamGpuContainersClient) Recv() (*StreamGpuContainersResponse, error) {
 72 | 	m := new(StreamGpuContainersResponse)
 73 | 	if err := x.ClientStream.RecvMsg(m); err != nil {
 74 | 		return nil, err
 75 | 	}
 76 | 	return m, nil
 77 | }
 78 | 
 79 | func (c *deviceServiceClient) GetDevices(ctx context.Context, in *GetDevicesRequest, opts ...grpc.CallOption) (*GetDevicesResponse, error) {
 80 | 	out := new(GetDevicesResponse)
 81 | 	err := c.cc.Invoke(ctx, "/device.v1.DeviceService/GetDevices", in, out, opts...)
 82 | 	if err != nil {
 83 | 		return nil, err
 84 | 	}
 85 | 	return out, nil
 86 | }
 87 | 
 88 | func (c *deviceServiceClient) KillGpuProcess(ctx context.Context, in *KillGpuProcessRequest, opts ...grpc.CallOption) (*KillGpuProcessResponse, error) {
 89 | 	out := new(KillGpuProcessResponse)
 90 | 	err := c.cc.Invoke(ctx, "/device.v1.DeviceService/KillGpuProcess", in, out, opts...)
 91 | 	if err != nil {
 92 | 		return nil, err
 93 | 	}
 94 | 	return out, nil
 95 | }
 96 | 
 97 | func (c *deviceServiceClient) PatchConfigs(ctx context.Context, in *PatchConfigsRequest, opts ...grpc.CallOption) (*PatchConfigsResponse, error) {
 98 | 	out := new(PatchConfigsResponse)
 99 | 	err := c.cc.Invoke(ctx, "/device.v1.DeviceService/PatchConfigs", in, out, opts...)
100 | 	if err != nil {
101 | 		return nil, err
102 | 	}
103 | 	return out, nil
104 | }
105 | 
106 | func (c *deviceServiceClient) GetMetaDeviceInfo(ctx context.Context, in *GetMetaDeviceInfoRequest, opts ...grpc.CallOption) (*GetMetaDeviceInfoResponse, error) {
107 | 	out := new(GetMetaDeviceInfoResponse)
108 | 	err := c.cc.Invoke(ctx, "/device.v1.DeviceService/GetMetaDeviceInfo", in, out, opts...)
109 | 	if err != nil {
110 | 		return nil, err
111 | 	}
112 | 	return out, nil
113 | }
114 | 
115 | func (c *deviceServiceClient) PingServer(ctx context.Context, in *PingServerRequest, opts ...grpc.CallOption) (*PingServerResponse, error) {
116 | 	out := new(PingServerResponse)
117 | 	err := c.cc.Invoke(ctx, "/device.v1.DeviceService/PingServer", in, out, opts...)
118 | 	if err != nil {
119 | 		return nil, err
120 | 	}
121 | 	return out, nil
122 | }
123 | 
124 | // DeviceServiceServer is the server API for DeviceService service.
125 | // All implementations must embed UnimplementedDeviceServiceServer
126 | // for forward compatibility
127 | type DeviceServiceServer interface {
128 | 	GetGpuContainers(context.Context, *GetGpuContainersRequest) (*GetGpuContainersResponse, error)
129 | 	StreamGpuContainers(*StreamGpuContainersRequest, DeviceService_StreamGpuContainersServer) error
130 | 	GetDevices(context.Context, *GetDevicesRequest) (*GetDevicesResponse, error)
131 | 	KillGpuProcess(context.Context, *KillGpuProcessRequest) (*KillGpuProcessResponse, error)
132 | 	PatchConfigs(context.Context, *PatchConfigsRequest) (*PatchConfigsResponse, error)
133 | 	GetMetaDeviceInfo(context.Context, *GetMetaDeviceInfoRequest) (*GetMetaDeviceInfoResponse, error)
134 | 	PingServer(context.Context, *PingServerRequest) (*PingServerResponse, error)
135 | 	mustEmbedUnimplementedDeviceServiceServer()
136 | }
137 | 
138 | // UnimplementedDeviceServiceServer must be embedded to have forward compatible implementations.
139 | type UnimplementedDeviceServiceServer struct {
140 | }
141 | 
142 | func (UnimplementedDeviceServiceServer) GetGpuContainers(context.Context, *GetGpuContainersRequest) (*GetGpuContainersResponse, error) {
143 | 	return nil, status.Errorf(codes.Unimplemented, "method GetGpuContainers not implemented")
144 | }
145 | func (UnimplementedDeviceServiceServer) StreamGpuContainers(*StreamGpuContainersRequest, DeviceService_StreamGpuContainersServer) error {
146 | 	return status.Errorf(codes.Unimplemented, "method StreamGpuContainers not implemented")
147 | }
148 | func (UnimplementedDeviceServiceServer) GetDevices(context.Context, *GetDevicesRequest) (*GetDevicesResponse, error) {
149 | 	return nil, status.Errorf(codes.Unimplemented, "method GetDevices not implemented")
150 | }
151 | func (UnimplementedDeviceServiceServer) KillGpuProcess(context.Context, *KillGpuProcessRequest) (*KillGpuProcessResponse, error) {
152 | 	return nil, status.Errorf(codes.Unimplemented, "method KillGpuProcess not implemented")
153 | }
154 | func (UnimplementedDeviceServiceServer) PatchConfigs(context.Context, *PatchConfigsRequest) (*PatchConfigsResponse, error) {
155 | 	return nil, status.Errorf(codes.Unimplemented, "method PatchConfigs not implemented")
156 | }
157 | func (UnimplementedDeviceServiceServer) GetMetaDeviceInfo(context.Context, *GetMetaDeviceInfoRequest) (*GetMetaDeviceInfoResponse, error) {
158 | 	return nil, status.Errorf(codes.Unimplemented, "method GetMetaDeviceInfo not implemented")
159 | }
160 | func (UnimplementedDeviceServiceServer) PingServer(context.Context, *PingServerRequest) (*PingServerResponse, error) {
161 | 	return nil, status.Errorf(codes.Unimplemented, "method PingServer not implemented")
162 | }
163 | func (UnimplementedDeviceServiceServer) mustEmbedUnimplementedDeviceServiceServer() {}
164 | 
165 | // UnsafeDeviceServiceServer may be embedded to opt out of forward compatibility for this service.
166 | // Use of this interface is not recommended, as added methods to DeviceServiceServer will
167 | // result in compilation errors.
168 | type UnsafeDeviceServiceServer interface {
169 | 	mustEmbedUnimplementedDeviceServiceServer()
170 | }
171 | 
172 | func RegisterDeviceServiceServer(s grpc.ServiceRegistrar, srv DeviceServiceServer) {
173 | 	s.RegisterService(&DeviceService_ServiceDesc, srv)
174 | }
175 | 
176 | func _DeviceService_GetGpuContainers_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
177 | 	in := new(GetGpuContainersRequest)
178 | 	if err := dec(in); err != nil {
179 | 		return nil, err
180 | 	}
181 | 	if interceptor == nil {
182 | 		return srv.(DeviceServiceServer).GetGpuContainers(ctx, in)
183 | 	}
184 | 	info := &grpc.UnaryServerInfo{
185 | 		Server:     srv,
186 | 		FullMethod: "/device.v1.DeviceService/GetGpuContainers",
187 | 	}
188 | 	handler := func(ctx context.Context, req interface{}) (interface{}, error) {
189 | 		return srv.(DeviceServiceServer).GetGpuContainers(ctx, req.(*GetGpuContainersRequest))
190 | 	}
191 | 	return interceptor(ctx, in, info, handler)
192 | }
193 | 
194 | func _DeviceService_StreamGpuContainers_Handler(srv interface{}, stream grpc.ServerStream) error {
195 | 	m := new(StreamGpuContainersRequest)
196 | 	if err := stream.RecvMsg(m); err != nil {
197 | 		return err
198 | 	}
199 | 	return srv.(DeviceServiceServer).StreamGpuContainers(m, &deviceServiceStreamGpuContainersServer{stream})
200 | }
201 | 
202 | type DeviceService_StreamGpuContainersServer interface {
203 | 	Send(*StreamGpuContainersResponse) error
204 | 	grpc.ServerStream
205 | }
206 | 
207 | type deviceServiceStreamGpuContainersServer struct {
208 | 	grpc.ServerStream
209 | }
210 | 
211 | func (x *deviceServiceStreamGpuContainersServer) Send(m *StreamGpuContainersResponse) error {
212 | 	return x.ServerStream.SendMsg(m)
213 | }
214 | 
215 | func _DeviceService_GetDevices_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
216 | 	in := new(GetDevicesRequest)
217 | 	if err := dec(in); err != nil {
218 | 		return nil, err
219 | 	}
220 | 	if interceptor == nil {
221 | 		return srv.(DeviceServiceServer).GetDevices(ctx, in)
222 | 	}
223 | 	info := &grpc.UnaryServerInfo{
224 | 		Server:     srv,
225 | 		FullMethod: "/device.v1.DeviceService/GetDevices",
226 | 	}
227 | 	handler := func(ctx context.Context, req interface{}) (interface{}, error) {
228 | 		return srv.(DeviceServiceServer).GetDevices(ctx, req.(*GetDevicesRequest))
229 | 	}
230 | 	return interceptor(ctx, in, info, handler)
231 | }
232 | 
233 | func _DeviceService_KillGpuProcess_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
234 | 	in := new(KillGpuProcessRequest)
235 | 	if err := dec(in); err != nil {
236 | 		return nil, err
237 | 	}
238 | 	if interceptor == nil {
239 | 		return srv.(DeviceServiceServer).KillGpuProcess(ctx, in)
240 | 	}
241 | 	info := &grpc.UnaryServerInfo{
242 | 		Server:     srv,
243 | 		FullMethod: "/device.v1.DeviceService/KillGpuProcess",
244 | 	}
245 | 	handler := func(ctx context.Context, req interface{}) (interface{}, error) {
246 | 		return srv.(DeviceServiceServer).KillGpuProcess(ctx, req.(*KillGpuProcessRequest))
247 | 	}
248 | 	return interceptor(ctx, in, info, handler)
249 | }
250 | 
251 | func _DeviceService_PatchConfigs_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
252 | 	in := new(PatchConfigsRequest)
253 | 	if err := dec(in); err != nil {
254 | 		return nil, err
255 | 	}
256 | 	if interceptor == nil {
257 | 		return srv.(DeviceServiceServer).PatchConfigs(ctx, in)
258 | 	}
259 | 	info := &grpc.UnaryServerInfo{
260 | 		Server:     srv,
261 | 		FullMethod: "/device.v1.DeviceService/PatchConfigs",
262 | 	}
263 | 	handler := func(ctx context.Context, req interface{}) (interface{}, error) {
264 | 		return srv.(DeviceServiceServer).PatchConfigs(ctx, req.(*PatchConfigsRequest))
265 | 	}
266 | 	return interceptor(ctx, in, info, handler)
267 | }
268 | 
269 | func _DeviceService_GetMetaDeviceInfo_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
270 | 	in := new(GetMetaDeviceInfoRequest)
271 | 	if err := dec(in); err != nil {
272 | 		return nil, err
273 | 	}
274 | 	if interceptor == nil {
275 | 		return srv.(DeviceServiceServer).GetMetaDeviceInfo(ctx, in)
276 | 	}
277 | 	info := &grpc.UnaryServerInfo{
278 | 		Server:     srv,
279 | 		FullMethod: "/device.v1.DeviceService/GetMetaDeviceInfo",
280 | 	}
281 | 	handler := func(ctx context.Context, req interface{}) (interface{}, error) {
282 | 		return srv.(DeviceServiceServer).GetMetaDeviceInfo(ctx, req.(*GetMetaDeviceInfoRequest))
283 | 	}
284 | 	return interceptor(ctx, in, info, handler)
285 | }
286 | 
287 | func _DeviceService_PingServer_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
288 | 	in := new(PingServerRequest)
289 | 	if err := dec(in); err != nil {
290 | 		return nil, err
291 | 	}
292 | 	if interceptor == nil {
293 | 		return srv.(DeviceServiceServer).PingServer(ctx, in)
294 | 	}
295 | 	info := &grpc.UnaryServerInfo{
296 | 		Server:     srv,
297 | 		FullMethod: "/device.v1.DeviceService/PingServer",
298 | 	}
299 | 	handler := func(ctx context.Context, req interface{}) (interface{}, error) {
300 | 		return srv.(DeviceServiceServer).PingServer(ctx, req.(*PingServerRequest))
301 | 	}
302 | 	return interceptor(ctx, in, info, handler)
303 | }
304 | 
305 | // DeviceService_ServiceDesc is the grpc.ServiceDesc for DeviceService service.
306 | // It's only intended for direct use with grpc.RegisterService,
307 | // and not to be introspected or modified (even as a copy)
308 | var DeviceService_ServiceDesc = grpc.ServiceDesc{
309 | 	ServiceName: "device.v1.DeviceService",
310 | 	HandlerType: (*DeviceServiceServer)(nil),
311 | 	Methods: []grpc.MethodDesc{
312 | 		{
313 | 			MethodName: "GetGpuContainers",
314 | 			Handler:    _DeviceService_GetGpuContainers_Handler,
315 | 		},
316 | 		{
317 | 			MethodName: "GetDevices",
318 | 			Handler:    _DeviceService_GetDevices_Handler,
319 | 		},
320 | 		{
321 | 			MethodName: "KillGpuProcess",
322 | 			Handler:    _DeviceService_KillGpuProcess_Handler,
323 | 		},
324 | 		{
325 | 			MethodName: "PatchConfigs",
326 | 			Handler:    _DeviceService_PatchConfigs_Handler,
327 | 		},
328 | 		{
329 | 			MethodName: "GetMetaDeviceInfo",
330 | 			Handler:    _DeviceService_GetMetaDeviceInfo_Handler,
331 | 		},
332 | 		{
333 | 			MethodName: "PingServer",
334 | 			Handler:    _DeviceService_PingServer_Handler,
335 | 		},
336 | 	},
337 | 	Streams: []grpc.StreamDesc{
338 | 		{
339 | 			StreamName:    "StreamGpuContainers",
340 | 			Handler:       _DeviceService_StreamGpuContainers_Handler,
341 | 			ServerStreams: true,
342 | 		},
343 | 	},
344 | 	Metadata: "device/v1/device.proto",
345 | }
346 | 


--------------------------------------------------------------------------------