├── .github
    └── PR_TEMPLATE.md
├── .gitignore
├── .gitlab-ci.yml
├── .gitmodules
├── CONTRIBUTING.md
├── LICENSE
├── Makefile
├── README.md
├── RELEASE.md
├── bindings
    └── go
    │   ├── dcgm
    │       ├── admin.go
    │       ├── api.go
    │       ├── bcast.go
    │       ├── callback.c
    │       ├── const.go
    │       ├── dcgm_agent.h
    │       ├── dcgm_errors.h
    │       ├── dcgm_fields.h
    │       ├── dcgm_structs.h
    │       ├── dcgm_test.go
    │       ├── device_info.go
    │       ├── device_status.go
    │       ├── fields.go
    │       ├── go.mod
    │       ├── gpu_group.go
    │       ├── health.go
    │       ├── hostengine_status.go
    │       ├── mig.go
    │       ├── policy.go
    │       ├── process_info.go
    │       ├── profile.go
    │       ├── topology.go
    │       └── utils.go
    │   ├── nvml
    │       ├── bindings.go
    │       ├── mig.go
    │       ├── mig_test.go
    │       ├── nvml.go
    │       ├── nvml.h
    │       ├── nvml_dl.go
    │       ├── nvml_dl_windows.go
    │       ├── nvml_test.go
    │       └── nvsmi
    │       │   └── nvsmi.go
    │   └── samples
    │       ├── dcgm
    │           ├── README.md
    │           ├── deviceInfo
    │           │   └── main.go
    │           ├── dmon
    │           │   └── main.go
    │           ├── health
    │           │   └── main.go
    │           ├── hostengineStatus
    │           │   └── main.go
    │           ├── policy
    │           │   └── main.go
    │           ├── processInfo
    │           │   └── main.go
    │           ├── restApi
    │           │   ├── README.md
    │           │   ├── handlers
    │           │   │   ├── byIds.go
    │           │   │   ├── byUuids.go
    │           │   │   ├── dcgm.go
    │           │   │   └── utils.go
    │           │   ├── main.go
    │           │   └── server.go
    │           └── topology
    │           │   └── main.go
    │       └── nvml
    │           ├── README.md
    │           ├── deviceInfo
    │               └── main.go
    │           ├── dmon
    │               └── main.go
    │           └── processInfo
    │               └── main.go
├── dcgm-exporter.yaml
├── deployment
    └── dcgm-exporter
    │   ├── .helmignore
    │   ├── Chart.yaml
    │   ├── templates
    │       ├── NOTES.txt
    │       ├── _helpers.tpl
    │       ├── daemonset.yaml
    │       ├── service-monitor.yaml
    │       ├── service.yaml
    │       └── serviceaccount.yaml
    │   └── values.yaml
├── docker
    ├── Dockerfile.ubi8
    ├── Dockerfile.ubuntu18.04
    ├── Dockerfile.ubuntu20.04
    └── dcgm-exporter-entrypoint.sh
├── etc
    └── dcgm-exporter
    │   ├── 1.x-compatibility-metrics.csv
    │   ├── dcp-metrics-included.csv
    │   └── default-counters.csv
├── go.mod
├── go.sum
├── grafana
    └── dcgm-exporter-dashboard.json
├── pkg
    ├── Dockerfile
    ├── dcgm.go
    ├── go.mod
    ├── go.sum
    ├── gpu_collector.go
    ├── gpu_collector_test.go
    ├── kubernetes.go
    ├── kubernetes_test.go
    ├── main.go
    ├── parser.go
    ├── pipeline.go
    ├── pipeline_test.go
    ├── server.go
    ├── system_info.go
    ├── system_info_test.go
    ├── types.go
    └── utils.go
├── service-monitor.yaml
├── tests
    ├── ci-run-e2e.sh
    ├── common.sh
    ├── gpu-pod.yaml
    ├── metrics.sh
    └── variables.tfvars
└── vendor
    ├── github.com
        ├── Masterminds
        │   └── semver
        │   │   ├── .travis.yml
        │   │   ├── CHANGELOG.md
        │   │   ├── LICENSE.txt
        │   │   ├── Makefile
        │   │   ├── README.md
        │   │   ├── appveyor.yml
        │   │   ├── collection.go
        │   │   ├── constraints.go
        │   │   ├── doc.go
        │   │   ├── version.go
        │   │   └── version_fuzz.go
        ├── NVIDIA
        │   └── gpu-monitoring-tools
        │   │   └── bindings
        │   │       └── go
        │   │           └── dcgm
        │   │               ├── admin.go
        │   │               ├── api.go
        │   │               ├── bcast.go
        │   │               ├── callback.c
        │   │               ├── const.go
        │   │               ├── dcgm_agent.h
        │   │               ├── dcgm_errors.h
        │   │               ├── dcgm_fields.h
        │   │               ├── dcgm_structs.h
        │   │               ├── device_info.go
        │   │               ├── device_status.go
        │   │               ├── fields.go
        │   │               ├── go.mod
        │   │               ├── gpu_group.go
        │   │               ├── health.go
        │   │               ├── hostengine_status.go
        │   │               ├── mig.go
        │   │               ├── policy.go
        │   │               ├── process_info.go
        │   │               ├── profile.go
        │   │               ├── topology.go
        │   │               └── utils.go
        └── gorilla
        │   └── mux
        │       ├── AUTHORS
        │       ├── LICENSE
        │       ├── README.md
        │       ├── doc.go
        │       ├── go.mod
        │       ├── middleware.go
        │       ├── mux.go
        │       ├── regexp.go
        │       ├── route.go
        │       └── test_helpers.go
    └── modules.txt


/.github/PR_TEMPLATE.md:
--------------------------------------------------------------------------------
 1 | **Please open your pull requests on [gitlab repository](https://gitlab.com/nvidia/container-toolkit/gpu-monitoring-tools.git) **
 2 | 
 3 | Make sure to complete the following items:_
 4 | 
 5 | - _A reference to a related issue._
 6 | - _A small description of the changes proposed in the pull request._
 7 | - _One commit per change and descriptive commit messages._
 8 | - _Sign-off your work following these [guidelines](https://gitlab.com/nvidia/container-toolkit/gpu-monitoring-tools/blob/master/CONTRIBUTING.md) ._
 9 | - _Test run of your changes._
10 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.swp
2 | *.swo
3 | dcgm-exporter
4 | !etc/dcgm-exporter/
5 | !deployment/dcgm-exporter/
6 | tags
7 | 


--------------------------------------------------------------------------------
/.gitlab-ci.yml:
--------------------------------------------------------------------------------
 1 | image: docker:latest
 2 | services:
 3 |   - docker:dind
 4 | 
 5 | stages:
 6 |   - aws_kube_setup
 7 |   - e2e_tests
 8 |   - aws_kube_clean
 9 | 
10 | variables:
11 |   GIT_SUBMODULE_STRATEGY: recursive
12 |   TF_VAR_FILE: "$CI_PROJECT_DIR/tests/variables.tfvars"
13 | 
14 | build:
15 |   stage: aws_kube_setup
16 |   script:
17 |     - apk add make
18 |     - docker login -u "${CI_REGISTRY_USER}" -p "${CI_REGISTRY_PASSWORD}" "${CI_REGISTRY}"
19 | 
20 |     - make REGISTRY="${CI_REGISTRY_IMAGE}" VERSION="${CI_COMMIT_SHORT_SHA}" all
21 |     - make REGISTRY="${CI_REGISTRY_IMAGE}" VERSION="${CI_COMMIT_SHORT_SHA}" push
22 |     - make REGISTRY="${CI_REGISTRY_IMAGE}" VERSION="${CI_COMMIT_SHORT_SHA}" push-short
23 |     - make REGISTRY="${CI_REGISTRY_IMAGE}" VERSION="${CI_COMMIT_SHORT_SHA}" push-latest
24 |     - make REGISTRY="${CI_REGISTRY_IMAGE}" VERSION="${CI_COMMIT_SHORT_SHA}" push-ci
25 | 
26 | aws_kube_setup:
27 |   extends: .aws_kube_setup
28 |   only:
29 |     - master
30 |     - tags
31 | 
32 | e2e:
33 |   stage: e2e_tests
34 |   only:
35 |     - master
36 |   script:
37 |     - source aws-kube-ci/hostname
38 |     - apk add --no-cache openssh-client rsync
39 |     - rsync -e "ssh -i aws-kube-ci/key -o StrictHostKeyChecking=no" -av --exclude="vendor/" "${CI_PROJECT_DIR}" "${instance_hostname}:~/"
40 |     - rc=0
41 |     - ssh -i aws-kube-ci/key ${instance_hostname} \
42 |       "export CI_COMMIT_SHORT_SHA=${CI_COMMIT_SHORT_SHA} &&
43 |        export CI_REGISTRY_IMAGE=${CI_REGISTRY_IMAGE} &&
44 |        cd ~/gpu-monitoring-tools && sudo -E ./tests/ci-run-e2e.sh"
45 | 
46 | aws_kube_clean:
47 |   extends: .aws_kube_clean
48 |   only:
49 |     - master
50 |     - tags
51 | 
52 | release:
53 |   stage: aws_kube_clean
54 |   only:
55 |     - tags
56 |   script:
57 |     - apk add make
58 |     - docker login -u "${REGISTRY_USER}" -p "${REGISTRY_TOKEN}"
59 | 
60 |     - make VERSION="${CI_COMMIT_TAG}" all
61 |     - make VERSION="${CI_COMMIT_TAG}" push
62 |     - make VERSION="${CI_COMMIT_TAG}" push-short
63 |     - make VERSION="${CI_COMMIT_TAG}" push-latest
64 | 
65 | include:
66 |   project: nvidia/container-infrastructure/aws-kube-ci
67 |   file: aws-kube-ci.yml
68 |   ref: 21.02.23
69 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "aws-kube-ci"]
2 | 	path = aws-kube-ci
3 | 	url = https://gitlab.com/nvidia/container-infrastructure/aws-kube-ci.git
4 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contribute to the GPU Operator Project
 2 | 
 3 | Want to hack on the NVIDIA Container Toolkit Project? Awesome!
 4 | We only require you to sign your work, the below section describes this!
 5 | 
 6 | ## Sign your work
 7 | 
 8 | The sign-off is a simple line at the end of the explanation for the patch. Your
 9 | signature certifies that you wrote the patch or otherwise have the right to pass
10 | it on as an open-source patch. The rules are pretty simple: if you can certify
11 | the below (from [developercertificate.org](http://developercertificate.org/)):
12 | 
13 | ```
14 | Developer Certificate of Origin
15 | Version 1.1
16 | 
17 | Copyright (C) 2004, 2006 The Linux Foundation and its contributors.
18 | 1 Letterman Drive
19 | Suite D4700
20 | San Francisco, CA, 94129
21 | 
22 | Everyone is permitted to copy and distribute verbatim copies of this
23 | license document, but changing it is not allowed.
24 | 
25 | Developer's Certificate of Origin 1.1
26 | 
27 | By making a contribution to this project, I certify that:
28 | 
29 | (a) The contribution was created in whole or in part by me and I
30 |     have the right to submit it under the open source license
31 |     indicated in the file; or
32 | 
33 | (b) The contribution is based upon previous work that, to the best
34 |     of my knowledge, is covered under an appropriate open source
35 |     license and I have the right under that license to submit that
36 |     work with modifications, whether created in whole or in part
37 |     by me, under the same open source license (unless I am
38 |     permitted to submit under a different license), as indicated
39 |     in the file; or
40 | 
41 | (c) The contribution was provided directly to me by some other
42 |     person who certified (a), (b) or (c) and I have not modified
43 |     it.
44 | 
45 | (d) I understand and agree that this project and the contribution
46 |     are public and that a record of the contribution (including all
47 |     personal information I submit with it, including my sign-off) is
48 |     maintained indefinitely and may be redistributed consistent with
49 |     this project or the open source license(s) involved.
50 | ```
51 | 
52 | Then you just add a line to every git commit message:
53 | 
54 |     Signed-off-by: Joe Smith <joe.smith@email.com>
55 | 
56 | Use your real name (sorry, no pseudonyms or anonymous contributions.)
57 | 
58 | If you set your `user.name` and `user.email` git configs, you can sign your
59 | commit automatically with `git commit -s`.
60 | 
61 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | DOCKER   ?= docker
16 | MKDIR    ?= mkdir
17 | REGISTRY ?= nvidia
18 | 
19 | DCGM_VERSION   := 2.2.9
20 | GOLANG_VERSION := 1.14.2
21 | VERSION        := 2.4.0
22 | FULL_VERSION   := $(DCGM_VERSION)-$(VERSION)
23 | 
24 | NON_TEST_FILES  := pkg/dcgm.go pkg/gpu_collector.go pkg/parser.go pkg/pipeline.go pkg/server.go pkg/system_info.go pkg/types.go pkg/utils.go pkg/kubernetes.go pkg/main.go
25 | MAIN_TEST_FILES := pkg/system_info_test.go
26 | 
27 | .PHONY: all binary install check-format
28 | all: ubuntu18.04 ubuntu20.04 ubi8
29 | 
30 | binary:
31 | 	cd pkg; go build
32 | 
33 | test-main: $(NON_TEST_FILES) $(MAIN_TEST_FILES)
34 | 	cd pkg; go test
35 | 
36 | install: binary
37 | 	install -m 557 pkg/dcgm-exporter /usr/bin/dcgm-exporter
38 | 	install -m 557 -D ./etc/dcgm-exporter/default-counters.csv /etc/dcgm-exporter/default-counters.csv
39 | 	install -m 557 -D ./etc/dcgm-exporter/dcp-metrics-included.csv /etc/dcgm-exporter/dcp-metrics-included.csv
40 | 
41 | check-format:
42 | 	test $$(gofmt -l pkg bindings | tee /dev/stderr | wc -l) -eq 0
43 | 
44 | push:
45 | 	$(DOCKER) push "$(REGISTRY)/dcgm-exporter:$(FULL_VERSION)-ubuntu20.04"
46 | 	$(DOCKER) push "$(REGISTRY)/dcgm-exporter:$(FULL_VERSION)-ubuntu18.04"
47 | 	$(DOCKER) push "$(REGISTRY)/dcgm-exporter:$(FULL_VERSION)-ubi8"
48 | 
49 | push-short:
50 | 	$(DOCKER) tag "$(REGISTRY)/dcgm-exporter:$(FULL_VERSION)-ubuntu18.04" "$(REGISTRY)/dcgm-exporter:$(DCGM_VERSION)"
51 | 	$(DOCKER) push "$(REGISTRY)/dcgm-exporter:$(DCGM_VERSION)"
52 | 
53 | push-ci:
54 | 	$(DOCKER) tag "$(REGISTRY)/dcgm-exporter:$(FULL_VERSION)-ubuntu18.04" "$(REGISTRY)/dcgm-exporter:$(VERSION)"
55 | 	$(DOCKER) push "$(REGISTRY)/dcgm-exporter:$(VERSION)"
56 | 
57 | push-latest:
58 | 	$(DOCKER) tag "$(REGISTRY)/dcgm-exporter:$(FULL_VERSION)-ubuntu18.04" "$(REGISTRY)/dcgm-exporter:latest"
59 | 	$(DOCKER) push "$(REGISTRY)/dcgm-exporter:latest"
60 | 
61 | ubuntu20.04:
62 | 	$(DOCKER) build --pull \
63 | 		--build-arg "GOLANG_VERSION=$(GOLANG_VERSION)" \
64 | 		--build-arg "DCGM_VERSION=$(DCGM_VERSION)" \
65 | 		--tag "$(REGISTRY)/dcgm-exporter:$(FULL_VERSION)-ubuntu20.04" \
66 | 		--file docker/Dockerfile.ubuntu20.04 .
67 | 
68 | ubuntu18.04:
69 | 	$(DOCKER) build --pull \
70 | 		--build-arg "GOLANG_VERSION=$(GOLANG_VERSION)" \
71 | 		--build-arg "DCGM_VERSION=$(DCGM_VERSION)" \
72 | 		--tag "$(REGISTRY)/dcgm-exporter:$(FULL_VERSION)-ubuntu18.04" \
73 | 		--file docker/Dockerfile.ubuntu18.04 .
74 | 
75 | ubi8:
76 | 	$(DOCKER) build --pull \
77 | 		--build-arg "GOLANG_VERSION=$(GOLANG_VERSION)" \
78 | 		--build-arg "DCGM_VERSION=$(DCGM_VERSION)" \
79 | 		--build-arg "VERSION=$(FULL_VERSION)" \
80 | 		--tag "$(REGISTRY)/dcgm-exporter:$(FULL_VERSION)-ubi8" \
81 | 		--file docker/Dockerfile.ubi8 .
82 | 
83 | 


--------------------------------------------------------------------------------
/RELEASE.md:
--------------------------------------------------------------------------------
 1 | # Release
 2 | 
 3 | This document, the release process as well as the versioning strategy for the DCGM exporter.
 4 | In the future this document will also contain information about the go bindings.
 5 | 
 6 | ## Versioning
 7 | 
 8 | The DCGM container posses three major components:
 9 | - The DCGM Version (e.g: 1.17.3)
10 | - The Exporter Version (e.g: 2.0.0)
11 | - The platform of the container (e.g: ubuntu18.04)
12 | 
13 | The overall version of the DCGM container has four forms:
14 | - The long form: `${DCGM_VERSION}-${EXPORTER_VERSION}-${PLATFORM}`
15 | - The short form: `${DCGM_VERSION}`
16 | - The latest tag: `latest`
17 | - The commit form: `${CI_COMMIT_SHORT_SHA}` only available on the gitlab registry
18 | 
19 | The long form is a unique tag that once pushed will always refer to the same container.
20 | This means that no updates will be made to that tag and it will always point to the same container.
21 | 
22 | The short form refers to the latest EXPORTER_VERSION with the platform fixed to ubuntu18.04.
23 | The latest tag refers to the latest short form (i.e: latest DCGM_VERSION and EXPORTER_VERSION).
24 | 
25 | Note: We do not maintain multiple version branches.
26 | 
27 | ## Releases
28 | 
29 | Release of newer versions is done on demand and does not follow DCGM's release cadence.
30 | Though it is very likely that when a new version of DCGM comes out a new version of the exporter will be released.
31 | 
32 | All commit to the master branch generates an image on the gitlab registry.
33 | Tagging a version will push an image to the nvidia/dcgm-exporter repository on the Dockerhub
34 | 


--------------------------------------------------------------------------------
/bindings/go/dcgm/api.go:
--------------------------------------------------------------------------------
  1 | package dcgm
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | 	"os"
  6 | 	"sync"
  7 | )
  8 | 
  9 | var (
 10 | 	dcgmInitCounter int
 11 | 	mux             sync.Mutex
 12 | )
 13 | 
 14 | // Init starts DCGM, based on the user selected mode
 15 | // DCGM can be started in 3 differengt modes:
 16 | // 1. Embedded: Start hostengine within this process
 17 | // 2. Standalone: Connect to an already running nv-hostengine at the specified address
 18 | // Connection address can be passed as command line args: -connect "IP:PORT/Socket" -socket "isSocket"
 19 | // 3. StartHostengine: Open an Unix socket to start and connect to the nv-hostengine and terminate before exiting
 20 | func Init(m mode, args ...string) (cleanup func(), err error) {
 21 | 	mux.Lock()
 22 | 	if dcgmInitCounter < 0 {
 23 | 		count := fmt.Sprintf("%d", dcgmInitCounter)
 24 | 		err = fmt.Errorf("Shutdown() is called %s times, before Init()", count[1:])
 25 | 	}
 26 | 	if dcgmInitCounter == 0 {
 27 | 		err = initDcgm(m, args...)
 28 | 	}
 29 | 	dcgmInitCounter += 1
 30 | 	mux.Unlock()
 31 | 
 32 | 	return func() {
 33 | 		if err := Shutdown(); err != nil {
 34 | 			fmt.Fprintf(os.Stderr, "Failed to shutdown DCGM with error: `%v`", err)
 35 | 		}
 36 | 	}, err
 37 | }
 38 | 
 39 | // Shutdown stops DCGM and destroy all connections
 40 | func Shutdown() (err error) {
 41 | 	mux.Lock()
 42 | 	if dcgmInitCounter <= 0 {
 43 | 		err = fmt.Errorf("Init() needs to be called before Shutdown()")
 44 | 	}
 45 | 	if dcgmInitCounter == 1 {
 46 | 		err = shutdown()
 47 | 	}
 48 | 	dcgmInitCounter -= 1
 49 | 	mux.Unlock()
 50 | 
 51 | 	return
 52 | }
 53 | 
 54 | // GetAllDeviceCount counts all GPUs on the system
 55 | func GetAllDeviceCount() (uint, error) {
 56 | 	return getAllDeviceCount()
 57 | }
 58 | 
 59 | // GetSupportedDevices returns only DCGM supported GPUs
 60 | func GetSupportedDevices() ([]uint, error) {
 61 | 	return getSupportedDevices()
 62 | }
 63 | 
 64 | // GetDeviceInfo describes the given device
 65 | func GetDeviceInfo(gpuId uint) (Device, error) {
 66 | 	return getDeviceInfo(gpuId)
 67 | }
 68 | 
 69 | // GetDeviceStatus monitors GPU status including its power, memory and GPU utilization
 70 | func GetDeviceStatus(gpuId uint) (DeviceStatus, error) {
 71 | 	return latestValuesForDevice(gpuId)
 72 | }
 73 | 
 74 | // GetDeviceTopology returns device topology corresponding to the gpuId
 75 | func GetDeviceTopology(gpuId uint) ([]P2PLink, error) {
 76 | 	return getDeviceTopology(gpuId)
 77 | }
 78 | 
 79 | // WatchPidFields lets DCGM start recording stats for GPU process
 80 | // It needs to be called before calling GetProcessInfo
 81 | func WatchPidFields() (GroupHandle, error) {
 82 | 	return watchPidFields()
 83 | }
 84 | 
 85 | // GetProcessInfo provides detailed per GPU stats for this process
 86 | func GetProcessInfo(group GroupHandle, pid uint) ([]ProcessInfo, error) {
 87 | 	return getProcessInfo(group, pid)
 88 | }
 89 | 
 90 | // HealthCheckByGpuId monitors GPU health for any errors/failures/warnings
 91 | func HealthCheckByGpuId(gpuId uint) (DeviceHealth, error) {
 92 | 	return healthCheckByGpuId(gpuId)
 93 | }
 94 | 
 95 | // Policy sets GPU usage and error policies and notifies in case of any violations via callback functions
 96 | func Policy(gpuId uint, typ ...policyCondition) (<-chan PolicyViolation, error) {
 97 | 	return registerPolicy(gpuId, typ...)
 98 | }
 99 | 
100 | // Introspect returns DCGM hostengine memory and CPU usage
101 | func Introspect() (DcgmStatus, error) {
102 | 	return introspect()
103 | }
104 | 
105 | // Get all of the profiling metric groups for a given GPU group.
106 | func GetSupportedMetricGroups(grpid uint) ([]MetricGroup, error) {
107 | 	return getSupportedMetricGroups(grpid)
108 | }
109 | 


--------------------------------------------------------------------------------
/bindings/go/dcgm/bcast.go:
--------------------------------------------------------------------------------
 1 | package dcgm
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"sync"
 6 | )
 7 | 
 8 | type publisher struct {
 9 | 	publish        chan interface{}
10 | 	close          chan bool
11 | 	subscribers    []*subscriber
12 | 	subscriberLock sync.Mutex
13 | }
14 | 
15 | type subscriber struct {
16 | 	read  chan interface{}
17 | 	close chan bool
18 | }
19 | 
20 | func newPublisher() *publisher {
21 | 	pub := &publisher{
22 | 		publish: make(chan interface{}),
23 | 		close:   make(chan bool),
24 | 	}
25 | 	return pub
26 | }
27 | 
28 | func (p *publisher) subscriberList() []*subscriber {
29 | 	p.subscriberLock.Lock()
30 | 	defer p.subscriberLock.Unlock()
31 | 	return p.subscribers[:]
32 | }
33 | 
34 | func (p *publisher) add() *subscriber {
35 | 	p.subscriberLock.Lock()
36 | 	defer p.subscriberLock.Unlock()
37 | 	newSub := &subscriber{
38 | 		read:  make(chan interface{}),
39 | 		close: make(chan bool),
40 | 	}
41 | 	p.subscribers = append(p.subscribers, newSub)
42 | 	return newSub
43 | }
44 | 
45 | func (p *publisher) remove(leaving *subscriber) error {
46 | 	p.subscriberLock.Lock()
47 | 	defer p.subscriberLock.Unlock()
48 | 	subscriberIndex := -1
49 | 	for i, sub := range p.subscribers {
50 | 		if sub == leaving {
51 | 			subscriberIndex = i
52 | 			break
53 | 		}
54 | 	}
55 | 	if subscriberIndex == -1 {
56 | 		return fmt.Errorf("Could not find subscriber")
57 | 	}
58 | 	go func() { leaving.close <- true }()
59 | 	p.subscribers = append(p.subscribers[:subscriberIndex], p.subscribers[subscriberIndex+1:]...)
60 | 	return nil
61 | }
62 | 
63 | func (p *publisher) send(val interface{}) {
64 | 	p.publish <- val
65 | }
66 | 
67 | func (p *publisher) broadcast() {
68 | 	for {
69 | 		select {
70 | 		case publishing := <-p.publish:
71 | 			for _, sub := range p.subscriberList() {
72 | 				go func(s *subscriber, val interface{}) {
73 | 					s.read <- val
74 | 				}(sub, publishing)
75 | 			}
76 | 		case <-p.close:
77 | 			return
78 | 		}
79 | 	}
80 | }
81 | 
82 | func (p *publisher) closePublisher() {
83 | 	p.close <- true
84 | }
85 | 


--------------------------------------------------------------------------------
/bindings/go/dcgm/callback.c:
--------------------------------------------------------------------------------
1 | int violationNotify(void* p) {
2 |     int ViolationRegistration(void*);
3 |     return ViolationRegistration(p);
4 | }
5 | 


--------------------------------------------------------------------------------
/bindings/go/dcgm/dcgm_test.go:
--------------------------------------------------------------------------------
  1 | package dcgm
  2 | 
  3 | import (
  4 | 	"math"
  5 | 	"strconv"
  6 | 	"strings"
  7 | 	"testing"
  8 | 
  9 | 	"github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/nvsmi"
 10 | )
 11 | 
 12 | func check(err error, t *testing.T) {
 13 | 	if err != nil {
 14 | 		t.Errorf("%v\n", err)
 15 | 	}
 16 | }
 17 | 
 18 | func TestDeviceCount(t *testing.T) {
 19 | 	cleanup, err := Init(Embedded)
 20 | 	check(err, t)
 21 | 	defer cleanup()
 22 | 
 23 | 	count, err := GetAllDeviceCount()
 24 | 	check(err, t)
 25 | 
 26 | 	query := "count"
 27 | 	c := nvsmi.DeviceCount(query)
 28 | 
 29 | 	if c != count {
 30 | 		t.Errorf("Device Count from dcgm is wrong, got %d, want: %d", count, c)
 31 | 	}
 32 | }
 33 | 
 34 | func BenchmarkDeviceCount1(b *testing.B) {
 35 | 	Init(Embedded)
 36 | 
 37 | 	b.StartTimer()
 38 | 	for n := 0; n < b.N; n++ {
 39 | 		GetAllDeviceCount()
 40 | 	}
 41 | 	b.StopTimer()
 42 | 
 43 | 	Shutdown()
 44 | }
 45 | 
 46 | func TestDeviceInfo(t *testing.T) {
 47 | 	cleanup, err := Init(Embedded)
 48 | 	check(err, t)
 49 | 	defer cleanup()
 50 | 
 51 | 	fields := []string{
 52 | 		"driver_version",
 53 | 		"name",
 54 | 		"serial",
 55 | 		"uuid",
 56 | 		"pci.bus_id",
 57 | 		"vbios_version",
 58 | 		"inforom.img",
 59 | 		"power.limit",
 60 | 	}
 61 | 
 62 | 	gpus, err := GetSupportedDevices()
 63 | 	check(err, t)
 64 | 
 65 | 	for _, gpu := range gpus {
 66 | 		info, err := GetDeviceInfo(gpu)
 67 | 		check(err, t)
 68 | 
 69 | 		id := strconv.FormatUint(uint64(gpu), 10)
 70 | 
 71 | 		for _, val := range fields {
 72 | 			var msg, output string
 73 | 			res := nvsmi.Query(id, val)
 74 | 
 75 | 			switch val {
 76 | 			case "driver_version":
 77 | 				msg = "Driver version"
 78 | 				output = info.Identifiers.DriverVersion
 79 | 			case "name":
 80 | 				msg = "Device name"
 81 | 				output = info.Identifiers.Model
 82 | 			case "serial":
 83 | 				msg = "Device Serial number"
 84 | 				output = info.Identifiers.Serial
 85 | 			case "uuid":
 86 | 				msg = "Device UUID"
 87 | 				output = info.UUID
 88 | 			case "pci.bus_id":
 89 | 				msg = "Device PCI busId"
 90 | 				output = info.PCI.BusID
 91 | 			case "vbios_version":
 92 | 				msg = "Device vbios version"
 93 | 				output = info.Identifiers.Vbios
 94 | 			case "inforom.img":
 95 | 				msg = "Device inforom image"
 96 | 				output = info.Identifiers.InforomImageVersion
 97 | 			case "power.limit":
 98 | 				msg = "Device power limit"
 99 | 				output = strconv.FormatUint(uint64(info.Power), 10)
100 | 				power, err := strconv.ParseFloat(res, 64)
101 | 				check(err, t)
102 | 				res = strconv.FormatUint(uint64(math.Round(power)), 10)
103 | 			}
104 | 
105 | 			if strings.Compare(res, output) != 0 {
106 | 				if strings.Contains(output, "NOT_SUPPORTED") {
107 | 					continue
108 | 				}
109 | 
110 | 				t.Errorf("%v from dcgm is wrong, got: %v, want: %v", msg, output, res)
111 | 			}
112 | 		}
113 | 	}
114 | }
115 | 
116 | func BenchmarkDeviceInfo1(b *testing.B) {
117 | 	Init(Embedded)
118 | 
119 | 	b.StartTimer()
120 | 	for n := 0; n < b.N; n++ {
121 | 		// assuming there will be atleast 1 GPU attached
122 | 		GetDeviceInfo(uint(0))
123 | 	}
124 | 	b.StopTimer()
125 | 
126 | 	Shutdown()
127 | }
128 | 
129 | func TestDeviceStatus(t *testing.T) {
130 | 	cleanup, err := Init(Embedded)
131 | 	check(err, t)
132 | 	defer cleanup()
133 | 
134 | 	gpus, err := GetSupportedDevices()
135 | 	check(err, t)
136 | 
137 | 	fields := []string{
138 | 		"power.draw",
139 | 		"temperature.gpu",
140 | 		"utilization.gpu",
141 | 		"utilization.memory",
142 | 		"encoder.stats.averageFps",
143 | 		"clocks.current.sm",
144 | 		"clocks.current.memory",
145 | 	}
146 | 
147 | 	for _, gpu := range gpus {
148 | 		status, err := GetDeviceStatus(gpu)
149 | 		check(err, t)
150 | 
151 | 		id := strconv.FormatUint(uint64(gpu), 10)
152 | 
153 | 		for _, val := range fields {
154 | 			var msg, output string
155 | 			res := nvsmi.Query(id, val)
156 | 
157 | 			switch val {
158 | 			case "power.draw":
159 | 				msg = "Device power utilization"
160 | 				output = strconv.FormatUint(uint64(math.Round(status.Power)), 10)
161 | 				power, err := strconv.ParseFloat(res, 64)
162 | 				check(err, t)
163 | 				res = strconv.FormatUint(uint64(math.Round(power)), 10)
164 | 			case "temperature.gpu":
165 | 				msg = "Device temperature"
166 | 				output = strconv.FormatUint(uint64(status.Temperature), 10)
167 | 			case "utilization.gpu":
168 | 				msg = "Device gpu utilization"
169 | 				output = strconv.FormatUint(uint64(status.Utilization.GPU), 10)
170 | 			case "utilization.memory":
171 | 				msg = "Device memory utilization"
172 | 				output = strconv.FormatUint(uint64(status.Utilization.Memory), 10)
173 | 			case "encoder.stats.averageFps":
174 | 				msg = "Device encoder utilization"
175 | 				output = strconv.FormatUint(uint64(status.Utilization.Encoder), 10)
176 | 			case "clocks.current.sm":
177 | 				msg = "Device sm clock"
178 | 				output = strconv.FormatUint(uint64(status.Clocks.Cores), 10)
179 | 			case "clocks.current.memory":
180 | 				msg = "Device mem clock"
181 | 				output = strconv.FormatUint(uint64(status.Clocks.Memory), 10)
182 | 			}
183 | 
184 | 			if strings.Compare(res, output) != 0 {
185 | 				t.Errorf("%v from dcgm is wrong, got: %v, want: %v", msg, output, res)
186 | 			}
187 | 		}
188 | 	}
189 | }
190 | 


--------------------------------------------------------------------------------
/bindings/go/dcgm/device_status.go:
--------------------------------------------------------------------------------
  1 | package dcgm
  2 | 
  3 | /*
  4 | #include "./dcgm_agent.h"
  5 | #include "./dcgm_structs.h"
  6 | */
  7 | import "C"
  8 | import (
  9 | 	"fmt"
 10 | 	"math/rand"
 11 | )
 12 | 
 13 | type PerfState uint
 14 | 
 15 | const (
 16 | 	PerfStateMax     = 0
 17 | 	PerfStateMin     = 15
 18 | 	PerfStateUnknown = 32
 19 | )
 20 | 
 21 | func (p PerfState) String() string {
 22 | 	if p >= PerfStateMax && p <= PerfStateMin {
 23 | 		return fmt.Sprintf("P%d", p)
 24 | 	}
 25 | 	return "Unknown"
 26 | }
 27 | 
 28 | type UtilizationInfo struct {
 29 | 	GPU     int64 // %
 30 | 	Memory  int64 // %
 31 | 	Encoder int64 // %
 32 | 	Decoder int64 // %
 33 | }
 34 | 
 35 | type ECCErrorsInfo struct {
 36 | 	SingleBit int64
 37 | 	DoubleBit int64
 38 | }
 39 | 
 40 | type MemoryInfo struct {
 41 | 	GlobalUsed int64
 42 | 	ECCErrors  ECCErrorsInfo
 43 | }
 44 | 
 45 | type ClockInfo struct {
 46 | 	Cores  int64 // MHz
 47 | 	Memory int64 // MHz
 48 | }
 49 | 
 50 | type PCIThroughputInfo struct {
 51 | 	Rx      int64 // MB
 52 | 	Tx      int64 // MB
 53 | 	Replays int64
 54 | }
 55 | 
 56 | type PCIStatusInfo struct {
 57 | 	BAR1Used   int64 // MB
 58 | 	Throughput PCIThroughputInfo
 59 | 	FBUsed     int64
 60 | }
 61 | 
 62 | type DeviceStatus struct {
 63 | 	Power       float64 // W
 64 | 	Temperature int64   // °C
 65 | 	Utilization UtilizationInfo
 66 | 	Memory      MemoryInfo
 67 | 	Clocks      ClockInfo
 68 | 	PCI         PCIStatusInfo
 69 | 	Performance PerfState
 70 | 	FanSpeed    int64 // %
 71 | }
 72 | 
 73 | func latestValuesForDevice(gpuId uint) (status DeviceStatus, err error) {
 74 | 	const (
 75 | 		pwr int = iota
 76 | 		temp
 77 | 		sm
 78 | 		mem
 79 | 		enc
 80 | 		dec
 81 | 		smClock
 82 | 		memClock
 83 | 		bar1Used
 84 | 		pcieRxThroughput
 85 | 		pcieTxThroughput
 86 | 		pcieReplay
 87 | 		fbUsed
 88 | 		sbe
 89 | 		dbe
 90 | 		pstate
 91 | 		fanSpeed
 92 | 		fieldsCount
 93 | 	)
 94 | 
 95 | 	deviceFields := make([]Short, fieldsCount)
 96 | 	deviceFields[pwr] = C.DCGM_FI_DEV_POWER_USAGE
 97 | 	deviceFields[temp] = C.DCGM_FI_DEV_GPU_TEMP
 98 | 	deviceFields[sm] = C.DCGM_FI_DEV_GPU_UTIL
 99 | 	deviceFields[mem] = C.DCGM_FI_DEV_MEM_COPY_UTIL
100 | 	deviceFields[enc] = C.DCGM_FI_DEV_ENC_UTIL
101 | 	deviceFields[dec] = C.DCGM_FI_DEV_DEC_UTIL
102 | 	deviceFields[smClock] = C.DCGM_FI_DEV_SM_CLOCK
103 | 	deviceFields[memClock] = C.DCGM_FI_DEV_MEM_CLOCK
104 | 	deviceFields[bar1Used] = C.DCGM_FI_DEV_BAR1_USED
105 | 	deviceFields[pcieRxThroughput] = C.DCGM_FI_DEV_PCIE_RX_THROUGHPUT
106 | 	deviceFields[pcieTxThroughput] = C.DCGM_FI_DEV_PCIE_TX_THROUGHPUT
107 | 	deviceFields[pcieReplay] = C.DCGM_FI_DEV_PCIE_REPLAY_COUNTER
108 | 	deviceFields[fbUsed] = C.DCGM_FI_DEV_FB_USED
109 | 	deviceFields[sbe] = C.DCGM_FI_DEV_ECC_SBE_AGG_TOTAL
110 | 	deviceFields[dbe] = C.DCGM_FI_DEV_ECC_DBE_AGG_TOTAL
111 | 	deviceFields[pstate] = C.DCGM_FI_DEV_PSTATE
112 | 	deviceFields[fanSpeed] = C.DCGM_FI_DEV_FAN_SPEED
113 | 
114 | 	fieldsName := fmt.Sprintf("devStatusFields%d", rand.Uint64())
115 | 	fieldsId, err := FieldGroupCreate(fieldsName, deviceFields)
116 | 	if err != nil {
117 | 		return
118 | 	}
119 | 
120 | 	groupName := fmt.Sprintf("devStatus%d", rand.Uint64())
121 | 	groupId, err := WatchFields(gpuId, fieldsId, groupName)
122 | 	if err != nil {
123 | 		_ = FieldGroupDestroy(fieldsId)
124 | 		return
125 | 	}
126 | 
127 | 	values, err := GetLatestValuesForFields(gpuId, deviceFields)
128 | 	if err != nil {
129 | 		_ = FieldGroupDestroy(fieldsId)
130 | 		_ = DestroyGroup(groupId)
131 | 		return status, fmt.Errorf("Error getting device status: %s", err)
132 | 	}
133 | 
134 | 	power := values[pwr].Float64()
135 | 
136 | 	gpuUtil := UtilizationInfo{
137 | 		GPU:     values[sm].Int64(),
138 | 		Memory:  values[mem].Int64(),
139 | 		Encoder: values[enc].Int64(),
140 | 		Decoder: values[dec].Int64(),
141 | 	}
142 | 
143 | 	memory := MemoryInfo{
144 | 		ECCErrors: ECCErrorsInfo{
145 | 			SingleBit: values[sbe].Int64(),
146 | 			DoubleBit: values[dbe].Int64(),
147 | 		},
148 | 	}
149 | 
150 | 	clocks := ClockInfo{
151 | 		Cores:  values[smClock].Int64(),
152 | 		Memory: values[memClock].Int64(),
153 | 	}
154 | 
155 | 	pci := PCIStatusInfo{
156 | 		BAR1Used: values[bar1Used].Int64(),
157 | 		Throughput: PCIThroughputInfo{
158 | 			Rx:      values[pcieRxThroughput].Int64(),
159 | 			Tx:      values[pcieTxThroughput].Int64(),
160 | 			Replays: values[pcieReplay].Int64(),
161 | 		},
162 | 		FBUsed: values[fbUsed].Int64(),
163 | 	}
164 | 
165 | 	status = DeviceStatus{
166 | 		Power:       power,
167 | 		Temperature: values[temp].Int64(),
168 | 		Utilization: gpuUtil,
169 | 		Memory:      memory,
170 | 		Clocks:      clocks,
171 | 		PCI:         pci,
172 | 		Performance: PerfState(values[pstate].Int64()),
173 | 		FanSpeed:    values[fanSpeed].Int64(),
174 | 	}
175 | 
176 | 	_ = FieldGroupDestroy(fieldsId)
177 | 	_ = DestroyGroup(groupId)
178 | 	return
179 | }
180 | 


--------------------------------------------------------------------------------
/bindings/go/dcgm/go.mod:
--------------------------------------------------------------------------------
1 | module github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm
2 | 
3 | go 1.14
4 | 


--------------------------------------------------------------------------------
/bindings/go/dcgm/gpu_group.go:
--------------------------------------------------------------------------------
 1 | package dcgm
 2 | 
 3 | /*
 4 | #include "dcgm_agent.h"
 5 | #include "dcgm_structs.h"
 6 | */
 7 | import "C"
 8 | import (
 9 | 	"fmt"
10 | )
11 | 
12 | type GroupHandle struct{ handle C.dcgmGpuGrp_t }
13 | 
14 | func CreateGroup(groupName string) (goGroupId GroupHandle, err error) {
15 | 	var cGroupId C.dcgmGpuGrp_t
16 | 	cname := C.CString(groupName)
17 | 	defer freeCString(cname)
18 | 
19 | 	result := C.dcgmGroupCreate(handle.handle, C.DCGM_GROUP_EMPTY, cname, &cGroupId)
20 | 	if err = errorString(result); err != nil {
21 | 		return goGroupId, fmt.Errorf("Error creating group: %s", err)
22 | 	}
23 | 
24 | 	goGroupId = GroupHandle{cGroupId}
25 | 	return
26 | }
27 | 
28 | func NewDefaultGroup(groupName string) (GroupHandle, error) {
29 | 	var cGroupId C.dcgmGpuGrp_t
30 | 
31 | 	cname := C.CString(groupName)
32 | 	defer freeCString(cname)
33 | 
34 | 	result := C.dcgmGroupCreate(handle.handle, C.DCGM_GROUP_DEFAULT, cname, &cGroupId)
35 | 	if err := errorString(result); err != nil {
36 | 		return GroupHandle{}, fmt.Errorf("Error creating group: %s", err)
37 | 	}
38 | 
39 | 	return GroupHandle{cGroupId}, nil
40 | }
41 | 
42 | func AddToGroup(groupId GroupHandle, gpuId uint) (err error) {
43 | 	result := C.dcgmGroupAddDevice(handle.handle, groupId.handle, C.uint(gpuId))
44 | 	if err = errorString(result); err != nil {
45 | 		return fmt.Errorf("Error adding GPU %v to group: %s", gpuId, err)
46 | 	}
47 | 
48 | 	return
49 | }
50 | 
51 | func AddEntityToGroup(groupId GroupHandle, entityGroupId Field_Entity_Group, entityId uint) (err error) {
52 | 	result := C.dcgmGroupAddEntity(handle.handle, groupId.handle, C.dcgm_field_entity_group_t(entityGroupId), C.uint(entityId))
53 | 	if err = errorString(result); err != nil {
54 | 		return fmt.Errorf("Error adding entity group type %v, entity %v to group: %s", entityGroupId, entityId, err)
55 | 	}
56 | 
57 | 	return
58 | }
59 | 
60 | func DestroyGroup(groupId GroupHandle) (err error) {
61 | 	result := C.dcgmGroupDestroy(handle.handle, groupId.handle)
62 | 	if err = errorString(result); err != nil {
63 | 		return fmt.Errorf("Error destroying group: %s", err)
64 | 	}
65 | 
66 | 	return
67 | }
68 | 


--------------------------------------------------------------------------------
/bindings/go/dcgm/health.go:
--------------------------------------------------------------------------------
  1 | package dcgm
  2 | 
  3 | /*
  4 | #include "dcgm_agent.h"
  5 | #include "dcgm_structs.h"
  6 | */
  7 | import "C"
  8 | import (
  9 | 	"fmt"
 10 | 	"math/rand"
 11 | 	"unsafe"
 12 | )
 13 | 
 14 | type SystemWatch struct {
 15 | 	Type   string
 16 | 	Status string
 17 | 	Error  string
 18 | }
 19 | 
 20 | type DeviceHealth struct {
 21 | 	GPU     uint
 22 | 	Status  string
 23 | 	Watches []SystemWatch
 24 | }
 25 | 
 26 | func setHealthWatches(groupId GroupHandle) (err error) {
 27 | 	result := C.dcgmHealthSet(handle.handle, groupId.handle, C.DCGM_HEALTH_WATCH_ALL)
 28 | 	if err = errorString(result); err != nil {
 29 | 		return fmt.Errorf("Error setting health watches: %s", err)
 30 | 	}
 31 | 	return
 32 | }
 33 | 
 34 | func healthCheckByGpuId(gpuId uint) (deviceHealth DeviceHealth, err error) {
 35 | 	name := fmt.Sprintf("health%d", rand.Uint64())
 36 | 	groupId, err := CreateGroup(name)
 37 | 	if err != nil {
 38 | 		return
 39 | 	}
 40 | 
 41 | 	err = AddToGroup(groupId, gpuId)
 42 | 	if err != nil {
 43 | 		return
 44 | 	}
 45 | 
 46 | 	err = setHealthWatches(groupId)
 47 | 	if err != nil {
 48 | 		return
 49 | 	}
 50 | 
 51 | 	var healthResults C.dcgmHealthResponse_v4
 52 | 	healthResults.version = makeVersion2(unsafe.Sizeof(healthResults))
 53 | 
 54 | 	result := C.dcgmHealthCheck(handle.handle, groupId.handle, (*C.dcgmHealthResponse_t)(unsafe.Pointer(&healthResults)))
 55 | 
 56 | 	if err = errorString(result); err != nil {
 57 | 		return deviceHealth, fmt.Errorf("Error checking GPU health: %s", err)
 58 | 	}
 59 | 
 60 | 	status := healthStatus(int8(healthResults.overallHealth))
 61 | 	watches := []SystemWatch{}
 62 | 
 63 | 	// number of watches that encountred error/warning
 64 | 	incidents := uint(healthResults.incidentCount)
 65 | 
 66 | 	for j := uint(0); j < incidents; j++ {
 67 | 		watch := SystemWatch{
 68 | 			Type:   systemWatch(int(healthResults.incidents[j].system)),
 69 | 			Status: healthStatus(int8(healthResults.incidents[j].health)),
 70 | 
 71 | 			Error: *stringPtr(&healthResults.incidents[j].error.msg[0]),
 72 | 		}
 73 | 		watches = append(watches, watch)
 74 | 	}
 75 | 
 76 | 	deviceHealth = DeviceHealth{
 77 | 		GPU:     gpuId,
 78 | 		Status:  status,
 79 | 		Watches: watches,
 80 | 	}
 81 | 	_ = DestroyGroup(groupId)
 82 | 	return
 83 | }
 84 | 
 85 | func healthStatus(status int8) string {
 86 | 	switch status {
 87 | 	case 0:
 88 | 		return "Healthy"
 89 | 	case 10:
 90 | 		return "Warning"
 91 | 	case 20:
 92 | 		return "Failure"
 93 | 	}
 94 | 	return "N/A"
 95 | }
 96 | 
 97 | func systemWatch(watch int) string {
 98 | 	switch watch {
 99 | 	case 1:
100 | 		return "PCIe watches"
101 | 	case 2:
102 | 		return "NVLINK watches"
103 | 	case 4:
104 | 		return "Power Managemnt unit watches"
105 | 	case 8:
106 | 		return "Microcontroller unit watches"
107 | 	case 16:
108 | 		return "Memory watches"
109 | 	case 32:
110 | 		return "Streaming Multiprocessor watches"
111 | 	case 64:
112 | 		return "Inforom watches"
113 | 	case 128:
114 | 		return "Temperature watches"
115 | 	case 256:
116 | 		return "Power watches"
117 | 	case 512:
118 | 		return "Driver-related watches"
119 | 	}
120 | 	return "N/A"
121 | }
122 | 


--------------------------------------------------------------------------------
/bindings/go/dcgm/hostengine_status.go:
--------------------------------------------------------------------------------
 1 | package dcgm
 2 | 
 3 | /*
 4 | #include "dcgm_agent.h"
 5 | #include "dcgm_structs.h"
 6 | */
 7 | import "C"
 8 | import (
 9 | 	"fmt"
10 | 	"unsafe"
11 | )
12 | 
13 | type DcgmStatus struct {
14 | 	Memory int64
15 | 	CPU    float64
16 | }
17 | 
18 | func introspect() (engine DcgmStatus, err error) {
19 | 	enableIntrospect := C.dcgmIntrospectState_t(1)
20 | 	result := C.dcgmIntrospectToggleState(handle.handle, enableIntrospect)
21 | 
22 | 	if err = errorString(result); err != nil {
23 | 		return engine, fmt.Errorf("Error enabling DCGM introspection: %s", err)
24 | 	}
25 | 
26 | 	var memory C.dcgmIntrospectMemory_t
27 | 	memory.version = makeVersion2(unsafe.Sizeof(memory))
28 | 	waitIfNoData := 1
29 | 	result = C.dcgmIntrospectGetHostengineMemoryUsage(handle.handle, &memory, C.int(waitIfNoData))
30 | 
31 | 	if err = errorString(result); err != nil {
32 | 		return engine, fmt.Errorf("Error getting memory usage of hostengine: %s", err)
33 | 	}
34 | 
35 | 	var cpu C.dcgmIntrospectCpuUtil_t
36 | 
37 | 	cpu.version = makeVersion2(unsafe.Sizeof(cpu))
38 | 	result = C.dcgmIntrospectGetHostengineCpuUtilization(handle.handle, &cpu, C.int(waitIfNoData))
39 | 
40 | 	if err = errorString(result); err != nil {
41 | 		return engine, fmt.Errorf("Error getting cpu usage of hostengine: %s", err)
42 | 	}
43 | 
44 | 	engine = DcgmStatus{
45 | 		Memory: toInt64(memory.bytesUsed) / 1024,
46 | 		CPU:    *dblToFloat(cpu.total) * 100,
47 | 	}
48 | 	return
49 | }
50 | 


--------------------------------------------------------------------------------
/bindings/go/dcgm/mig.go:
--------------------------------------------------------------------------------
 1 | package dcgm
 2 | 
 3 | /*
 4 | #include "./dcgm_agent.h"
 5 | #include "./dcgm_structs.h"
 6 | */
 7 | import "C"
 8 | import (
 9 | 	"fmt"
10 | 	"unsafe"
11 | )
12 | 
13 | type Field_Entity_Group uint
14 | 
15 | const (
16 | 	FE_NONE Field_Entity_Group = iota
17 | 	FE_GPU
18 | 	FE_VGPU
19 | 	FE_SWITCH
20 | 	FE_GPU_I
21 | 	FE_GPU_CI
22 | 	FE_COUNT
23 | )
24 | 
25 | type GroupEntityPair struct {
26 | 	EntityGroupId Field_Entity_Group
27 | 	EntityId      uint
28 | }
29 | 
30 | type MigEntityInfo struct {
31 | 	GpuUuid               string
32 | 	NvmlGpuIndex          uint
33 | 	NvmlInstanceId        uint
34 | 	NvmlComputeInstanceId uint
35 | 	NvmlMigProfileId      uint
36 | 	NvmlProfileSlices     uint
37 | }
38 | 
39 | type MigHierarchyInfo_v2 struct {
40 | 	Entity GroupEntityPair
41 | 	Parent GroupEntityPair
42 | 	Info   MigEntityInfo
43 | }
44 | 
45 | const (
46 | 	MAX_NUM_DEVICES    uint = C.DCGM_MAX_NUM_DEVICES
47 | 	MAX_HIERARCHY_INFO uint = C.DCGM_MAX_HIERARCHY_INFO
48 | )
49 | 
50 | type MigHierarchy_v2 struct {
51 | 	Version    uint
52 | 	Count      uint
53 | 	EntityList [C.DCGM_MAX_HIERARCHY_INFO]MigHierarchyInfo_v2
54 | }
55 | 
56 | func GetGpuInstanceHierarchy() (hierarchy MigHierarchy_v2, err error) {
57 | 	var c_hierarchy C.dcgmMigHierarchy_v2
58 | 	c_hierarchy.version = C.dcgmMigHierarchy_version2
59 | 	ptr_hierarchy := (*C.dcgmMigHierarchy_v2)(unsafe.Pointer(&c_hierarchy))
60 | 	result := C.dcgmGetGpuInstanceHierarchy(handle.handle, ptr_hierarchy)
61 | 
62 | 	if err = errorString(result); err != nil {
63 | 		return toMigHierarchy(c_hierarchy), fmt.Errorf("Error retrieving DCGM MIG hierarchy: %s", err)
64 | 	}
65 | 
66 | 	return toMigHierarchy(c_hierarchy), nil
67 | }
68 | 
69 | func toMigHierarchy(c_hierarchy C.dcgmMigHierarchy_v2) MigHierarchy_v2 {
70 | 	var hierarchy MigHierarchy_v2
71 | 	hierarchy.Version = uint(c_hierarchy.version)
72 | 	hierarchy.Count = uint(c_hierarchy.count)
73 | 	for i := uint(0); i < hierarchy.Count; i++ {
74 | 		hierarchy.EntityList[i] = MigHierarchyInfo_v2{
75 | 			Entity: GroupEntityPair{Field_Entity_Group(c_hierarchy.entityList[i].entity.entityGroupId), uint(c_hierarchy.entityList[i].entity.entityId)},
76 | 			Parent: GroupEntityPair{Field_Entity_Group(c_hierarchy.entityList[i].parent.entityGroupId), uint(c_hierarchy.entityList[i].parent.entityId)},
77 | 			Info: MigEntityInfo{
78 | 				GpuUuid:               *stringPtr(&c_hierarchy.entityList[i].info.gpuUuid[0]),
79 | 				NvmlGpuIndex:          uint(c_hierarchy.entityList[i].info.nvmlGpuIndex),
80 | 				NvmlInstanceId:        uint(c_hierarchy.entityList[i].info.nvmlInstanceId),
81 | 				NvmlComputeInstanceId: uint(c_hierarchy.entityList[i].info.nvmlComputeInstanceId),
82 | 				NvmlMigProfileId:      uint(c_hierarchy.entityList[i].info.nvmlMigProfileId),
83 | 				NvmlProfileSlices:     uint(c_hierarchy.entityList[i].info.nvmlProfileSlices),
84 | 			},
85 | 		}
86 | 	}
87 | 
88 | 	return hierarchy
89 | }
90 | 


--------------------------------------------------------------------------------
/bindings/go/dcgm/profile.go:
--------------------------------------------------------------------------------
 1 | package dcgm
 2 | 
 3 | /*
 4 | #include "dcgm_agent.h"
 5 | #include "dcgm_structs.h"
 6 | */
 7 | import "C"
 8 | import (
 9 | 	"fmt"
10 | 	"unsafe"
11 | )
12 | 
13 | type MetricGroup struct {
14 | 	major    uint
15 | 	minor    uint
16 | 	fieldIds []uint
17 | }
18 | 
19 | func getSupportedMetricGroups(grpid uint) (groups []MetricGroup, err error) {
20 | 
21 | 	var groupInfo C.dcgmProfGetMetricGroups_t
22 | 	groupInfo.version = makeVersion2(unsafe.Sizeof(groupInfo))
23 | 	groupInfo.groupId = C.ulong(grpid)
24 | 
25 | 	result := C.dcgmProfGetSupportedMetricGroups(handle.handle, &groupInfo)
26 | 
27 | 	if err = errorString(result); err != nil {
28 | 		return groups, fmt.Errorf("Error getting supported metrics: %s", err)
29 | 	}
30 | 
31 | 	var count = uint(groupInfo.numMetricGroups)
32 | 
33 | 	for i := uint(0); i < count; i++ {
34 | 		var group MetricGroup
35 | 		group.major = uint(groupInfo.metricGroups[i].majorId)
36 | 		group.minor = uint(groupInfo.metricGroups[i].minorId)
37 | 
38 | 		var fieldCount = uint(groupInfo.metricGroups[i].numFieldIds)
39 | 
40 | 		for j := uint(0); j < fieldCount; j++ {
41 | 			group.fieldIds = append(group.fieldIds, uint(groupInfo.metricGroups[i].fieldIds[j]))
42 | 		}
43 | 		groups = append(groups, group)
44 | 	}
45 | 
46 | 	return groups, nil
47 | }
48 | 


--------------------------------------------------------------------------------
/bindings/go/dcgm/topology.go:
--------------------------------------------------------------------------------
  1 | package dcgm
  2 | 
  3 | /*
  4 | #include "dcgm_agent.h"
  5 | #include "dcgm_structs.h"
  6 | */
  7 | import "C"
  8 | import (
  9 | 	"fmt"
 10 | 	"io/ioutil"
 11 | 	"strings"
 12 | 	"unsafe"
 13 | )
 14 | 
 15 | type P2PLinkType uint
 16 | 
 17 | const (
 18 | 	P2PLinkUnknown P2PLinkType = iota
 19 | 	P2PLinkCrossCPU
 20 | 	P2PLinkSameCPU
 21 | 	P2PLinkHostBridge
 22 | 	P2PLinkMultiSwitch
 23 | 	P2PLinkSingleSwitch
 24 | 	P2PLinkSameBoard
 25 | 	SingleNVLINKLink
 26 | 	TwoNVLINKLinks
 27 | 	ThreeNVLINKLinks
 28 | 	FourNVLINKLinks
 29 | )
 30 | 
 31 | func (l P2PLinkType) PCIPaths() string {
 32 | 	switch l {
 33 | 	case P2PLinkSameBoard:
 34 | 		return "PSB"
 35 | 	case P2PLinkSingleSwitch:
 36 | 		return "PIX"
 37 | 	case P2PLinkMultiSwitch:
 38 | 		return "PXB"
 39 | 	case P2PLinkHostBridge:
 40 | 		return "PHB"
 41 | 	case P2PLinkSameCPU:
 42 | 		return "NODE"
 43 | 	case P2PLinkCrossCPU:
 44 | 		return "SYS"
 45 | 	case SingleNVLINKLink:
 46 | 		return "NV1"
 47 | 	case TwoNVLINKLinks:
 48 | 		return "NV2"
 49 | 	case ThreeNVLINKLinks:
 50 | 		return "NV3"
 51 | 	case FourNVLINKLinks:
 52 | 		return "NV4"
 53 | 	case P2PLinkUnknown:
 54 | 	}
 55 | 	return "N/A"
 56 | }
 57 | 
 58 | type P2PLink struct {
 59 | 	GPU   uint
 60 | 	BusID string
 61 | 	Link  P2PLinkType
 62 | }
 63 | 
 64 | func getP2PLink(path uint) P2PLinkType {
 65 | 	switch path {
 66 | 	case C.DCGM_TOPOLOGY_BOARD:
 67 | 		return P2PLinkSameBoard
 68 | 	case C.DCGM_TOPOLOGY_SINGLE:
 69 | 		return P2PLinkSingleSwitch
 70 | 	case C.DCGM_TOPOLOGY_MULTIPLE:
 71 | 		return P2PLinkMultiSwitch
 72 | 	case C.DCGM_TOPOLOGY_HOSTBRIDGE:
 73 | 		return P2PLinkHostBridge
 74 | 	case C.DCGM_TOPOLOGY_CPU:
 75 | 		return P2PLinkSameCPU
 76 | 	case C.DCGM_TOPOLOGY_SYSTEM:
 77 | 		return P2PLinkCrossCPU
 78 | 	case C.DCGM_TOPOLOGY_NVLINK1:
 79 | 		return SingleNVLINKLink
 80 | 	case C.DCGM_TOPOLOGY_NVLINK2:
 81 | 		return TwoNVLINKLinks
 82 | 	case C.DCGM_TOPOLOGY_NVLINK3:
 83 | 		return ThreeNVLINKLinks
 84 | 	case C.DCGM_TOPOLOGY_NVLINK4:
 85 | 		return FourNVLINKLinks
 86 | 	}
 87 | 	return P2PLinkUnknown
 88 | }
 89 | 
 90 | func getCPUAffinity(busid string) (string, error) {
 91 | 	b, err := ioutil.ReadFile(fmt.Sprintf("/sys/bus/pci/devices/%s/local_cpulist", strings.ToLower(busid[4:])))
 92 | 	if err != nil {
 93 | 		return "", fmt.Errorf("Error getting device cpu affinity: %v", err)
 94 | 	}
 95 | 	return strings.TrimSuffix(string(b), "\n"), nil
 96 | }
 97 | 
 98 | func getBusid(gpuid uint) (string, error) {
 99 | 	var device C.dcgmDeviceAttributes_t
100 | 	device.version = makeVersion2(unsafe.Sizeof(device))
101 | 
102 | 	result := C.dcgmGetDeviceAttributes(handle.handle, C.uint(gpuid), &device)
103 | 	if err := errorString(result); err != nil {
104 | 		return "", fmt.Errorf("Error getting device busid: %s", err)
105 | 	}
106 | 	return *stringPtr(&device.identifiers.pciBusId[0]), nil
107 | }
108 | 
109 | func getDeviceTopology(gpuid uint) (links []P2PLink, err error) {
110 | 	var topology C.dcgmDeviceTopology_t
111 | 	topology.version = makeVersion2(unsafe.Sizeof(topology))
112 | 
113 | 	result := C.dcgmGetDeviceTopology(handle.handle, C.uint(gpuid), &topology)
114 | 	if result == C.DCGM_ST_NOT_SUPPORTED {
115 | 		return links, nil
116 | 	}
117 | 	if result != C.DCGM_ST_OK {
118 | 		return links, fmt.Errorf("Error getting device topology: %s", errorString(result))
119 | 	}
120 | 
121 | 	busid, err := getBusid(gpuid)
122 | 	if err != nil {
123 | 		return
124 | 	}
125 | 
126 | 	for i := uint(0); i < uint(topology.numGpus); i++ {
127 | 		gpu := topology.gpuPaths[i].gpuId
128 | 		p2pLink := P2PLink{
129 | 			GPU:   uint(gpu),
130 | 			BusID: busid,
131 | 			Link:  getP2PLink(uint(topology.gpuPaths[i].path)),
132 | 		}
133 | 		links = append(links, p2pLink)
134 | 	}
135 | 	return
136 | }
137 | 


--------------------------------------------------------------------------------
/bindings/go/dcgm/utils.go:
--------------------------------------------------------------------------------
  1 | package dcgm
  2 | 
  3 | /*
  4 | #include "stdlib.h"
  5 | #include "dcgm_structs.h"
  6 | */
  7 | import "C"
  8 | 
  9 | import (
 10 | 	"fmt"
 11 | 	"math"
 12 | 	"unsafe"
 13 | )
 14 | 
 15 | const (
 16 | 	dcgmInt32Blank = 0x7ffffff0         // 2147483632
 17 | 	dcgmInt64Blank = 0x7ffffffffffffff0 // 9223372036854775792
 18 | )
 19 | 
 20 | func uintPtr(c C.uint) *uint {
 21 | 	i := uint(c)
 22 | 	return &i
 23 | }
 24 | 
 25 | func uintPtrInt(c C.int) *uint {
 26 | 	i := uint(c)
 27 | 	return &i
 28 | }
 29 | 
 30 | func uintPtrUnsafe(p unsafe.Pointer) *uint {
 31 | 	if p == nil {
 32 | 		return nil
 33 | 	}
 34 | 	uintP := (*uint)(unsafe.Pointer(p))
 35 | 	val := *uintP
 36 | 	return &val
 37 | }
 38 | 
 39 | func uint64Ptr(c C.longlong) *uint64 {
 40 | 	i := uint64(c)
 41 | 	return &i
 42 | }
 43 | 
 44 | func int64Ptr(c C.longlong) *int64 {
 45 | 	i := int64(c)
 46 | 	return &i
 47 | }
 48 | 
 49 | func uint64PtrUint(c C.uint) *uint64 {
 50 | 	i := uint64(c)
 51 | 	return &i
 52 | }
 53 | 
 54 | func uint64PtrUnsafe(p unsafe.Pointer) *uint64 {
 55 | 	if p == nil {
 56 | 		return nil
 57 | 	}
 58 | 	uintP := (*uint64)(unsafe.Pointer(p))
 59 | 	val := *uintP
 60 | 	return &val
 61 | }
 62 | 
 63 | func toInt64(c C.longlong) int64 {
 64 | 	i := int64(c)
 65 | 	return i
 66 | }
 67 | 
 68 | func dblToUint(val C.double) *uint {
 69 | 	i := uint(val)
 70 | 	return &i
 71 | }
 72 | 
 73 | func dblToFloat(val C.double) *float64 {
 74 | 	i := float64(val)
 75 | 	return &i
 76 | }
 77 | 
 78 | func dblToFloatUnsafe(val unsafe.Pointer) *float64 {
 79 | 	if val == nil {
 80 | 		return nil
 81 | 	}
 82 | 	dblP := (*C.double)(unsafe.Pointer(val))
 83 | 	floatP := float64(*dblP)
 84 | 	return &floatP
 85 | }
 86 | 
 87 | func stringPtr(c *C.char) *string {
 88 | 	s := C.GoString(c)
 89 | 	return &s
 90 | }
 91 | 
 92 | func errorString(result C.dcgmReturn_t) error {
 93 | 	if result == C.DCGM_ST_OK {
 94 | 		return nil
 95 | 	}
 96 | 	err := C.GoString(C.errorString(result))
 97 | 	return fmt.Errorf("%v", err)
 98 | }
 99 | 
100 | func freeCString(cStr *C.char) {
101 | 	C.free(unsafe.Pointer(cStr))
102 | }
103 | 
104 | func IsInt32Blank(value int) bool {
105 | 	if value >= dcgmInt32Blank {
106 | 		return true
107 | 	}
108 | 	return false
109 | }
110 | 
111 | func IsInt64Blank(value int64) bool {
112 | 	if value >= dcgmInt64Blank {
113 | 		return true
114 | 	}
115 | 	return false
116 | }
117 | 
118 | func blank64(val *int64) *int64 {
119 | 	if val != nil && IsInt64Blank(*val) {
120 | 		return nil
121 | 	}
122 | 	return val
123 | }
124 | 
125 | func blank32(val *uint) *uint {
126 | 	if val != nil && IsInt32Blank(int(*val)) {
127 | 		return nil
128 | 	}
129 | 	return val
130 | }
131 | 
132 | func makeVersion1(struct_type uintptr) C.uint {
133 | 	version := C.uint(struct_type | 1<<24)
134 | 	return version
135 | }
136 | 
137 | func makeVersion2(struct_type uintptr) C.uint {
138 | 	version := C.uint(struct_type | 2<<24)
139 | 	return version
140 | }
141 | 
142 | func roundFloat(f *float64) *float64 {
143 | 	var val float64
144 | 	if f != nil {
145 | 		val = math.Round(*f)
146 | 	}
147 | 	return &val
148 | }
149 | 


--------------------------------------------------------------------------------
/bindings/go/nvml/mig_test.go:
--------------------------------------------------------------------------------
  1 | package nvml
  2 | 
  3 | import (
  4 | 	"testing"
  5 | )
  6 | 
  7 | func TestSetMigMode(t *testing.T) {
  8 | 	// Initialize NVML
  9 | 	err := Init()
 10 | 	if err != nil {
 11 | 		t.Errorf("%v", err)
 12 | 	}
 13 | 	defer Shutdown()
 14 | 
 15 | 	// Grab a reference to our first device
 16 | 	device, err := NewDevice(0)
 17 | 	if err != nil {
 18 | 		t.Errorf("%v", err)
 19 | 	}
 20 | 
 21 | 	// Disable MIG on the device
 22 | 	_, err = device.SetMigMode(DEVICE_MIG_DISABLE)
 23 | 	if err != nil {
 24 | 		t.Errorf("error enabling MIG mode on Device: %v", err)
 25 | 	}
 26 | 
 27 | 	// Ensure MIG Mode is disabled on the device
 28 | 	current, pending, err := device.GetMigMode()
 29 | 	if err != nil {
 30 | 		t.Errorf("error getting MIG mode on Device: %v", err)
 31 | 	}
 32 | 	if current != pending || current != DEVICE_MIG_DISABLE {
 33 | 		t.Errorf("Expected MIG mode on Device to be DEVICE_MIG_DISABLE, got (current %v, pending %v)", current, pending)
 34 | 	}
 35 | 
 36 | 	// Enable MIG on the device
 37 | 	_, err = device.SetMigMode(DEVICE_MIG_ENABLE)
 38 | 	if err != nil {
 39 | 		t.Errorf("error enabling MIG mode on Device: %v", err)
 40 | 	}
 41 | 
 42 | 	// Ensure MIG Mode is enabled on the device
 43 | 	current, pending, err = device.GetMigMode()
 44 | 	if err != nil {
 45 | 		t.Errorf("error getting MIG mode on Device: %v", err)
 46 | 	}
 47 | 	if current != pending || current != DEVICE_MIG_ENABLE {
 48 | 		t.Errorf("Expected MIG mode on Device to be DEVICE_MIG_ENABLE, got (current %v, pending %v)", current, pending)
 49 | 	}
 50 | 
 51 | 	// Disable MIG on the device
 52 | 	_, err = device.SetMigMode(DEVICE_MIG_DISABLE)
 53 | 	if err != nil {
 54 | 		t.Errorf("error enabling MIG mode on Device: %v", err)
 55 | 	}
 56 | 
 57 | 	// Ensure MIG Mode is disabled on the device
 58 | 	current, pending, err = device.GetMigMode()
 59 | 	if err != nil {
 60 | 		t.Errorf("error getting MIG mode on Device: %v", err)
 61 | 	}
 62 | 	if current != pending || current != DEVICE_MIG_DISABLE {
 63 | 		t.Errorf("Expected MIG mode on Device to be DEVICE_MIG_DISABLE, got (current %v, pending %v)", current, pending)
 64 | 	}
 65 | }
 66 | 
 67 | func TestParseMigDeviceUUID(t *testing.T) {
 68 | 	tests := []struct {
 69 | 		name          string
 70 | 		uuid          string
 71 | 		expectedGPU   string
 72 | 		expectedGi    uint
 73 | 		expectedCi    uint
 74 | 		expectedError bool
 75 | 	}{
 76 | 		{
 77 | 			name:        "Successfull Parsing",
 78 | 			uuid:        "MIG-GPU-b8ea3855-276c-c9cb-b366-c6fa655957c5/1/5",
 79 | 			expectedGPU: "GPU-b8ea3855-276c-c9cb-b366-c6fa655957c5",
 80 | 			expectedGi:  1,
 81 | 			expectedCi:  5,
 82 | 		},
 83 | 		{
 84 | 			name:          "Fail, Missing MIG at the beginning of UUID",
 85 | 			uuid:          "GPU-b8ea3855-276c-c9cb-b366-c6fa655957c5/1/5",
 86 | 			expectedError: true,
 87 | 		},
 88 | 		{
 89 | 			name:          "Fail, Missing GPU at the beginning of GPU UUID",
 90 | 			uuid:          "MIG-b8ea3855-276c-c9cb-b366-c6fa655957c5/1/5",
 91 | 			expectedError: true,
 92 | 		},
 93 | 		{
 94 | 			name:          "Fail, GI not parsable",
 95 | 			uuid:          "MIG-GPU-b8ea3855-276c-c9cb-b366-c6fa655957c5/xx/5",
 96 | 			expectedError: true,
 97 | 		},
 98 | 		{
 99 | 			name:          "Fail, CI not a parsable",
100 | 			uuid:          "MIG-GPU-b8ea3855-276c-c9cb-b366-c6fa655957c5/1/xx",
101 | 			expectedError: true,
102 | 		},
103 | 	}
104 | 
105 | 	for _, tc := range tests {
106 | 		t.Run(tc.name, func(t *testing.T) {
107 | 			gpu, gi, ci, err := ParseMigDeviceUUID(tc.uuid)
108 | 			if tc.expectedError && err != nil {
109 | 				return
110 | 			}
111 | 			if tc.expectedError && err == nil {
112 | 				t.Fatalf("Expected an error, but didn't get one: uuid: %v, (gpu: %v, gi: %v, ci: %v)", tc.uuid, gpu, gi, ci)
113 | 			}
114 | 			if !tc.expectedError && err != nil {
115 | 				t.Fatalf("Unexpected error: %v, uuid: %v, (gpu: %v, gi: %v, ci: %v)", err, tc.uuid, gpu, gi, ci)
116 | 			}
117 | 			if gpu != tc.expectedGPU || gi != tc.expectedGi || ci != tc.expectedCi {
118 | 				t.Fatalf("MIG UUID parsed incorrectly: uuid: %v, (gpu: %v, gi: %v, ci: %v)", tc.uuid, gpu, gi, ci)
119 | 			}
120 | 		})
121 | 	}
122 | }
123 | 


--------------------------------------------------------------------------------
/bindings/go/nvml/nvml_dl.go:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | // +build linux darwin
 4 | 
 5 | package nvml
 6 | 
 7 | import (
 8 | 	"unsafe"
 9 | )
10 | 
11 | /*
12 | #include <dlfcn.h>
13 | #include "nvml.h"
14 | 
15 | // We wrap the call to nvmlInit() here to ensure that we pick up the correct
16 | // version of this call. The macro magic in nvml.h that #defines the symbol
17 | // 'nvmlInit' to 'nvmlInit_v2' is unfortunately lost on cgo.
18 | static nvmlReturn_t nvmlInit_dl(void) {
19 | 	return nvmlInit();
20 | }
21 | */
22 | import "C"
23 | 
24 | type dlhandles struct{ handles []unsafe.Pointer }
25 | 
26 | var dl dlhandles
27 | 
28 | // Initialize NVML, opening a dynamic reference to the NVML library in the process.
29 | func (dl *dlhandles) nvmlInit() C.nvmlReturn_t {
30 | 	handle := C.dlopen(C.CString("libnvidia-ml.so.1"), C.RTLD_LAZY|C.RTLD_GLOBAL)
31 | 	if handle == C.NULL {
32 | 		return C.NVML_ERROR_LIBRARY_NOT_FOUND
33 | 	}
34 | 	dl.handles = append(dl.handles, handle)
35 | 	return C.nvmlInit_dl()
36 | }
37 | 
38 | // Shutdown NVML, closing our dynamic reference to the NVML library in the process.
39 | func (dl *dlhandles) nvmlShutdown() C.nvmlReturn_t {
40 | 	ret := C.nvmlShutdown()
41 | 	if ret != C.NVML_SUCCESS {
42 | 		return ret
43 | 	}
44 | 
45 | 	for _, handle := range dl.handles {
46 | 		err := C.dlclose(handle)
47 | 		if err != 0 {
48 | 			return C.NVML_ERROR_UNKNOWN
49 | 		}
50 | 	}
51 | 
52 | 	return C.NVML_SUCCESS
53 | }
54 | 
55 | // Check to see if a specific symbol is present in the NVML library.
56 | func (dl *dlhandles) lookupSymbol(symbol string) C.nvmlReturn_t {
57 | 	for _, handle := range dl.handles {
58 | 		C.dlerror()
59 | 		C.dlsym(handle, C.CString(symbol))
60 | 		if unsafe.Pointer(C.dlerror()) == C.NULL {
61 | 			return C.NVML_SUCCESS
62 | 		}
63 | 	}
64 | 	return C.NVML_ERROR_FUNCTION_NOT_FOUND
65 | }
66 | 


--------------------------------------------------------------------------------
/bindings/go/nvml/nvml_dl_windows.go:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | // +build windows
 4 | 
 5 | package nvml
 6 | 
 7 | import (
 8 | 	"syscall"
 9 | )
10 | 
11 | /*
12 | #include "nvml.h"
13 | 
14 | // We wrap the call to nvmlInit() here to ensure that we pick up the correct
15 | // version of this call. The macro magic in nvml.h that #defines the symbol
16 | // 'nvmlInit' to 'nvmlInit_v2' is unfortunately lost on cgo.
17 | static nvmlReturn_t nvmlInit_dl(void) {
18 | 	return nvmlInit();
19 | }
20 | */
21 | import "C"
22 | 
23 | type dlhandles struct{ handles []*syscall.LazyDLL }
24 | 
25 | var dl dlhandles
26 | 
27 | // Initialize NVML, opening a dynamic reference to the NVML library in the process.
28 | func (dl *dlhandles) nvmlInit() C.nvmlReturn_t {
29 | 	handle := syscall.NewLazyDLL("nvml.dll")
30 | 	if handle == nil {
31 | 		return C.NVML_ERROR_LIBRARY_NOT_FOUND
32 | 	}
33 | 	dl.handles = append(dl.handles, handle)
34 | 	return C.nvmlInit_dl()
35 | }
36 | 
37 | // Shutdown NVML, closing our dynamic reference to the NVML library in the process.
38 | func (dl *dlhandles) nvmlShutdown() C.nvmlReturn_t {
39 | 	ret := C.nvmlShutdown()
40 | 	if ret != C.NVML_SUCCESS {
41 | 		return ret
42 | 	}
43 | 
44 | 	dl.handles = dl.handles[:0]
45 | 
46 | 	return C.NVML_SUCCESS
47 | }
48 | 
49 | // Check to see if a specific symbol is present in the NVML library.
50 | func (dl *dlhandles) lookupSymbol(symbol string) C.nvmlReturn_t {
51 | 	for _, handle := range dl.handles {
52 | 		if proc := handle.NewProc(symbol); proc != nil {
53 | 			return C.NVML_SUCCESS
54 | 		}
55 | 	}
56 | 	return C.NVML_ERROR_FUNCTION_NOT_FOUND
57 | }
58 | 


--------------------------------------------------------------------------------
/bindings/go/nvml/nvsmi/nvsmi.go:
--------------------------------------------------------------------------------
 1 | package nvsmi
 2 | 
 3 | import (
 4 | 	"bytes"
 5 | 	"fmt"
 6 | 	"os/exec"
 7 | 	"strings"
 8 | )
 9 | 
10 | const (
11 | 	bin       = "nvidia-smi"
12 | 	gpuArg    = "--id="
13 | 	queryArg  = "--query-gpu="
14 | 	formatArg = "--format=csv,noheader,nounits"
15 | )
16 | 
17 | func Query(id string, query string) string {
18 | 	var out bytes.Buffer
19 | 
20 | 	cmd := exec.Command(bin, gpuArg+id, queryArg+query, formatArg)
21 | 	cmd.Stdout = &out
22 | 
23 | 	err := cmd.Run()
24 | 	if err != nil {
25 | 		fmt.Errorf("nvsmi exec error: %v\n", err)
26 | 	}
27 | 	return strings.TrimSpace(out.String())
28 | }
29 | 
30 | func DeviceCount(query string) uint {
31 | 	var out bytes.Buffer
32 | 
33 | 	cmd := exec.Command(bin, queryArg+query, formatArg)
34 | 	cmd.Stdout = &out
35 | 
36 | 	err := cmd.Run()
37 | 	if err != nil {
38 | 		fmt.Errorf("nvsmi exec error: %v\n", err)
39 | 	}
40 | 
41 | 	nvSmi := strings.Split(strings.TrimSuffix(out.String(), "\n"), "\n")
42 | 	return uint(len(nvSmi))
43 | }
44 | 


--------------------------------------------------------------------------------
/bindings/go/samples/dcgm/deviceInfo/main.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"flag"
 5 | 	"log"
 6 | 	"os"
 7 | 	"text/template"
 8 | 
 9 | 	"github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm"
10 | )
11 | 
12 | const (
13 | 	deviceInfo = `Driver Version         : {{.Identifiers.DriverVersion}}
14 | GPU		       : {{.GPU}}
15 | DCGMSupported          : {{.DCGMSupported}}
16 | UUID                   : {{.UUID}}
17 | Brand                  : {{.Identifiers.Brand}}
18 | Model                  : {{.Identifiers.Model}}
19 | Serial Number          : {{.Identifiers.Serial}}
20 | Vbios                  : {{or .Identifiers.Vbios "N/A"}}
21 | InforomImage Version   : {{.Identifiers.InforomImageVersion}}
22 | Bus ID                 : {{.PCI.BusID}}
23 | BAR1 (MB)              : {{or .PCI.BAR1 "N/A"}}
24 | FrameBuffer Memory (MB): {{or .PCI.FBTotal "N/A"}}
25 | Bandwidth (MB/s)       : {{or .PCI.Bandwidth "N/A"}}
26 | Cores (MHz)            : {{or .Clocks.Cores "N/A"}}
27 | Memory (MHz)           : {{or .Clocks.Memory "N/A"}}
28 | Power (W)              : {{or .Power "N/A"}}
29 | CPUAffinity            : {{or .CPUAffinity "N/A"}}
30 | P2P Available          : {{if not .Topology}}None{{else}}{{range .Topology}}
31 |     GPU{{.GPU}} - (BusID){{.BusID}} - {{.Link.PCIPaths}}{{end}}{{end}}
32 | ---------------------------------------------------------------------
33 | `
34 | )
35 | 
36 | var (
37 | 	connectAddr = flag.String("connect", "localhost", "Provide nv-hostengine connection address.")
38 | 	isSocket    = flag.String("socket", "0", "Connecting to Unix socket?")
39 | )
40 | 
41 | // mini version of nvidia-smi -q
42 | // dcgmi discovery -i apc
43 | func main() {
44 | 	// choose dcgm hostengine running mode
45 | 	// 1. dcgm.Embedded
46 | 	// 2. dcgm.Standalone -connect "addr", -socket "isSocket"
47 | 	// 3. dcgm.StartHostengine
48 | 	flag.Parse()
49 | 	cleanup, err := dcgm.Init(dcgm.Standalone, *connectAddr, *isSocket)
50 | 	if err != nil {
51 | 		log.Panicln(err)
52 | 	}
53 | 	defer cleanup()
54 | 
55 | 	defer func() {
56 | 		if err := dcgm.Shutdown(); err != nil {
57 | 			log.Panicln(err)
58 | 		}
59 | 	}()
60 | 
61 | 	count, err := dcgm.GetAllDeviceCount()
62 | 	if err != nil {
63 | 		log.Panicln(err)
64 | 	}
65 | 
66 | 	t := template.Must(template.New("Device").Parse(deviceInfo))
67 | 
68 | 	for i := uint(0); i < count; i++ {
69 | 		deviceInfo, err := dcgm.GetDeviceInfo(i)
70 | 		if err != nil {
71 | 			log.Panicln(err)
72 | 		}
73 | 
74 | 		if err = t.Execute(os.Stdout, deviceInfo); err != nil {
75 | 			log.Panicln("Template error:", err)
76 | 		}
77 | 	}
78 | }
79 | 


--------------------------------------------------------------------------------
/bindings/go/samples/dcgm/dmon/main.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"log"
 6 | 	"os"
 7 | 	"os/signal"
 8 | 	"syscall"
 9 | 	"time"
10 | 
11 | 	"github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm"
12 | )
13 | 
14 | const (
15 | 	header = `# gpu   pwr  temp    sm   mem   enc   dec  mclk  pclk
16 | # Idx     W     C     %     %     %     %   MHz   MHz`
17 | )
18 | 
19 | // modelled on nvidia-smi dmon
20 | // dcgmi dmon -e 155,150,203,204,206,207,100,101
21 | func main() {
22 | 	sigs := make(chan os.Signal, 1)
23 | 	signal.Notify(sigs, syscall.SIGINT, syscall.SIGTERM)
24 | 
25 | 	cleanup, err := dcgm.Init(dcgm.Embedded)
26 | 	if err != nil {
27 | 		log.Panicln(err)
28 | 	}
29 | 	defer cleanup()
30 | 
31 | 	gpus, err := dcgm.GetSupportedDevices()
32 | 	if err != nil {
33 | 		log.Panicln(err)
34 | 	}
35 | 
36 | 	ticker := time.NewTicker(time.Second * 1)
37 | 	defer ticker.Stop()
38 | 
39 | 	fmt.Println(header)
40 | 	for {
41 | 		select {
42 | 		case <-ticker.C:
43 | 			for _, gpu := range gpus {
44 | 				st, err := dcgm.GetDeviceStatus(gpu)
45 | 				if err != nil {
46 | 					log.Panicln(err)
47 | 				}
48 | 				fmt.Printf("%5d %5d %5d %5d %5d %5d %5d %5d %5d\n",
49 | 					gpu, int64(st.Power), st.Temperature, st.Utilization.GPU, st.Utilization.Memory,
50 | 					st.Utilization.Encoder, st.Utilization.Decoder, st.Clocks.Memory, st.Clocks.Cores)
51 | 			}
52 | 
53 | 		case <-sigs:
54 | 			return
55 | 		}
56 | 	}
57 | }
58 | 


--------------------------------------------------------------------------------
/bindings/go/samples/dcgm/health/main.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"log"
 5 | 	"os"
 6 | 	"os/signal"
 7 | 	"syscall"
 8 | 	"text/template"
 9 | 	"time"
10 | 
11 | 	"github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm"
12 | )
13 | 
14 | const (
15 | 	healthStatus = `GPU                : {{.GPU}}
16 | Status             : {{.Status}}
17 | {{range .Watches}}
18 | Type               : {{.Type}}
19 | Status             : {{.Status}}
20 | Error              : {{.Error}}
21 | {{end}}
22 | `
23 | )
24 | 
25 | // create group: dcgmi group -c "name" --default
26 | // enable watches: dcgmi health -s a
27 | // check: dcgmi health -g 1 -c
28 | func main() {
29 | 	sigs := make(chan os.Signal, 1)
30 | 	signal.Notify(sigs, syscall.SIGINT, syscall.SIGTERM)
31 | 
32 | 	cleanup, err := dcgm.Init(dcgm.Embedded)
33 | 	if err != nil {
34 | 		log.Panicln(err)
35 | 	}
36 | 	defer cleanup()
37 | 
38 | 	gpus, err := dcgm.GetSupportedDevices()
39 | 	if err != nil {
40 | 		log.Panicln(err)
41 | 	}
42 | 
43 | 	ticker := time.NewTicker(time.Second * 1)
44 | 	defer ticker.Stop()
45 | 
46 | 	t := template.Must(template.New("Health").Parse(healthStatus))
47 | 	for {
48 | 		select {
49 | 		case <-ticker.C:
50 | 			for _, gpu := range gpus {
51 | 				h, err := dcgm.HealthCheckByGpuId(gpu)
52 | 				if err != nil {
53 | 					log.Panicln(err)
54 | 				}
55 | 
56 | 				if err = t.Execute(os.Stdout, h); err != nil {
57 | 					log.Panicln("Template error:", err)
58 | 				}
59 | 			}
60 | 		case <-sigs:
61 | 			return
62 | 		}
63 | 	}
64 | }
65 | 


--------------------------------------------------------------------------------
/bindings/go/samples/dcgm/hostengineStatus/main.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"log"
 6 | 
 7 | 	"github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm"
 8 | )
 9 | 
10 | // dcgmi introspect --enable
11 | // dcgmi introspect -s -H
12 | func main() {
13 | 	cleanup, err := dcgm.Init(dcgm.Embedded)
14 | 	if err != nil {
15 | 		log.Panicln(err)
16 | 	}
17 | 	defer cleanup()
18 | 
19 | 	st, err := dcgm.Introspect()
20 | 	if err != nil {
21 | 		log.Panicln(err)
22 | 	}
23 | 
24 | 	fmt.Printf("Memory %2s %v KB\nCPU %5s %.2f %s\n", ":", st.Memory, ":", st.CPU, "%")
25 | }
26 | 


--------------------------------------------------------------------------------
/bindings/go/samples/dcgm/policy/main.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"log"
 6 | 
 7 | 	"github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm"
 8 | )
 9 | 
10 | // dcgmi group -c "name" --default
11 | // dcgmi policy -g GROUPID --set 0,0 -x -n -p -e -P 250 -T 100 -M 10
12 | // dcgmi policy -g GROUPID --reg
13 | func main() {
14 | 	cleanup, err := dcgm.Init(dcgm.Embedded)
15 | 	if err != nil {
16 | 		log.Panicln(err)
17 | 	}
18 | 	defer cleanup()
19 | 
20 | 	gpus, err := dcgm.GetSupportedDevices()
21 | 	if err != nil {
22 | 		log.Panicln(err)
23 | 	}
24 | 
25 | 	// Choose policy conditions to register violation callback.
26 | 	// Note: Need to be root for some options
27 | 	// Available options are:
28 | 	// 1. dcgm.DbePolicy
29 | 	// 2. dcgm.PCIePolicy
30 | 	// 3. dcgm.MaxRtPgPolicy
31 | 	// 4. dcgm.ThermalPolicy
32 | 	// 5. dcgm.PowerPolicy
33 | 	// 6. dcgm.NvlinkPolicy
34 | 	// 7. dcgm.XidPolicy
35 | 	for _, gpu := range gpus {
36 | 		c, err := dcgm.Policy(gpu, dcgm.XidPolicy)
37 | 		if err != nil {
38 | 			log.Panicln(err)
39 | 		}
40 | 
41 | 		pe := <-c
42 | 		fmt.Printf("GPU %8s %v\nError %6s %v\nTimestamp %2s %v\nData %7s %v\n",
43 | 			":", gpu, ":", pe.Condition, ":", pe.Timestamp, ":", pe.Data)
44 | 	}
45 | }
46 | 


--------------------------------------------------------------------------------
/bindings/go/samples/dcgm/processInfo/main.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"flag"
 5 | 	"log"
 6 | 	"os"
 7 | 	"text/template"
 8 | 	"time"
 9 | 
10 | 	"github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm"
11 | )
12 | 
13 | const (
14 | 	processInfo = `----------------------------------------------------------------------
15 | GPU ID			     : {{.GPU}}
16 | ----------Execution Stats---------------------------------------------
17 | PID                          : {{.PID}}
18 | Name                         : {{or .Name "N/A"}}
19 | Start Time                   : {{.ProcessUtilization.StartTime.String}}
20 | End Time                     : {{.ProcessUtilization.EndTime.String}}
21 | ----------Performance Stats-------------------------------------------
22 | Energy Consumed (Joules)     : {{or .ProcessUtilization.EnergyConsumed "N/A"}}
23 | Max GPU Memory Used (bytes)  : {{or .Memory.GlobalUsed "N/A"}}
24 | Avg SM Clock (MHz)           : {{or .Clocks.Cores "N/A"}}
25 | Avg Memory Clock (MHz)       : {{or .Clocks.Memory "N/A"}}
26 | Avg SM Utilization (%)       : {{or .GpuUtilization.Memory "N/A"}}
27 | Avg Memory Utilization (%)   : {{or .GpuUtilization.GPU "N/A"}}
28 | Avg PCIe Rx Bandwidth (MB)   : {{or .PCI.Throughput.Rx "N/A"}}
29 | Avg PCIe Tx Bandwidth (MB)   : {{or .PCI.Throughput.Tx "N/A"}}
30 | ----------Event Stats-------------------------------------------------
31 | Single Bit ECC Errors        : {{or .Memory.ECCErrors.SingleBit "N/A"}}
32 | Double Bit ECC Errors        : {{or .Memory.ECCErrors.DoubleBit "N/A"}}
33 | Critical XID Errors          : {{.XIDErrors.NumErrors}}
34 | ----------Slowdown Stats----------------------------------------------
35 | Due to - Power (%)           : {{or .Violations.Power "N/A"}}
36 |        - Thermal (%)         : {{or .Violations.Thermal "N/A"}}
37 |        - Reliability (%)     : {{or .Violations.Reliability "N/A"}}
38 |        - Board Limit (%)     : {{or .Violations.BoardLimit "N/A"}}
39 |        - Low Utilization (%) : {{or .Violations.LowUtilization "N/A"}}
40 |        - Sync Boost (%)      : {{or .Violations.SyncBoost "N/A"}}
41 | ----------Process Utilization-----------------------------------------
42 | Avg SM Utilization (%)       : {{or .ProcessUtilization.SmUtil "N/A"}}
43 | Avg Memory Utilization (%)   : {{or .ProcessUtilization.MemUtil "N/A"}}
44 | ----------------------------------------------------------------------
45 | `
46 | )
47 | 
48 | var process = flag.Uint("pid", 0, "Provide pid to get this process information.")
49 | 
50 | // run as root, for enabling health watches
51 | // dcgmi stats -e
52 | // dcgmi stats --pid ENTERPID -v
53 | // sample: sudo ./processInfo -pid PID
54 | func main() {
55 | 	cleanup, err := dcgm.Init(dcgm.Embedded)
56 | 	if err != nil {
57 | 		log.Panicln(err)
58 | 	}
59 | 	defer cleanup()
60 | 
61 | 	// Request DCGM to start recording stats for GPU process fields
62 | 	group, err := dcgm.WatchPidFields()
63 | 	if err != nil {
64 | 		log.Panicln(err)
65 | 	}
66 | 
67 | 	// Before retrieving process stats, wait few seconds for watches to be enabled and collect data
68 | 	log.Println("Enabling DCGM watches to start collecting process stats. This may take a few seconds....")
69 | 	time.Sleep(3000 * time.Millisecond)
70 | 
71 | 	flag.Parse()
72 | 	pidInfo, err := dcgm.GetProcessInfo(group, *process)
73 | 	if err != nil {
74 | 		log.Panicln(err)
75 | 	}
76 | 
77 | 	t := template.Must(template.New("Process").Parse(processInfo))
78 | 	for _, gpu := range pidInfo {
79 | 
80 | 		if err = t.Execute(os.Stdout, gpu); err != nil {
81 | 			log.Panicln("Template error:", err)
82 | 		}
83 | 	}
84 | }
85 | 


--------------------------------------------------------------------------------
/bindings/go/samples/dcgm/restApi/README.md:
--------------------------------------------------------------------------------
  1 | ## DCGM REST API
  2 | 
  3 | A sample REST API is provided, demonstrating various endpoints for getting GPU metrics via DCGM.
  4 | 
  5 | 
  6 | ```
  7 | # Start the http server
  8 | # By default the http server is started at localhost:8070
  9 | 
 10 | $ go build && ./restApi
 11 | 
 12 | # Query GPU 0 info
 13 | $ GPUID=0
 14 | $ curl localhost:8070/dcgm/device/info/id/$GPUID
 15 | 
 16 | # sample output
 17 | 
 18 | Driver Version         : 384.130
 19 | GPU                    : 0
 20 | DCGMSupported          : Yes
 21 | UUID                   : GPU-34e8d7ba-0e4d-ac00-6852-695d5d404f51
 22 | Brand                  : GeForce
 23 | Model                  : GeForce GTX 980
 24 | Serial Number          : 0324414056639
 25 | Vbios                  : 84.04.1F.00.02
 26 | InforomImage Version   : G001.0000.01.03
 27 | Bus ID                 : 00000000:01:00.0
 28 | BAR1 (MB)              : 256
 29 | FrameBuffer Memory (MB): 4036
 30 | Bandwidth (MB/s)       : 15760
 31 | Cores (MHz)            : 1392
 32 | Memory (MHz)           : 3505
 33 | Power (W)              : 180
 34 | CPUAffinity            : 0-11
 35 | P2P Available          : None
 36 | ---------------------------------------------------------------------
 37 | 
 38 | $ curl localhost:8070/dcgm/device/info/id/$GPUID/json
 39 | 
 40 | # Query GPU info using its UUID
 41 | 
 42 | $ UUID=$(curl -s localhost:8070/dcgm/device/info/id/$GPUID | grep -i uuid | cut -d ":" -f2 )
 43 | $ curl localhost:8070/dcgm/device/info/uuid/$UUID
 44 | $ curl localhost:8070/dcgm/device/info/uuid/$UUID/json
 45 | 
 46 | # sample output
 47 | 
 48 | {"GPU":0,"DCGMSupported":"Yes","UUID":"GPU-34e8d7ba-0e4d-ac00-6852-695d5d404f51","Power":180,"PCI":{"BusID":"00000000:01:00.0","BAR1":256,"FBTotal":4036,"Bandwidth":15760},"Clocks":{"Cores":1392,"Memory":3505},"Identifiers":{"Brand":"GeForce","Model":"GeForce GTX 980","Serial":"0324414056639","Vbios":"84.04.1F.00.02","InforomImageVersion":"G001.0000.01.03","DriverVersion":"384.130"},"Topology":null,"CPUAffinity":"0-11"}
 49 | 
 50 | # Query GPU status
 51 | 
 52 | $ curl localhost:8070/dcgm/device/status/id/$GPUID
 53 | $ curl localhost:8070/dcgm/device/status/id/$GPUID/json
 54 | 
 55 | # sample output
 56 | 
 57 | Power (W)               : 20.985
 58 | Temperature (°C)        : 47
 59 | Sm Utilization (%)      : 2
 60 | Memory Utilization (%)  : 8
 61 | Encoder Utilization (%) : 0
 62 | Decoder Utilization (%) : 0
 63 | Memory Clock (MHz       : 324
 64 | SM Clock (MHz)          : 135
 65 | 
 66 | $ curl localhost:8070/dcgm/device/status/uuid/$UUID
 67 | 
 68 | # sample output
 69 | 
 70 | {"Power":20.793,"Temperature":43,"Utilization":{"GPU":0,"Memory":8,"Encoder":0,"Decoder":0},"Memory":{"GlobalUsed":null,"ECCErrors":{"SingleBit":9223372036854775794,"DoubleBit":9223372036854775794}},"Clocks":{"Cores":135,"Memory":324},"PCI":{"BAR1Used":9,"Throughput":{"Rx":129,"Tx":47,"Replays":0},"FBUsed":423},"Performance":8,"FanSpeed":29}
 71 | 
 72 | $ curl localhost:8070/dcgm/device/status/uuid/$UUID/json
 73 | 
 74 | # Query GPU process info
 75 | 
 76 | # Run CUDA nbody sample and get its PID
 77 | $ PID=$(pgrep nbody)
 78 | 
 79 | $ curl localhost:8070/dcgm/process/info/pid/$PID
 80 | $ curl localhost:8070/dcgm/process/info/pid/$PID/json
 81 | 
 82 | # sample output
 83 | 
 84 | {"GPU":0,"PID":19132,"Name":"nbody","ProcessUtilization":{"StartTime":1529980640,"EndTime":0,"EnergyConsumed":1346,"SmUtil":0,"MemUtil":0},"PCI":{"BAR1Used":null,"Throughput":{"Rx":null,"Tx":null,"Replays":0},"FBUsed":null},"Memory":{"GlobalUsed":84279296,"ECCErrors":{"SingleBit":0,"DoubleBit":0}},"GpuUtilization":{"GPU":null,"Memory":null,"Encoder":null,"Decoder":null},"Clocks":{"Cores":null,"Memory":null},"Violations":{"Power":0,"Thermal":0,"Reliability":0,"BoardLimit":0,"LowUtilization":0,"SyncBoost":0},"XIDErrors":{"NumErrors":0,"TimeStamp":[]}}
 85 | 
 86 | # Query GPU health
 87 | 
 88 | $ curl localhost:8070/dcgm/health/id/$GPUID
 89 | $ curl localhost:8070/dcgm/health/id/$GPUID/json
 90 | $ curl localhost:8070/dcgm/health/uuid/$UUID
 91 | $ curl localhost:8070/dcgm/health/uuid/$UUID/json
 92 | 
 93 | # sample output
 94 | 
 95 | {"GPU":0,"Status":"Healthy","Watches":[]}
 96 | 
 97 | # Query DCGM hostengine memory and CPU usage
 98 | 
 99 | $ curl localhost:8070/dcgm/status
100 | $ curl localhost:8070/dcgm/status/json
101 | 
102 | # sample output
103 | 
104 | {"Memory":18380,"CPU":0.16482222745467387}
105 | 
106 | ```


--------------------------------------------------------------------------------
/bindings/go/samples/dcgm/restApi/handlers/byIds.go:
--------------------------------------------------------------------------------
 1 | package handlers
 2 | 
 3 | import (
 4 | 	"net/http"
 5 | )
 6 | 
 7 | func DeviceInfo(resp http.ResponseWriter, req *http.Request) {
 8 | 	device := getDeviceInfo(resp, req)
 9 | 	if device == nil {
10 | 		return
11 | 	}
12 | 	if isJson(req) {
13 | 		encode(resp, req, device)
14 | 		return
15 | 	}
16 | 	print(resp, req, device, deviceInfo)
17 | }
18 | 
19 | func DeviceStatus(resp http.ResponseWriter, req *http.Request) {
20 | 	st := getDeviceStatus(resp, req)
21 | 	if st == nil {
22 | 		return
23 | 	}
24 | 	if isJson(req) {
25 | 		encode(resp, req, st)
26 | 		return
27 | 	}
28 | 	print(resp, req, st, deviceStatus)
29 | }
30 | 
31 | func ProcessInfo(resp http.ResponseWriter, req *http.Request) {
32 | 	pInfo := getProcessInfo(resp, req)
33 | 	if len(pInfo) == 0 {
34 | 		return
35 | 	}
36 | 	if isJson(req) {
37 | 		encode(resp, req, pInfo)
38 | 		return
39 | 	}
40 | 	processPrint(resp, req, pInfo)
41 | }
42 | 
43 | func Health(resp http.ResponseWriter, req *http.Request) {
44 | 	h := getHealth(resp, req)
45 | 	if h == nil {
46 | 		return
47 | 	}
48 | 	if isJson(req) {
49 | 		encode(resp, req, h)
50 | 		return
51 | 	}
52 | 	print(resp, req, h, healthStatus)
53 | }
54 | 
55 | func DcgmStatus(resp http.ResponseWriter, req *http.Request) {
56 | 	st := getDcgmStatus(resp, req)
57 | 	if st == nil {
58 | 		return
59 | 	}
60 | 	if isJson(req) {
61 | 		encode(resp, req, st)
62 | 		return
63 | 	}
64 | 	print(resp, req, st, hostengine)
65 | }
66 | 


--------------------------------------------------------------------------------
/bindings/go/samples/dcgm/restApi/handlers/byUuids.go:
--------------------------------------------------------------------------------
 1 | package handlers
 2 | 
 3 | import (
 4 | 	"log"
 5 | 	"net/http"
 6 | 
 7 | 	"github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm"
 8 | )
 9 | 
10 | // map of uuids and device id
11 | var uuids map[string]uint
12 | 
13 | func DevicesUuids() {
14 | 	uuids = make(map[string]uint)
15 | 	count, err := dcgm.GetAllDeviceCount()
16 | 	if err != nil {
17 | 		log.Printf("(DCGM) Error getting devices: %s", err)
18 | 		return
19 | 	}
20 | 
21 | 	for i := uint(0); i < count; i++ {
22 | 		deviceInfo, err := dcgm.GetDeviceInfo(i)
23 | 		if err != nil {
24 | 			log.Printf("(DCGM) Error getting device information: %s", err)
25 | 			return
26 | 		}
27 | 		uuids[deviceInfo.UUID] = i
28 | 	}
29 | }
30 | 
31 | func DeviceInfoByUuid(resp http.ResponseWriter, req *http.Request) {
32 | 	device := getDeviceInfo(resp, req)
33 | 	if device == nil {
34 | 		return
35 | 	}
36 | 	if isJson(req) {
37 | 		encode(resp, req, device)
38 | 		return
39 | 	}
40 | 	print(resp, req, device, deviceInfo)
41 | }
42 | 
43 | func DeviceStatusByUuid(resp http.ResponseWriter, req *http.Request) {
44 | 	st := getDeviceStatus(resp, req)
45 | 	if st == nil {
46 | 		return
47 | 	}
48 | 	if isJson(req) {
49 | 		encode(resp, req, st)
50 | 		return
51 | 	}
52 | 	print(resp, req, st, deviceStatus)
53 | }
54 | 
55 | func HealthByUuid(resp http.ResponseWriter, req *http.Request) {
56 | 	h := getHealth(resp, req)
57 | 	if h == nil {
58 | 		return
59 | 	}
60 | 	if isJson(req) {
61 | 		encode(resp, req, h)
62 | 		return
63 | 	}
64 | 	print(resp, req, h, healthStatus)
65 | }
66 | 


--------------------------------------------------------------------------------
/bindings/go/samples/dcgm/restApi/handlers/dcgm.go:
--------------------------------------------------------------------------------
  1 | package handlers
  2 | 
  3 | import (
  4 | 	"log"
  5 | 	"math"
  6 | 	"net/http"
  7 | 	"time"
  8 | 
  9 | 	"github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm"
 10 | 	"github.com/gorilla/mux"
 11 | )
 12 | 
 13 | func getDcgmStatus(resp http.ResponseWriter, req *http.Request) (status *dcgm.DcgmStatus) {
 14 | 	st, err := dcgm.Introspect()
 15 | 	if err != nil {
 16 | 		http.Error(resp, err.Error(), http.StatusInternalServerError)
 17 | 		log.Printf("error: %v%v: %v", req.Host, req.URL, err.Error())
 18 | 		return
 19 | 	}
 20 | 	return &st
 21 | 
 22 | }
 23 | 
 24 | func getDeviceInfo(resp http.ResponseWriter, req *http.Request) (device *dcgm.Device) {
 25 | 	var id uint
 26 | 	params := mux.Vars(req)
 27 | 	for k, v := range params {
 28 | 		switch k {
 29 | 		case "id":
 30 | 			id = getId(resp, req, v)
 31 | 		case "uuid":
 32 | 			id = getIdByUuid(resp, req, v)
 33 | 		}
 34 | 	}
 35 | 
 36 | 	if id == math.MaxUint32 {
 37 | 		return
 38 | 	}
 39 | 
 40 | 	if !isValidId(id, resp, req) {
 41 | 		return
 42 | 	}
 43 | 	d, err := dcgm.GetDeviceInfo(id)
 44 | 	if err != nil {
 45 | 		http.Error(resp, err.Error(), http.StatusInternalServerError)
 46 | 		log.Printf("error: %v%v: %v", req.Host, req.URL, err.Error())
 47 | 		return
 48 | 	}
 49 | 	return &d
 50 | }
 51 | 
 52 | func getDeviceStatus(resp http.ResponseWriter, req *http.Request) (status *dcgm.DeviceStatus) {
 53 | 	var id uint
 54 | 	params := mux.Vars(req)
 55 | 	for k, v := range params {
 56 | 		switch k {
 57 | 		case "id":
 58 | 			id = getId(resp, req, v)
 59 | 		case "uuid":
 60 | 			id = getIdByUuid(resp, req, v)
 61 | 		}
 62 | 	}
 63 | 
 64 | 	if id == math.MaxUint32 {
 65 | 		return
 66 | 	}
 67 | 
 68 | 	if !isValidId(id, resp, req) {
 69 | 		return
 70 | 	}
 71 | 
 72 | 	if !isDcgmSupported(id, resp, req) {
 73 | 		return
 74 | 	}
 75 | 
 76 | 	st, err := dcgm.GetDeviceStatus(id)
 77 | 	if err != nil {
 78 | 		http.Error(resp, err.Error(), http.StatusInternalServerError)
 79 | 		log.Printf("error: %v%v: %v", req.Host, req.URL, err.Error())
 80 | 		return
 81 | 	}
 82 | 	return &st
 83 | }
 84 | 
 85 | func getHealth(resp http.ResponseWriter, req *http.Request) (health *dcgm.DeviceHealth) {
 86 | 	var id uint
 87 | 	params := mux.Vars(req)
 88 | 	for k, v := range params {
 89 | 		switch k {
 90 | 		case "id":
 91 | 			id = getId(resp, req, v)
 92 | 		case "uuid":
 93 | 			id = getIdByUuid(resp, req, v)
 94 | 		}
 95 | 	}
 96 | 
 97 | 	if id == math.MaxUint32 {
 98 | 		return
 99 | 	}
100 | 
101 | 	if !isValidId(id, resp, req) {
102 | 		return
103 | 	}
104 | 
105 | 	h, err := dcgm.HealthCheckByGpuId(id)
106 | 	if err != nil {
107 | 		http.Error(resp, err.Error(), http.StatusInternalServerError)
108 | 		log.Printf("error: %v%v: %v", req.Host, req.URL, err.Error())
109 | 		return
110 | 	}
111 | 	return &h
112 | }
113 | 
114 | func getProcessInfo(resp http.ResponseWriter, req *http.Request) (pInfo []dcgm.ProcessInfo) {
115 | 	params := mux.Vars(req)
116 | 	pid := getId(resp, req, params["pid"])
117 | 	if pid == math.MaxUint32 {
118 | 		return
119 | 	}
120 | 	group, err := dcgm.WatchPidFields()
121 | 	if err != nil {
122 | 		http.Error(resp, err.Error(), http.StatusInternalServerError)
123 | 		log.Printf("error: %v%v: %v", req.Host, req.URL, err.Error())
124 | 		return
125 | 	}
126 | 
127 | 	// wait for watches to be enabled
128 | 	log.Printf("Enabling DCGM watches to start collecting process stats. This may take a few seconds....")
129 | 	time.Sleep(3000 * time.Millisecond)
130 | 	pInfo, err = dcgm.GetProcessInfo(group, pid)
131 | 	if err != nil {
132 | 		http.Error(resp, err.Error(), http.StatusInternalServerError)
133 | 		log.Printf("error: %v%v: %v", req.Host, req.URL, err.Error())
134 | 	}
135 | 	return
136 | }
137 | 


--------------------------------------------------------------------------------
/bindings/go/samples/dcgm/restApi/main.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"log"
 5 | 	"os"
 6 | 	"os/signal"
 7 | 	"syscall"
 8 | 
 9 | 	"github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm"
10 | )
11 | 
12 | // res: curl localhost:8070/dcgm/device/info/id/0
13 | 
14 | func main() {
15 | 	stopSig := make(chan os.Signal, 1)
16 | 	signal.Notify(stopSig, syscall.SIGINT, syscall.SIGTERM)
17 | 
18 | 	cleanup, err := dcgm.Init(dcgm.Embedded)
19 | 	if err != nil {
20 | 		log.Panicln(err)
21 | 	}
22 | 	defer cleanup()
23 | 
24 | 	addr := ":8070"
25 | 	server := newHttpServer(addr)
26 | 
27 | 	go func() {
28 | 		log.Printf("Running http server on localhost%s", addr)
29 | 		server.serve()
30 | 	}()
31 | 	defer server.stop()
32 | 
33 | 	<-stopSig
34 | 	return
35 | }
36 | 


--------------------------------------------------------------------------------
/bindings/go/samples/dcgm/restApi/server.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"context"
 5 | 	"log"
 6 | 	"net/http"
 7 | 	"time"
 8 | 
 9 | 	h "github.com/NVIDIA/gpu-monitoring-tools/bindings/go/samples/dcgm/restApi/handlers"
10 | 	"github.com/gorilla/mux"
11 | )
12 | 
13 | const timeout = 5 * time.Second
14 | 
15 | type httpServer struct {
16 | 	router *mux.Router
17 | 	server *http.Server
18 | }
19 | 
20 | func newHttpServer(addr string) *httpServer {
21 | 	r := mux.NewRouter()
22 | 
23 | 	s := &httpServer{
24 | 		router: r,
25 | 		server: &http.Server{
26 | 			Addr:         addr,
27 | 			Handler:      r,
28 | 			ReadTimeout:  timeout,
29 | 			WriteTimeout: timeout,
30 | 		},
31 | 	}
32 | 
33 | 	// make a global map of device uuids and ids
34 | 	h.DevicesUuids()
35 | 
36 | 	s.handler()
37 | 	return s
38 | }
39 | 
40 | func (s *httpServer) handler() {
41 | 	deviceInfo := "/dcgm/device/info"
42 | 	subrouter := s.router.PathPrefix(deviceInfo).Subrouter()
43 | 	subrouter.HandleFunc("/id/{id}", h.DeviceInfo).Methods("GET")
44 | 	subrouter.HandleFunc("/id/{id}/json", h.DeviceInfo).Methods("GET")
45 | 	subrouter.HandleFunc("/uuid/{uuid}", h.DeviceInfoByUuid).Methods("GET")
46 | 	subrouter.HandleFunc("/uuid/{uuid}/json", h.DeviceInfoByUuid).Methods("GET")
47 | 
48 | 	deviceStatus := "/dcgm/device/status"
49 | 	subrouter = s.router.PathPrefix(deviceStatus).Subrouter()
50 | 	subrouter.HandleFunc("/id/{id}", h.DeviceStatus).Methods("GET")
51 | 	subrouter.HandleFunc("/id/{id}/json", h.DeviceStatus).Methods("GET")
52 | 	subrouter.HandleFunc("/uuid/{uuid}", h.DeviceStatusByUuid).Methods("GET")
53 | 	subrouter.HandleFunc("/uuid/{uuid}/json", h.DeviceStatusByUuid).Methods("GET")
54 | 
55 | 	processInfo := "/dcgm/process/info/pid/{pid}"
56 | 	subrouter = s.router.PathPrefix(processInfo).Subrouter()
57 | 	subrouter.HandleFunc("", h.ProcessInfo).Methods("GET")
58 | 	subrouter.HandleFunc("/json", h.ProcessInfo).Methods("GET")
59 | 
60 | 	health := "/dcgm/health"
61 | 	subrouter = s.router.PathPrefix(health).Subrouter()
62 | 	subrouter.HandleFunc("/id/{id}", h.Health).Methods("GET")
63 | 	subrouter.HandleFunc("/id/{id}/json", h.Health).Methods("GET")
64 | 	subrouter.HandleFunc("/uuid/{uuid}", h.HealthByUuid).Methods("GET")
65 | 	subrouter.HandleFunc("/uuid/{uuid}/json", h.HealthByUuid).Methods("GET")
66 | 
67 | 	dcgmStatus := "/dcgm/status"
68 | 	subrouter = s.router.PathPrefix(dcgmStatus).Subrouter()
69 | 	subrouter.HandleFunc("", h.DcgmStatus).Methods("GET")
70 | 	subrouter.HandleFunc("/json", h.DcgmStatus).Methods("GET")
71 | }
72 | 
73 | func (s *httpServer) serve() {
74 | 	if err := s.server.ListenAndServe(); err != http.ErrServerClosed {
75 | 		log.Printf("Error: %v", err)
76 | 	}
77 | }
78 | 
79 | func (s *httpServer) stop() {
80 | 	ctx, cancel := context.WithTimeout(context.Background(), timeout)
81 | 	defer cancel()
82 | 
83 | 	if err := s.server.Shutdown(ctx); err != nil {
84 | 		log.Printf("Error: %v", err)
85 | 	} else {
86 | 		log.Println("http server stopped")
87 | 	}
88 | }
89 | 


--------------------------------------------------------------------------------
/bindings/go/samples/dcgm/topology/main.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"log"
 6 | 
 7 | 	"github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm"
 8 | )
 9 | 
10 | const (
11 | 	legend = `
12 | Legend:
13 |  X    = Self
14 |  SYS  = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)
15 |  NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node
16 |  PHB  = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)
17 |  PXB  = Connection traversing multiple PCIe switches (without traversing the PCIe Host Bridge)
18 |  PIX  = Connection traversing a single PCIe switch
19 |  PSB  = Connection traversing a single on-board PCIe switch
20 |  NV#  = Connection traversing a bonded set of # NVLinks`
21 | )
22 | 
23 | // based on nvidia-smi topo -m
24 | // dcgmi topo
25 | func main() {
26 | 	// choose dcgm hostengine running mode
27 | 	// 1. dcgm.Embedded
28 | 	// 2. dcgm.Standalone
29 | 	// 3. dcgm.StartHostengine
30 | 	cleanup, err := dcgm.Init(dcgm.Embedded)
31 | 	if err != nil {
32 | 		log.Panicln(err)
33 | 	}
34 | 	defer cleanup()
35 | 
36 | 	gpus, err := dcgm.GetSupportedDevices()
37 | 	if err != nil {
38 | 		log.Panicln(err)
39 | 	}
40 | 
41 | 	for _, gpu := range gpus {
42 | 		fmt.Printf("%9s%d", "GPU", gpu)
43 | 	}
44 | 	fmt.Printf("%5s\n", "CPUAffinity")
45 | 
46 | 	numGpus := len(gpus)
47 | 	gpuTopo := make([]string, numGpus)
48 | 	for i := 0; i < numGpus; i++ {
49 | 		topo, err := dcgm.GetDeviceTopology(gpus[i])
50 | 		if err != nil {
51 | 			log.Panicln(err)
52 | 		}
53 | 
54 | 		fmt.Printf("GPU%d", gpus[i])
55 | 		for j := 0; j < len(topo); j++ {
56 | 			// skip current GPU
57 | 			gpuTopo[topo[j].GPU] = topo[j].Link.PCIPaths()
58 | 		}
59 | 		gpuTopo[i] = "X"
60 | 		for j := 0; j < numGpus; j++ {
61 | 			fmt.Printf("%5s", gpuTopo[j])
62 | 		}
63 | 		deviceInfo, err := dcgm.GetDeviceInfo(gpus[i])
64 | 		if err != nil {
65 | 			log.Panicln(err)
66 | 		}
67 | 		fmt.Printf("%5s\n", deviceInfo.CPUAffinity)
68 | 	}
69 | 	fmt.Println(legend)
70 | }
71 | 


--------------------------------------------------------------------------------
/bindings/go/samples/nvml/README.md:
--------------------------------------------------------------------------------
 1 | ## NVML Samples
 2 | 
 3 | Modelled on the [NVIDIA System Management Interface (nvidia-smi)](https://developer.nvidia.com/nvidia-system-management-interface), a commnad line utility using NVML, three samples have been provided to show how to use NVML go bindings.
 4 | 
 5 | #### deviceInfo
 6 | 
 7 | Provides basic information about each GPU on the system.
 8 | 
 9 | ```
10 | $ go build && ./deviceInfo
11 | 
12 | # sample output
13 | 
14 | Driver Version : 384.111
15 | GPU            : 0
16 | UUID           : GPU-34e8d7ba-0e4d-ac00-6852-695d5d404f51
17 | Model          : GeForce GTX 980
18 | Path           : /dev/nvidia0
19 | Power          : 180 W
20 | CPU Affinity   : NUMA node0
21 | Bus ID         : 00000000:01:00.0
22 | BAR1           : 256 MiB
23 | Bandwidth      : 15760 MB/s
24 | Cores          : 1392 MHz
25 | Memory         : 3505 MHz
26 | P2P Available  : None
27 | ---------------------------------------------------------------------
28 | GPU            : 1
29 | UUID           : GPU-8d3b966d-2248-c3f4-1784-49851a1d02b3
30 | Model          : GeForce GTX TITAN
31 | Path           : /dev/nvidia1
32 | Power          : 250 W
33 | CPU Affinity   : NUMA node0
34 | Bus ID         : 00000000:06:00.0
35 | BAR1           : 128 MiB
36 | Bandwidth      : 8000 MB/s
37 | Cores          : 1202 MHz
38 | Memory         : 3004 MHz
39 | P2P Available  : None
40 | ---------------------------------------------------------------------
41 | ```
42 | 
43 | #### dmon
44 | 
45 | Monitors each device status including its power, memory and GPU utilization.
46 | 
47 | ```
48 | $ go build && ./dmon
49 | 
50 | # sample output
51 | 
52 | # gpu   pwr  temp    sm   mem   enc   dec  mclk  pclk
53 | # Idx     W     C     %     %     %     %   MHz   MHz
54 |     0    20    43     0     8     0     0   324   135
55 |     1    10    32     0     0     0     0   324   324
56 | 
57 | ```
58 | 
59 | #### processInfo
60 | 
61 | Informs about GPU processes running on all devices.
62 | 
63 | ```
64 | $ go build && ./processInfo
65 | 
66 | # sample output
67 | 
68 | # gpu     pid   type   mem   command
69 | # Idx       #    C/G     %   name
70 |     0   25712    C+G     0   nbody
71 |     1       -      -     -   -
72 | ```
73 | 


--------------------------------------------------------------------------------
/bindings/go/samples/nvml/deviceInfo/main.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"log"
 6 | 	"os"
 7 | 	"text/template"
 8 | 
 9 | 	"github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml"
10 | )
11 | 
12 | const (
13 | 	DEVICEINFO = `UUID           : {{.UUID}}
14 | Model          : {{or .Model "N/A"}}
15 | Path           : {{.Path}}
16 | Power          : {{if .Power}}{{.Power}} W{{else}}N/A{{end}}
17 | Memory         : {{if .Memory}}{{.Memory}} MiB{{else}}N/A{{end}}
18 | CudaComputeCap : {{if .CudaComputeCapability.Major}}{{.CudaComputeCapability.Major}}.{{.CudaComputeCapability.Minor}}{{else}}N/A{{end}}
19 | CPU Affinity   : {{if .CPUAffinity}}NUMA node{{.CPUAffinity}}{{else}}N/A{{end}}
20 | Bus ID         : {{.PCI.BusID}}
21 | BAR1           : {{if .PCI.BAR1}}{{.PCI.BAR1}} MiB{{else}}N/A{{end}}
22 | Bandwidth      : {{if .PCI.Bandwidth}}{{.PCI.Bandwidth}} MB/s{{else}}N/A{{end}}
23 | Cores          : {{if .Clocks.Cores}}{{.Clocks.Cores}} MHz{{else}}N/A{{end}}
24 | Memory         : {{if .Clocks.Memory}}{{.Clocks.Memory}} MHz{{else}}N/A{{end}}
25 | P2P Available  : {{if not .Topology}}None{{else}}{{range .Topology}}
26 |                       {{.BusID}} - {{(.Link.String)}}{{end}}{{end}}
27 | ---------------------------------------------------------------------
28 | `
29 | )
30 | 
31 | func main() {
32 | 	nvml.Init()
33 | 	defer nvml.Shutdown()
34 | 
35 | 	count, err := nvml.GetDeviceCount()
36 | 	if err != nil {
37 | 		log.Panicln("Error getting device count:", err)
38 | 	}
39 | 
40 | 	driverVersion, err := nvml.GetDriverVersion()
41 | 	if err != nil {
42 | 		log.Panicln("Error getting driver version:", err)
43 | 	}
44 | 
45 | 	t := template.Must(template.New("Device").Parse(DEVICEINFO))
46 | 
47 | 	fmt.Printf("Driver Version : %5v\n", driverVersion)
48 | 	for i := uint(0); i < count; i++ {
49 | 		device, err := nvml.NewDevice(i)
50 | 		if err != nil {
51 | 			log.Panicf("Error getting device %d: %v\n", i, err)
52 | 		}
53 | 
54 | 		fmt.Printf("GPU %12s %d\n", ":", i)
55 | 		err = t.Execute(os.Stdout, device)
56 | 		if err != nil {
57 | 			log.Panicln("Template error:", err)
58 | 		}
59 | 	}
60 | }
61 | 


--------------------------------------------------------------------------------
/bindings/go/samples/nvml/dmon/main.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"log"
 6 | 	"os"
 7 | 	"os/signal"
 8 | 	"syscall"
 9 | 	"time"
10 | 
11 | 	"github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml"
12 | )
13 | 
14 | const (
15 | 	DMONHEADER = `# gpu   pwr  temp    sm   mem   enc   dec  mclk  pclk
16 | # Idx     W     C     %     %     %     %   MHz   MHz`
17 | )
18 | 
19 | func main() {
20 | 	nvml.Init()
21 | 	defer nvml.Shutdown()
22 | 
23 | 	count, err := nvml.GetDeviceCount()
24 | 	if err != nil {
25 | 		log.Panicln("Error getting device count:", err)
26 | 	}
27 | 
28 | 	var devices []*nvml.Device
29 | 	for i := uint(0); i < count; i++ {
30 | 		device, err := nvml.NewDevice(i)
31 | 		if err != nil {
32 | 			log.Panicf("Error getting device %d: %v\n", i, err)
33 | 		}
34 | 		devices = append(devices, device)
35 | 	}
36 | 
37 | 	sigs := make(chan os.Signal, 1)
38 | 	signal.Notify(sigs, syscall.SIGINT, syscall.SIGTERM)
39 | 
40 | 	ticker := time.NewTicker(time.Second * 1)
41 | 	defer ticker.Stop()
42 | 
43 | 	fmt.Println(DMONHEADER)
44 | 	for {
45 | 		select {
46 | 		case <-ticker.C:
47 | 			for i, device := range devices {
48 | 				st, err := device.Status()
49 | 				if err != nil {
50 | 					log.Panicf("Error getting device %d status: %v\n", i, err)
51 | 				}
52 | 				fmt.Printf("%5d %5d %5d %5d %5d %5d %5d %5d %5d\n",
53 | 					i, *st.Power, *st.Temperature, *st.Utilization.GPU, *st.Utilization.Memory,
54 | 					*st.Utilization.Encoder, *st.Utilization.Decoder, *st.Clocks.Memory, *st.Clocks.Cores)
55 | 			}
56 | 		case <-sigs:
57 | 			return
58 | 		}
59 | 	}
60 | }
61 | 


--------------------------------------------------------------------------------
/bindings/go/samples/nvml/processInfo/main.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"log"
 6 | 	"os"
 7 | 	"os/signal"
 8 | 	"syscall"
 9 | 	"time"
10 | 
11 | 	"github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml"
12 | )
13 | 
14 | const (
15 | 	PINFOHEADER = `# gpu   pid   type  mem  Command
16 | # Idx     #   C/G   MiB  name`
17 | )
18 | 
19 | func main() {
20 | 	nvml.Init()
21 | 	defer nvml.Shutdown()
22 | 
23 | 	count, err := nvml.GetDeviceCount()
24 | 	if err != nil {
25 | 		log.Panicln("Error getting device count:", err)
26 | 	}
27 | 
28 | 	var devices []*nvml.Device
29 | 	for i := uint(0); i < count; i++ {
30 | 		device, err := nvml.NewDevice(i)
31 | 		if err != nil {
32 | 			log.Panicf("Error getting device %d: %v\n", i, err)
33 | 		}
34 | 		devices = append(devices, device)
35 | 	}
36 | 
37 | 	sigs := make(chan os.Signal, 1)
38 | 	signal.Notify(sigs, syscall.SIGINT, syscall.SIGTERM)
39 | 
40 | 	ticker := time.NewTicker(time.Second * 1)
41 | 	defer ticker.Stop()
42 | 
43 | 	fmt.Println(PINFOHEADER)
44 | 	for {
45 | 		select {
46 | 		case <-ticker.C:
47 | 			for i, device := range devices {
48 | 				pInfo, err := device.GetAllRunningProcesses()
49 | 				if err != nil {
50 | 					log.Panicf("Error getting device %d processes: %v\n", i, err)
51 | 				}
52 | 				if len(pInfo) == 0 {
53 | 					fmt.Printf("%5v %5s %5s %5s %-5s\n", i, "-", "-", "-", "-")
54 | 				}
55 | 				for j := range pInfo {
56 | 					fmt.Printf("%5v %5v %5v %5v %-5v\n",
57 | 						i, pInfo[j].PID, pInfo[j].Type, pInfo[j].MemoryUsed, pInfo[j].Name)
58 | 				}
59 | 			}
60 | 		case <-sigs:
61 | 			return
62 | 		}
63 | 	}
64 | }
65 | 


--------------------------------------------------------------------------------
/dcgm-exporter.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | apiVersion: apps/v1
16 | kind: DaemonSet
17 | metadata:
18 |   name: "dcgm-exporter"
19 |   labels:
20 |     app.kubernetes.io/name: "dcgm-exporter"
21 |     app.kubernetes.io/version: "2.4.0"
22 | spec:
23 |   updateStrategy:
24 |     type: RollingUpdate
25 |   selector:
26 |     matchLabels:
27 |       app.kubernetes.io/name: "dcgm-exporter"
28 |       app.kubernetes.io/version: "2.4.0"
29 |   template:
30 |     metadata:
31 |       labels:
32 |         app.kubernetes.io/name: "dcgm-exporter"
33 |         app.kubernetes.io/version: "2.4.0"
34 |       name: "dcgm-exporter"
35 |     spec:
36 |       containers:
37 |       - image: "nvcr.io/nvidia/k8s/dcgm-exporter:2.2.9-2.4.0-ubuntu18.04"
38 |         env:
39 |         - name: "DCGM_EXPORTER_LISTEN"
40 |           value: ":9400"
41 |         - name: "DCGM_EXPORTER_KUBERNETES"
42 |           value: "true"
43 |         name: "dcgm-exporter"
44 |         ports:
45 |         - name: "metrics"
46 |           containerPort: 9400
47 |         securityContext:
48 |           runAsNonRoot: false
49 |           runAsUser: 0
50 |         volumeMounts:
51 |         - name: "pod-gpu-resources"
52 |           readOnly: true
53 |           mountPath: "/var/lib/kubelet/pod-resources"
54 |       volumes:
55 |       - name: "pod-gpu-resources"
56 |         hostPath:
57 |           path: "/var/lib/kubelet/pod-resources"
58 | 
59 | ---
60 | 
61 | kind: Service
62 | apiVersion: v1
63 | metadata:
64 |   name: "dcgm-exporter"
65 |   labels:
66 |     app.kubernetes.io/name: "dcgm-exporter"
67 |     app.kubernetes.io/version: "2.4.0"
68 | spec:
69 |   selector:
70 |     app.kubernetes.io/name: "dcgm-exporter"
71 |     app.kubernetes.io/version: "2.4.0"
72 |   ports:
73 |   - name: "metrics"
74 |     port: 9400
75 | 


--------------------------------------------------------------------------------
/deployment/dcgm-exporter/.helmignore:
--------------------------------------------------------------------------------
 1 | # Patterns to ignore when building packages.
 2 | # This supports shell glob matching, relative path matching, and
 3 | # negation (prefixed with !). Only one pattern per line.
 4 | .DS_Store
 5 | # Common VCS dirs
 6 | .git/
 7 | .gitignore
 8 | .bzr/
 9 | .bzrignore
10 | .hg/
11 | .hgignore
12 | .svn/
13 | # Common backup files
14 | *.swp
15 | *.bak
16 | *.tmp
17 | *.orig
18 | *~
19 | # Various IDEs
20 | .project
21 | .idea/
22 | *.tmproj
23 | .vscode/
24 | 


--------------------------------------------------------------------------------
/deployment/dcgm-exporter/Chart.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v2
 2 | name: dcgm-exporter
 3 | description: A Helm chart for DCGM exporter
 4 | version: "2.4.0"
 5 | kubeVersion: ">= 1.13.0-0"
 6 | appVersion: "2.4.0"
 7 | sources:
 8 | - https://gitlab.com/nvidia/container-toolkit/gpu-monitoring-tools
 9 | home: https://github.com/nvidia/gpu-monitoring-tools/
10 | icon: https://assets.nvidiagrid.net/ngc/logos/DCGM.png
11 | keywords:
12 |   - gpu
13 |   - cuda
14 |   - compute
15 |   - monitoring
16 |   - telemetry
17 |   - tesla
18 | 


--------------------------------------------------------------------------------
/deployment/dcgm-exporter/templates/NOTES.txt:
--------------------------------------------------------------------------------
 1 | 1. Get the application URL by running these commands:
 2 | {{- if contains "NodePort" .Values.service.type }}
 3 |   export NODE_PORT=$(kubectl get --namespace {{ .Release.Namespace }} -o jsonpath="{.spec.ports[0].nodePort}" services {{ include "dcgm-exporter.fullname" . }})
 4 |   export NODE_IP=$(kubectl get nodes --namespace {{ .Release.Namespace }} -o jsonpath="{.items[0].status.addresses[0].address}")
 5 |   echo http://$NODE_IP:$NODE_PORT/metrics
 6 | {{- else if contains "LoadBalancer" .Values.service.type }}
 7 |      NOTE: It may take a few minutes for the LoadBalancer IP to be available.
 8 |            You can watch the status of by running 'kubectl get --namespace {{ .Release.Namespace }} svc -w {{ include "dcgm-exporter.fullname" . }}'
 9 |   export SERVICE_IP=$(kubectl get svc --namespace {{ .Release.Namespace }} {{ include "dcgm-exporter.fullname" . }} --template "{{"{{ range (index .status.loadBalancer.ingress 0) }}{{.}}{{ end }}"}}")
10 |   echo http://$SERVICE_IP:{{ .Values.service.port }}
11 | {{- else if contains "ClusterIP" .Values.service.type }}
12 |   export POD_NAME=$(kubectl get pods -n {{ .Release.Namespace }} -l "app.kubernetes.io/name={{ include "dcgm-exporter.name" . }},app.kubernetes.io/instance={{ .Release.Name }}" -o jsonpath="{.items[0].metadata.name}")
13 |   kubectl -n {{ .Release.Namespace }} port-forward $POD_NAME 8080:{{ .Values.service.port }} &
14 |   echo "Visit http://127.0.0.1:8080/metrics to use your application"
15 | {{- end }}
16 | 


--------------------------------------------------------------------------------
/deployment/dcgm-exporter/templates/_helpers.tpl:
--------------------------------------------------------------------------------
 1 | {{/* vim: set filetype=mustache: */}}
 2 | {{/*
 3 | Expand the name of the chart.
 4 | */}}
 5 | {{- define "dcgm-exporter.name" -}}
 6 | {{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}}
 7 | {{- end -}}
 8 | 
 9 | {{/*
10 | Create a default fully qualified app name.
11 | We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).
12 | If release name contains chart name it will be used as a full name.
13 | */}}
14 | {{- define "dcgm-exporter.fullname" -}}
15 | {{- if .Values.fullnameOverride -}}
16 | {{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" -}}
17 | {{- else -}}
18 | {{- $name := default .Chart.Name .Values.nameOverride -}}
19 | {{- if contains $name .Release.Name -}}
20 | {{- .Release.Name | trunc 63 | trimSuffix "-" -}}
21 | {{- else -}}
22 | {{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" -}}
23 | {{- end -}}
24 | {{- end -}}
25 | {{- end -}}
26 | 
27 | {{/*
28 | Create chart name and version as used by the chart label.
29 | */}}
30 | {{- define "dcgm-exporter.chart" -}}
31 | {{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" -}}
32 | {{- end -}}
33 | 
34 | {{/*
35 | Common labels
36 | */}}
37 | {{- define "dcgm-exporter.labels" -}}
38 | helm.sh/chart: {{ include "dcgm-exporter.chart" . }}
39 | {{ include "dcgm-exporter.selectorLabels" . }}
40 | {{- if .Chart.AppVersion }}
41 | app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
42 | {{- end }}
43 | app.kubernetes.io/managed-by: {{ .Release.Service }}
44 | {{- end -}}
45 | 
46 | {{/*
47 | Selector labels
48 | */}}
49 | {{- define "dcgm-exporter.selectorLabels" -}}
50 | app.kubernetes.io/name: {{ include "dcgm-exporter.name" . }}
51 | app.kubernetes.io/instance: {{ .Release.Name }}
52 | {{- end -}}
53 | 
54 | {{/*
55 | Create the name of the service account to use
56 | */}}
57 | {{- define "dcgm-exporter.serviceAccountName" -}}
58 | {{- if .Values.serviceAccount.create -}}
59 |     {{ default (include "dcgm-exporter.fullname" .) .Values.serviceAccount.name }}
60 | {{- else -}}
61 |     {{ default "default" .Values.serviceAccount.name }}
62 | {{- end -}}
63 | {{- end -}}
64 | 


--------------------------------------------------------------------------------
/deployment/dcgm-exporter/templates/daemonset.yaml:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | apiVersion: apps/v1
 16 | kind: DaemonSet
 17 | metadata:
 18 |   name: {{ include "dcgm-exporter.fullname" . }}
 19 |   namespace: {{ .Release.Namespace }}
 20 |   labels:
 21 |     {{- include "dcgm-exporter.labels" . | nindent 4 }}
 22 |     app.kubernetes.io/component: "dcgm-exporter"
 23 | spec:
 24 |   updateStrategy:
 25 |     type: RollingUpdate
 26 |   selector:
 27 |     matchLabels:
 28 |       {{- include "dcgm-exporter.selectorLabels" . | nindent 6 }}
 29 |       app.kubernetes.io/component: "dcgm-exporter"
 30 |   template:
 31 |     metadata:
 32 |       labels:
 33 |         {{- include "dcgm-exporter.selectorLabels" . | nindent 8 }}
 34 |         app.kubernetes.io/component: "dcgm-exporter"
 35 |     spec:
 36 |       {{- with .Values.imagePullSecrets }}
 37 |       imagePullSecrets:
 38 |         {{- toYaml . | nindent 8 }}
 39 |       {{- end }}
 40 |       serviceAccountName: {{ include "dcgm-exporter.serviceAccountName" . }}
 41 |       {{- if .Values.podSecurityContext }}
 42 |       securityContext:
 43 |         {{- toYaml .Values.podSecurityContext | nindent 8 }}
 44 |       {{- end }}
 45 |       {{- if .Values.affinity }}
 46 |       affinity:
 47 |         {{- toYaml .Values.affinity | nindent 8 }}
 48 |       {{- end }}
 49 |       {{- if .Values.nodeSelector }}
 50 |       nodeSelector:
 51 |         {{- toYaml .Values.nodeSelector | nindent 8 }}
 52 |       {{- end }}
 53 |       {{- with .Values.tolerations }}
 54 |       tolerations:
 55 |         {{- toYaml . | nindent 6 }}
 56 |       {{- end }}
 57 |       volumes:
 58 |       - name: "pod-gpu-resources"
 59 |         hostPath:
 60 |           path: "/var/lib/kubelet/pod-resources"
 61 |       {{- range .Values.extraHostVolumes }}
 62 |       - name: {{ .name | quote }}
 63 |         hostPath:
 64 |           path: {{ .hostPath | quote }}
 65 |       {{- end }}
 66 |       containers:
 67 |       - name: exporter
 68 |         securityContext:
 69 |           {{- toYaml .Values.securityContext | nindent 10 }}
 70 |         {{- if .Values.image.tag }}
 71 |         image: "{{ .Values.image.repository }}:{{ .Values.image.tag }}"
 72 |         {{- else }}
 73 |         image: "{{ .Values.image.repository }}:{{ .Chart.AppVersion }}"
 74 |         {{- end }}
 75 |         imagePullPolicy: "{{ .Values.image.pullPolicy }}"
 76 |         args: 
 77 |         {{- range $.Values.arguments }} 
 78 |         - {{ . }} 
 79 |         {{- end }}
 80 |         env:
 81 |         - name: "DCGM_EXPORTER_KUBERNETES"
 82 |           value: "true"
 83 |         - name: "DCGM_EXPORTER_LISTEN"
 84 |           value: "{{ .Values.service.address }}"
 85 |         {{- if .Values.extraEnv }}
 86 |         {{- toYaml .Values.extraEnv | nindent 8 }}
 87 |         {{- end }}
 88 |         ports:
 89 |         - name: "metrics"
 90 |           containerPort: {{ .Values.service.port }}
 91 |         volumeMounts:
 92 |         - name: "pod-gpu-resources"
 93 |           readOnly: true
 94 |           mountPath: "/var/lib/kubelet/pod-resources"
 95 |         {{- if .Values.extraVolumeMounts }}
 96 |         {{- toYaml .Values.extraVolumeMounts | nindent 8 }}
 97 |         {{- end }}
 98 |         livenessProbe:
 99 |           httpGet:
100 |             path: /health
101 |             port: {{ .Values.service.port }}
102 |           initialDelaySeconds: 5
103 |           periodSeconds: 5
104 |         readinessProbe:
105 |           httpGet:
106 |             path: /health
107 |             port: {{ .Values.service.port }}
108 |           initialDelaySeconds: 5
109 |         {{- if .Values.resources }}
110 |         resources:
111 |           {{- toYaml .Values.resources | nindent 10 }}
112 |         {{- end }}
113 | 


--------------------------------------------------------------------------------
/deployment/dcgm-exporter/templates/service-monitor.yaml:
--------------------------------------------------------------------------------
 1 | {{- if .Values.serviceMonitor.enabled }}
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | apiVersion: monitoring.coreos.com/v1
17 | kind: ServiceMonitor
18 | metadata:
19 |   name: {{ include "dcgm-exporter.fullname" . }}
20 |   namespace: {{ .Release.Namespace }}
21 |   labels:
22 |     {{- include "dcgm-exporter.labels" . | nindent 4 }}
23 |     app.kubernetes.io/component: "dcgm-exporter"
24 |     {{- if .Values.serviceMonitor.additionalLabels }}
25 |     {{- toYaml .Values.serviceMonitor.additionalLabels | nindent 4 }}
26 |     {{- end }}
27 | spec:
28 |   selector:
29 |     matchLabels:
30 |       {{- include "dcgm-exporter.selectorLabels" . | nindent 6 }}
31 |       app.kubernetes.io/component: "dcgm-exporter"
32 |   namespaceSelector:
33 |     matchNames:
34 |     - "{{ .Release.Namespace }}"
35 |   endpoints:
36 |   - port: "metrics"
37 |     path: "/metrics"
38 |     interval: "{{ .Values.serviceMonitor.interval }}"
39 | {{- end -}}
40 | 


--------------------------------------------------------------------------------
/deployment/dcgm-exporter/templates/service.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | apiVersion: v1
16 | kind: Service
17 | metadata:
18 |   name: {{ include "dcgm-exporter.fullname" . }}
19 |   namespace: {{ .Release.Namespace }}
20 |   labels:
21 |     {{- include "dcgm-exporter.labels" . | nindent 4 }}
22 |     app.kubernetes.io/component: "dcgm-exporter"
23 |   {{- with .Values.service.annotations }}
24 |   annotations:
25 |   {{- toYaml . | nindent 4 }}
26 |   {{- end }}
27 | spec:
28 |   type: {{ .Values.service.type }}
29 |   ports:
30 |   - name: "metrics"
31 |     port: {{ .Values.service.port }}
32 |     targetPort: {{ .Values.service.port }}
33 |     protocol: TCP
34 |   selector:
35 |     {{- include "dcgm-exporter.selectorLabels" . | nindent 4 }}
36 | 


--------------------------------------------------------------------------------
/deployment/dcgm-exporter/templates/serviceaccount.yaml:
--------------------------------------------------------------------------------
 1 | {{- if .Values.serviceAccount.create -}}
 2 | apiVersion: v1
 3 | kind: ServiceAccount
 4 | metadata:
 5 |   name: {{ include "dcgm-exporter.serviceAccountName" . }}
 6 |   namespace: {{ .Release.Namespace }}
 7 |   labels:
 8 |     {{- include "dcgm-exporter.labels" . | nindent 4 }}
 9 |     app.kubernetes.io/component: "dcgm-exporter"
10 |   {{- with .Values.serviceAccount.annotations }}
11 |   annotations:
12 |     {{- toYaml . | nindent 4 }}
13 |   {{- end }}
14 | {{- end -}}
15 | 


--------------------------------------------------------------------------------
/deployment/dcgm-exporter/values.yaml:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | image:
 16 |   repository: nvcr.io/nvidia/k8s/dcgm-exporter
 17 |   pullPolicy: IfNotPresent
 18 |   # Image tag defaults to AppVersion, but you can use the tag key
 19 |   # for the image tag, e.g:
 20 |   tag: 2.2.9-2.4.0-ubuntu18.04
 21 | 
 22 | # Comment the following line to stop profiling metrics from DCGM
 23 | arguments: ["-f", "/etc/dcgm-exporter/dcp-metrics-included.csv"]
 24 | # NOTE: in general, add any command line arguments to arguments above
 25 | # and they will be passed through.
 26 | # Use "-r", "<HOST>:<PORT>" to connect to an already running hostengine
 27 | # Example arguments: ["-r", "host123:5555"]
 28 | # Use "-n" to remove the hostname tag from the output.
 29 | # Example arguments: ["-n"]
 30 | # Use "-d" to specify the devices to monitor. -d must be followed by a string
 31 | # in the following format: [f] or [g[:numeric_range][+]][i[:numeric_range]]
 32 | # Where a numeric range is something like 0-4 or 0,2,4, etc.
 33 | # Example arguments: ["-d", "g+i"] to monitor all GPUs and GPU instances or
 34 | # ["-d", "g:0-3"] to monitor GPUs 0-3.
 35 | 
 36 | imagePullSecrets: []
 37 | nameOverride: ""
 38 | fullnameOverride: ""
 39 | 
 40 | serviceAccount:
 41 |   # Specifies whether a service account should be created
 42 |   create: true
 43 |   # Annotations to add to the service account
 44 |   annotations: {}
 45 |   # The name of the service account to use.
 46 |   # If not set and create is true, a name is generated using the fullname template
 47 |   name:
 48 | 
 49 | podSecurityContext: {}
 50 |   # fsGroup: 2000
 51 | 
 52 | securityContext:
 53 |   runAsNonRoot: false
 54 |   runAsUser: 0
 55 |   capabilities:
 56 |      add: ["SYS_ADMIN"]
 57 |   # readOnlyRootFilesystem: true
 58 | 
 59 | service:
 60 |   type: ClusterIP
 61 |   port: 9400
 62 |   address: ":9400"
 63 |   # Annotations to add to the service
 64 |   annotations: {}
 65 | 
 66 | resources: {}
 67 |   # limits:
 68 |   #   cpu: 100m
 69 |   #   memory: 128Mi
 70 |   # requests:
 71 |   #   cpu: 100m
 72 |   #   memory: 128Mi
 73 | serviceMonitor:
 74 |   enabled: true
 75 |   interval: 15s
 76 |   additionalLabels: {}
 77 |     #monitoring: prometheus
 78 | 
 79 | mapPodsMetrics: false
 80 | 
 81 | nodeSelector: {}
 82 |   #node: gpu
 83 | 
 84 | tolerations: []
 85 | #- operator: Exists
 86 | 
 87 | affinity: {}
 88 |   #nodeAffinity:
 89 |   #  requiredDuringSchedulingIgnoredDuringExecution:
 90 |   #    nodeSelectorTerms:
 91 |   #    - matchExpressions:
 92 |   #      - key: nvidia-gpu
 93 |   #        operator: Exists
 94 | 
 95 | extraHostVolumes: []
 96 | #- name: host-binaries
 97 | #  hostPath: /opt/bin
 98 | 
 99 | extraVolumeMounts: []
100 | #- name: host-binaries
101 | #  mountPath: /opt/bin
102 | #  readOnly: true
103 | 
104 | extraEnv: []
105 | #- name: EXTRA_VAR
106 | #  value: "TheStringValue"
107 | 


--------------------------------------------------------------------------------
/docker/Dockerfile.ubi8:
--------------------------------------------------------------------------------
 1 | ARG GOLANG_VERSION
 2 | FROM golang:$GOLANG_VERSION AS builder
 3 | WORKDIR /go/src/github.com/NVIDIA/gpu-monitoring-tools
 4 | 
 5 | COPY . .
 6 | 
 7 | RUN make binary check-format
 8 | 
 9 | FROM nvcr.io/nvidia/cuda:11.2.1-base-ubi8
10 | LABEL io.k8s.display-name="NVIDIA DCGM Exporter"
11 | 
12 | ARG DCGM_VERSION
13 | RUN dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo &&  \
14 |     dnf clean expire-cache
15 | RUN dnf install -y datacenter-gpu-manager-${DCGM_VERSION}
16 | 
17 | COPY --from=builder /go/src/github.com/NVIDIA/gpu-monitoring-tools/pkg/dcgm-exporter /usr/bin/
18 | COPY etc/dcgm-exporter /etc/dcgm-exporter
19 | 
20 | ENV NVIDIA_VISIBLE_DEVICES=all
21 | # disable all constraints on the configurations required by NVIDIA container toolkit
22 | ENV NVIDIA_DISABLE_REQUIRE="true"
23 | # Required for DCP metrics
24 | ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility,compat32
25 | 
26 | ARG VERSION
27 | 
28 | LABEL io.k8s.display-name="NVIDIA DCGM Eporter"
29 | LABEL name="NVIDIA DCGM Exporter"
30 | LABEL vendor="NVIDIA"
31 | LABEL version="${VERSION}"
32 | LABEL release="N/A"
33 | LABEL summary="Exports GPU Metrics to Prometheus"
34 | LABEL description="See summary"
35 | 
36 | COPY ./LICENSE ./licenses/LICENSE
37 | 
38 | ENV NO_SETCAP=
39 | COPY docker/dcgm-exporter-entrypoint.sh /usr/local/dcgm/dcgm-exporter-entrypoint.sh
40 | RUN chmod +x /usr/local/dcgm/dcgm-exporter-entrypoint.sh
41 | 
42 | ENTRYPOINT ["/usr/local/dcgm/dcgm-exporter-entrypoint.sh"]
43 | 


--------------------------------------------------------------------------------
/docker/Dockerfile.ubuntu18.04:
--------------------------------------------------------------------------------
 1 | ARG GOLANG_VERSION
 2 | FROM golang:$GOLANG_VERSION AS builder
 3 | WORKDIR /go/src/github.com/NVIDIA/gpu-monitoring-tools
 4 | 
 5 | COPY . .
 6 | 
 7 | RUN make binary check-format
 8 | 
 9 | FROM nvcr.io/nvidia/cuda:11.2.1-base-ubuntu18.04
10 | LABEL io.k8s.display-name="NVIDIA DCGM Exporter"
11 | 
12 | COPY --from=builder /go/src/github.com/NVIDIA/gpu-monitoring-tools/pkg/dcgm-exporter /usr/bin/
13 | COPY etc/dcgm-exporter /etc/dcgm-exporter
14 | 
15 | ARG DCGM_VERSION
16 | RUN apt-get update && apt-get install -y --no-install-recommends \
17 |     libcap2-bin gnupg2 curl ca-certificates && \
18 |     curl -fsSL https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/7fa2af80.pub | apt-key add - && \
19 |     echo "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64 /" > /etc/apt/sources.list.d/cuda.list && \
20 |     echo "deb https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64 /" > /etc/apt/sources.list.d/nvidia-ml.list && \
21 |     apt-get purge --autoremove -y curl \
22 |     && rm -rf /var/lib/apt/lists/*
23 | 
24 | RUN apt-get update && apt-get install -y --no-install-recommends \
25 |     datacenter-gpu-manager=1:${DCGM_VERSION} && apt-get purge --autoremove -y openssl
26 | 
27 | # Required for DCP metrics
28 | ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility,compat32
29 | # disable all constraints on the configurations required by NVIDIA container toolkit
30 | ENV NVIDIA_DISABLE_REQUIRE="true"
31 | ENV NVIDIA_VISIBLE_DEVICES=all
32 | 
33 | ENV NO_SETCAP=
34 | COPY docker/dcgm-exporter-entrypoint.sh /usr/local/dcgm/dcgm-exporter-entrypoint.sh
35 | RUN chmod +x /usr/local/dcgm/dcgm-exporter-entrypoint.sh
36 | 
37 | ENTRYPOINT ["/usr/local/dcgm/dcgm-exporter-entrypoint.sh"]
38 | 


--------------------------------------------------------------------------------
/docker/Dockerfile.ubuntu20.04:
--------------------------------------------------------------------------------
 1 | ARG GOLANG_VERSION
 2 | FROM golang:$GOLANG_VERSION AS builder
 3 | WORKDIR /go/src/github.com/NVIDIA/gpu-monitoring-tools
 4 | 
 5 | COPY . .
 6 | 
 7 | RUN make binary check-format
 8 | 
 9 | FROM nvcr.io/nvidia/cuda:11.2.1-base-ubuntu20.04
10 | LABEL io.k8s.display-name="NVIDIA DCGM Exporter"
11 | 
12 | COPY --from=builder /go/src/github.com/NVIDIA/gpu-monitoring-tools/pkg/dcgm-exporter /usr/bin/
13 | COPY etc/dcgm-exporter /etc/dcgm-exporter
14 | 
15 | ARG DCGM_VERSION
16 | RUN apt-get update && apt-get install -y --no-install-recommends \
17 |     libcap2-bin gnupg2 curl ca-certificates && \
18 |     curl -fsSL https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/7fa2af80.pub | apt-key add - && \
19 |     echo "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64 /" > /etc/apt/sources.list.d/cuda.list && \
20 |     echo "deb https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu2004/x86_64 /" > /etc/apt/sources.list.d/nvidia-ml.list && \
21 |     apt-get purge --autoremove -y curl \
22 |     && rm -rf /var/lib/apt/lists/*
23 | 
24 | RUN apt-get update && apt-get install -y --no-install-recommends \
25 |     datacenter-gpu-manager=1:${DCGM_VERSION} && apt-get purge --autoremove -y openssl
26 | 
27 | # Required for DCP metrics
28 | ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility,compat32
29 | # disable all constraints on the configurations required by NVIDIA container toolkit
30 | ENV NVIDIA_DISABLE_REQUIRE="true"
31 | ENV NVIDIA_VISIBLE_DEVICES=all
32 | 
33 | ENV NO_SETCAP=
34 | COPY docker/dcgm-exporter-entrypoint.sh /usr/local/dcgm/dcgm-exporter-entrypoint.sh
35 | RUN chmod +x /usr/local/dcgm/dcgm-exporter-entrypoint.sh
36 | 
37 | ENTRYPOINT ["/usr/local/dcgm/dcgm-exporter-entrypoint.sh"]
38 | 


--------------------------------------------------------------------------------
/docker/dcgm-exporter-entrypoint.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -euo pipefail
 3 | 
 4 | # We want to setcap only when the container is started with the right caps
 5 | DCGM_EXPORTER=$(readlink -f $(which dcgm-exporter))
 6 | if [ -z "$NO_SETCAP" ]; then
 7 |    setcap 'cap_sys_admin=+ep' $DCGM_EXPORTER
 8 |    if [ $? -eq 0 ]; then
 9 |       if ! $DCGM_EXPORTER -v 1>/dev/null 2>/dev/null; then
10 |          >&2 echo "Warning #2: dcgm-exporter doesn't have sufficient privileges to expose profiling metrics. To get profiling metrics with dcgm-exporter, use --cap-add SYS_ADMIN"
11 |          setcap 'cap_sys_admin=-ep' $DCGM_EXPORTER
12 |       fi
13 |    else
14 |       >&2 echo "Warning #1: dcgm-exporter doesn't have sufficient privileges to expose profiling metrics. To get profiling metrics with dcgm-exporter, use --cap-add SYS_ADMIN"
15 |    fi
16 | 
17 | fi
18 | 
19 | # Pass the command line arguments to dcgm-exporter
20 | set -- $DCGM_EXPORTER "$@"
21 | exec "$@"
22 | 


--------------------------------------------------------------------------------
/etc/dcgm-exporter/1.x-compatibility-metrics.csv:
--------------------------------------------------------------------------------
 1 | # Format,,
 2 | # If line starts with a '#' it is considered a comment,,
 3 | # DCGM FIELD, Prometheus metric type, help message
 4 | 
 5 | # Clocks,,
 6 | dcgm_sm_clock,     gauge, SM clock frequency (in MHz).
 7 | dcgm_memory_clock, gauge, Memory clock frequency (in MHz).
 8 | 
 9 | # Temperature,,
10 | dcgm_memory_temp, gauge, Memory temperature (in C).
11 | dcgm_gpu_temp,    gauge, GPU temperature (in C).
12 | 
13 | # Power,,
14 | dcgm_power_usage,              gauge, Power draw (in W).
15 | dcgm_total_energy_consumption, counter, Total energy consumption since boot (in mJ).
16 | 
17 | # PCIe,,
18 | dcgm_pcie_tx_throughput,  counter, Total number of bytes transmitted through PCIe TX (in KB) via NVML.
19 | dcgm_pcie_rx_throughput,  counter, Total number of bytes received through PCIe RX (in KB) via NVML.
20 | dcgm_pcie_replay_counter, counter, Total number of PCIe retries.
21 | 
22 | # Utilization (the sample period varies depending on the product),,
23 | dcgm_gpu_utilization,      gauge, GPU utilization (in %).
24 | dcgm_mem_copy_utilization, gauge, Memory utilization (in %).
25 | dcgm_enc_utilization,      gauge, Encoder utilization (in %).
26 | dcgm_dec_utilization,      gauge, Decoder utilization (in %).
27 | 
28 | # Errors and violations,,
29 | dcgm_xid_errors,            gauge, Value of the last XID error encountered.
30 | # dcgm_power_violation,       counter, Throttling duration due to power constraints (in us).
31 | # dcgm_thermal_violation,     counter, Throttling duration due to thermal constraints (in us).
32 | # dcgm_sync_boost_violation,  counter, Throttling duration due to sync-boost constraints (in us).
33 | # dcgm_board_limit_violation, counter, Throttling duration due to board limit constraints (in us).
34 | # dcgm_low_util_violation,    counter, Throttling duration due to low utilization (in us).
35 | # dcgm_reliability_violation, counter, Throttling duration due to reliability constraints (in us).
36 | 
37 | # Memory usage,,
38 | dcgm_fb_free, gauge, Framebuffer memory free (in MiB).
39 | dcgm_fb_used, gauge, Framebuffer memory used (in MiB).
40 | 
41 | # ECC,,
42 | # dcgm_ecc_sbe_volatile_total,  counter, Total number of single-bit volatile ECC errors.
43 | # dcgm_ecc_dbe_volatile_total,  counter, Total number of double-bit volatile ECC errors.
44 | # dcgm_ecc_sbe_aggregate_total, counter, Total number of single-bit persistent ECC errors.
45 | # dcgm_ecc_dbe_aggregate_total, counter, Total number of double-bit persistent ECC errors.
46 | 
47 | # Retired pages,,
48 | # dcgm_retired_pages_sbe,     counter, Total number of retired pages due to single-bit errors.
49 | # dcgm_retired_pages_dbe,     counter, Total number of retired pages due to double-bit errors.
50 | # dcgm_retired_pages_pending, counter, Total number of pages pending retirement.
51 | 
52 | # NVLink,,
53 | # dcgm_nvlink_flit_crc_error_count_total, counter, Total number of NVLink flow-control CRC errors.
54 | # dcgm_nvlink_data_crc_error_count_total, counter, Total number of NVLink data CRC errors.
55 | # dcgm_nvlink_replay_error_count_total,   counter, Total number of NVLink retries.
56 | # dcgm_nvlink_recovery_error_count_total, counter, Total number of NVLink recovery errors.
57 | dcgm_nvlink_bandwidth_total,            counter, Total number of NVLink bandwidth counters for all lanes
58 | 
59 | # Add DCP metrics,,
60 | dcgm_fi_prof_gr_engine_active,   gauge, Ratio of time the graphics engine is active (in %).
61 | # dcgm_fi_prof_sm_active,          gauge, The ratio of cycles an SM has at least 1 warp assigned (in %).
62 | # dcgm_fi_prof_sm_occupancy,       gauge, The ratio of number of warps resident on an SM (in %).
63 | dcgm_fi_prof_pipe_tensor_active, gauge, Ratio of cycles the tensor (HMMA) pipe is active (in %).
64 | dcgm_fi_prof_dram_active,        gauge, Ratio of cycles the device memory interface is active sending or receiving data (in %).
65 | dcgm_fi_prof_pcie_tx_bytes,      counter, The number of bytes of active pcie tx data including both header and payload.
66 | dcgm_fi_prof_pcie_rx_bytes,      counter, The number of bytes of active pcie rx data including both header and payload.
67 | 


--------------------------------------------------------------------------------
/etc/dcgm-exporter/dcp-metrics-included.csv:
--------------------------------------------------------------------------------
 1 | # Format,,
 2 | # If line starts with a '#' it is considered a comment,,
 3 | # DCGM FIELD, Prometheus metric type, help message
 4 | 
 5 | # Clocks,,
 6 | DCGM_FI_DEV_SM_CLOCK,  gauge, SM clock frequency (in MHz).
 7 | DCGM_FI_DEV_MEM_CLOCK, gauge, Memory clock frequency (in MHz).
 8 | 
 9 | # Temperature,,
10 | DCGM_FI_DEV_MEMORY_TEMP, gauge, Memory temperature (in C).
11 | DCGM_FI_DEV_GPU_TEMP,    gauge, GPU temperature (in C).
12 | 
13 | # Power,,
14 | DCGM_FI_DEV_POWER_USAGE,              gauge, Power draw (in W).
15 | DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION, counter, Total energy consumption since boot (in mJ).
16 | 
17 | # PCIE,,
18 | # DCGM_FI_DEV_PCIE_TX_THROUGHPUT,  counter, Total number of bytes transmitted through PCIe TX (in KB) via NVML.
19 | # DCGM_FI_DEV_PCIE_RX_THROUGHPUT,  counter, Total number of bytes received through PCIe RX (in KB) via NVML.
20 | DCGM_FI_DEV_PCIE_REPLAY_COUNTER, counter, Total number of PCIe retries.
21 | 
22 | # Utilization (the sample period varies depending on the product),,
23 | # DCGM_FI_DEV_GPU_UTIL,      gauge, GPU utilization (in %).
24 | DCGM_FI_DEV_MEM_COPY_UTIL, gauge, Memory utilization (in %).
25 | DCGM_FI_DEV_ENC_UTIL,      gauge, Encoder utilization (in %).
26 | DCGM_FI_DEV_DEC_UTIL ,     gauge, Decoder utilization (in %).
27 | 
28 | # Errors and violations,,
29 | DCGM_FI_DEV_XID_ERRORS,            gauge,   Value of the last XID error encountered.
30 | # DCGM_FI_DEV_POWER_VIOLATION,       counter, Throttling duration due to power constraints (in us).
31 | # DCGM_FI_DEV_THERMAL_VIOLATION,     counter, Throttling duration due to thermal constraints (in us).
32 | # DCGM_FI_DEV_SYNC_BOOST_VIOLATION,  counter, Throttling duration due to sync-boost constraints (in us).
33 | # DCGM_FI_DEV_BOARD_LIMIT_VIOLATION, counter, Throttling duration due to board limit constraints (in us).
34 | # DCGM_FI_DEV_LOW_UTIL_VIOLATION,    counter, Throttling duration due to low utilization (in us).
35 | # DCGM_FI_DEV_RELIABILITY_VIOLATION, counter, Throttling duration due to reliability constraints (in us).
36 | 
37 | # Memory usage,,
38 | DCGM_FI_DEV_FB_FREE, gauge, Framebuffer memory free (in MiB).
39 | DCGM_FI_DEV_FB_USED, gauge, Framebuffer memory used (in MiB).
40 | 
41 | # ECC,,
42 | # DCGM_FI_DEV_ECC_SBE_VOL_TOTAL, counter, Total number of single-bit volatile ECC errors.
43 | # DCGM_FI_DEV_ECC_DBE_VOL_TOTAL, counter, Total number of double-bit volatile ECC errors.
44 | # DCGM_FI_DEV_ECC_SBE_AGG_TOTAL, counter, Total number of single-bit persistent ECC errors.
45 | # DCGM_FI_DEV_ECC_DBE_AGG_TOTAL, counter, Total number of double-bit persistent ECC errors.
46 | 
47 | # Retired pages,,
48 | # DCGM_FI_DEV_RETIRED_SBE,     counter, Total number of retired pages due to single-bit errors.
49 | # DCGM_FI_DEV_RETIRED_DBE,     counter, Total number of retired pages due to double-bit errors.
50 | # DCGM_FI_DEV_RETIRED_PENDING, counter, Total number of pages pending retirement.
51 | 
52 | # NVLink,,
53 | # DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL, counter, Total number of NVLink flow-control CRC errors.
54 | # DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL, counter, Total number of NVLink data CRC errors.
55 | # DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL,   counter, Total number of NVLink retries.
56 | # DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL, counter, Total number of NVLink recovery errors.
57 | DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL,            counter, Total number of NVLink bandwidth counters for all lanes.
58 | # DCGM_FI_DEV_NVLINK_BANDWIDTH_L0,               counter, The number of bytes of active NVLink rx or tx data including both header and payload.
59 | 
60 | # VGPU License status,,
61 | DCGM_FI_DEV_VGPU_LICENSE_STATUS, gauge, vGPU License status
62 | 
63 | # Remapped rows,,
64 | DCGM_FI_DEV_UNCORRECTABLE_REMAPPED_ROWS, counter, Number of remapped rows for uncorrectable errors
65 | DCGM_FI_DEV_CORRECTABLE_REMAPPED_ROWS,   counter, Number of remapped rows for correctable errors
66 | DCGM_FI_DEV_ROW_REMAP_FAILURE,           gauge,   Whether remapping of rows has failed
67 | 
68 | # DCP metrics,,
69 | DCGM_FI_PROF_GR_ENGINE_ACTIVE,   gauge, Ratio of time the graphics engine is active (in %).
70 | # DCGM_FI_PROF_SM_ACTIVE,          gauge, The ratio of cycles an SM has at least 1 warp assigned (in %).
71 | # DCGM_FI_PROF_SM_OCCUPANCY,       gauge, The ratio of number of warps resident on an SM (in %).
72 | DCGM_FI_PROF_PIPE_TENSOR_ACTIVE, gauge, Ratio of cycles the tensor (HMMA) pipe is active (in %).
73 | DCGM_FI_PROF_DRAM_ACTIVE,        gauge, Ratio of cycles the device memory interface is active sending or receiving data (in %).
74 | # DCGM_FI_PROF_PIPE_FP64_ACTIVE,   gauge, Ratio of cycles the fp64 pipes are active (in %).
75 | # DCGM_FI_PROF_PIPE_FP32_ACTIVE,   gauge, Ratio of cycles the fp32 pipes are active (in %).
76 | # DCGM_FI_PROF_PIPE_FP16_ACTIVE,   gauge, Ratio of cycles the fp16 pipes are active (in %).
77 | DCGM_FI_PROF_PCIE_TX_BYTES,      counter, The number of bytes of active pcie tx data including both header and payload.
78 | DCGM_FI_PROF_PCIE_RX_BYTES,      counter, The number of bytes of active pcie rx data including both header and payload.
79 | 
80 | 


--------------------------------------------------------------------------------
/etc/dcgm-exporter/default-counters.csv:
--------------------------------------------------------------------------------
 1 | # Format,,
 2 | # If line starts with a '#' it is considered a comment,,
 3 | # DCGM FIELD, Prometheus metric type, help message
 4 | 
 5 | # Clocks,,
 6 | DCGM_FI_DEV_SM_CLOCK,  gauge, SM clock frequency (in MHz).
 7 | DCGM_FI_DEV_MEM_CLOCK, gauge, Memory clock frequency (in MHz).
 8 | 
 9 | # Temperature,,
10 | DCGM_FI_DEV_MEMORY_TEMP, gauge, Memory temperature (in C).
11 | DCGM_FI_DEV_GPU_TEMP,    gauge, GPU temperature (in C).
12 | 
13 | # Power,,
14 | DCGM_FI_DEV_POWER_USAGE,              gauge, Power draw (in W).
15 | DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION, counter, Total energy consumption since boot (in mJ).
16 | 
17 | # PCIE,,
18 | DCGM_FI_DEV_PCIE_TX_THROUGHPUT,  counter, Total number of bytes transmitted through PCIe TX (in KB) via NVML.
19 | DCGM_FI_DEV_PCIE_RX_THROUGHPUT,  counter, Total number of bytes received through PCIe RX (in KB) via NVML.
20 | DCGM_FI_DEV_PCIE_REPLAY_COUNTER, counter, Total number of PCIe retries.
21 | 
22 | # Utilization (the sample period varies depending on the product),,
23 | # DCGM_FI_DEV_GPU_UTIL,      gauge, GPU utilization (in %).
24 | DCGM_FI_DEV_MEM_COPY_UTIL, gauge, Memory utilization (in %).
25 | DCGM_FI_DEV_ENC_UTIL,      gauge, Encoder utilization (in %).
26 | DCGM_FI_DEV_DEC_UTIL ,     gauge, Decoder utilization (in %).
27 | 
28 | # Errors and violations,,
29 | DCGM_FI_DEV_XID_ERRORS,            gauge,   Value of the last XID error encountered.
30 | # DCGM_FI_DEV_POWER_VIOLATION,       counter, Throttling duration due to power constraints (in us).
31 | # DCGM_FI_DEV_THERMAL_VIOLATION,     counter, Throttling duration due to thermal constraints (in us).
32 | # DCGM_FI_DEV_SYNC_BOOST_VIOLATION,  counter, Throttling duration due to sync-boost constraints (in us).
33 | # DCGM_FI_DEV_BOARD_LIMIT_VIOLATION, counter, Throttling duration due to board limit constraints (in us).
34 | # DCGM_FI_DEV_LOW_UTIL_VIOLATION,    counter, Throttling duration due to low utilization (in us).
35 | # DCGM_FI_DEV_RELIABILITY_VIOLATION, counter, Throttling duration due to reliability constraints (in us).
36 | 
37 | # Memory usage,,
38 | DCGM_FI_DEV_FB_FREE, gauge, Framebuffer memory free (in MiB).
39 | DCGM_FI_DEV_FB_USED, gauge, Framebuffer memory used (in MiB).
40 | 
41 | # ECC,,
42 | # DCGM_FI_DEV_ECC_SBE_VOL_TOTAL, counter, Total number of single-bit volatile ECC errors.
43 | # DCGM_FI_DEV_ECC_DBE_VOL_TOTAL, counter, Total number of double-bit volatile ECC errors.
44 | # DCGM_FI_DEV_ECC_SBE_AGG_TOTAL, counter, Total number of single-bit persistent ECC errors.
45 | # DCGM_FI_DEV_ECC_DBE_AGG_TOTAL, counter, Total number of double-bit persistent ECC errors.
46 | 
47 | # Retired pages,,
48 | # DCGM_FI_DEV_RETIRED_SBE,     counter, Total number of retired pages due to single-bit errors.
49 | # DCGM_FI_DEV_RETIRED_DBE,     counter, Total number of retired pages due to double-bit errors.
50 | # DCGM_FI_DEV_RETIRED_PENDING, counter, Total number of pages pending retirement.
51 | 
52 | # NVLink,,
53 | # DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL, counter, Total number of NVLink flow-control CRC errors.
54 | # DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL, counter, Total number of NVLink data CRC errors.
55 | # DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL,   counter, Total number of NVLink retries.
56 | # DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL, counter, Total number of NVLink recovery errors.
57 | DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL,            counter, Total number of NVLink bandwidth counters for all lanes
58 | 
59 | # VGPU License status,,
60 | DCGM_FI_DEV_VGPU_LICENSE_STATUS, gauge, vGPU License status
61 | 
62 | # Remapped rows,,
63 | DCGM_FI_DEV_UNCORRECTABLE_REMAPPED_ROWS, counter, Number of remapped rows for uncorrectable errors
64 | DCGM_FI_DEV_CORRECTABLE_REMAPPED_ROWS,   counter, Number of remapped rows for correctable errors
65 | DCGM_FI_DEV_ROW_REMAP_FAILURE,           gauge,   Whether remapping of rows has failed
66 | 


--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
 1 | module github.com/NVIDIA/gpu-monitoring-tools
 2 | 
 3 | go 1.14
 4 | 
 5 | require (
 6 | 	github.com/Masterminds/semver v1.5.0 // indirect
 7 | 	github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm v0.0.0-20210325210537-29b4f1784f18
 8 | 	github.com/gorilla/mux v1.7.4
 9 | )
10 | 
11 | replace (
12 | 	github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm => ./bindings/go/dcgm
13 | 	k8s.io/api => k8s.io/api v0.20.2
14 | 	k8s.io/apiextensions-apiserver => k8s.io/apiextensions-apiserver v0.20.2
15 | 	k8s.io/apimachinery => k8s.io/apimachinery v0.20.2
16 | 	k8s.io/apiserver => k8s.io/apiserver v0.20.2
17 | 	k8s.io/cli-runtime => k8s.io/cli-runtime v0.20.2
18 | 	k8s.io/client-go => k8s.io/client-go v0.20.2
19 | 	k8s.io/cloud-provider => k8s.io/cloud-provider v0.20.2
20 | 	k8s.io/cluster-bootstrap => k8s.io/cluster-bootstrap v0.20.2
21 | 	k8s.io/code-generator => k8s.io/code-generator v0.20.2
22 | 	k8s.io/component-base => k8s.io/component-base v0.20.2
23 | 	k8s.io/cri-api => k8s.io/cri-api v0.20.2
24 | 	k8s.io/csi-translation-lib => k8s.io/csi-translation-lib v0.20.2
25 | 	k8s.io/kube-aggregator => k8s.io/kube-aggregator v0.20.2
26 | 	k8s.io/kube-controller-manager => k8s.io/kube-controller-manager v0.20.2
27 | 	k8s.io/kube-proxy => k8s.io/kube-proxy v0.20.2
28 | 	k8s.io/kube-scheduler => k8s.io/kube-scheduler v0.20.2
29 | 	k8s.io/kubectl => k8s.io/kubectl v0.20.2
30 | 	k8s.io/kubelet => k8s.io/kubelet v0.20.2
31 | 	k8s.io/legacy-cloud-providers => k8s.io/legacy-cloud-providers v0.20.2
32 | 	k8s.io/metrics => k8s.io/metrics v0.20.2
33 | 	k8s.io/sample-apiserver => k8s.io/sample-apiserver v0.20.2
34 | )
35 | 


--------------------------------------------------------------------------------
/go.sum:
--------------------------------------------------------------------------------
1 | github.com/Masterminds/semver v1.5.0 h1:H65muMkzWKEuNDnfl9d70GUjFniHKHRbFPGBuZ3QEww=
2 | github.com/Masterminds/semver v1.5.0/go.mod h1:MB6lktGJrhw8PrUyiEoblNEGEQ+RzHPF078ddwwvV3Y=
3 | github.com/gorilla/mux v1.7.4 h1:VuZ8uybHlWmqV03+zRzdwKL4tUnIp1MAQtp1mIFE1bc=
4 | github.com/gorilla/mux v1.7.4/go.mod h1:DVbg23sWSpFRCP0SfiEN6jmj59UnW/n46BH5rLB71So=
5 | 


--------------------------------------------------------------------------------
/pkg/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:16.04
 2 | 
 3 | ARG DCGM_VERSION
 4 | 
 5 | RUN apt-get update && apt-get install -y --no-install-recommends libgomp1 \
 6 |     ca-certificates wget && \
 7 |     rm -rf /var/lib/apt/lists/*
 8 | 
 9 | RUN wget https://developer.download.nvidia.com/compute/redist/dcgm/${DCGM_VERSION}/DEBS/datacenter-gpu-manager_${DCGM_VERSION}_amd64.deb && \
10 |     dpkg -i datacenter-gpu-manager_*.deb && \
11 |     rm -f datacenter-gpu-manager_*.deb
12 | 
13 | COPY dcgm-exporter /usr/local/bin
14 | 
15 | ENV NVIDIA_VISIBLE_DEVICES all
16 | ENV NVIDIA_DRIVER_CAPABILITIES all
17 | 
18 | VOLUME /run/prometheus
19 | 
20 | ENTRYPOINT [ "dcgm-exporter", "-e" ]
21 | 


--------------------------------------------------------------------------------
/pkg/dcgm.go:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package main
18 | 
19 | import (
20 | 	"fmt"
21 | 	"github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm"
22 | 	"math/rand"
23 | )
24 | 
25 | func NewGroup() (dcgm.GroupHandle, func(), error) {
26 | 	group, err := dcgm.NewDefaultGroup(fmt.Sprintf("gpu-collector-group-%d", rand.Uint64()))
27 | 	if err != nil {
28 | 		return dcgm.GroupHandle{}, func() {}, err
29 | 	}
30 | 
31 | 	return group, func() { dcgm.DestroyGroup(group) }, nil
32 | }
33 | 
34 | func NewDeviceFields(counters []Counter) []dcgm.Short {
35 | 	deviceFields := make([]dcgm.Short, len(counters))
36 | 	for i, f := range counters {
37 | 		deviceFields[i] = f.FieldID
38 | 	}
39 | 
40 | 	return deviceFields
41 | }
42 | 
43 | func NewFieldGroup(deviceFields []dcgm.Short) (dcgm.FieldHandle, func(), error) {
44 | 	name := fmt.Sprintf("gpu-collector-fieldgroup-%d", rand.Uint64())
45 | 	fieldGroup, err := dcgm.FieldGroupCreate(name, deviceFields)
46 | 	if err != nil {
47 | 		return dcgm.FieldHandle{}, func() {}, err
48 | 	}
49 | 
50 | 	return fieldGroup, func() { dcgm.FieldGroupDestroy(fieldGroup) }, nil
51 | }
52 | 
53 | func WatchFieldGroup(group dcgm.GroupHandle, field dcgm.FieldHandle) error {
54 | 	err := dcgm.WatchFieldsWithGroup(field, group)
55 | 	if err != nil {
56 | 		return err
57 | 	}
58 | 
59 | 	return nil
60 | }
61 | 
62 | func SetupDcgmFieldsWatch(deviceFields []dcgm.Short, sysInfo SystemInfo) ([]func(), error) {
63 | 	var err error
64 | 	var cleanups []func()
65 | 	var cleanup func()
66 | 	var group dcgm.GroupHandle
67 | 	var fieldGroup dcgm.FieldHandle
68 | 
69 | 	group, cleanup, err = CreateGroupFromSystemInfo(sysInfo)
70 | 	if err != nil {
71 | 		goto fail
72 | 	}
73 | 
74 | 	cleanups = append(cleanups, cleanup)
75 | 
76 | 	fieldGroup, cleanup, err = NewFieldGroup(deviceFields)
77 | 	if err != nil {
78 | 		goto fail
79 | 	}
80 | 
81 | 	cleanups = append(cleanups, cleanup)
82 | 
83 | 	err = WatchFieldGroup(group, fieldGroup)
84 | 	if err != nil {
85 | 		goto fail
86 | 	}
87 | 
88 | 	return cleanups, nil
89 | 
90 | fail:
91 | 	for _, f := range cleanups {
92 | 		f()
93 | 	}
94 | 
95 | 	return nil, err
96 | }
97 | 


--------------------------------------------------------------------------------
/pkg/go.mod:
--------------------------------------------------------------------------------
 1 | module dcgm-exporter
 2 | 
 3 | go 1.14
 4 | 
 5 | replace (
 6 | 	github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm => ../bindings/go/dcgm
 7 | 	k8s.io/api => k8s.io/api v0.20.2
 8 | 	k8s.io/apiextensions-apiserver => k8s.io/apiextensions-apiserver v0.20.2
 9 | 	k8s.io/apimachinery => k8s.io/apimachinery v0.20.2
10 | 	k8s.io/apiserver => k8s.io/apiserver v0.20.2
11 | 	k8s.io/cli-runtime => k8s.io/cli-runtime v0.20.2
12 | 	k8s.io/client-go => k8s.io/client-go v0.20.2
13 | 	k8s.io/cloud-provider => k8s.io/cloud-provider v0.20.2
14 | 	k8s.io/cluster-bootstrap => k8s.io/cluster-bootstrap v0.20.2
15 | 	k8s.io/code-generator => k8s.io/code-generator v0.20.2
16 | 	k8s.io/component-base => k8s.io/component-base v0.20.2
17 | 	k8s.io/cri-api => k8s.io/cri-api v0.20.2
18 | 	k8s.io/csi-translation-lib => k8s.io/csi-translation-lib v0.20.2
19 | 	k8s.io/kube-aggregator => k8s.io/kube-aggregator v0.20.2
20 | 	k8s.io/kube-controller-manager => k8s.io/kube-controller-manager v0.20.2
21 | 	k8s.io/kube-proxy => k8s.io/kube-proxy v0.20.2
22 | 	k8s.io/kube-scheduler => k8s.io/kube-scheduler v0.20.2
23 | 	k8s.io/kubectl => k8s.io/kubectl v0.20.2
24 | 	k8s.io/kubelet => k8s.io/kubelet v0.20.2
25 | 	k8s.io/legacy-cloud-providers => k8s.io/legacy-cloud-providers v0.20.2
26 | 	k8s.io/metrics => k8s.io/metrics v0.20.2
27 | 	k8s.io/sample-apiserver => k8s.io/sample-apiserver v0.20.2
28 | )
29 | 
30 | require (
31 | 	github.com/Masterminds/semver v1.5.0 // indirect
32 | 	github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm v0.0.0-00010101000000-000000000000
33 | 	github.com/gorilla/mux v1.8.0
34 | 	github.com/sirupsen/logrus v1.7.0
35 | 	github.com/stretchr/testify v1.6.1
36 | 	github.com/urfave/cli/v2 v2.3.0
37 | 	google.golang.org/grpc v1.35.0
38 | 	k8s.io/kubelet v0.20.2
39 | 	k8s.io/kubernetes v1.18.2
40 | )
41 | 


--------------------------------------------------------------------------------
/pkg/gpu_collector.go:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | package main
 18 | 
 19 | import (
 20 | 	"fmt"
 21 | 	"github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm"
 22 | 	"os"
 23 | )
 24 | 
 25 | func NewDCGMCollector(c []Counter, config *Config) (*DCGMCollector, func(), error) {
 26 | 	sysInfo, err := InitializeSystemInfo(config.Devices, config.UseFakeGpus)
 27 | 	if err != nil {
 28 | 		return nil, func() {}, err
 29 | 	}
 30 | 
 31 | 	hostname := ""
 32 | 	if config.NoHostname == false {
 33 | 		hostname, err = os.Hostname()
 34 | 		if err != nil {
 35 | 			return nil, func() {}, err
 36 | 		}
 37 | 	}
 38 | 
 39 | 	collector := &DCGMCollector{
 40 | 		Counters:        c,
 41 | 		DeviceFields:    NewDeviceFields(c),
 42 | 		UseOldNamespace: config.UseOldNamespace,
 43 | 		SysInfo:         sysInfo,
 44 | 		Hostname:        hostname,
 45 | 	}
 46 | 
 47 | 	cleanups, err := SetupDcgmFieldsWatch(collector.DeviceFields, sysInfo)
 48 | 	if err != nil {
 49 | 		return nil, func() {}, err
 50 | 	}
 51 | 
 52 | 	collector.Cleanups = cleanups
 53 | 
 54 | 	return collector, func() { collector.Cleanup() }, nil
 55 | }
 56 | 
 57 | func (c *DCGMCollector) Cleanup() {
 58 | 	for _, c := range c.Cleanups {
 59 | 		c()
 60 | 	}
 61 | }
 62 | 
 63 | func (c *DCGMCollector) GetMetrics() ([][]Metric, error) {
 64 | 	monitoringInfo := GetMonitoredEntities(c.SysInfo)
 65 | 	count := len(monitoringInfo)
 66 | 
 67 | 	metrics := make([][]Metric, count)
 68 | 
 69 | 	for i, mi := range monitoringInfo {
 70 | 		vals, err := dcgm.EntityGetLatestValues(mi.Entity.EntityGroupId, mi.Entity.EntityId, c.DeviceFields)
 71 | 		if err != nil {
 72 | 			return nil, err
 73 | 		}
 74 | 
 75 | 		// InstanceInfo will be nil for GPUs
 76 | 		metrics[i] = ToMetric(vals, c.Counters, mi.DeviceInfo, mi.InstanceInfo, c.UseOldNamespace, c.Hostname)
 77 | 	}
 78 | 
 79 | 	return metrics, nil
 80 | }
 81 | 
 82 | func ToMetric(values []dcgm.FieldValue_v1, c []Counter, d dcgm.Device, instanceInfo *GpuInstanceInfo, useOld bool, hostname string) []Metric {
 83 | 	var metrics []Metric
 84 | 
 85 | 	for i, val := range values {
 86 | 		v := ToString(val)
 87 | 		// Filter out counters with no value and ignored fields for this entity
 88 | 		if v == SkipDCGMValue {
 89 | 			continue
 90 | 		}
 91 | 		uuid := "UUID"
 92 | 		if useOld {
 93 | 			uuid = "uuid"
 94 | 		}
 95 | 		m := Metric{
 96 | 			Counter: &c[i],
 97 | 			Value:   v,
 98 | 
 99 | 			UUID:         uuid,
100 | 			GPU:          fmt.Sprintf("%d", d.GPU),
101 | 			GPUUUID:      d.UUID,
102 | 			GPUDevice:    fmt.Sprintf("nvidia%d", d.GPU),
103 | 			GPUModelName: d.Identifiers.Model,
104 | 			Hostname:     hostname,
105 | 
106 | 			Attributes: map[string]string{},
107 | 		}
108 | 		if instanceInfo != nil {
109 | 			m.MigProfile = instanceInfo.ProfileName
110 | 			m.GPUInstanceID = fmt.Sprintf("%d", instanceInfo.Info.NvmlInstanceId)
111 | 		} else {
112 | 			m.MigProfile = ""
113 | 			m.GPUInstanceID = ""
114 | 		}
115 | 		metrics = append(metrics, m)
116 | 	}
117 | 
118 | 	return metrics
119 | }
120 | 
121 | func ToString(value dcgm.FieldValue_v1) string {
122 | 	switch v := value.Int64(); v {
123 | 	case dcgm.DCGM_FT_INT32_BLANK:
124 | 		return SkipDCGMValue
125 | 	case dcgm.DCGM_FT_INT32_NOT_FOUND:
126 | 		return SkipDCGMValue
127 | 	case dcgm.DCGM_FT_INT32_NOT_SUPPORTED:
128 | 		return SkipDCGMValue
129 | 	case dcgm.DCGM_FT_INT32_NOT_PERMISSIONED:
130 | 		return SkipDCGMValue
131 | 	case dcgm.DCGM_FT_INT64_BLANK:
132 | 		return SkipDCGMValue
133 | 	case dcgm.DCGM_FT_INT64_NOT_FOUND:
134 | 		return SkipDCGMValue
135 | 	case dcgm.DCGM_FT_INT64_NOT_SUPPORTED:
136 | 		return SkipDCGMValue
137 | 	case dcgm.DCGM_FT_INT64_NOT_PERMISSIONED:
138 | 		return SkipDCGMValue
139 | 	}
140 | 	switch v := value.Float64(); v {
141 | 	case dcgm.DCGM_FT_FP64_BLANK:
142 | 		return SkipDCGMValue
143 | 	case dcgm.DCGM_FT_FP64_NOT_FOUND:
144 | 		return SkipDCGMValue
145 | 	case dcgm.DCGM_FT_FP64_NOT_SUPPORTED:
146 | 		return SkipDCGMValue
147 | 	case dcgm.DCGM_FT_FP64_NOT_PERMISSIONED:
148 | 		return SkipDCGMValue
149 | 	}
150 | 	switch v := value.FieldType; v {
151 | 	case dcgm.DCGM_FT_STRING:
152 | 		return value.String()
153 | 	case dcgm.DCGM_FT_DOUBLE:
154 | 		return fmt.Sprintf("%f", value.Float64())
155 | 	case dcgm.DCGM_FT_INT64:
156 | 		return fmt.Sprintf("%d", value.Int64())
157 | 	default:
158 | 		return FailedToConvert
159 | 	}
160 | 
161 | 	return FailedToConvert
162 | }
163 | 


--------------------------------------------------------------------------------
/pkg/gpu_collector_test.go:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package main
18 | 
19 | import (
20 | 	"fmt"
21 | 	"testing"
22 | 
23 | 	"github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm"
24 | 	"github.com/stretchr/testify/require"
25 | )
26 | 
27 | var sampleCounters = []Counter{
28 | 	{dcgm.DCGM_FI_DEV_GPU_TEMP, "DCGM_FI_DEV_GPU_TEMP", "gauge", "Temperature Help info"},
29 | 	{dcgm.DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION, "DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION", "gauge", "Energy help info"},
30 | 	{dcgm.DCGM_FI_DEV_POWER_USAGE, "DCGM_FI_DEV_POWER_USAGE", "gauge", "Power help info"},
31 | }
32 | 
33 | func TestDCGMCollector(t *testing.T) {
34 | 	cleanup, err := dcgm.Init(dcgm.Embedded)
35 | 	require.NoError(t, err)
36 | 	defer cleanup()
37 | 
38 | 	_, cleanup = testDCGMCollector(t, sampleCounters)
39 | 	cleanup()
40 | }
41 | 
42 | func testDCGMCollector(t *testing.T, counters []Counter) (*DCGMCollector, func()) {
43 | 	dOpt := DeviceOptions{true, []int{-1}, []int{-1}}
44 | 	cfg := Config{
45 | 		Devices:         dOpt,
46 | 		NoHostname:      false,
47 | 		UseOldNamespace: false,
48 | 		UseFakeGpus:     false,
49 | 	}
50 | 	c, cleanup, err := NewDCGMCollector(counters, &cfg)
51 | 	require.NoError(t, err)
52 | 
53 | 	out, err := c.GetMetrics()
54 | 	require.NoError(t, err)
55 | 	require.Greater(t, len(out), 0, "Check that you have a GPU on this node")
56 | 	require.Len(t, out[0], len(counters))
57 | 
58 | 	for i, dev := range out {
59 | 		for j, metric := range dev {
60 | 			require.Equal(t, metric.Name, counters[j].FieldName)
61 | 			require.Equal(t, metric.GPU, fmt.Sprintf("%d", i))
62 | 
63 | 			require.NotEmpty(t, metric.Value)
64 | 			require.NotEqual(t, metric.Value, FailedToConvert)
65 | 		}
66 | 	}
67 | 
68 | 	return c, cleanup
69 | }
70 | 


--------------------------------------------------------------------------------
/pkg/kubernetes_test.go:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | package main
 18 | 
 19 | import (
 20 | 	"context"
 21 | 	"fmt"
 22 | 	"io/ioutil"
 23 | 	"os"
 24 | 	"testing"
 25 | 	"time"
 26 | 
 27 | 	"github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm"
 28 | 	"github.com/stretchr/testify/require"
 29 | 	"google.golang.org/grpc"
 30 | 	podresourcesapi "k8s.io/kubernetes/pkg/kubelet/apis/podresources/v1alpha1"
 31 | 	"k8s.io/kubernetes/pkg/kubelet/util"
 32 | )
 33 | 
 34 | var tmpDir string
 35 | 
 36 | func TestProcessPodMapper(t *testing.T) {
 37 | 	cleanup := CreateTmpDir(t)
 38 | 	defer cleanup()
 39 | 
 40 | 	cleanup, err := dcgm.Init(dcgm.Embedded)
 41 | 	require.NoError(t, err)
 42 | 	defer cleanup()
 43 | 
 44 | 	c, cleanup := testDCGMCollector(t, sampleCounters)
 45 | 	defer cleanup()
 46 | 
 47 | 	out, err := c.GetMetrics()
 48 | 	require.NoError(t, err)
 49 | 	original := append(out[:0:0], out...)
 50 | 
 51 | 	socketPath = tmpDir + "/kubelet.sock"
 52 | 	server := grpc.NewServer()
 53 | 	gpus := GetGPUUUIDs(original)
 54 | 	podresourcesapi.RegisterPodResourcesListerServer(server, NewPodResourcesMockServer(gpus))
 55 | 
 56 | 	cleanup = StartMockServer(t, server, socketPath)
 57 | 	defer cleanup()
 58 | 
 59 | 	podMapper := NewPodMapper(&Config{KubernetesGPUIdType: GPUUID})
 60 | 	var sysInfo SystemInfo
 61 | 	err = podMapper.Process(out, sysInfo)
 62 | 	require.NoError(t, err)
 63 | 
 64 | 	require.Len(t, out, len(original))
 65 | 	for i, dev := range out {
 66 | 		for _, metric := range dev {
 67 | 			require.Contains(t, metric.Attributes, podAttribute)
 68 | 			require.Contains(t, metric.Attributes, namespaceAttribute)
 69 | 			require.Contains(t, metric.Attributes, containerAttribute)
 70 | 
 71 | 			// TODO currently we rely on ordering and implicit expectations of the mock implementation
 72 | 			// This should be a table comparison
 73 | 			require.Equal(t, metric.Attributes[podAttribute], fmt.Sprintf("gpu-pod-%d", i))
 74 | 			require.Equal(t, metric.Attributes[namespaceAttribute], "default")
 75 | 			require.Equal(t, metric.Attributes[containerAttribute], "default")
 76 | 		}
 77 | 	}
 78 | }
 79 | 
 80 | func GetGPUUUIDs(metrics [][]Metric) []string {
 81 | 	gpus := make([]string, len(metrics))
 82 | 	for i, dev := range metrics {
 83 | 		gpus[i] = dev[0].GPUUUID
 84 | 	}
 85 | 
 86 | 	return gpus
 87 | }
 88 | 
 89 | func StartMockServer(t *testing.T, server *grpc.Server, socket string) func() {
 90 | 	l, err := util.CreateListener("unix://" + socket)
 91 | 	require.NoError(t, err)
 92 | 
 93 | 	stopped := make(chan interface{})
 94 | 
 95 | 	go func() {
 96 | 		server.Serve(l)
 97 | 		close(stopped)
 98 | 	}()
 99 | 
100 | 	return func() {
101 | 		server.Stop()
102 | 		select {
103 | 		case <-stopped:
104 | 			return
105 | 		case <-time.After(1 * time.Second):
106 | 			t.Fatal("Failed waiting for gRPC server to stop")
107 | 		}
108 | 	}
109 | }
110 | 
111 | func CreateTmpDir(t *testing.T) func() {
112 | 	path, err := ioutil.TempDir("", "gpu-monitoring-tools")
113 | 	require.NoError(t, err)
114 | 
115 | 	tmpDir = path
116 | 
117 | 	return func() {
118 | 		require.NoError(t, os.RemoveAll(tmpDir))
119 | 	}
120 | }
121 | 
122 | // Contains a list of UUIDs
123 | type PodResourcesMockServer struct {
124 | 	gpus []string
125 | }
126 | 
127 | func NewPodResourcesMockServer(used []string) *PodResourcesMockServer {
128 | 	return &PodResourcesMockServer{
129 | 		gpus: used,
130 | 	}
131 | }
132 | 
133 | func (s *PodResourcesMockServer) List(ctx context.Context, req *podresourcesapi.ListPodResourcesRequest) (*podresourcesapi.ListPodResourcesResponse, error) {
134 | 	podResources := make([]*podresourcesapi.PodResources, len(s.gpus))
135 | 
136 | 	for i, gpu := range s.gpus {
137 | 		podResources[i] = &podresourcesapi.PodResources{
138 | 			Name:      fmt.Sprintf("gpu-pod-%d", i),
139 | 			Namespace: "default",
140 | 			Containers: []*podresourcesapi.ContainerResources{
141 | 				&podresourcesapi.ContainerResources{
142 | 					Name: "default",
143 | 					Devices: []*podresourcesapi.ContainerDevices{
144 | 						&podresourcesapi.ContainerDevices{
145 | 							ResourceName: nvidiaResourceName,
146 | 							DeviceIds:    []string{gpu},
147 | 						},
148 | 					},
149 | 				},
150 | 			},
151 | 		}
152 | 	}
153 | 
154 | 	return &podresourcesapi.ListPodResourcesResponse{
155 | 		PodResources: podResources,
156 | 	}, nil
157 | 
158 | }
159 | 


--------------------------------------------------------------------------------
/pkg/parser.go:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | package main
 18 | 
 19 | import (
 20 | 	"encoding/csv"
 21 | 	"fmt"
 22 | 	"os"
 23 | 	"strings"
 24 | 
 25 | 	"github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm"
 26 | 	"github.com/sirupsen/logrus"
 27 | )
 28 | 
 29 | func ExtractCounters(filename string, dcpAllowed bool) ([]Counter, error) {
 30 | 	records, err := ReadCSVFile(filename)
 31 | 	if err != nil {
 32 | 		fmt.Printf("Error: %v\n", err)
 33 | 		return nil, err
 34 | 	}
 35 | 
 36 | 	counters, err := extractCounters(records, dcpAllowed)
 37 | 	if err != nil {
 38 | 		return nil, err
 39 | 	}
 40 | 
 41 | 	return counters, err
 42 | }
 43 | 
 44 | func ReadCSVFile(filename string) ([][]string, error) {
 45 | 	file, err := os.Open(filename)
 46 | 	if err != nil {
 47 | 		return nil, err
 48 | 	}
 49 | 
 50 | 	defer file.Close()
 51 | 
 52 | 	r := csv.NewReader(file)
 53 | 	records, err := r.ReadAll()
 54 | 
 55 | 	return records, err
 56 | }
 57 | 
 58 | func extractCounters(records [][]string, dcpAllowed bool) ([]Counter, error) {
 59 | 	f := make([]Counter, 0, len(records))
 60 | 
 61 | 	for i, record := range records {
 62 | 		var useOld = false
 63 | 		if len(record) == 0 {
 64 | 			continue
 65 | 		}
 66 | 
 67 | 		for j, r := range record {
 68 | 			record[j] = strings.Trim(r, " ")
 69 | 		}
 70 | 
 71 | 		if recordIsCommentOrEmpty(record) {
 72 | 			logrus.Debugf("Skipping line %d (`%v`)", i, record)
 73 | 			continue
 74 | 		}
 75 | 
 76 | 		if len(record) != 3 {
 77 | 			return nil, fmt.Errorf("Malformed CSV record, failed to parse line %d (`%v`), expected 3 fields", i, record)
 78 | 		}
 79 | 
 80 | 		fieldID, ok := dcgm.DCGM_FI[record[0]]
 81 | 		oldFieldID, oldOk := dcgm.OLD_DCGM_FI[record[0]]
 82 | 		if !ok && !oldOk {
 83 | 			return nil, fmt.Errorf("Could not find DCGM field %s", record[0])
 84 | 		}
 85 | 
 86 | 		if !ok && oldOk {
 87 | 			useOld = true
 88 | 		}
 89 | 
 90 | 		if !useOld {
 91 | 			if !dcpAllowed && fieldID >= 1000 {
 92 | 				logrus.Warnf("Skipping line %d ('%s'): DCP metrics not enabled", i, record[0])
 93 | 				continue
 94 | 			}
 95 | 
 96 | 			if _, ok := promMetricType[record[1]]; !ok {
 97 | 				return nil, fmt.Errorf("Could not find Prometheus metry type %s", record[1])
 98 | 			}
 99 | 
100 | 			f = append(f, Counter{fieldID, record[0], record[1], record[2]})
101 | 		} else {
102 | 			if !dcpAllowed && oldFieldID >= 1000 {
103 | 				logrus.Warnf("Skipping line %d ('%s'): DCP metrics not enabled", i, record[0])
104 | 				continue
105 | 			}
106 | 
107 | 			if _, ok := promMetricType[record[1]]; !ok {
108 | 				return nil, fmt.Errorf("Could not find Prometheus metry type %s", record[1])
109 | 			}
110 | 
111 | 			f = append(f, Counter{oldFieldID, record[0], record[1], record[2]})
112 | 
113 | 		}
114 | 	}
115 | 
116 | 	return f, nil
117 | }
118 | 
119 | func recordIsCommentOrEmpty(s []string) bool {
120 | 	if len(s) == 0 {
121 | 		return true
122 | 	}
123 | 
124 | 	if len(s[0]) < 1 || s[0][0] == '#' {
125 | 		return true
126 | 	}
127 | 
128 | 	return false
129 | }
130 | 


--------------------------------------------------------------------------------
/pkg/pipeline_test.go:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package main
18 | 
19 | import (
20 | 	"testing"
21 | 
22 | 	"github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm"
23 | 	"github.com/stretchr/testify/require"
24 | )
25 | 
26 | func TestRun(t *testing.T) {
27 | 	cleanup, err := dcgm.Init(dcgm.Embedded)
28 | 	require.NoError(t, err)
29 | 	defer cleanup()
30 | 
31 | 	c, cleanup := testDCGMCollector(t, sampleCounters)
32 | 	defer cleanup()
33 | 
34 | 	p, cleanup, err := NewMetricsPipelineWithGPUCollector(&Config{}, c)
35 | 	defer cleanup()
36 | 
37 | 	out, err := p.run()
38 | 	require.NoError(t, err)
39 | 	require.NotEmpty(t, out)
40 | 
41 | 	// Note it is pretty difficult to make non superficial tests without
42 | 	// writting a full blown parser, always look at the results
43 | 	// We'll be testing them more throughly in the e2e tests (e.g: by running prometheus).
44 | 	t.Logf("Pipeline result is:\n%v", out)
45 | }
46 | 


--------------------------------------------------------------------------------
/pkg/server.go:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | package main
 18 | 
 19 | import (
 20 | 	"context"
 21 | 	"net/http"
 22 | 	"sync"
 23 | 	"time"
 24 | 
 25 | 	"github.com/gorilla/mux"
 26 | 	"github.com/sirupsen/logrus"
 27 | )
 28 | 
 29 | func NewMetricsServer(c *Config, metrics chan string) (*MetricsServer, func(), error) {
 30 | 	router := mux.NewRouter()
 31 | 	serverv1 := &MetricsServer{
 32 | 		server: http.Server{
 33 | 			Addr:         c.Address,
 34 | 			Handler:      router,
 35 | 			ReadTimeout:  10 * time.Second,
 36 | 			WriteTimeout: 10 * time.Second,
 37 | 		},
 38 | 		metricsChan: metrics,
 39 | 		metrics:     "",
 40 | 	}
 41 | 
 42 | 	router.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) {
 43 | 		w.Write([]byte(`<html>
 44 | 			<head><title>GPU Exporter</title></head>
 45 | 			<body>
 46 | 			<h1>GPU Exporter</h1>
 47 | 			<p><a href="./metrics">Metrics</a></p>
 48 | 			</body>
 49 | 			</html>`))
 50 | 	})
 51 | 
 52 | 	router.HandleFunc("/health", serverv1.Health)
 53 | 	router.HandleFunc("/metrics", serverv1.Metrics)
 54 | 
 55 | 	return serverv1, func() {}, nil
 56 | }
 57 | 
 58 | func (s *MetricsServer) Run(stop chan interface{}, wg *sync.WaitGroup) {
 59 | 	defer wg.Done()
 60 | 
 61 | 	var httpwg sync.WaitGroup
 62 | 	httpwg.Add(1)
 63 | 	go func() {
 64 | 		defer httpwg.Done()
 65 | 		logrus.Info("Starting webserver")
 66 | 		if err := s.server.ListenAndServe(); err != nil && err != http.ErrServerClosed {
 67 | 			logrus.Fatalf("Failed to Listen and Server HTTP server with err: `%v`", err)
 68 | 		}
 69 | 	}()
 70 | 
 71 | 	httpwg.Add(1)
 72 | 	go func() {
 73 | 		defer httpwg.Done()
 74 | 		for {
 75 | 			select {
 76 | 			case <-stop:
 77 | 				return
 78 | 			case m := <-s.metricsChan:
 79 | 				s.updateMetrics(m)
 80 | 			}
 81 | 		}
 82 | 	}()
 83 | 
 84 | 	<-stop
 85 | 	if err := s.server.Shutdown(context.Background()); err != nil {
 86 | 		logrus.Fatalf("Failed to shutdown HTTP server, with err: `%v`", err)
 87 | 	}
 88 | 
 89 | 	if err := WaitWithTimeout(&httpwg, 3*time.Second); err != nil {
 90 | 		logrus.Fatalf("Failed waiting for HTTP server to shutdown, with err: `%v`", err)
 91 | 	}
 92 | }
 93 | 
 94 | func (s *MetricsServer) Metrics(w http.ResponseWriter, r *http.Request) {
 95 | 	w.WriteHeader(http.StatusOK)
 96 | 	w.Write([]byte(s.getMetrics()))
 97 | }
 98 | 
 99 | func (s *MetricsServer) Health(w http.ResponseWriter, r *http.Request) {
100 | 	if s.getMetrics() == "" {
101 | 		w.WriteHeader(http.StatusServiceUnavailable)
102 | 		w.Write([]byte("KO"))
103 | 	} else {
104 | 		w.WriteHeader(http.StatusOK)
105 | 		w.Write([]byte("OK"))
106 | 	}
107 | }
108 | 
109 | func (s *MetricsServer) updateMetrics(m string) {
110 | 	s.Lock()
111 | 	defer s.Unlock()
112 | 
113 | 	s.metrics = m
114 | }
115 | 
116 | func (s *MetricsServer) getMetrics() string {
117 | 	s.Lock()
118 | 	defer s.Unlock()
119 | 
120 | 	return s.metrics
121 | }
122 | 


--------------------------------------------------------------------------------
/pkg/system_info_test.go:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | package main
 18 | 
 19 | import (
 20 | 	"fmt"
 21 | 	"github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm"
 22 | 	"github.com/stretchr/testify/require"
 23 | 	"testing"
 24 | )
 25 | 
 26 | const (
 27 | 	fakeProfileName string = "2fake.4gb"
 28 | )
 29 | 
 30 | func SpoofSystemInfo() SystemInfo {
 31 | 	var sysInfo SystemInfo
 32 | 	sysInfo.GpuCount = 2
 33 | 	sysInfo.MigEnabled = true
 34 | 	sysInfo.Gpus[0].DeviceInfo.GPU = 0
 35 | 	gi := GpuInstanceInfo{
 36 | 		Info:        dcgm.MigEntityInfo{"fake", 0, 0, 0, 0, 3},
 37 | 		ProfileName: fakeProfileName,
 38 | 		EntityId:    0,
 39 | 	}
 40 | 	sysInfo.Gpus[0].GpuInstances = append(sysInfo.Gpus[0].GpuInstances, gi)
 41 | 	gi2 := GpuInstanceInfo{
 42 | 		Info:        dcgm.MigEntityInfo{"fake", 0, 1, 0, 0, 3},
 43 | 		ProfileName: fakeProfileName,
 44 | 		EntityId:    14,
 45 | 	}
 46 | 	sysInfo.Gpus[1].GpuInstances = append(sysInfo.Gpus[1].GpuInstances, gi2)
 47 | 	sysInfo.Gpus[1].DeviceInfo.GPU = 1
 48 | 
 49 | 	return sysInfo
 50 | }
 51 | 
 52 | func TestMonitoredEntities(t *testing.T) {
 53 | 	sysInfo := SpoofSystemInfo()
 54 | 	sysInfo.dOpt.Flex = true
 55 | 
 56 | 	monitoring := GetMonitoredEntities(sysInfo)
 57 | 	require.Equal(t, len(monitoring), 2, fmt.Sprintf("Should have 2 monitored entities but found %d", len(monitoring)))
 58 | 	instanceCount := 0
 59 | 	gpuCount := 0
 60 | 	for _, mi := range monitoring {
 61 | 		if mi.Entity.EntityGroupId == dcgm.FE_GPU_I {
 62 | 			instanceCount = instanceCount + 1
 63 | 			require.NotEqual(t, mi.InstanceInfo, nil, "Expected InstanceInfo to be populated but it wasn't")
 64 | 			require.Equal(t, mi.InstanceInfo.ProfileName, fakeProfileName, "Expected profile named '%s' but found '%s'", fakeProfileName, mi.InstanceInfo.ProfileName)
 65 | 			if mi.Entity.EntityId != uint(0) {
 66 | 				// One of these should be 0, the other should be 14
 67 | 				require.Equal(t, mi.Entity.EntityId, uint(14), "Expected 14 as EntityId but found %s", monitoring[1].Entity.EntityId)
 68 | 			}
 69 | 		} else {
 70 | 			gpuCount = gpuCount + 1
 71 | 			require.Equal(t, mi.InstanceInfo, (*GpuInstanceInfo)(nil), "Expected InstanceInfo to be nil but it wasn't")
 72 | 		}
 73 | 	}
 74 | 	require.Equal(t, instanceCount, 2, "Expected 2 GPU instances but found %d", instanceCount)
 75 | 	require.Equal(t, gpuCount, 0, "Expected 0 GPUs but found %d", gpuCount)
 76 | 
 77 | 	sysInfo.MigEnabled = false // we are now monitoring the GPUs
 78 | 	monitoring = GetMonitoredEntities(sysInfo)
 79 | 	require.Equal(t, 2, len(monitoring), fmt.Sprintf("Should have 2 monitored entities but found %d", len(monitoring)))
 80 | 	for i, mi := range monitoring {
 81 | 		require.Equal(t, mi.Entity.EntityGroupId, dcgm.FE_GPU, "Expected FE_GPU but found %d", mi.Entity.EntityGroupId)
 82 | 		require.Equal(t, uint(i), mi.DeviceInfo.GPU, "Expected GPU %d but found %d", i, mi.DeviceInfo.GPU)
 83 | 		require.Equal(t, (*GpuInstanceInfo)(nil), mi.InstanceInfo, "Expected InstanceInfo not to be populated but it was")
 84 | 	}
 85 | }
 86 | 
 87 | func TestVerifyDevicePresence(t *testing.T) {
 88 | 	sysInfo := SpoofSystemInfo()
 89 | 	var dOpt DeviceOptions
 90 | 	dOpt.Flex = true
 91 | 	err := VerifyDevicePresence(&sysInfo, dOpt)
 92 | 	require.Equal(t, err, nil, "Expected to have no error, but found %s", err)
 93 | 
 94 | 	dOpt.Flex = false
 95 | 	dOpt.GpuRange = append(dOpt.GpuRange, -1)
 96 | 	dOpt.GpuInstanceRange = append(dOpt.GpuInstanceRange, -1)
 97 | 	err = VerifyDevicePresence(&sysInfo, dOpt)
 98 | 	require.Equal(t, err, nil, "Expected to have no error, but found %s", err)
 99 | 
100 | 	dOpt.GpuInstanceRange[0] = 10 // this GPU instance doesn't exist
101 | 	err = VerifyDevicePresence(&sysInfo, dOpt)
102 | 	require.NotEqual(t, err, nil, "Expected to have an error for a non-existent GPU instance, but none found")
103 | 
104 | 	dOpt.GpuRange[0] = 10 // this GPU doesn't exist
105 | 	dOpt.GpuInstanceRange[0] = -1
106 | 	err = VerifyDevicePresence(&sysInfo, dOpt)
107 | 	require.NotEqual(t, err, nil, "Expected to have an error for a non-existent GPU, but none found")
108 | 
109 | 	// Add GPUs and instances that exist
110 | 	dOpt.GpuRange[0] = 0
111 | 	dOpt.GpuRange = append(dOpt.GpuRange, 1)
112 | 	dOpt.GpuInstanceRange[0] = 0
113 | 	dOpt.GpuInstanceRange = append(dOpt.GpuInstanceRange, 14)
114 | 	err = VerifyDevicePresence(&sysInfo, dOpt)
115 | 	require.Equal(t, err, nil, "Expected to have no error, but found %s", err)
116 | }
117 | 
118 | //func TestMigProfileNames(t *testing.T) {
119 | //	sysInfo := SpoofSystemInfo()
120 | //    SetMigProfileNames(sysInfo, values)
121 | //}
122 | 


--------------------------------------------------------------------------------
/pkg/types.go:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | package main
 18 | 
 19 | import (
 20 | 	"fmt"
 21 | 	"net/http"
 22 | 	"sync"
 23 | 	"text/template"
 24 | 
 25 | 	"github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm"
 26 | )
 27 | 
 28 | var (
 29 | 	SkipDCGMValue   = "SKIPPING DCGM VALUE"
 30 | 	FailedToConvert = "ERROR - FAILED TO CONVERT TO STRING"
 31 | 
 32 | 	nvidiaResourceName      = "nvidia.com/gpu"
 33 | 	nvidiaMigResourcePrefix = "nvidia.com/mig-"
 34 | 	MIG_UUID_PREFIX         = "MIG-"
 35 | 
 36 | 	// Note standard resource attributes
 37 | 	podAttribute       = "pod"
 38 | 	namespaceAttribute = "namespace"
 39 | 	containerAttribute = "container"
 40 | 
 41 | 	oldPodAttribute       = "pod_name"
 42 | 	oldNamespaceAttribute = "pod_namespace"
 43 | 	oldContainerAttribute = "container_name"
 44 | )
 45 | 
 46 | type KubernetesGPUIDType string
 47 | 
 48 | const (
 49 | 	GPUUID     KubernetesGPUIDType = "uid"
 50 | 	DeviceName KubernetesGPUIDType = "device-name"
 51 | )
 52 | 
 53 | const (
 54 | 	FlexKey        = "f" // Monitor all GPUs if MIG is disabled or all GPU instances if MIG is enabled
 55 | 	GPUKey         = "g" // Monitor GPUs
 56 | 	GPUInstanceKey = "i" // Monitor GPU instances - cannot be specified if MIG is disabled
 57 | )
 58 | 
 59 | type DeviceOptions struct {
 60 | 	Flex             bool  // If true, then monitor all GPUs if MIG mode is disabled or all GPU instances if MIG is enabled.
 61 | 	GpuRange         []int // The indices of each GPU to monitor, or -1 to monitor all
 62 | 	GpuInstanceRange []int // The indices of each GPU instance to monitor, or -1 to monitor all
 63 | }
 64 | 
 65 | type Config struct {
 66 | 	CollectorsFile      string
 67 | 	Address             string
 68 | 	CollectInterval     int
 69 | 	Kubernetes          bool
 70 | 	KubernetesGPUIdType KubernetesGPUIDType
 71 | 	CollectDCP          bool
 72 | 	UseOldNamespace     bool
 73 | 	UseRemoteHE         bool
 74 | 	RemoteHEInfo        string
 75 | 	Devices             DeviceOptions
 76 | 	NoHostname          bool
 77 | 	UseFakeGpus         bool
 78 | }
 79 | 
 80 | type Transform interface {
 81 | 	Process(metrics [][]Metric, sysInfo SystemInfo) error
 82 | 	Name() string
 83 | }
 84 | 
 85 | type MetricsPipeline struct {
 86 | 	config *Config
 87 | 
 88 | 	transformations  []Transform
 89 | 	metricsFormat    *template.Template
 90 | 	migMetricsFormat *template.Template
 91 | 
 92 | 	counters     []Counter
 93 | 	gpuCollector *DCGMCollector
 94 | }
 95 | 
 96 | type DCGMCollector struct {
 97 | 	Counters        []Counter
 98 | 	DeviceFields    []dcgm.Short
 99 | 	Cleanups        []func()
100 | 	UseOldNamespace bool
101 | 	SysInfo         SystemInfo
102 | 	Hostname        string
103 | }
104 | 
105 | type Counter struct {
106 | 	FieldID   dcgm.Short
107 | 	FieldName string
108 | 	PromType  string
109 | 	Help      string
110 | }
111 | 
112 | type Metric struct {
113 | 	Counter *Counter
114 | 	Value   string
115 | 
116 | 	GPU          string
117 | 	GPUUUID      string
118 | 	GPUDevice    string
119 | 	GPUModelName string
120 | 
121 | 	UUID string
122 | 
123 | 	MigProfile    string
124 | 	GPUInstanceID string
125 | 	Hostname      string
126 | 
127 | 	Attributes map[string]string
128 | }
129 | 
130 | func (m Metric) getIDOfType(idType KubernetesGPUIDType) (string, error) {
131 | 	// For MIG devices, return the MIG profile instead of
132 | 	if m.MigProfile != "" {
133 | 		return fmt.Sprintf("%s-%s", m.GPU, m.GPUInstanceID), nil
134 | 	}
135 | 	switch idType {
136 | 	case GPUUID:
137 | 		return m.GPUUUID, nil
138 | 	case DeviceName:
139 | 		return m.GPUDevice, nil
140 | 	}
141 | 	return "", fmt.Errorf("unsupported KubernetesGPUIDType for MetricID '%s'", idType)
142 | }
143 | 
144 | var promMetricType = map[string]bool{
145 | 	"gauge":     true,
146 | 	"counter":   true,
147 | 	"histogram": true,
148 | 	"summary":   true,
149 | }
150 | 
151 | type MetricsServer struct {
152 | 	sync.Mutex
153 | 
154 | 	server      http.Server
155 | 	metrics     string
156 | 	metricsChan chan string
157 | }
158 | 
159 | type PodMapper struct {
160 | 	Config *Config
161 | }
162 | 
163 | type PodInfo struct {
164 | 	Name      string
165 | 	Namespace string
166 | 	Container string
167 | }
168 | 


--------------------------------------------------------------------------------
/pkg/utils.go:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package main
18 | 
19 | import (
20 | 	"fmt"
21 | 	"os"
22 | 	"os/signal"
23 | 	"sync"
24 | 	"time"
25 | )
26 | 
27 | func WaitWithTimeout(wg *sync.WaitGroup, timeout time.Duration) error {
28 | 	c := make(chan struct{})
29 | 	go func() {
30 | 		defer close(c)
31 | 		wg.Wait()
32 | 	}()
33 | 	select {
34 | 	case <-c:
35 | 		return nil
36 | 	case <-time.After(timeout):
37 | 		return fmt.Errorf("Timeout waiting for WaitGroup")
38 | 	}
39 | }
40 | 
41 | func newOSWatcher(sigs ...os.Signal) chan os.Signal {
42 | 	sigChan := make(chan os.Signal, 1)
43 | 	signal.Notify(sigChan, sigs...)
44 | 
45 | 	return sigChan
46 | }
47 | 


--------------------------------------------------------------------------------
/service-monitor.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | apiVersion: monitoring.coreos.com/v1
16 | kind: ServiceMonitor
17 | metadata:
18 |   name: "dcgm-exporter"
19 |   labels:
20 |     app.kubernetes.io/name: "dcgm-exporter"
21 |     app.kubernetes.io/version: "2.4.0"
22 | spec:
23 |   selector:
24 |     matchLabels:
25 |       app.kubernetes.io/name: "dcgm-exporter"
26 |       app.kubernetes.io/version: "2.4.0"
27 |   endpoints:
28 |   - port: "metrics"
29 |     path: "/metrics"
30 | 


--------------------------------------------------------------------------------
/tests/ci-run-e2e.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash -x
 2 | # Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | set -euxo pipefail
17 | shopt -s lastpipe
18 | 
19 | readonly basedir="$(dirname "$(realpath "$0")")"
20 | 
21 | # shellcheck source=tests/common.sh
22 | source "${basedir}/common.sh"
23 | 
24 | # shellcheck source=tests/metrics.sh
25 | source "${basedir}/metrics.sh"
26 | 
27 | CI_REGISTRY_IMAGE=${CI_REGISTRY_IMAGE:-"undefined"}
28 | CI_COMMIT_SHORT_SHA=${CI_COMMIT_SHORT_SHA:-"undefined"}
29 | 
30 | install::jq() {
31 | 	apt update && apt install -y --no-install-recommends jq
32 | }
33 | 
34 | install::helm() {
35 | 	curl https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 | bash
36 | }
37 | 
38 | install::dcgm::exporter() {
39 | 	helm package deployment/dcgm-exporter
40 | 	helm install --wait dcgm-exporter ./*.tgz --set "image.repository=${CI_REGISTRY_IMAGE}/dcgm-exporter" --set "image.tag=${CI_COMMIT_SHORT_SHA}" --set "serviceMonitor.enabled=true"
41 | }
42 | 
43 | install::prom() {
44 | 	helm repo add stable https://charts.helm.sh/stable
45 | 	helm install --wait stable/prometheus-operator --generate-name \
46 | 		--set "prometheus.prometheusSpec.serviceMonitorSelectorNilUsesHelmValues=false"
47 | }
48 | 
49 | query::prom() {
50 | 	IP="$(kubectl get svc -l app=prometheus-operator-prometheus -o jsonpath='{.items[0].spec.clusterIP}')"
51 | 	val="$(curl -sL "http://$IP:9090/api/v1/query?query=$1" | jq -r '.data.result')"
52 | 
53 | 	[ "${val}" != "" ] || return 1
54 | 	[ "${val}" != "[]" ] || return 1
55 | 
56 | 	echo "$val"
57 | }
58 | 
59 | query::pod::phase() {
60 | 	state="$(kubectl get pods "$1" -o jsonpath='{.status.phase}')"
61 | 	[ "$state" = "$2" ] || return 1
62 | }
63 | 
64 | testing::log::kube() {
65 | 	kubectl get pods
66 | 	kubectl get svc
67 | 	kubectl get serviceMonitor
68 | 
69 | 	kubectl get pods -l "app.kubernetes.io/component=dcgm-exporter" -o yaml
70 | }
71 | 
72 | install::jq
73 | install::helm
74 | install::prom
75 | install::dcgm::exporter
76 | 
77 | trap 'testing::log::kube' ERR
78 | 
79 | for test_case in "metrics"; do
80 | 	log INFO "=================Testing ${test_case}================="
81 | 	testing::${test_case}::setup "$@"
82 | 	testing::${test_case}::main "$@"
83 | 	testing::${test_case}::cleanup "$@"
84 | done
85 | 
86 | 


--------------------------------------------------------------------------------
/tests/common.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | # shellcheck disable=SC2015
17 | [ -t 2 ] && readonly LOG_TTY=1 || readonly LOG_NO_TTY=1
18 | 
19 | if [ "${LOG_TTY-0}" -eq 1 ] && [ "$(tput colors)" -ge 15 ]; then
20 | 	readonly FMT_BOLD=$(tput bold)
21 | 	readonly FMT_RED=$(tput setaf 1)
22 | 	readonly FMT_YELLOW=$(tput setaf 3)
23 | 	readonly FMT_BLUE=$(tput setaf 12)
24 | 	readonly FMT_CLEAR=$(tput sgr0)
25 | fi
26 | 
27 | log() {
28 | 	local -r level="$1"; shift
29 | 	local -r message="$*"
30 | 
31 | 	local fmt_on="${FMT_CLEAR-}"
32 | 	local -r fmt_off="${FMT_CLEAR-}"
33 | 
34 | 	case "${level}" in
35 | 		INFO)  fmt_on="${FMT_BLUE-}" ;;
36 | 		WARN)  fmt_on="${FMT_YELLOW-}" ;;
37 | 		ERROR) fmt_on="${FMT_RED-}" ;;
38 | 	esac
39 | 	printf "%s[%s]%s %b\n" "${fmt_on}" "${level}" "${fmt_off}" "${message}" >&2
40 | }
41 | 
42 | with_retry() {
43 | 	local -r max_attempts="$1" delay="$2"
44 | 	shift 2
45 | 	local count=0 rc
46 | 
47 | 	while true; do
48 | 		set +e
49 | 		"$@"
50 | 		rc="$?"
51 | 		set -e
52 | 		count="$((count+1))"
53 | 
54 | 		if [[ "${rc}" -eq 0 ]]; then
55 | 			echo "'$*' SUCCEEDED in ${count} attempts !"
56 | 			return 0
57 | 		fi
58 | 
59 | 		if [[ "${max_attempts}" -le 0 ]] || [[ "${count}" -lt "${max_attempts}" ]]; then
60 | 			echo "'$*' FAILED at attempt ${count}, will retry in ${delay} seconds ..."
61 | 			sleep "${delay}"
62 | 		else
63 | 			break
64 | 		fi
65 | 	done
66 | 
67 | 	echo "'$*' FAILED in ${count} attempts !"
68 | 	return 1
69 | }
70 | 


--------------------------------------------------------------------------------
/tests/gpu-pod.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: nbody-pod
 5 | spec:
 6 |   restartPolicy: OnFailure
 7 |   containers:
 8 |     - name: nbody
 9 |       image: "nvcr.io/nvidia/k8s/cuda-sample:nbody"
10 |       command: ["nbody"]
11 |       args: ["-benchmark", "-i=10000000"]
12 |       resources:
13 |         limits:
14 |           nvidia.com/gpu: 1
15 | 


--------------------------------------------------------------------------------
/tests/metrics.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash -x
 2 | # Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | testing::metrics::setup() {
17 | 	:
18 | }
19 | 
20 | testing::metrics::cleanup() {
21 | 	kubectl delete -f tests/gpu-pod.yaml
22 | }
23 | 
24 | testing::metrics::utilization::increase() {
25 | 	# For a short while we might have multiple values returned
26 | 	# In this case it seems like the first item is the oldest
27 | 	val="$(query::prom "DCGM_FI_DEV_GPU_UTIL" | jq -r '.[-1].value[1]')"
28 | 	[ "$val" -ge 0 ] || return 1
29 | }
30 | 
31 | testing::metrics::ensure::kube::labels() {
32 | 	val="$(query::prom "DCGM_FI_DEV_GPU_UTIL")"
33 | 	UUID="$(echo "${val}" | jq -r '.[0].metric.UUID')"
34 | 	gpu="$(echo "${val}" | jq -r '.[0].metric.gpu')"
35 | 	pod="$(echo "${val}" | jq -r '.[0].metric.exported_pod')"
36 | 	namespace="$(echo "${val}" | jq -r '.[0].metric.exported_namespace')"
37 | 
38 | 	[ "$UUID" != "" ] || return 1
39 | 	[ "$gpu" != "" ] || return 1
40 | 
41 | 	[ "$pod" = "nbody-pod" ] || return 1
42 | 	[ "$namespace" = "default" ] || return 1
43 | }
44 | 
45 | testing::metrics::main() {
46 | 	# Prometheus can take a while to pickup the exporter
47 | 	with_retry 30 10s query::prom "DCGM_FI_DEV_MEMORY_TEMP"
48 | 
49 | 	kubectl create -f tests/gpu-pod.yaml
50 | 	with_retry 30 10s query::pod::phase "nbody-pod" "Running"
51 | 
52 | 	with_retry 10 10s testing::metrics::utilization::increase
53 | 	with_retry 10 10s testing::metrics::ensure::kube::labels
54 | }
55 | 


--------------------------------------------------------------------------------
/tests/variables.tfvars:
--------------------------------------------------------------------------------
1 | instance_type = "p3.2xlarge"
2 | project_name = "gpu-monitoring-tools"
3 | setup_params = "--driver --k8s-plugin --nvcr"
4 | 


--------------------------------------------------------------------------------
/vendor/github.com/Masterminds/semver/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: go
 2 | 
 3 | go:
 4 |   - 1.6.x
 5 |   - 1.7.x
 6 |   - 1.8.x
 7 |   - 1.9.x
 8 |   - 1.10.x
 9 |   - 1.11.x
10 |   - 1.12.x
11 |   - tip
12 | 
13 | # Setting sudo access to false will let Travis CI use containers rather than
14 | # VMs to run the tests. For more details see:
15 | # - http://docs.travis-ci.com/user/workers/container-based-infrastructure/
16 | # - http://docs.travis-ci.com/user/workers/standard-infrastructure/
17 | sudo: false
18 | 
19 | script:
20 |   - make setup
21 |   - make test
22 | 
23 | notifications:
24 |   webhooks:
25 |     urls:
26 |       - https://webhooks.gitter.im/e/06e3328629952dabe3e0
27 |     on_success: change  # options: [always|never|change] default: always
28 |     on_failure: always  # options: [always|never|change] default: always
29 |     on_start: never     # options: [always|never|change] default: always
30 | 


--------------------------------------------------------------------------------
/vendor/github.com/Masterminds/semver/CHANGELOG.md:
--------------------------------------------------------------------------------
  1 | # 1.5.0 (2019-09-11)
  2 | 
  3 | ## Added
  4 | 
  5 | - #103: Add basic fuzzing for `NewVersion()` (thanks @jesse-c)
  6 | 
  7 | ## Changed
  8 | 
  9 | - #82: Clarify wildcard meaning in range constraints and update tests for it (thanks @greysteil)
 10 | - #83: Clarify caret operator range for pre-1.0.0 dependencies (thanks @greysteil)
 11 | - #72: Adding docs comment pointing to vert for a cli
 12 | - #71: Update the docs on pre-release comparator handling
 13 | - #89: Test with new go versions (thanks @thedevsaddam)
 14 | - #87: Added $ to ValidPrerelease for better validation (thanks @jeremycarroll)
 15 | 
 16 | ## Fixed
 17 | 
 18 | - #78: Fix unchecked error in example code (thanks @ravron)
 19 | - #70: Fix the handling of pre-releases and the 0.0.0 release edge case
 20 | - #97: Fixed copyright file for proper display on GitHub
 21 | - #107: Fix handling prerelease when sorting alphanum and num 
 22 | - #109: Fixed where Validate sometimes returns wrong message on error
 23 | 
 24 | # 1.4.2 (2018-04-10)
 25 | 
 26 | ## Changed
 27 | - #72: Updated the docs to point to vert for a console appliaction
 28 | - #71: Update the docs on pre-release comparator handling
 29 | 
 30 | ## Fixed
 31 | - #70: Fix the handling of pre-releases and the 0.0.0 release edge case
 32 | 
 33 | # 1.4.1 (2018-04-02)
 34 | 
 35 | ## Fixed
 36 | - Fixed #64: Fix pre-release precedence issue (thanks @uudashr)
 37 | 
 38 | # 1.4.0 (2017-10-04)
 39 | 
 40 | ## Changed
 41 | - #61: Update NewVersion to parse ints with a 64bit int size (thanks @zknill)
 42 | 
 43 | # 1.3.1 (2017-07-10)
 44 | 
 45 | ## Fixed
 46 | - Fixed #57: number comparisons in prerelease sometimes inaccurate
 47 | 
 48 | # 1.3.0 (2017-05-02)
 49 | 
 50 | ## Added
 51 | - #45: Added json (un)marshaling support (thanks @mh-cbon)
 52 | - Stability marker. See https://masterminds.github.io/stability/
 53 | 
 54 | ## Fixed
 55 | - #51: Fix handling of single digit tilde constraint (thanks @dgodd)
 56 | 
 57 | ## Changed
 58 | - #55: The godoc icon moved from png to svg
 59 | 
 60 | # 1.2.3 (2017-04-03)
 61 | 
 62 | ## Fixed
 63 | - #46: Fixed 0.x.x and 0.0.x in constraints being treated as *
 64 | 
 65 | # Release 1.2.2 (2016-12-13)
 66 | 
 67 | ## Fixed
 68 | - #34: Fixed issue where hyphen range was not working with pre-release parsing.
 69 | 
 70 | # Release 1.2.1 (2016-11-28)
 71 | 
 72 | ## Fixed
 73 | - #24: Fixed edge case issue where constraint "> 0" does not handle "0.0.1-alpha"
 74 |   properly.
 75 | 
 76 | # Release 1.2.0 (2016-11-04)
 77 | 
 78 | ## Added
 79 | - #20: Added MustParse function for versions (thanks @adamreese)
 80 | - #15: Added increment methods on versions (thanks @mh-cbon)
 81 | 
 82 | ## Fixed
 83 | - Issue #21: Per the SemVer spec (section 9) a pre-release is unstable and
 84 |   might not satisfy the intended compatibility. The change here ignores pre-releases
 85 |   on constraint checks (e.g., ~ or ^) when a pre-release is not part of the
 86 |   constraint. For example, `^1.2.3` will ignore pre-releases while
 87 |   `^1.2.3-alpha` will include them.
 88 | 
 89 | # Release 1.1.1 (2016-06-30)
 90 | 
 91 | ## Changed
 92 | - Issue #9: Speed up version comparison performance (thanks @sdboyer)
 93 | - Issue #8: Added benchmarks (thanks @sdboyer)
 94 | - Updated Go Report Card URL to new location
 95 | - Updated Readme to add code snippet formatting (thanks @mh-cbon)
 96 | - Updating tagging to v[SemVer] structure for compatibility with other tools.
 97 | 
 98 | # Release 1.1.0 (2016-03-11)
 99 | 
100 | - Issue #2: Implemented validation to provide reasons a versions failed a
101 |   constraint.
102 | 
103 | # Release 1.0.1 (2015-12-31)
104 | 
105 | - Fixed #1: * constraint failing on valid versions.
106 | 
107 | # Release 1.0.0 (2015-10-20)
108 | 
109 | - Initial release
110 | 


--------------------------------------------------------------------------------
/vendor/github.com/Masterminds/semver/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | Copyright (C) 2014-2019, Matt Butcher and Matt Farina
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to deal
 5 | in the Software without restriction, including without limitation the rights
 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | copies of the Software, and to permit persons to whom the Software is
 8 | furnished to do so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in
11 | all copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 | THE SOFTWARE.
20 | 


--------------------------------------------------------------------------------
/vendor/github.com/Masterminds/semver/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: setup
 2 | setup:
 3 | 	go get -u gopkg.in/alecthomas/gometalinter.v1
 4 | 	gometalinter.v1 --install
 5 | 
 6 | .PHONY: test
 7 | test: validate lint
 8 | 	@echo "==> Running tests"
 9 | 	go test -v
10 | 
11 | .PHONY: validate
12 | validate:
13 | 	@echo "==> Running static validations"
14 | 	@gometalinter.v1 \
15 | 	  --disable-all \
16 | 	  --enable deadcode \
17 | 	  --severity deadcode:error \
18 | 	  --enable gofmt \
19 | 	  --enable gosimple \
20 | 	  --enable ineffassign \
21 | 	  --enable misspell \
22 | 	  --enable vet \
23 | 	  --tests \
24 | 	  --vendor \
25 | 	  --deadline 60s \
26 | 	  ./... || exit_code=1
27 | 
28 | .PHONY: lint
29 | lint:
30 | 	@echo "==> Running linters"
31 | 	@gometalinter.v1 \
32 | 	  --disable-all \
33 | 	  --enable golint \
34 | 	  --vendor \
35 | 	  --deadline 60s \
36 | 	  ./... || :
37 | 


--------------------------------------------------------------------------------
/vendor/github.com/Masterminds/semver/appveyor.yml:
--------------------------------------------------------------------------------
 1 | version: build-{build}.{branch}
 2 | 
 3 | clone_folder: C:\gopath\src\github.com\Masterminds\semver
 4 | shallow_clone: true
 5 | 
 6 | environment:
 7 |   GOPATH: C:\gopath
 8 | 
 9 | platform:
10 |   - x64
11 | 
12 | install:
13 |   - go version
14 |   - go env
15 |   - go get -u gopkg.in/alecthomas/gometalinter.v1
16 |   - set PATH=%PATH%;%GOPATH%\bin
17 |   - gometalinter.v1.exe --install
18 | 
19 | build_script:
20 |   - go install -v ./...
21 | 
22 | test_script:
23 |   - "gometalinter.v1 \
24 |     --disable-all \
25 |     --enable deadcode \
26 |     --severity deadcode:error \
27 |     --enable gofmt \
28 |     --enable gosimple \
29 |     --enable ineffassign \
30 |     --enable misspell \
31 |     --enable vet \
32 |     --tests \
33 |     --vendor \
34 |     --deadline 60s \
35 |     ./... || exit_code=1"
36 |   - "gometalinter.v1 \
37 |     --disable-all \
38 |     --enable golint \
39 |     --vendor \
40 |     --deadline 60s \
41 |     ./... || :"
42 |   - go test -v
43 | 
44 | deploy: off
45 | 


--------------------------------------------------------------------------------
/vendor/github.com/Masterminds/semver/collection.go:
--------------------------------------------------------------------------------
 1 | package semver
 2 | 
 3 | // Collection is a collection of Version instances and implements the sort
 4 | // interface. See the sort package for more details.
 5 | // https://golang.org/pkg/sort/
 6 | type Collection []*Version
 7 | 
 8 | // Len returns the length of a collection. The number of Version instances
 9 | // on the slice.
10 | func (c Collection) Len() int {
11 | 	return len(c)
12 | }
13 | 
14 | // Less is needed for the sort interface to compare two Version objects on the
15 | // slice. If checks if one is less than the other.
16 | func (c Collection) Less(i, j int) bool {
17 | 	return c[i].LessThan(c[j])
18 | }
19 | 
20 | // Swap is needed for the sort interface to replace the Version objects
21 | // at two different positions in the slice.
22 | func (c Collection) Swap(i, j int) {
23 | 	c[i], c[j] = c[j], c[i]
24 | }
25 | 


--------------------------------------------------------------------------------
/vendor/github.com/Masterminds/semver/doc.go:
--------------------------------------------------------------------------------
  1 | /*
  2 | Package semver provides the ability to work with Semantic Versions (http://semver.org) in Go.
  3 | 
  4 | Specifically it provides the ability to:
  5 | 
  6 |     * Parse semantic versions
  7 |     * Sort semantic versions
  8 |     * Check if a semantic version fits within a set of constraints
  9 |     * Optionally work with a `v` prefix
 10 | 
 11 | Parsing Semantic Versions
 12 | 
 13 | To parse a semantic version use the `NewVersion` function. For example,
 14 | 
 15 |     v, err := semver.NewVersion("1.2.3-beta.1+build345")
 16 | 
 17 | If there is an error the version wasn't parseable. The version object has methods
 18 | to get the parts of the version, compare it to other versions, convert the
 19 | version back into a string, and get the original string. For more details
 20 | please see the documentation at https://godoc.org/github.com/Masterminds/semver.
 21 | 
 22 | Sorting Semantic Versions
 23 | 
 24 | A set of versions can be sorted using the `sort` package from the standard library.
 25 | For example,
 26 | 
 27 |     raw := []string{"1.2.3", "1.0", "1.3", "2", "0.4.2",}
 28 |     vs := make([]*semver.Version, len(raw))
 29 | 	for i, r := range raw {
 30 | 		v, err := semver.NewVersion(r)
 31 | 		if err != nil {
 32 | 			t.Errorf("Error parsing version: %s", err)
 33 | 		}
 34 | 
 35 | 		vs[i] = v
 36 | 	}
 37 | 
 38 | 	sort.Sort(semver.Collection(vs))
 39 | 
 40 | Checking Version Constraints
 41 | 
 42 | Checking a version against version constraints is one of the most featureful
 43 | parts of the package.
 44 | 
 45 |     c, err := semver.NewConstraint(">= 1.2.3")
 46 |     if err != nil {
 47 |         // Handle constraint not being parseable.
 48 |     }
 49 | 
 50 |     v, err := semver.NewVersion("1.3")
 51 |     if err != nil {
 52 |         // Handle version not being parseable.
 53 |     }
 54 |     // Check if the version meets the constraints. The a variable will be true.
 55 |     a := c.Check(v)
 56 | 
 57 | Basic Comparisons
 58 | 
 59 | There are two elements to the comparisons. First, a comparison string is a list
 60 | of comma separated and comparisons. These are then separated by || separated or
 61 | comparisons. For example, `">= 1.2, < 3.0.0 || >= 4.2.3"` is looking for a
 62 | comparison that's greater than or equal to 1.2 and less than 3.0.0 or is
 63 | greater than or equal to 4.2.3.
 64 | 
 65 | The basic comparisons are:
 66 | 
 67 |     * `=`: equal (aliased to no operator)
 68 |     * `!=`: not equal
 69 |     * `>`: greater than
 70 |     * `<`: less than
 71 |     * `>=`: greater than or equal to
 72 |     * `<=`: less than or equal to
 73 | 
 74 | Hyphen Range Comparisons
 75 | 
 76 | There are multiple methods to handle ranges and the first is hyphens ranges.
 77 | These look like:
 78 | 
 79 |     * `1.2 - 1.4.5` which is equivalent to `>= 1.2, <= 1.4.5`
 80 |     * `2.3.4 - 4.5` which is equivalent to `>= 2.3.4, <= 4.5`
 81 | 
 82 | Wildcards In Comparisons
 83 | 
 84 | The `x`, `X`, and `*` characters can be used as a wildcard character. This works
 85 | for all comparison operators. When used on the `=` operator it falls
 86 | back to the pack level comparison (see tilde below). For example,
 87 | 
 88 |     * `1.2.x` is equivalent to `>= 1.2.0, < 1.3.0`
 89 |     * `>= 1.2.x` is equivalent to `>= 1.2.0`
 90 |     * `<= 2.x` is equivalent to `<= 3`
 91 |     * `*` is equivalent to `>= 0.0.0`
 92 | 
 93 | Tilde Range Comparisons (Patch)
 94 | 
 95 | The tilde (`~`) comparison operator is for patch level ranges when a minor
 96 | version is specified and major level changes when the minor number is missing.
 97 | For example,
 98 | 
 99 |     * `~1.2.3` is equivalent to `>= 1.2.3, < 1.3.0`
100 |     * `~1` is equivalent to `>= 1, < 2`
101 |     * `~2.3` is equivalent to `>= 2.3, < 2.4`
102 |     * `~1.2.x` is equivalent to `>= 1.2.0, < 1.3.0`
103 |     * `~1.x` is equivalent to `>= 1, < 2`
104 | 
105 | Caret Range Comparisons (Major)
106 | 
107 | The caret (`^`) comparison operator is for major level changes. This is useful
108 | when comparisons of API versions as a major change is API breaking. For example,
109 | 
110 |     * `^1.2.3` is equivalent to `>= 1.2.3, < 2.0.0`
111 |     * `^1.2.x` is equivalent to `>= 1.2.0, < 2.0.0`
112 |     * `^2.3` is equivalent to `>= 2.3, < 3`
113 |     * `^2.x` is equivalent to `>= 2.0.0, < 3`
114 | */
115 | package semver
116 | 


--------------------------------------------------------------------------------
/vendor/github.com/Masterminds/semver/version_fuzz.go:
--------------------------------------------------------------------------------
 1 | // +build gofuzz
 2 | 
 3 | package semver
 4 | 
 5 | func Fuzz(data []byte) int {
 6 | 	if _, err := NewVersion(string(data)); err != nil {
 7 | 		return 0
 8 | 	}
 9 | 	return 1
10 | }
11 | 


--------------------------------------------------------------------------------
/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm/api.go:
--------------------------------------------------------------------------------
  1 | package dcgm
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | 	"os"
  6 | 	"sync"
  7 | )
  8 | 
  9 | var (
 10 | 	dcgmInitCounter int
 11 | 	mux             sync.Mutex
 12 | )
 13 | 
 14 | // Init starts DCGM, based on the user selected mode
 15 | // DCGM can be started in 3 differengt modes:
 16 | // 1. Embedded: Start hostengine within this process
 17 | // 2. Standalone: Connect to an already running nv-hostengine at the specified address
 18 | // Connection address can be passed as command line args: -connect "IP:PORT/Socket" -socket "isSocket"
 19 | // 3. StartHostengine: Open an Unix socket to start and connect to the nv-hostengine and terminate before exiting
 20 | func Init(m mode, args ...string) (cleanup func(), err error) {
 21 | 	mux.Lock()
 22 | 	if dcgmInitCounter < 0 {
 23 | 		count := fmt.Sprintf("%d", dcgmInitCounter)
 24 | 		err = fmt.Errorf("Shutdown() is called %s times, before Init()", count[1:])
 25 | 	}
 26 | 	if dcgmInitCounter == 0 {
 27 | 		err = initDcgm(m, args...)
 28 | 	}
 29 | 	dcgmInitCounter += 1
 30 | 	mux.Unlock()
 31 | 
 32 | 	return func() {
 33 | 		if err := Shutdown(); err != nil {
 34 | 			fmt.Fprintf(os.Stderr, "Failed to shutdown DCGM with error: `%v`", err)
 35 | 		}
 36 | 	}, err
 37 | }
 38 | 
 39 | // Shutdown stops DCGM and destroy all connections
 40 | func Shutdown() (err error) {
 41 | 	mux.Lock()
 42 | 	if dcgmInitCounter <= 0 {
 43 | 		err = fmt.Errorf("Init() needs to be called before Shutdown()")
 44 | 	}
 45 | 	if dcgmInitCounter == 1 {
 46 | 		err = shutdown()
 47 | 	}
 48 | 	dcgmInitCounter -= 1
 49 | 	mux.Unlock()
 50 | 
 51 | 	return
 52 | }
 53 | 
 54 | // GetAllDeviceCount counts all GPUs on the system
 55 | func GetAllDeviceCount() (uint, error) {
 56 | 	return getAllDeviceCount()
 57 | }
 58 | 
 59 | // GetSupportedDevices returns only DCGM supported GPUs
 60 | func GetSupportedDevices() ([]uint, error) {
 61 | 	return getSupportedDevices()
 62 | }
 63 | 
 64 | // GetDeviceInfo describes the given device
 65 | func GetDeviceInfo(gpuId uint) (Device, error) {
 66 | 	return getDeviceInfo(gpuId)
 67 | }
 68 | 
 69 | // GetDeviceStatus monitors GPU status including its power, memory and GPU utilization
 70 | func GetDeviceStatus(gpuId uint) (DeviceStatus, error) {
 71 | 	return latestValuesForDevice(gpuId)
 72 | }
 73 | 
 74 | // GetDeviceTopology returns device topology corresponding to the gpuId
 75 | func GetDeviceTopology(gpuId uint) ([]P2PLink, error) {
 76 | 	return getDeviceTopology(gpuId)
 77 | }
 78 | 
 79 | // WatchPidFields lets DCGM start recording stats for GPU process
 80 | // It needs to be called before calling GetProcessInfo
 81 | func WatchPidFields() (GroupHandle, error) {
 82 | 	return watchPidFields()
 83 | }
 84 | 
 85 | // GetProcessInfo provides detailed per GPU stats for this process
 86 | func GetProcessInfo(group GroupHandle, pid uint) ([]ProcessInfo, error) {
 87 | 	return getProcessInfo(group, pid)
 88 | }
 89 | 
 90 | // HealthCheckByGpuId monitors GPU health for any errors/failures/warnings
 91 | func HealthCheckByGpuId(gpuId uint) (DeviceHealth, error) {
 92 | 	return healthCheckByGpuId(gpuId)
 93 | }
 94 | 
 95 | // Policy sets GPU usage and error policies and notifies in case of any violations via callback functions
 96 | func Policy(gpuId uint, typ ...policyCondition) (<-chan PolicyViolation, error) {
 97 | 	return registerPolicy(gpuId, typ...)
 98 | }
 99 | 
100 | // Introspect returns DCGM hostengine memory and CPU usage
101 | func Introspect() (DcgmStatus, error) {
102 | 	return introspect()
103 | }
104 | 
105 | // Get all of the profiling metric groups for a given GPU group.
106 | func GetSupportedMetricGroups(grpid uint) ([]MetricGroup, error) {
107 | 	return getSupportedMetricGroups(grpid)
108 | }
109 | 


--------------------------------------------------------------------------------
/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm/bcast.go:
--------------------------------------------------------------------------------
 1 | package dcgm
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"sync"
 6 | )
 7 | 
 8 | type publisher struct {
 9 | 	publish        chan interface{}
10 | 	close          chan bool
11 | 	subscribers    []*subscriber
12 | 	subscriberLock sync.Mutex
13 | }
14 | 
15 | type subscriber struct {
16 | 	read  chan interface{}
17 | 	close chan bool
18 | }
19 | 
20 | func newPublisher() *publisher {
21 | 	pub := &publisher{
22 | 		publish: make(chan interface{}),
23 | 		close:   make(chan bool),
24 | 	}
25 | 	return pub
26 | }
27 | 
28 | func (p *publisher) subscriberList() []*subscriber {
29 | 	p.subscriberLock.Lock()
30 | 	defer p.subscriberLock.Unlock()
31 | 	return p.subscribers[:]
32 | }
33 | 
34 | func (p *publisher) add() *subscriber {
35 | 	p.subscriberLock.Lock()
36 | 	defer p.subscriberLock.Unlock()
37 | 	newSub := &subscriber{
38 | 		read:  make(chan interface{}),
39 | 		close: make(chan bool),
40 | 	}
41 | 	p.subscribers = append(p.subscribers, newSub)
42 | 	return newSub
43 | }
44 | 
45 | func (p *publisher) remove(leaving *subscriber) error {
46 | 	p.subscriberLock.Lock()
47 | 	defer p.subscriberLock.Unlock()
48 | 	subscriberIndex := -1
49 | 	for i, sub := range p.subscribers {
50 | 		if sub == leaving {
51 | 			subscriberIndex = i
52 | 			break
53 | 		}
54 | 	}
55 | 	if subscriberIndex == -1 {
56 | 		return fmt.Errorf("Could not find subscriber")
57 | 	}
58 | 	go func() { leaving.close <- true }()
59 | 	p.subscribers = append(p.subscribers[:subscriberIndex], p.subscribers[subscriberIndex+1:]...)
60 | 	return nil
61 | }
62 | 
63 | func (p *publisher) send(val interface{}) {
64 | 	p.publish <- val
65 | }
66 | 
67 | func (p *publisher) broadcast() {
68 | 	for {
69 | 		select {
70 | 		case publishing := <-p.publish:
71 | 			for _, sub := range p.subscriberList() {
72 | 				go func(s *subscriber, val interface{}) {
73 | 					s.read <- val
74 | 				}(sub, publishing)
75 | 			}
76 | 		case <-p.close:
77 | 			return
78 | 		}
79 | 	}
80 | }
81 | 
82 | func (p *publisher) closePublisher() {
83 | 	p.close <- true
84 | }
85 | 


--------------------------------------------------------------------------------
/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm/callback.c:
--------------------------------------------------------------------------------
1 | int violationNotify(void* p) {
2 |     int ViolationRegistration(void*);
3 |     return ViolationRegistration(p);
4 | }
5 | 


--------------------------------------------------------------------------------
/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm/device_status.go:
--------------------------------------------------------------------------------
  1 | package dcgm
  2 | 
  3 | /*
  4 | #include "./dcgm_agent.h"
  5 | #include "./dcgm_structs.h"
  6 | */
  7 | import "C"
  8 | import (
  9 | 	"fmt"
 10 | 	"math/rand"
 11 | )
 12 | 
 13 | type PerfState uint
 14 | 
 15 | const (
 16 | 	PerfStateMax     = 0
 17 | 	PerfStateMin     = 15
 18 | 	PerfStateUnknown = 32
 19 | )
 20 | 
 21 | func (p PerfState) String() string {
 22 | 	if p >= PerfStateMax && p <= PerfStateMin {
 23 | 		return fmt.Sprintf("P%d", p)
 24 | 	}
 25 | 	return "Unknown"
 26 | }
 27 | 
 28 | type UtilizationInfo struct {
 29 | 	GPU     int64 // %
 30 | 	Memory  int64 // %
 31 | 	Encoder int64 // %
 32 | 	Decoder int64 // %
 33 | }
 34 | 
 35 | type ECCErrorsInfo struct {
 36 | 	SingleBit int64
 37 | 	DoubleBit int64
 38 | }
 39 | 
 40 | type MemoryInfo struct {
 41 | 	GlobalUsed int64
 42 | 	ECCErrors  ECCErrorsInfo
 43 | }
 44 | 
 45 | type ClockInfo struct {
 46 | 	Cores  int64 // MHz
 47 | 	Memory int64 // MHz
 48 | }
 49 | 
 50 | type PCIThroughputInfo struct {
 51 | 	Rx      int64 // MB
 52 | 	Tx      int64 // MB
 53 | 	Replays int64
 54 | }
 55 | 
 56 | type PCIStatusInfo struct {
 57 | 	BAR1Used   int64 // MB
 58 | 	Throughput PCIThroughputInfo
 59 | 	FBUsed     int64
 60 | }
 61 | 
 62 | type DeviceStatus struct {
 63 | 	Power       float64 // W
 64 | 	Temperature int64   // °C
 65 | 	Utilization UtilizationInfo
 66 | 	Memory      MemoryInfo
 67 | 	Clocks      ClockInfo
 68 | 	PCI         PCIStatusInfo
 69 | 	Performance PerfState
 70 | 	FanSpeed    int64 // %
 71 | }
 72 | 
 73 | func latestValuesForDevice(gpuId uint) (status DeviceStatus, err error) {
 74 | 	const (
 75 | 		pwr int = iota
 76 | 		temp
 77 | 		sm
 78 | 		mem
 79 | 		enc
 80 | 		dec
 81 | 		smClock
 82 | 		memClock
 83 | 		bar1Used
 84 | 		pcieRxThroughput
 85 | 		pcieTxThroughput
 86 | 		pcieReplay
 87 | 		fbUsed
 88 | 		sbe
 89 | 		dbe
 90 | 		pstate
 91 | 		fanSpeed
 92 | 		fieldsCount
 93 | 	)
 94 | 
 95 | 	deviceFields := make([]Short, fieldsCount)
 96 | 	deviceFields[pwr] = C.DCGM_FI_DEV_POWER_USAGE
 97 | 	deviceFields[temp] = C.DCGM_FI_DEV_GPU_TEMP
 98 | 	deviceFields[sm] = C.DCGM_FI_DEV_GPU_UTIL
 99 | 	deviceFields[mem] = C.DCGM_FI_DEV_MEM_COPY_UTIL
100 | 	deviceFields[enc] = C.DCGM_FI_DEV_ENC_UTIL
101 | 	deviceFields[dec] = C.DCGM_FI_DEV_DEC_UTIL
102 | 	deviceFields[smClock] = C.DCGM_FI_DEV_SM_CLOCK
103 | 	deviceFields[memClock] = C.DCGM_FI_DEV_MEM_CLOCK
104 | 	deviceFields[bar1Used] = C.DCGM_FI_DEV_BAR1_USED
105 | 	deviceFields[pcieRxThroughput] = C.DCGM_FI_DEV_PCIE_RX_THROUGHPUT
106 | 	deviceFields[pcieTxThroughput] = C.DCGM_FI_DEV_PCIE_TX_THROUGHPUT
107 | 	deviceFields[pcieReplay] = C.DCGM_FI_DEV_PCIE_REPLAY_COUNTER
108 | 	deviceFields[fbUsed] = C.DCGM_FI_DEV_FB_USED
109 | 	deviceFields[sbe] = C.DCGM_FI_DEV_ECC_SBE_AGG_TOTAL
110 | 	deviceFields[dbe] = C.DCGM_FI_DEV_ECC_DBE_AGG_TOTAL
111 | 	deviceFields[pstate] = C.DCGM_FI_DEV_PSTATE
112 | 	deviceFields[fanSpeed] = C.DCGM_FI_DEV_FAN_SPEED
113 | 
114 | 	fieldsName := fmt.Sprintf("devStatusFields%d", rand.Uint64())
115 | 	fieldsId, err := FieldGroupCreate(fieldsName, deviceFields)
116 | 	if err != nil {
117 | 		return
118 | 	}
119 | 
120 | 	groupName := fmt.Sprintf("devStatus%d", rand.Uint64())
121 | 	groupId, err := WatchFields(gpuId, fieldsId, groupName)
122 | 	if err != nil {
123 | 		_ = FieldGroupDestroy(fieldsId)
124 | 		return
125 | 	}
126 | 
127 | 	values, err := GetLatestValuesForFields(gpuId, deviceFields)
128 | 	if err != nil {
129 | 		_ = FieldGroupDestroy(fieldsId)
130 | 		_ = DestroyGroup(groupId)
131 | 		return status, fmt.Errorf("Error getting device status: %s", err)
132 | 	}
133 | 
134 | 	power := values[pwr].Float64()
135 | 
136 | 	gpuUtil := UtilizationInfo{
137 | 		GPU:     values[sm].Int64(),
138 | 		Memory:  values[mem].Int64(),
139 | 		Encoder: values[enc].Int64(),
140 | 		Decoder: values[dec].Int64(),
141 | 	}
142 | 
143 | 	memory := MemoryInfo{
144 | 		ECCErrors: ECCErrorsInfo{
145 | 			SingleBit: values[sbe].Int64(),
146 | 			DoubleBit: values[dbe].Int64(),
147 | 		},
148 | 	}
149 | 
150 | 	clocks := ClockInfo{
151 | 		Cores:  values[smClock].Int64(),
152 | 		Memory: values[memClock].Int64(),
153 | 	}
154 | 
155 | 	pci := PCIStatusInfo{
156 | 		BAR1Used: values[bar1Used].Int64(),
157 | 		Throughput: PCIThroughputInfo{
158 | 			Rx:      values[pcieRxThroughput].Int64(),
159 | 			Tx:      values[pcieTxThroughput].Int64(),
160 | 			Replays: values[pcieReplay].Int64(),
161 | 		},
162 | 		FBUsed: values[fbUsed].Int64(),
163 | 	}
164 | 
165 | 	status = DeviceStatus{
166 | 		Power:       power,
167 | 		Temperature: values[temp].Int64(),
168 | 		Utilization: gpuUtil,
169 | 		Memory:      memory,
170 | 		Clocks:      clocks,
171 | 		PCI:         pci,
172 | 		Performance: PerfState(values[pstate].Int64()),
173 | 		FanSpeed:    values[fanSpeed].Int64(),
174 | 	}
175 | 
176 | 	_ = FieldGroupDestroy(fieldsId)
177 | 	_ = DestroyGroup(groupId)
178 | 	return
179 | }
180 | 


--------------------------------------------------------------------------------
/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm/go.mod:
--------------------------------------------------------------------------------
1 | module github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm
2 | 
3 | go 1.14
4 | 


--------------------------------------------------------------------------------
/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm/gpu_group.go:
--------------------------------------------------------------------------------
 1 | package dcgm
 2 | 
 3 | /*
 4 | #include "dcgm_agent.h"
 5 | #include "dcgm_structs.h"
 6 | */
 7 | import "C"
 8 | import (
 9 | 	"fmt"
10 | )
11 | 
12 | type GroupHandle struct{ handle C.dcgmGpuGrp_t }
13 | 
14 | func CreateGroup(groupName string) (goGroupId GroupHandle, err error) {
15 | 	var cGroupId C.dcgmGpuGrp_t
16 | 	cname := C.CString(groupName)
17 | 	defer freeCString(cname)
18 | 
19 | 	result := C.dcgmGroupCreate(handle.handle, C.DCGM_GROUP_EMPTY, cname, &cGroupId)
20 | 	if err = errorString(result); err != nil {
21 | 		return goGroupId, fmt.Errorf("Error creating group: %s", err)
22 | 	}
23 | 
24 | 	goGroupId = GroupHandle{cGroupId}
25 | 	return
26 | }
27 | 
28 | func NewDefaultGroup(groupName string) (GroupHandle, error) {
29 | 	var cGroupId C.dcgmGpuGrp_t
30 | 
31 | 	cname := C.CString(groupName)
32 | 	defer freeCString(cname)
33 | 
34 | 	result := C.dcgmGroupCreate(handle.handle, C.DCGM_GROUP_DEFAULT, cname, &cGroupId)
35 | 	if err := errorString(result); err != nil {
36 | 		return GroupHandle{}, fmt.Errorf("Error creating group: %s", err)
37 | 	}
38 | 
39 | 	return GroupHandle{cGroupId}, nil
40 | }
41 | 
42 | func AddToGroup(groupId GroupHandle, gpuId uint) (err error) {
43 | 	result := C.dcgmGroupAddDevice(handle.handle, groupId.handle, C.uint(gpuId))
44 | 	if err = errorString(result); err != nil {
45 | 		return fmt.Errorf("Error adding GPU %v to group: %s", gpuId, err)
46 | 	}
47 | 
48 | 	return
49 | }
50 | 
51 | func AddEntityToGroup(groupId GroupHandle, entityGroupId Field_Entity_Group, entityId uint) (err error) {
52 | 	result := C.dcgmGroupAddEntity(handle.handle, groupId.handle, C.dcgm_field_entity_group_t(entityGroupId), C.uint(entityId))
53 | 	if err = errorString(result); err != nil {
54 | 		return fmt.Errorf("Error adding entity group type %v, entity %v to group: %s", entityGroupId, entityId, err)
55 | 	}
56 | 
57 | 	return
58 | }
59 | 
60 | func DestroyGroup(groupId GroupHandle) (err error) {
61 | 	result := C.dcgmGroupDestroy(handle.handle, groupId.handle)
62 | 	if err = errorString(result); err != nil {
63 | 		return fmt.Errorf("Error destroying group: %s", err)
64 | 	}
65 | 
66 | 	return
67 | }
68 | 


--------------------------------------------------------------------------------
/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm/health.go:
--------------------------------------------------------------------------------
  1 | package dcgm
  2 | 
  3 | /*
  4 | #include "dcgm_agent.h"
  5 | #include "dcgm_structs.h"
  6 | */
  7 | import "C"
  8 | import (
  9 | 	"fmt"
 10 | 	"math/rand"
 11 | 	"unsafe"
 12 | )
 13 | 
 14 | type SystemWatch struct {
 15 | 	Type   string
 16 | 	Status string
 17 | 	Error  string
 18 | }
 19 | 
 20 | type DeviceHealth struct {
 21 | 	GPU     uint
 22 | 	Status  string
 23 | 	Watches []SystemWatch
 24 | }
 25 | 
 26 | func setHealthWatches(groupId GroupHandle) (err error) {
 27 | 	result := C.dcgmHealthSet(handle.handle, groupId.handle, C.DCGM_HEALTH_WATCH_ALL)
 28 | 	if err = errorString(result); err != nil {
 29 | 		return fmt.Errorf("Error setting health watches: %s", err)
 30 | 	}
 31 | 	return
 32 | }
 33 | 
 34 | func healthCheckByGpuId(gpuId uint) (deviceHealth DeviceHealth, err error) {
 35 | 	name := fmt.Sprintf("health%d", rand.Uint64())
 36 | 	groupId, err := CreateGroup(name)
 37 | 	if err != nil {
 38 | 		return
 39 | 	}
 40 | 
 41 | 	err = AddToGroup(groupId, gpuId)
 42 | 	if err != nil {
 43 | 		return
 44 | 	}
 45 | 
 46 | 	err = setHealthWatches(groupId)
 47 | 	if err != nil {
 48 | 		return
 49 | 	}
 50 | 
 51 | 	var healthResults C.dcgmHealthResponse_v4
 52 | 	healthResults.version = makeVersion2(unsafe.Sizeof(healthResults))
 53 | 
 54 | 	result := C.dcgmHealthCheck(handle.handle, groupId.handle, (*C.dcgmHealthResponse_t)(unsafe.Pointer(&healthResults)))
 55 | 
 56 | 	if err = errorString(result); err != nil {
 57 | 		return deviceHealth, fmt.Errorf("Error checking GPU health: %s", err)
 58 | 	}
 59 | 
 60 | 	status := healthStatus(int8(healthResults.overallHealth))
 61 | 	watches := []SystemWatch{}
 62 | 
 63 | 	// number of watches that encountred error/warning
 64 | 	incidents := uint(healthResults.incidentCount)
 65 | 
 66 | 	for j := uint(0); j < incidents; j++ {
 67 | 		watch := SystemWatch{
 68 | 			Type:   systemWatch(int(healthResults.incidents[j].system)),
 69 | 			Status: healthStatus(int8(healthResults.incidents[j].health)),
 70 | 
 71 | 			Error: *stringPtr(&healthResults.incidents[j].error.msg[0]),
 72 | 		}
 73 | 		watches = append(watches, watch)
 74 | 	}
 75 | 
 76 | 	deviceHealth = DeviceHealth{
 77 | 		GPU:     gpuId,
 78 | 		Status:  status,
 79 | 		Watches: watches,
 80 | 	}
 81 | 	_ = DestroyGroup(groupId)
 82 | 	return
 83 | }
 84 | 
 85 | func healthStatus(status int8) string {
 86 | 	switch status {
 87 | 	case 0:
 88 | 		return "Healthy"
 89 | 	case 10:
 90 | 		return "Warning"
 91 | 	case 20:
 92 | 		return "Failure"
 93 | 	}
 94 | 	return "N/A"
 95 | }
 96 | 
 97 | func systemWatch(watch int) string {
 98 | 	switch watch {
 99 | 	case 1:
100 | 		return "PCIe watches"
101 | 	case 2:
102 | 		return "NVLINK watches"
103 | 	case 4:
104 | 		return "Power Managemnt unit watches"
105 | 	case 8:
106 | 		return "Microcontroller unit watches"
107 | 	case 16:
108 | 		return "Memory watches"
109 | 	case 32:
110 | 		return "Streaming Multiprocessor watches"
111 | 	case 64:
112 | 		return "Inforom watches"
113 | 	case 128:
114 | 		return "Temperature watches"
115 | 	case 256:
116 | 		return "Power watches"
117 | 	case 512:
118 | 		return "Driver-related watches"
119 | 	}
120 | 	return "N/A"
121 | }
122 | 


--------------------------------------------------------------------------------
/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm/hostengine_status.go:
--------------------------------------------------------------------------------
 1 | package dcgm
 2 | 
 3 | /*
 4 | #include "dcgm_agent.h"
 5 | #include "dcgm_structs.h"
 6 | */
 7 | import "C"
 8 | import (
 9 | 	"fmt"
10 | 	"unsafe"
11 | )
12 | 
13 | type DcgmStatus struct {
14 | 	Memory int64
15 | 	CPU    float64
16 | }
17 | 
18 | func introspect() (engine DcgmStatus, err error) {
19 | 	enableIntrospect := C.dcgmIntrospectState_t(1)
20 | 	result := C.dcgmIntrospectToggleState(handle.handle, enableIntrospect)
21 | 
22 | 	if err = errorString(result); err != nil {
23 | 		return engine, fmt.Errorf("Error enabling DCGM introspection: %s", err)
24 | 	}
25 | 
26 | 	var memory C.dcgmIntrospectMemory_t
27 | 	memory.version = makeVersion2(unsafe.Sizeof(memory))
28 | 	waitIfNoData := 1
29 | 	result = C.dcgmIntrospectGetHostengineMemoryUsage(handle.handle, &memory, C.int(waitIfNoData))
30 | 
31 | 	if err = errorString(result); err != nil {
32 | 		return engine, fmt.Errorf("Error getting memory usage of hostengine: %s", err)
33 | 	}
34 | 
35 | 	var cpu C.dcgmIntrospectCpuUtil_t
36 | 
37 | 	cpu.version = makeVersion2(unsafe.Sizeof(cpu))
38 | 	result = C.dcgmIntrospectGetHostengineCpuUtilization(handle.handle, &cpu, C.int(waitIfNoData))
39 | 
40 | 	if err = errorString(result); err != nil {
41 | 		return engine, fmt.Errorf("Error getting cpu usage of hostengine: %s", err)
42 | 	}
43 | 
44 | 	engine = DcgmStatus{
45 | 		Memory: toInt64(memory.bytesUsed) / 1024,
46 | 		CPU:    *dblToFloat(cpu.total) * 100,
47 | 	}
48 | 	return
49 | }
50 | 


--------------------------------------------------------------------------------
/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm/mig.go:
--------------------------------------------------------------------------------
 1 | package dcgm
 2 | 
 3 | /*
 4 | #include "./dcgm_agent.h"
 5 | #include "./dcgm_structs.h"
 6 | */
 7 | import "C"
 8 | import (
 9 | 	"fmt"
10 | 	"unsafe"
11 | )
12 | 
13 | type Field_Entity_Group uint
14 | 
15 | const (
16 | 	FE_NONE Field_Entity_Group = iota
17 | 	FE_GPU
18 | 	FE_VGPU
19 | 	FE_SWITCH
20 | 	FE_GPU_I
21 | 	FE_GPU_CI
22 | 	FE_COUNT
23 | )
24 | 
25 | type GroupEntityPair struct {
26 | 	EntityGroupId Field_Entity_Group
27 | 	EntityId      uint
28 | }
29 | 
30 | type MigEntityInfo struct {
31 | 	GpuUuid               string
32 | 	NvmlGpuIndex          uint
33 | 	NvmlInstanceId        uint
34 | 	NvmlComputeInstanceId uint
35 | 	NvmlMigProfileId      uint
36 | 	NvmlProfileSlices     uint
37 | }
38 | 
39 | type MigHierarchyInfo_v2 struct {
40 | 	Entity GroupEntityPair
41 | 	Parent GroupEntityPair
42 | 	Info   MigEntityInfo
43 | }
44 | 
45 | const (
46 | 	MAX_NUM_DEVICES    uint = C.DCGM_MAX_NUM_DEVICES
47 | 	MAX_HIERARCHY_INFO uint = C.DCGM_MAX_HIERARCHY_INFO
48 | )
49 | 
50 | type MigHierarchy_v2 struct {
51 | 	Version    uint
52 | 	Count      uint
53 | 	EntityList [C.DCGM_MAX_HIERARCHY_INFO]MigHierarchyInfo_v2
54 | }
55 | 
56 | func GetGpuInstanceHierarchy() (hierarchy MigHierarchy_v2, err error) {
57 | 	var c_hierarchy C.dcgmMigHierarchy_v2
58 | 	c_hierarchy.version = C.dcgmMigHierarchy_version2
59 | 	ptr_hierarchy := (*C.dcgmMigHierarchy_v2)(unsafe.Pointer(&c_hierarchy))
60 | 	result := C.dcgmGetGpuInstanceHierarchy(handle.handle, ptr_hierarchy)
61 | 
62 | 	if err = errorString(result); err != nil {
63 | 		return toMigHierarchy(c_hierarchy), fmt.Errorf("Error retrieving DCGM MIG hierarchy: %s", err)
64 | 	}
65 | 
66 | 	return toMigHierarchy(c_hierarchy), nil
67 | }
68 | 
69 | func toMigHierarchy(c_hierarchy C.dcgmMigHierarchy_v2) MigHierarchy_v2 {
70 | 	var hierarchy MigHierarchy_v2
71 | 	hierarchy.Version = uint(c_hierarchy.version)
72 | 	hierarchy.Count = uint(c_hierarchy.count)
73 | 	for i := uint(0); i < hierarchy.Count; i++ {
74 | 		hierarchy.EntityList[i] = MigHierarchyInfo_v2{
75 | 			Entity: GroupEntityPair{Field_Entity_Group(c_hierarchy.entityList[i].entity.entityGroupId), uint(c_hierarchy.entityList[i].entity.entityId)},
76 | 			Parent: GroupEntityPair{Field_Entity_Group(c_hierarchy.entityList[i].parent.entityGroupId), uint(c_hierarchy.entityList[i].parent.entityId)},
77 | 			Info: MigEntityInfo{
78 | 				GpuUuid:               *stringPtr(&c_hierarchy.entityList[i].info.gpuUuid[0]),
79 | 				NvmlGpuIndex:          uint(c_hierarchy.entityList[i].info.nvmlGpuIndex),
80 | 				NvmlInstanceId:        uint(c_hierarchy.entityList[i].info.nvmlInstanceId),
81 | 				NvmlComputeInstanceId: uint(c_hierarchy.entityList[i].info.nvmlComputeInstanceId),
82 | 				NvmlMigProfileId:      uint(c_hierarchy.entityList[i].info.nvmlMigProfileId),
83 | 				NvmlProfileSlices:     uint(c_hierarchy.entityList[i].info.nvmlProfileSlices),
84 | 			},
85 | 		}
86 | 	}
87 | 
88 | 	return hierarchy
89 | }
90 | 


--------------------------------------------------------------------------------
/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm/profile.go:
--------------------------------------------------------------------------------
 1 | package dcgm
 2 | 
 3 | /*
 4 | #include "dcgm_agent.h"
 5 | #include "dcgm_structs.h"
 6 | */
 7 | import "C"
 8 | import (
 9 | 	"fmt"
10 | 	"unsafe"
11 | )
12 | 
13 | type MetricGroup struct {
14 | 	major    uint
15 | 	minor    uint
16 | 	fieldIds []uint
17 | }
18 | 
19 | func getSupportedMetricGroups(grpid uint) (groups []MetricGroup, err error) {
20 | 
21 | 	var groupInfo C.dcgmProfGetMetricGroups_t
22 | 	groupInfo.version = makeVersion2(unsafe.Sizeof(groupInfo))
23 | 	groupInfo.groupId = C.ulong(grpid)
24 | 
25 | 	result := C.dcgmProfGetSupportedMetricGroups(handle.handle, &groupInfo)
26 | 
27 | 	if err = errorString(result); err != nil {
28 | 		return groups, fmt.Errorf("Error getting supported metrics: %s", err)
29 | 	}
30 | 
31 | 	var count = uint(groupInfo.numMetricGroups)
32 | 
33 | 	for i := uint(0); i < count; i++ {
34 | 		var group MetricGroup
35 | 		group.major = uint(groupInfo.metricGroups[i].majorId)
36 | 		group.minor = uint(groupInfo.metricGroups[i].minorId)
37 | 
38 | 		var fieldCount = uint(groupInfo.metricGroups[i].numFieldIds)
39 | 
40 | 		for j := uint(0); j < fieldCount; j++ {
41 | 			group.fieldIds = append(group.fieldIds, uint(groupInfo.metricGroups[i].fieldIds[j]))
42 | 		}
43 | 		groups = append(groups, group)
44 | 	}
45 | 
46 | 	return groups, nil
47 | }
48 | 


--------------------------------------------------------------------------------
/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm/topology.go:
--------------------------------------------------------------------------------
  1 | package dcgm
  2 | 
  3 | /*
  4 | #include "dcgm_agent.h"
  5 | #include "dcgm_structs.h"
  6 | */
  7 | import "C"
  8 | import (
  9 | 	"fmt"
 10 | 	"io/ioutil"
 11 | 	"strings"
 12 | 	"unsafe"
 13 | )
 14 | 
 15 | type P2PLinkType uint
 16 | 
 17 | const (
 18 | 	P2PLinkUnknown P2PLinkType = iota
 19 | 	P2PLinkCrossCPU
 20 | 	P2PLinkSameCPU
 21 | 	P2PLinkHostBridge
 22 | 	P2PLinkMultiSwitch
 23 | 	P2PLinkSingleSwitch
 24 | 	P2PLinkSameBoard
 25 | 	SingleNVLINKLink
 26 | 	TwoNVLINKLinks
 27 | 	ThreeNVLINKLinks
 28 | 	FourNVLINKLinks
 29 | )
 30 | 
 31 | func (l P2PLinkType) PCIPaths() string {
 32 | 	switch l {
 33 | 	case P2PLinkSameBoard:
 34 | 		return "PSB"
 35 | 	case P2PLinkSingleSwitch:
 36 | 		return "PIX"
 37 | 	case P2PLinkMultiSwitch:
 38 | 		return "PXB"
 39 | 	case P2PLinkHostBridge:
 40 | 		return "PHB"
 41 | 	case P2PLinkSameCPU:
 42 | 		return "NODE"
 43 | 	case P2PLinkCrossCPU:
 44 | 		return "SYS"
 45 | 	case SingleNVLINKLink:
 46 | 		return "NV1"
 47 | 	case TwoNVLINKLinks:
 48 | 		return "NV2"
 49 | 	case ThreeNVLINKLinks:
 50 | 		return "NV3"
 51 | 	case FourNVLINKLinks:
 52 | 		return "NV4"
 53 | 	case P2PLinkUnknown:
 54 | 	}
 55 | 	return "N/A"
 56 | }
 57 | 
 58 | type P2PLink struct {
 59 | 	GPU   uint
 60 | 	BusID string
 61 | 	Link  P2PLinkType
 62 | }
 63 | 
 64 | func getP2PLink(path uint) P2PLinkType {
 65 | 	switch path {
 66 | 	case C.DCGM_TOPOLOGY_BOARD:
 67 | 		return P2PLinkSameBoard
 68 | 	case C.DCGM_TOPOLOGY_SINGLE:
 69 | 		return P2PLinkSingleSwitch
 70 | 	case C.DCGM_TOPOLOGY_MULTIPLE:
 71 | 		return P2PLinkMultiSwitch
 72 | 	case C.DCGM_TOPOLOGY_HOSTBRIDGE:
 73 | 		return P2PLinkHostBridge
 74 | 	case C.DCGM_TOPOLOGY_CPU:
 75 | 		return P2PLinkSameCPU
 76 | 	case C.DCGM_TOPOLOGY_SYSTEM:
 77 | 		return P2PLinkCrossCPU
 78 | 	case C.DCGM_TOPOLOGY_NVLINK1:
 79 | 		return SingleNVLINKLink
 80 | 	case C.DCGM_TOPOLOGY_NVLINK2:
 81 | 		return TwoNVLINKLinks
 82 | 	case C.DCGM_TOPOLOGY_NVLINK3:
 83 | 		return ThreeNVLINKLinks
 84 | 	case C.DCGM_TOPOLOGY_NVLINK4:
 85 | 		return FourNVLINKLinks
 86 | 	}
 87 | 	return P2PLinkUnknown
 88 | }
 89 | 
 90 | func getCPUAffinity(busid string) (string, error) {
 91 | 	b, err := ioutil.ReadFile(fmt.Sprintf("/sys/bus/pci/devices/%s/local_cpulist", strings.ToLower(busid[4:])))
 92 | 	if err != nil {
 93 | 		return "", fmt.Errorf("Error getting device cpu affinity: %v", err)
 94 | 	}
 95 | 	return strings.TrimSuffix(string(b), "\n"), nil
 96 | }
 97 | 
 98 | func getBusid(gpuid uint) (string, error) {
 99 | 	var device C.dcgmDeviceAttributes_t
100 | 	device.version = makeVersion2(unsafe.Sizeof(device))
101 | 
102 | 	result := C.dcgmGetDeviceAttributes(handle.handle, C.uint(gpuid), &device)
103 | 	if err := errorString(result); err != nil {
104 | 		return "", fmt.Errorf("Error getting device busid: %s", err)
105 | 	}
106 | 	return *stringPtr(&device.identifiers.pciBusId[0]), nil
107 | }
108 | 
109 | func getDeviceTopology(gpuid uint) (links []P2PLink, err error) {
110 | 	var topology C.dcgmDeviceTopology_t
111 | 	topology.version = makeVersion2(unsafe.Sizeof(topology))
112 | 
113 | 	result := C.dcgmGetDeviceTopology(handle.handle, C.uint(gpuid), &topology)
114 | 	if result == C.DCGM_ST_NOT_SUPPORTED {
115 | 		return links, nil
116 | 	}
117 | 	if result != C.DCGM_ST_OK {
118 | 		return links, fmt.Errorf("Error getting device topology: %s", errorString(result))
119 | 	}
120 | 
121 | 	busid, err := getBusid(gpuid)
122 | 	if err != nil {
123 | 		return
124 | 	}
125 | 
126 | 	for i := uint(0); i < uint(topology.numGpus); i++ {
127 | 		gpu := topology.gpuPaths[i].gpuId
128 | 		p2pLink := P2PLink{
129 | 			GPU:   uint(gpu),
130 | 			BusID: busid,
131 | 			Link:  getP2PLink(uint(topology.gpuPaths[i].path)),
132 | 		}
133 | 		links = append(links, p2pLink)
134 | 	}
135 | 	return
136 | }
137 | 


--------------------------------------------------------------------------------
/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm/utils.go:
--------------------------------------------------------------------------------
  1 | package dcgm
  2 | 
  3 | /*
  4 | #include "stdlib.h"
  5 | #include "dcgm_structs.h"
  6 | */
  7 | import "C"
  8 | 
  9 | import (
 10 | 	"fmt"
 11 | 	"math"
 12 | 	"unsafe"
 13 | )
 14 | 
 15 | const (
 16 | 	dcgmInt32Blank = 0x7ffffff0         // 2147483632
 17 | 	dcgmInt64Blank = 0x7ffffffffffffff0 // 9223372036854775792
 18 | )
 19 | 
 20 | func uintPtr(c C.uint) *uint {
 21 | 	i := uint(c)
 22 | 	return &i
 23 | }
 24 | 
 25 | func uintPtrInt(c C.int) *uint {
 26 | 	i := uint(c)
 27 | 	return &i
 28 | }
 29 | 
 30 | func uintPtrUnsafe(p unsafe.Pointer) *uint {
 31 | 	if p == nil {
 32 | 		return nil
 33 | 	}
 34 | 	uintP := (*uint)(unsafe.Pointer(p))
 35 | 	val := *uintP
 36 | 	return &val
 37 | }
 38 | 
 39 | func uint64Ptr(c C.longlong) *uint64 {
 40 | 	i := uint64(c)
 41 | 	return &i
 42 | }
 43 | 
 44 | func int64Ptr(c C.longlong) *int64 {
 45 | 	i := int64(c)
 46 | 	return &i
 47 | }
 48 | 
 49 | func uint64PtrUint(c C.uint) *uint64 {
 50 | 	i := uint64(c)
 51 | 	return &i
 52 | }
 53 | 
 54 | func uint64PtrUnsafe(p unsafe.Pointer) *uint64 {
 55 | 	if p == nil {
 56 | 		return nil
 57 | 	}
 58 | 	uintP := (*uint64)(unsafe.Pointer(p))
 59 | 	val := *uintP
 60 | 	return &val
 61 | }
 62 | 
 63 | func toInt64(c C.longlong) int64 {
 64 | 	i := int64(c)
 65 | 	return i
 66 | }
 67 | 
 68 | func dblToUint(val C.double) *uint {
 69 | 	i := uint(val)
 70 | 	return &i
 71 | }
 72 | 
 73 | func dblToFloat(val C.double) *float64 {
 74 | 	i := float64(val)
 75 | 	return &i
 76 | }
 77 | 
 78 | func dblToFloatUnsafe(val unsafe.Pointer) *float64 {
 79 | 	if val == nil {
 80 | 		return nil
 81 | 	}
 82 | 	dblP := (*C.double)(unsafe.Pointer(val))
 83 | 	floatP := float64(*dblP)
 84 | 	return &floatP
 85 | }
 86 | 
 87 | func stringPtr(c *C.char) *string {
 88 | 	s := C.GoString(c)
 89 | 	return &s
 90 | }
 91 | 
 92 | func errorString(result C.dcgmReturn_t) error {
 93 | 	if result == C.DCGM_ST_OK {
 94 | 		return nil
 95 | 	}
 96 | 	err := C.GoString(C.errorString(result))
 97 | 	return fmt.Errorf("%v", err)
 98 | }
 99 | 
100 | func freeCString(cStr *C.char) {
101 | 	C.free(unsafe.Pointer(cStr))
102 | }
103 | 
104 | func IsInt32Blank(value int) bool {
105 | 	if value >= dcgmInt32Blank {
106 | 		return true
107 | 	}
108 | 	return false
109 | }
110 | 
111 | func IsInt64Blank(value int64) bool {
112 | 	if value >= dcgmInt64Blank {
113 | 		return true
114 | 	}
115 | 	return false
116 | }
117 | 
118 | func blank64(val *int64) *int64 {
119 | 	if val != nil && IsInt64Blank(*val) {
120 | 		return nil
121 | 	}
122 | 	return val
123 | }
124 | 
125 | func blank32(val *uint) *uint {
126 | 	if val != nil && IsInt32Blank(int(*val)) {
127 | 		return nil
128 | 	}
129 | 	return val
130 | }
131 | 
132 | func makeVersion1(struct_type uintptr) C.uint {
133 | 	version := C.uint(struct_type | 1<<24)
134 | 	return version
135 | }
136 | 
137 | func makeVersion2(struct_type uintptr) C.uint {
138 | 	version := C.uint(struct_type | 2<<24)
139 | 	return version
140 | }
141 | 
142 | func roundFloat(f *float64) *float64 {
143 | 	var val float64
144 | 	if f != nil {
145 | 		val = math.Round(*f)
146 | 	}
147 | 	return &val
148 | }
149 | 


--------------------------------------------------------------------------------
/vendor/github.com/gorilla/mux/AUTHORS:
--------------------------------------------------------------------------------
1 | # This is the official list of gorilla/mux authors for copyright purposes.
2 | #
3 | # Please keep the list sorted.
4 | 
5 | Google LLC (https://opensource.google.com/)
6 | Kamil Kisielk <kamil@kamilkisiel.net>
7 | Matt Silverlock <matt@eatsleeprepeat.net>
8 | Rodrigo Moraes (https://github.com/moraes)
9 | 


--------------------------------------------------------------------------------
/vendor/github.com/gorilla/mux/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2012-2018 The Gorilla Authors. All rights reserved.
 2 | 
 3 | Redistribution and use in source and binary forms, with or without
 4 | modification, are permitted provided that the following conditions are
 5 | met:
 6 | 
 7 | 	 * Redistributions of source code must retain the above copyright
 8 | notice, this list of conditions and the following disclaimer.
 9 | 	 * Redistributions in binary form must reproduce the above
10 | copyright notice, this list of conditions and the following disclaimer
11 | in the documentation and/or other materials provided with the
12 | distribution.
13 | 	 * Neither the name of Google Inc. nor the names of its
14 | contributors may be used to endorse or promote products derived from
15 | this software without specific prior written permission.
16 | 
17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 | 


--------------------------------------------------------------------------------
/vendor/github.com/gorilla/mux/go.mod:
--------------------------------------------------------------------------------
1 | module github.com/gorilla/mux
2 | 
3 | go 1.12
4 | 


--------------------------------------------------------------------------------
/vendor/github.com/gorilla/mux/middleware.go:
--------------------------------------------------------------------------------
 1 | package mux
 2 | 
 3 | import (
 4 | 	"net/http"
 5 | 	"strings"
 6 | )
 7 | 
 8 | // MiddlewareFunc is a function which receives an http.Handler and returns another http.Handler.
 9 | // Typically, the returned handler is a closure which does something with the http.ResponseWriter and http.Request passed
10 | // to it, and then calls the handler passed as parameter to the MiddlewareFunc.
11 | type MiddlewareFunc func(http.Handler) http.Handler
12 | 
13 | // middleware interface is anything which implements a MiddlewareFunc named Middleware.
14 | type middleware interface {
15 | 	Middleware(handler http.Handler) http.Handler
16 | }
17 | 
18 | // Middleware allows MiddlewareFunc to implement the middleware interface.
19 | func (mw MiddlewareFunc) Middleware(handler http.Handler) http.Handler {
20 | 	return mw(handler)
21 | }
22 | 
23 | // Use appends a MiddlewareFunc to the chain. Middleware can be used to intercept or otherwise modify requests and/or responses, and are executed in the order that they are applied to the Router.
24 | func (r *Router) Use(mwf ...MiddlewareFunc) {
25 | 	for _, fn := range mwf {
26 | 		r.middlewares = append(r.middlewares, fn)
27 | 	}
28 | }
29 | 
30 | // useInterface appends a middleware to the chain. Middleware can be used to intercept or otherwise modify requests and/or responses, and are executed in the order that they are applied to the Router.
31 | func (r *Router) useInterface(mw middleware) {
32 | 	r.middlewares = append(r.middlewares, mw)
33 | }
34 | 
35 | // CORSMethodMiddleware automatically sets the Access-Control-Allow-Methods response header
36 | // on requests for routes that have an OPTIONS method matcher to all the method matchers on
37 | // the route. Routes that do not explicitly handle OPTIONS requests will not be processed
38 | // by the middleware. See examples for usage.
39 | func CORSMethodMiddleware(r *Router) MiddlewareFunc {
40 | 	return func(next http.Handler) http.Handler {
41 | 		return http.HandlerFunc(func(w http.ResponseWriter, req *http.Request) {
42 | 			allMethods, err := getAllMethodsForRoute(r, req)
43 | 			if err == nil {
44 | 				for _, v := range allMethods {
45 | 					if v == http.MethodOptions {
46 | 						w.Header().Set("Access-Control-Allow-Methods", strings.Join(allMethods, ","))
47 | 					}
48 | 				}
49 | 			}
50 | 
51 | 			next.ServeHTTP(w, req)
52 | 		})
53 | 	}
54 | }
55 | 
56 | // getAllMethodsForRoute returns all the methods from method matchers matching a given
57 | // request.
58 | func getAllMethodsForRoute(r *Router, req *http.Request) ([]string, error) {
59 | 	var allMethods []string
60 | 
61 | 	for _, route := range r.routes {
62 | 		var match RouteMatch
63 | 		if route.Match(req, &match) || match.MatchErr == ErrMethodMismatch {
64 | 			methods, err := route.GetMethods()
65 | 			if err != nil {
66 | 				return nil, err
67 | 			}
68 | 
69 | 			allMethods = append(allMethods, methods...)
70 | 		}
71 | 	}
72 | 
73 | 	return allMethods, nil
74 | }
75 | 


--------------------------------------------------------------------------------
/vendor/github.com/gorilla/mux/test_helpers.go:
--------------------------------------------------------------------------------
 1 | // Copyright 2012 The Gorilla Authors. All rights reserved.
 2 | // Use of this source code is governed by a BSD-style
 3 | // license that can be found in the LICENSE file.
 4 | 
 5 | package mux
 6 | 
 7 | import "net/http"
 8 | 
 9 | // SetURLVars sets the URL variables for the given request, to be accessed via
10 | // mux.Vars for testing route behaviour. Arguments are not modified, a shallow
11 | // copy is returned.
12 | //
13 | // This API should only be used for testing purposes; it provides a way to
14 | // inject variables into the request context. Alternatively, URL variables
15 | // can be set by making a route that captures the required variables,
16 | // starting a server and sending the request to that server.
17 | func SetURLVars(r *http.Request, val map[string]string) *http.Request {
18 | 	return requestWithVars(r, val)
19 | }
20 | 


--------------------------------------------------------------------------------
/vendor/modules.txt:
--------------------------------------------------------------------------------
 1 | # github.com/Masterminds/semver v1.5.0
 2 | ## explicit
 3 | github.com/Masterminds/semver
 4 | # github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm v0.0.0-20210325210537-29b4f1784f18 => ./bindings/go/dcgm
 5 | ## explicit
 6 | github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm
 7 | # github.com/gorilla/mux v1.7.4
 8 | ## explicit
 9 | github.com/gorilla/mux
10 | # github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm => ./bindings/go/dcgm
11 | # k8s.io/api => k8s.io/api v0.20.2
12 | # k8s.io/apiextensions-apiserver => k8s.io/apiextensions-apiserver v0.20.2
13 | # k8s.io/apimachinery => k8s.io/apimachinery v0.20.2
14 | # k8s.io/apiserver => k8s.io/apiserver v0.20.2
15 | # k8s.io/cli-runtime => k8s.io/cli-runtime v0.20.2
16 | # k8s.io/client-go => k8s.io/client-go v0.20.2
17 | # k8s.io/cloud-provider => k8s.io/cloud-provider v0.20.2
18 | # k8s.io/cluster-bootstrap => k8s.io/cluster-bootstrap v0.20.2
19 | # k8s.io/code-generator => k8s.io/code-generator v0.20.2
20 | # k8s.io/component-base => k8s.io/component-base v0.20.2
21 | # k8s.io/cri-api => k8s.io/cri-api v0.20.2
22 | # k8s.io/csi-translation-lib => k8s.io/csi-translation-lib v0.20.2
23 | # k8s.io/kube-aggregator => k8s.io/kube-aggregator v0.20.2
24 | # k8s.io/kube-controller-manager => k8s.io/kube-controller-manager v0.20.2
25 | # k8s.io/kube-proxy => k8s.io/kube-proxy v0.20.2
26 | # k8s.io/kube-scheduler => k8s.io/kube-scheduler v0.20.2
27 | # k8s.io/kubectl => k8s.io/kubectl v0.20.2
28 | # k8s.io/kubelet => k8s.io/kubelet v0.20.2
29 | # k8s.io/legacy-cloud-providers => k8s.io/legacy-cloud-providers v0.20.2
30 | # k8s.io/metrics => k8s.io/metrics v0.20.2
31 | # k8s.io/sample-apiserver => k8s.io/sample-apiserver v0.20.2
32 | 


--------------------------------------------------------------------------------