├── .github └── PR_TEMPLATE.md ├── .gitignore ├── .gitlab-ci.yml ├── .gitmodules ├── CONTRIBUTING.md ├── LICENSE ├── Makefile ├── README.md ├── RELEASE.md ├── bindings └── go │ ├── dcgm │ ├── admin.go │ ├── api.go │ ├── bcast.go │ ├── callback.c │ ├── const.go │ ├── dcgm_agent.h │ ├── dcgm_errors.h │ ├── dcgm_fields.h │ ├── dcgm_structs.h │ ├── dcgm_test.go │ ├── device_info.go │ ├── device_status.go │ ├── fields.go │ ├── go.mod │ ├── gpu_group.go │ ├── health.go │ ├── hostengine_status.go │ ├── mig.go │ ├── policy.go │ ├── process_info.go │ ├── profile.go │ ├── topology.go │ └── utils.go │ ├── nvml │ ├── bindings.go │ ├── mig.go │ ├── mig_test.go │ ├── nvml.go │ ├── nvml.h │ ├── nvml_dl.go │ ├── nvml_dl_windows.go │ ├── nvml_test.go │ └── nvsmi │ │ └── nvsmi.go │ └── samples │ ├── dcgm │ ├── README.md │ ├── deviceInfo │ │ └── main.go │ ├── dmon │ │ └── main.go │ ├── health │ │ └── main.go │ ├── hostengineStatus │ │ └── main.go │ ├── policy │ │ └── main.go │ ├── processInfo │ │ └── main.go │ ├── restApi │ │ ├── README.md │ │ ├── handlers │ │ │ ├── byIds.go │ │ │ ├── byUuids.go │ │ │ ├── dcgm.go │ │ │ └── utils.go │ │ ├── main.go │ │ └── server.go │ └── topology │ │ └── main.go │ └── nvml │ ├── README.md │ ├── deviceInfo │ └── main.go │ ├── dmon │ └── main.go │ └── processInfo │ └── main.go ├── dcgm-exporter.yaml ├── deployment └── dcgm-exporter │ ├── .helmignore │ ├── Chart.yaml │ ├── templates │ ├── NOTES.txt │ ├── _helpers.tpl │ ├── daemonset.yaml │ ├── service-monitor.yaml │ ├── service.yaml │ └── serviceaccount.yaml │ └── values.yaml ├── docker ├── Dockerfile.ubi8 ├── Dockerfile.ubuntu18.04 ├── Dockerfile.ubuntu20.04 └── dcgm-exporter-entrypoint.sh ├── etc └── dcgm-exporter │ ├── 1.x-compatibility-metrics.csv │ ├── dcp-metrics-included.csv │ └── default-counters.csv ├── go.mod ├── go.sum ├── grafana └── dcgm-exporter-dashboard.json ├── pkg ├── Dockerfile ├── dcgm.go ├── go.mod ├── go.sum ├── gpu_collector.go ├── gpu_collector_test.go ├── kubernetes.go ├── kubernetes_test.go ├── main.go ├── parser.go ├── pipeline.go ├── pipeline_test.go ├── server.go ├── system_info.go ├── system_info_test.go ├── types.go └── utils.go ├── service-monitor.yaml ├── tests ├── ci-run-e2e.sh ├── common.sh ├── gpu-pod.yaml ├── metrics.sh └── variables.tfvars └── vendor ├── github.com ├── Masterminds │ └── semver │ │ ├── .travis.yml │ │ ├── CHANGELOG.md │ │ ├── LICENSE.txt │ │ ├── Makefile │ │ ├── README.md │ │ ├── appveyor.yml │ │ ├── collection.go │ │ ├── constraints.go │ │ ├── doc.go │ │ ├── version.go │ │ └── version_fuzz.go ├── NVIDIA │ └── gpu-monitoring-tools │ │ └── bindings │ │ └── go │ │ └── dcgm │ │ ├── admin.go │ │ ├── api.go │ │ ├── bcast.go │ │ ├── callback.c │ │ ├── const.go │ │ ├── dcgm_agent.h │ │ ├── dcgm_errors.h │ │ ├── dcgm_fields.h │ │ ├── dcgm_structs.h │ │ ├── device_info.go │ │ ├── device_status.go │ │ ├── fields.go │ │ ├── go.mod │ │ ├── gpu_group.go │ │ ├── health.go │ │ ├── hostengine_status.go │ │ ├── mig.go │ │ ├── policy.go │ │ ├── process_info.go │ │ ├── profile.go │ │ ├── topology.go │ │ └── utils.go └── gorilla │ └── mux │ ├── AUTHORS │ ├── LICENSE │ ├── README.md │ ├── doc.go │ ├── go.mod │ ├── middleware.go │ ├── mux.go │ ├── regexp.go │ ├── route.go │ └── test_helpers.go └── modules.txt /.github/PR_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | **Please open your pull requests on [gitlab repository](https://gitlab.com/nvidia/container-toolkit/gpu-monitoring-tools.git) ** 2 | 3 | Make sure to complete the following items:_ 4 | 5 | - _A reference to a related issue._ 6 | - _A small description of the changes proposed in the pull request._ 7 | - _One commit per change and descriptive commit messages._ 8 | - _Sign-off your work following these [guidelines](https://gitlab.com/nvidia/container-toolkit/gpu-monitoring-tools/blob/master/CONTRIBUTING.md) ._ 9 | - _Test run of your changes._ 10 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.swp 2 | *.swo 3 | dcgm-exporter 4 | !etc/dcgm-exporter/ 5 | !deployment/dcgm-exporter/ 6 | tags 7 | -------------------------------------------------------------------------------- /.gitlab-ci.yml: -------------------------------------------------------------------------------- 1 | image: docker:latest 2 | services: 3 | - docker:dind 4 | 5 | stages: 6 | - aws_kube_setup 7 | - e2e_tests 8 | - aws_kube_clean 9 | 10 | variables: 11 | GIT_SUBMODULE_STRATEGY: recursive 12 | TF_VAR_FILE: "$CI_PROJECT_DIR/tests/variables.tfvars" 13 | 14 | build: 15 | stage: aws_kube_setup 16 | script: 17 | - apk add make 18 | - docker login -u "${CI_REGISTRY_USER}" -p "${CI_REGISTRY_PASSWORD}" "${CI_REGISTRY}" 19 | 20 | - make REGISTRY="${CI_REGISTRY_IMAGE}" VERSION="${CI_COMMIT_SHORT_SHA}" all 21 | - make REGISTRY="${CI_REGISTRY_IMAGE}" VERSION="${CI_COMMIT_SHORT_SHA}" push 22 | - make REGISTRY="${CI_REGISTRY_IMAGE}" VERSION="${CI_COMMIT_SHORT_SHA}" push-short 23 | - make REGISTRY="${CI_REGISTRY_IMAGE}" VERSION="${CI_COMMIT_SHORT_SHA}" push-latest 24 | - make REGISTRY="${CI_REGISTRY_IMAGE}" VERSION="${CI_COMMIT_SHORT_SHA}" push-ci 25 | 26 | aws_kube_setup: 27 | extends: .aws_kube_setup 28 | only: 29 | - master 30 | - tags 31 | 32 | e2e: 33 | stage: e2e_tests 34 | only: 35 | - master 36 | script: 37 | - source aws-kube-ci/hostname 38 | - apk add --no-cache openssh-client rsync 39 | - rsync -e "ssh -i aws-kube-ci/key -o StrictHostKeyChecking=no" -av --exclude="vendor/" "${CI_PROJECT_DIR}" "${instance_hostname}:~/" 40 | - rc=0 41 | - ssh -i aws-kube-ci/key ${instance_hostname} \ 42 | "export CI_COMMIT_SHORT_SHA=${CI_COMMIT_SHORT_SHA} && 43 | export CI_REGISTRY_IMAGE=${CI_REGISTRY_IMAGE} && 44 | cd ~/gpu-monitoring-tools && sudo -E ./tests/ci-run-e2e.sh" 45 | 46 | aws_kube_clean: 47 | extends: .aws_kube_clean 48 | only: 49 | - master 50 | - tags 51 | 52 | release: 53 | stage: aws_kube_clean 54 | only: 55 | - tags 56 | script: 57 | - apk add make 58 | - docker login -u "${REGISTRY_USER}" -p "${REGISTRY_TOKEN}" 59 | 60 | - make VERSION="${CI_COMMIT_TAG}" all 61 | - make VERSION="${CI_COMMIT_TAG}" push 62 | - make VERSION="${CI_COMMIT_TAG}" push-short 63 | - make VERSION="${CI_COMMIT_TAG}" push-latest 64 | 65 | include: 66 | project: nvidia/container-infrastructure/aws-kube-ci 67 | file: aws-kube-ci.yml 68 | ref: 21.02.23 69 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "aws-kube-ci"] 2 | path = aws-kube-ci 3 | url = https://gitlab.com/nvidia/container-infrastructure/aws-kube-ci.git 4 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contribute to the GPU Operator Project 2 | 3 | Want to hack on the NVIDIA Container Toolkit Project? Awesome! 4 | We only require you to sign your work, the below section describes this! 5 | 6 | ## Sign your work 7 | 8 | The sign-off is a simple line at the end of the explanation for the patch. Your 9 | signature certifies that you wrote the patch or otherwise have the right to pass 10 | it on as an open-source patch. The rules are pretty simple: if you can certify 11 | the below (from [developercertificate.org](http://developercertificate.org/)): 12 | 13 | ``` 14 | Developer Certificate of Origin 15 | Version 1.1 16 | 17 | Copyright (C) 2004, 2006 The Linux Foundation and its contributors. 18 | 1 Letterman Drive 19 | Suite D4700 20 | San Francisco, CA, 94129 21 | 22 | Everyone is permitted to copy and distribute verbatim copies of this 23 | license document, but changing it is not allowed. 24 | 25 | Developer's Certificate of Origin 1.1 26 | 27 | By making a contribution to this project, I certify that: 28 | 29 | (a) The contribution was created in whole or in part by me and I 30 | have the right to submit it under the open source license 31 | indicated in the file; or 32 | 33 | (b) The contribution is based upon previous work that, to the best 34 | of my knowledge, is covered under an appropriate open source 35 | license and I have the right under that license to submit that 36 | work with modifications, whether created in whole or in part 37 | by me, under the same open source license (unless I am 38 | permitted to submit under a different license), as indicated 39 | in the file; or 40 | 41 | (c) The contribution was provided directly to me by some other 42 | person who certified (a), (b) or (c) and I have not modified 43 | it. 44 | 45 | (d) I understand and agree that this project and the contribution 46 | are public and that a record of the contribution (including all 47 | personal information I submit with it, including my sign-off) is 48 | maintained indefinitely and may be redistributed consistent with 49 | this project or the open source license(s) involved. 50 | ``` 51 | 52 | Then you just add a line to every git commit message: 53 | 54 | Signed-off-by: Joe Smith 55 | 56 | Use your real name (sorry, no pseudonyms or anonymous contributions.) 57 | 58 | If you set your `user.name` and `user.email` git configs, you can sign your 59 | commit automatically with `git commit -s`. 60 | 61 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | DOCKER ?= docker 16 | MKDIR ?= mkdir 17 | REGISTRY ?= nvidia 18 | 19 | DCGM_VERSION := 2.2.9 20 | GOLANG_VERSION := 1.14.2 21 | VERSION := 2.4.0 22 | FULL_VERSION := $(DCGM_VERSION)-$(VERSION) 23 | 24 | NON_TEST_FILES := pkg/dcgm.go pkg/gpu_collector.go pkg/parser.go pkg/pipeline.go pkg/server.go pkg/system_info.go pkg/types.go pkg/utils.go pkg/kubernetes.go pkg/main.go 25 | MAIN_TEST_FILES := pkg/system_info_test.go 26 | 27 | .PHONY: all binary install check-format 28 | all: ubuntu18.04 ubuntu20.04 ubi8 29 | 30 | binary: 31 | cd pkg; go build 32 | 33 | test-main: $(NON_TEST_FILES) $(MAIN_TEST_FILES) 34 | cd pkg; go test 35 | 36 | install: binary 37 | install -m 557 pkg/dcgm-exporter /usr/bin/dcgm-exporter 38 | install -m 557 -D ./etc/dcgm-exporter/default-counters.csv /etc/dcgm-exporter/default-counters.csv 39 | install -m 557 -D ./etc/dcgm-exporter/dcp-metrics-included.csv /etc/dcgm-exporter/dcp-metrics-included.csv 40 | 41 | check-format: 42 | test $$(gofmt -l pkg bindings | tee /dev/stderr | wc -l) -eq 0 43 | 44 | push: 45 | $(DOCKER) push "$(REGISTRY)/dcgm-exporter:$(FULL_VERSION)-ubuntu20.04" 46 | $(DOCKER) push "$(REGISTRY)/dcgm-exporter:$(FULL_VERSION)-ubuntu18.04" 47 | $(DOCKER) push "$(REGISTRY)/dcgm-exporter:$(FULL_VERSION)-ubi8" 48 | 49 | push-short: 50 | $(DOCKER) tag "$(REGISTRY)/dcgm-exporter:$(FULL_VERSION)-ubuntu18.04" "$(REGISTRY)/dcgm-exporter:$(DCGM_VERSION)" 51 | $(DOCKER) push "$(REGISTRY)/dcgm-exporter:$(DCGM_VERSION)" 52 | 53 | push-ci: 54 | $(DOCKER) tag "$(REGISTRY)/dcgm-exporter:$(FULL_VERSION)-ubuntu18.04" "$(REGISTRY)/dcgm-exporter:$(VERSION)" 55 | $(DOCKER) push "$(REGISTRY)/dcgm-exporter:$(VERSION)" 56 | 57 | push-latest: 58 | $(DOCKER) tag "$(REGISTRY)/dcgm-exporter:$(FULL_VERSION)-ubuntu18.04" "$(REGISTRY)/dcgm-exporter:latest" 59 | $(DOCKER) push "$(REGISTRY)/dcgm-exporter:latest" 60 | 61 | ubuntu20.04: 62 | $(DOCKER) build --pull \ 63 | --build-arg "GOLANG_VERSION=$(GOLANG_VERSION)" \ 64 | --build-arg "DCGM_VERSION=$(DCGM_VERSION)" \ 65 | --tag "$(REGISTRY)/dcgm-exporter:$(FULL_VERSION)-ubuntu20.04" \ 66 | --file docker/Dockerfile.ubuntu20.04 . 67 | 68 | ubuntu18.04: 69 | $(DOCKER) build --pull \ 70 | --build-arg "GOLANG_VERSION=$(GOLANG_VERSION)" \ 71 | --build-arg "DCGM_VERSION=$(DCGM_VERSION)" \ 72 | --tag "$(REGISTRY)/dcgm-exporter:$(FULL_VERSION)-ubuntu18.04" \ 73 | --file docker/Dockerfile.ubuntu18.04 . 74 | 75 | ubi8: 76 | $(DOCKER) build --pull \ 77 | --build-arg "GOLANG_VERSION=$(GOLANG_VERSION)" \ 78 | --build-arg "DCGM_VERSION=$(DCGM_VERSION)" \ 79 | --build-arg "VERSION=$(FULL_VERSION)" \ 80 | --tag "$(REGISTRY)/dcgm-exporter:$(FULL_VERSION)-ubi8" \ 81 | --file docker/Dockerfile.ubi8 . 82 | 83 | -------------------------------------------------------------------------------- /RELEASE.md: -------------------------------------------------------------------------------- 1 | # Release 2 | 3 | This document, the release process as well as the versioning strategy for the DCGM exporter. 4 | In the future this document will also contain information about the go bindings. 5 | 6 | ## Versioning 7 | 8 | The DCGM container posses three major components: 9 | - The DCGM Version (e.g: 1.17.3) 10 | - The Exporter Version (e.g: 2.0.0) 11 | - The platform of the container (e.g: ubuntu18.04) 12 | 13 | The overall version of the DCGM container has four forms: 14 | - The long form: `${DCGM_VERSION}-${EXPORTER_VERSION}-${PLATFORM}` 15 | - The short form: `${DCGM_VERSION}` 16 | - The latest tag: `latest` 17 | - The commit form: `${CI_COMMIT_SHORT_SHA}` only available on the gitlab registry 18 | 19 | The long form is a unique tag that once pushed will always refer to the same container. 20 | This means that no updates will be made to that tag and it will always point to the same container. 21 | 22 | The short form refers to the latest EXPORTER_VERSION with the platform fixed to ubuntu18.04. 23 | The latest tag refers to the latest short form (i.e: latest DCGM_VERSION and EXPORTER_VERSION). 24 | 25 | Note: We do not maintain multiple version branches. 26 | 27 | ## Releases 28 | 29 | Release of newer versions is done on demand and does not follow DCGM's release cadence. 30 | Though it is very likely that when a new version of DCGM comes out a new version of the exporter will be released. 31 | 32 | All commit to the master branch generates an image on the gitlab registry. 33 | Tagging a version will push an image to the nvidia/dcgm-exporter repository on the Dockerhub 34 | -------------------------------------------------------------------------------- /bindings/go/dcgm/api.go: -------------------------------------------------------------------------------- 1 | package dcgm 2 | 3 | import ( 4 | "fmt" 5 | "os" 6 | "sync" 7 | ) 8 | 9 | var ( 10 | dcgmInitCounter int 11 | mux sync.Mutex 12 | ) 13 | 14 | // Init starts DCGM, based on the user selected mode 15 | // DCGM can be started in 3 differengt modes: 16 | // 1. Embedded: Start hostengine within this process 17 | // 2. Standalone: Connect to an already running nv-hostengine at the specified address 18 | // Connection address can be passed as command line args: -connect "IP:PORT/Socket" -socket "isSocket" 19 | // 3. StartHostengine: Open an Unix socket to start and connect to the nv-hostengine and terminate before exiting 20 | func Init(m mode, args ...string) (cleanup func(), err error) { 21 | mux.Lock() 22 | if dcgmInitCounter < 0 { 23 | count := fmt.Sprintf("%d", dcgmInitCounter) 24 | err = fmt.Errorf("Shutdown() is called %s times, before Init()", count[1:]) 25 | } 26 | if dcgmInitCounter == 0 { 27 | err = initDcgm(m, args...) 28 | } 29 | dcgmInitCounter += 1 30 | mux.Unlock() 31 | 32 | return func() { 33 | if err := Shutdown(); err != nil { 34 | fmt.Fprintf(os.Stderr, "Failed to shutdown DCGM with error: `%v`", err) 35 | } 36 | }, err 37 | } 38 | 39 | // Shutdown stops DCGM and destroy all connections 40 | func Shutdown() (err error) { 41 | mux.Lock() 42 | if dcgmInitCounter <= 0 { 43 | err = fmt.Errorf("Init() needs to be called before Shutdown()") 44 | } 45 | if dcgmInitCounter == 1 { 46 | err = shutdown() 47 | } 48 | dcgmInitCounter -= 1 49 | mux.Unlock() 50 | 51 | return 52 | } 53 | 54 | // GetAllDeviceCount counts all GPUs on the system 55 | func GetAllDeviceCount() (uint, error) { 56 | return getAllDeviceCount() 57 | } 58 | 59 | // GetSupportedDevices returns only DCGM supported GPUs 60 | func GetSupportedDevices() ([]uint, error) { 61 | return getSupportedDevices() 62 | } 63 | 64 | // GetDeviceInfo describes the given device 65 | func GetDeviceInfo(gpuId uint) (Device, error) { 66 | return getDeviceInfo(gpuId) 67 | } 68 | 69 | // GetDeviceStatus monitors GPU status including its power, memory and GPU utilization 70 | func GetDeviceStatus(gpuId uint) (DeviceStatus, error) { 71 | return latestValuesForDevice(gpuId) 72 | } 73 | 74 | // GetDeviceTopology returns device topology corresponding to the gpuId 75 | func GetDeviceTopology(gpuId uint) ([]P2PLink, error) { 76 | return getDeviceTopology(gpuId) 77 | } 78 | 79 | // WatchPidFields lets DCGM start recording stats for GPU process 80 | // It needs to be called before calling GetProcessInfo 81 | func WatchPidFields() (GroupHandle, error) { 82 | return watchPidFields() 83 | } 84 | 85 | // GetProcessInfo provides detailed per GPU stats for this process 86 | func GetProcessInfo(group GroupHandle, pid uint) ([]ProcessInfo, error) { 87 | return getProcessInfo(group, pid) 88 | } 89 | 90 | // HealthCheckByGpuId monitors GPU health for any errors/failures/warnings 91 | func HealthCheckByGpuId(gpuId uint) (DeviceHealth, error) { 92 | return healthCheckByGpuId(gpuId) 93 | } 94 | 95 | // Policy sets GPU usage and error policies and notifies in case of any violations via callback functions 96 | func Policy(gpuId uint, typ ...policyCondition) (<-chan PolicyViolation, error) { 97 | return registerPolicy(gpuId, typ...) 98 | } 99 | 100 | // Introspect returns DCGM hostengine memory and CPU usage 101 | func Introspect() (DcgmStatus, error) { 102 | return introspect() 103 | } 104 | 105 | // Get all of the profiling metric groups for a given GPU group. 106 | func GetSupportedMetricGroups(grpid uint) ([]MetricGroup, error) { 107 | return getSupportedMetricGroups(grpid) 108 | } 109 | -------------------------------------------------------------------------------- /bindings/go/dcgm/bcast.go: -------------------------------------------------------------------------------- 1 | package dcgm 2 | 3 | import ( 4 | "fmt" 5 | "sync" 6 | ) 7 | 8 | type publisher struct { 9 | publish chan interface{} 10 | close chan bool 11 | subscribers []*subscriber 12 | subscriberLock sync.Mutex 13 | } 14 | 15 | type subscriber struct { 16 | read chan interface{} 17 | close chan bool 18 | } 19 | 20 | func newPublisher() *publisher { 21 | pub := &publisher{ 22 | publish: make(chan interface{}), 23 | close: make(chan bool), 24 | } 25 | return pub 26 | } 27 | 28 | func (p *publisher) subscriberList() []*subscriber { 29 | p.subscriberLock.Lock() 30 | defer p.subscriberLock.Unlock() 31 | return p.subscribers[:] 32 | } 33 | 34 | func (p *publisher) add() *subscriber { 35 | p.subscriberLock.Lock() 36 | defer p.subscriberLock.Unlock() 37 | newSub := &subscriber{ 38 | read: make(chan interface{}), 39 | close: make(chan bool), 40 | } 41 | p.subscribers = append(p.subscribers, newSub) 42 | return newSub 43 | } 44 | 45 | func (p *publisher) remove(leaving *subscriber) error { 46 | p.subscriberLock.Lock() 47 | defer p.subscriberLock.Unlock() 48 | subscriberIndex := -1 49 | for i, sub := range p.subscribers { 50 | if sub == leaving { 51 | subscriberIndex = i 52 | break 53 | } 54 | } 55 | if subscriberIndex == -1 { 56 | return fmt.Errorf("Could not find subscriber") 57 | } 58 | go func() { leaving.close <- true }() 59 | p.subscribers = append(p.subscribers[:subscriberIndex], p.subscribers[subscriberIndex+1:]...) 60 | return nil 61 | } 62 | 63 | func (p *publisher) send(val interface{}) { 64 | p.publish <- val 65 | } 66 | 67 | func (p *publisher) broadcast() { 68 | for { 69 | select { 70 | case publishing := <-p.publish: 71 | for _, sub := range p.subscriberList() { 72 | go func(s *subscriber, val interface{}) { 73 | s.read <- val 74 | }(sub, publishing) 75 | } 76 | case <-p.close: 77 | return 78 | } 79 | } 80 | } 81 | 82 | func (p *publisher) closePublisher() { 83 | p.close <- true 84 | } 85 | -------------------------------------------------------------------------------- /bindings/go/dcgm/callback.c: -------------------------------------------------------------------------------- 1 | int violationNotify(void* p) { 2 | int ViolationRegistration(void*); 3 | return ViolationRegistration(p); 4 | } 5 | -------------------------------------------------------------------------------- /bindings/go/dcgm/dcgm_test.go: -------------------------------------------------------------------------------- 1 | package dcgm 2 | 3 | import ( 4 | "math" 5 | "strconv" 6 | "strings" 7 | "testing" 8 | 9 | "github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/nvsmi" 10 | ) 11 | 12 | func check(err error, t *testing.T) { 13 | if err != nil { 14 | t.Errorf("%v\n", err) 15 | } 16 | } 17 | 18 | func TestDeviceCount(t *testing.T) { 19 | cleanup, err := Init(Embedded) 20 | check(err, t) 21 | defer cleanup() 22 | 23 | count, err := GetAllDeviceCount() 24 | check(err, t) 25 | 26 | query := "count" 27 | c := nvsmi.DeviceCount(query) 28 | 29 | if c != count { 30 | t.Errorf("Device Count from dcgm is wrong, got %d, want: %d", count, c) 31 | } 32 | } 33 | 34 | func BenchmarkDeviceCount1(b *testing.B) { 35 | Init(Embedded) 36 | 37 | b.StartTimer() 38 | for n := 0; n < b.N; n++ { 39 | GetAllDeviceCount() 40 | } 41 | b.StopTimer() 42 | 43 | Shutdown() 44 | } 45 | 46 | func TestDeviceInfo(t *testing.T) { 47 | cleanup, err := Init(Embedded) 48 | check(err, t) 49 | defer cleanup() 50 | 51 | fields := []string{ 52 | "driver_version", 53 | "name", 54 | "serial", 55 | "uuid", 56 | "pci.bus_id", 57 | "vbios_version", 58 | "inforom.img", 59 | "power.limit", 60 | } 61 | 62 | gpus, err := GetSupportedDevices() 63 | check(err, t) 64 | 65 | for _, gpu := range gpus { 66 | info, err := GetDeviceInfo(gpu) 67 | check(err, t) 68 | 69 | id := strconv.FormatUint(uint64(gpu), 10) 70 | 71 | for _, val := range fields { 72 | var msg, output string 73 | res := nvsmi.Query(id, val) 74 | 75 | switch val { 76 | case "driver_version": 77 | msg = "Driver version" 78 | output = info.Identifiers.DriverVersion 79 | case "name": 80 | msg = "Device name" 81 | output = info.Identifiers.Model 82 | case "serial": 83 | msg = "Device Serial number" 84 | output = info.Identifiers.Serial 85 | case "uuid": 86 | msg = "Device UUID" 87 | output = info.UUID 88 | case "pci.bus_id": 89 | msg = "Device PCI busId" 90 | output = info.PCI.BusID 91 | case "vbios_version": 92 | msg = "Device vbios version" 93 | output = info.Identifiers.Vbios 94 | case "inforom.img": 95 | msg = "Device inforom image" 96 | output = info.Identifiers.InforomImageVersion 97 | case "power.limit": 98 | msg = "Device power limit" 99 | output = strconv.FormatUint(uint64(info.Power), 10) 100 | power, err := strconv.ParseFloat(res, 64) 101 | check(err, t) 102 | res = strconv.FormatUint(uint64(math.Round(power)), 10) 103 | } 104 | 105 | if strings.Compare(res, output) != 0 { 106 | if strings.Contains(output, "NOT_SUPPORTED") { 107 | continue 108 | } 109 | 110 | t.Errorf("%v from dcgm is wrong, got: %v, want: %v", msg, output, res) 111 | } 112 | } 113 | } 114 | } 115 | 116 | func BenchmarkDeviceInfo1(b *testing.B) { 117 | Init(Embedded) 118 | 119 | b.StartTimer() 120 | for n := 0; n < b.N; n++ { 121 | // assuming there will be atleast 1 GPU attached 122 | GetDeviceInfo(uint(0)) 123 | } 124 | b.StopTimer() 125 | 126 | Shutdown() 127 | } 128 | 129 | func TestDeviceStatus(t *testing.T) { 130 | cleanup, err := Init(Embedded) 131 | check(err, t) 132 | defer cleanup() 133 | 134 | gpus, err := GetSupportedDevices() 135 | check(err, t) 136 | 137 | fields := []string{ 138 | "power.draw", 139 | "temperature.gpu", 140 | "utilization.gpu", 141 | "utilization.memory", 142 | "encoder.stats.averageFps", 143 | "clocks.current.sm", 144 | "clocks.current.memory", 145 | } 146 | 147 | for _, gpu := range gpus { 148 | status, err := GetDeviceStatus(gpu) 149 | check(err, t) 150 | 151 | id := strconv.FormatUint(uint64(gpu), 10) 152 | 153 | for _, val := range fields { 154 | var msg, output string 155 | res := nvsmi.Query(id, val) 156 | 157 | switch val { 158 | case "power.draw": 159 | msg = "Device power utilization" 160 | output = strconv.FormatUint(uint64(math.Round(status.Power)), 10) 161 | power, err := strconv.ParseFloat(res, 64) 162 | check(err, t) 163 | res = strconv.FormatUint(uint64(math.Round(power)), 10) 164 | case "temperature.gpu": 165 | msg = "Device temperature" 166 | output = strconv.FormatUint(uint64(status.Temperature), 10) 167 | case "utilization.gpu": 168 | msg = "Device gpu utilization" 169 | output = strconv.FormatUint(uint64(status.Utilization.GPU), 10) 170 | case "utilization.memory": 171 | msg = "Device memory utilization" 172 | output = strconv.FormatUint(uint64(status.Utilization.Memory), 10) 173 | case "encoder.stats.averageFps": 174 | msg = "Device encoder utilization" 175 | output = strconv.FormatUint(uint64(status.Utilization.Encoder), 10) 176 | case "clocks.current.sm": 177 | msg = "Device sm clock" 178 | output = strconv.FormatUint(uint64(status.Clocks.Cores), 10) 179 | case "clocks.current.memory": 180 | msg = "Device mem clock" 181 | output = strconv.FormatUint(uint64(status.Clocks.Memory), 10) 182 | } 183 | 184 | if strings.Compare(res, output) != 0 { 185 | t.Errorf("%v from dcgm is wrong, got: %v, want: %v", msg, output, res) 186 | } 187 | } 188 | } 189 | } 190 | -------------------------------------------------------------------------------- /bindings/go/dcgm/device_status.go: -------------------------------------------------------------------------------- 1 | package dcgm 2 | 3 | /* 4 | #include "./dcgm_agent.h" 5 | #include "./dcgm_structs.h" 6 | */ 7 | import "C" 8 | import ( 9 | "fmt" 10 | "math/rand" 11 | ) 12 | 13 | type PerfState uint 14 | 15 | const ( 16 | PerfStateMax = 0 17 | PerfStateMin = 15 18 | PerfStateUnknown = 32 19 | ) 20 | 21 | func (p PerfState) String() string { 22 | if p >= PerfStateMax && p <= PerfStateMin { 23 | return fmt.Sprintf("P%d", p) 24 | } 25 | return "Unknown" 26 | } 27 | 28 | type UtilizationInfo struct { 29 | GPU int64 // % 30 | Memory int64 // % 31 | Encoder int64 // % 32 | Decoder int64 // % 33 | } 34 | 35 | type ECCErrorsInfo struct { 36 | SingleBit int64 37 | DoubleBit int64 38 | } 39 | 40 | type MemoryInfo struct { 41 | GlobalUsed int64 42 | ECCErrors ECCErrorsInfo 43 | } 44 | 45 | type ClockInfo struct { 46 | Cores int64 // MHz 47 | Memory int64 // MHz 48 | } 49 | 50 | type PCIThroughputInfo struct { 51 | Rx int64 // MB 52 | Tx int64 // MB 53 | Replays int64 54 | } 55 | 56 | type PCIStatusInfo struct { 57 | BAR1Used int64 // MB 58 | Throughput PCIThroughputInfo 59 | FBUsed int64 60 | } 61 | 62 | type DeviceStatus struct { 63 | Power float64 // W 64 | Temperature int64 // °C 65 | Utilization UtilizationInfo 66 | Memory MemoryInfo 67 | Clocks ClockInfo 68 | PCI PCIStatusInfo 69 | Performance PerfState 70 | FanSpeed int64 // % 71 | } 72 | 73 | func latestValuesForDevice(gpuId uint) (status DeviceStatus, err error) { 74 | const ( 75 | pwr int = iota 76 | temp 77 | sm 78 | mem 79 | enc 80 | dec 81 | smClock 82 | memClock 83 | bar1Used 84 | pcieRxThroughput 85 | pcieTxThroughput 86 | pcieReplay 87 | fbUsed 88 | sbe 89 | dbe 90 | pstate 91 | fanSpeed 92 | fieldsCount 93 | ) 94 | 95 | deviceFields := make([]Short, fieldsCount) 96 | deviceFields[pwr] = C.DCGM_FI_DEV_POWER_USAGE 97 | deviceFields[temp] = C.DCGM_FI_DEV_GPU_TEMP 98 | deviceFields[sm] = C.DCGM_FI_DEV_GPU_UTIL 99 | deviceFields[mem] = C.DCGM_FI_DEV_MEM_COPY_UTIL 100 | deviceFields[enc] = C.DCGM_FI_DEV_ENC_UTIL 101 | deviceFields[dec] = C.DCGM_FI_DEV_DEC_UTIL 102 | deviceFields[smClock] = C.DCGM_FI_DEV_SM_CLOCK 103 | deviceFields[memClock] = C.DCGM_FI_DEV_MEM_CLOCK 104 | deviceFields[bar1Used] = C.DCGM_FI_DEV_BAR1_USED 105 | deviceFields[pcieRxThroughput] = C.DCGM_FI_DEV_PCIE_RX_THROUGHPUT 106 | deviceFields[pcieTxThroughput] = C.DCGM_FI_DEV_PCIE_TX_THROUGHPUT 107 | deviceFields[pcieReplay] = C.DCGM_FI_DEV_PCIE_REPLAY_COUNTER 108 | deviceFields[fbUsed] = C.DCGM_FI_DEV_FB_USED 109 | deviceFields[sbe] = C.DCGM_FI_DEV_ECC_SBE_AGG_TOTAL 110 | deviceFields[dbe] = C.DCGM_FI_DEV_ECC_DBE_AGG_TOTAL 111 | deviceFields[pstate] = C.DCGM_FI_DEV_PSTATE 112 | deviceFields[fanSpeed] = C.DCGM_FI_DEV_FAN_SPEED 113 | 114 | fieldsName := fmt.Sprintf("devStatusFields%d", rand.Uint64()) 115 | fieldsId, err := FieldGroupCreate(fieldsName, deviceFields) 116 | if err != nil { 117 | return 118 | } 119 | 120 | groupName := fmt.Sprintf("devStatus%d", rand.Uint64()) 121 | groupId, err := WatchFields(gpuId, fieldsId, groupName) 122 | if err != nil { 123 | _ = FieldGroupDestroy(fieldsId) 124 | return 125 | } 126 | 127 | values, err := GetLatestValuesForFields(gpuId, deviceFields) 128 | if err != nil { 129 | _ = FieldGroupDestroy(fieldsId) 130 | _ = DestroyGroup(groupId) 131 | return status, fmt.Errorf("Error getting device status: %s", err) 132 | } 133 | 134 | power := values[pwr].Float64() 135 | 136 | gpuUtil := UtilizationInfo{ 137 | GPU: values[sm].Int64(), 138 | Memory: values[mem].Int64(), 139 | Encoder: values[enc].Int64(), 140 | Decoder: values[dec].Int64(), 141 | } 142 | 143 | memory := MemoryInfo{ 144 | ECCErrors: ECCErrorsInfo{ 145 | SingleBit: values[sbe].Int64(), 146 | DoubleBit: values[dbe].Int64(), 147 | }, 148 | } 149 | 150 | clocks := ClockInfo{ 151 | Cores: values[smClock].Int64(), 152 | Memory: values[memClock].Int64(), 153 | } 154 | 155 | pci := PCIStatusInfo{ 156 | BAR1Used: values[bar1Used].Int64(), 157 | Throughput: PCIThroughputInfo{ 158 | Rx: values[pcieRxThroughput].Int64(), 159 | Tx: values[pcieTxThroughput].Int64(), 160 | Replays: values[pcieReplay].Int64(), 161 | }, 162 | FBUsed: values[fbUsed].Int64(), 163 | } 164 | 165 | status = DeviceStatus{ 166 | Power: power, 167 | Temperature: values[temp].Int64(), 168 | Utilization: gpuUtil, 169 | Memory: memory, 170 | Clocks: clocks, 171 | PCI: pci, 172 | Performance: PerfState(values[pstate].Int64()), 173 | FanSpeed: values[fanSpeed].Int64(), 174 | } 175 | 176 | _ = FieldGroupDestroy(fieldsId) 177 | _ = DestroyGroup(groupId) 178 | return 179 | } 180 | -------------------------------------------------------------------------------- /bindings/go/dcgm/go.mod: -------------------------------------------------------------------------------- 1 | module github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm 2 | 3 | go 1.14 4 | -------------------------------------------------------------------------------- /bindings/go/dcgm/gpu_group.go: -------------------------------------------------------------------------------- 1 | package dcgm 2 | 3 | /* 4 | #include "dcgm_agent.h" 5 | #include "dcgm_structs.h" 6 | */ 7 | import "C" 8 | import ( 9 | "fmt" 10 | ) 11 | 12 | type GroupHandle struct{ handle C.dcgmGpuGrp_t } 13 | 14 | func CreateGroup(groupName string) (goGroupId GroupHandle, err error) { 15 | var cGroupId C.dcgmGpuGrp_t 16 | cname := C.CString(groupName) 17 | defer freeCString(cname) 18 | 19 | result := C.dcgmGroupCreate(handle.handle, C.DCGM_GROUP_EMPTY, cname, &cGroupId) 20 | if err = errorString(result); err != nil { 21 | return goGroupId, fmt.Errorf("Error creating group: %s", err) 22 | } 23 | 24 | goGroupId = GroupHandle{cGroupId} 25 | return 26 | } 27 | 28 | func NewDefaultGroup(groupName string) (GroupHandle, error) { 29 | var cGroupId C.dcgmGpuGrp_t 30 | 31 | cname := C.CString(groupName) 32 | defer freeCString(cname) 33 | 34 | result := C.dcgmGroupCreate(handle.handle, C.DCGM_GROUP_DEFAULT, cname, &cGroupId) 35 | if err := errorString(result); err != nil { 36 | return GroupHandle{}, fmt.Errorf("Error creating group: %s", err) 37 | } 38 | 39 | return GroupHandle{cGroupId}, nil 40 | } 41 | 42 | func AddToGroup(groupId GroupHandle, gpuId uint) (err error) { 43 | result := C.dcgmGroupAddDevice(handle.handle, groupId.handle, C.uint(gpuId)) 44 | if err = errorString(result); err != nil { 45 | return fmt.Errorf("Error adding GPU %v to group: %s", gpuId, err) 46 | } 47 | 48 | return 49 | } 50 | 51 | func AddEntityToGroup(groupId GroupHandle, entityGroupId Field_Entity_Group, entityId uint) (err error) { 52 | result := C.dcgmGroupAddEntity(handle.handle, groupId.handle, C.dcgm_field_entity_group_t(entityGroupId), C.uint(entityId)) 53 | if err = errorString(result); err != nil { 54 | return fmt.Errorf("Error adding entity group type %v, entity %v to group: %s", entityGroupId, entityId, err) 55 | } 56 | 57 | return 58 | } 59 | 60 | func DestroyGroup(groupId GroupHandle) (err error) { 61 | result := C.dcgmGroupDestroy(handle.handle, groupId.handle) 62 | if err = errorString(result); err != nil { 63 | return fmt.Errorf("Error destroying group: %s", err) 64 | } 65 | 66 | return 67 | } 68 | -------------------------------------------------------------------------------- /bindings/go/dcgm/health.go: -------------------------------------------------------------------------------- 1 | package dcgm 2 | 3 | /* 4 | #include "dcgm_agent.h" 5 | #include "dcgm_structs.h" 6 | */ 7 | import "C" 8 | import ( 9 | "fmt" 10 | "math/rand" 11 | "unsafe" 12 | ) 13 | 14 | type SystemWatch struct { 15 | Type string 16 | Status string 17 | Error string 18 | } 19 | 20 | type DeviceHealth struct { 21 | GPU uint 22 | Status string 23 | Watches []SystemWatch 24 | } 25 | 26 | func setHealthWatches(groupId GroupHandle) (err error) { 27 | result := C.dcgmHealthSet(handle.handle, groupId.handle, C.DCGM_HEALTH_WATCH_ALL) 28 | if err = errorString(result); err != nil { 29 | return fmt.Errorf("Error setting health watches: %s", err) 30 | } 31 | return 32 | } 33 | 34 | func healthCheckByGpuId(gpuId uint) (deviceHealth DeviceHealth, err error) { 35 | name := fmt.Sprintf("health%d", rand.Uint64()) 36 | groupId, err := CreateGroup(name) 37 | if err != nil { 38 | return 39 | } 40 | 41 | err = AddToGroup(groupId, gpuId) 42 | if err != nil { 43 | return 44 | } 45 | 46 | err = setHealthWatches(groupId) 47 | if err != nil { 48 | return 49 | } 50 | 51 | var healthResults C.dcgmHealthResponse_v4 52 | healthResults.version = makeVersion2(unsafe.Sizeof(healthResults)) 53 | 54 | result := C.dcgmHealthCheck(handle.handle, groupId.handle, (*C.dcgmHealthResponse_t)(unsafe.Pointer(&healthResults))) 55 | 56 | if err = errorString(result); err != nil { 57 | return deviceHealth, fmt.Errorf("Error checking GPU health: %s", err) 58 | } 59 | 60 | status := healthStatus(int8(healthResults.overallHealth)) 61 | watches := []SystemWatch{} 62 | 63 | // number of watches that encountred error/warning 64 | incidents := uint(healthResults.incidentCount) 65 | 66 | for j := uint(0); j < incidents; j++ { 67 | watch := SystemWatch{ 68 | Type: systemWatch(int(healthResults.incidents[j].system)), 69 | Status: healthStatus(int8(healthResults.incidents[j].health)), 70 | 71 | Error: *stringPtr(&healthResults.incidents[j].error.msg[0]), 72 | } 73 | watches = append(watches, watch) 74 | } 75 | 76 | deviceHealth = DeviceHealth{ 77 | GPU: gpuId, 78 | Status: status, 79 | Watches: watches, 80 | } 81 | _ = DestroyGroup(groupId) 82 | return 83 | } 84 | 85 | func healthStatus(status int8) string { 86 | switch status { 87 | case 0: 88 | return "Healthy" 89 | case 10: 90 | return "Warning" 91 | case 20: 92 | return "Failure" 93 | } 94 | return "N/A" 95 | } 96 | 97 | func systemWatch(watch int) string { 98 | switch watch { 99 | case 1: 100 | return "PCIe watches" 101 | case 2: 102 | return "NVLINK watches" 103 | case 4: 104 | return "Power Managemnt unit watches" 105 | case 8: 106 | return "Microcontroller unit watches" 107 | case 16: 108 | return "Memory watches" 109 | case 32: 110 | return "Streaming Multiprocessor watches" 111 | case 64: 112 | return "Inforom watches" 113 | case 128: 114 | return "Temperature watches" 115 | case 256: 116 | return "Power watches" 117 | case 512: 118 | return "Driver-related watches" 119 | } 120 | return "N/A" 121 | } 122 | -------------------------------------------------------------------------------- /bindings/go/dcgm/hostengine_status.go: -------------------------------------------------------------------------------- 1 | package dcgm 2 | 3 | /* 4 | #include "dcgm_agent.h" 5 | #include "dcgm_structs.h" 6 | */ 7 | import "C" 8 | import ( 9 | "fmt" 10 | "unsafe" 11 | ) 12 | 13 | type DcgmStatus struct { 14 | Memory int64 15 | CPU float64 16 | } 17 | 18 | func introspect() (engine DcgmStatus, err error) { 19 | enableIntrospect := C.dcgmIntrospectState_t(1) 20 | result := C.dcgmIntrospectToggleState(handle.handle, enableIntrospect) 21 | 22 | if err = errorString(result); err != nil { 23 | return engine, fmt.Errorf("Error enabling DCGM introspection: %s", err) 24 | } 25 | 26 | var memory C.dcgmIntrospectMemory_t 27 | memory.version = makeVersion2(unsafe.Sizeof(memory)) 28 | waitIfNoData := 1 29 | result = C.dcgmIntrospectGetHostengineMemoryUsage(handle.handle, &memory, C.int(waitIfNoData)) 30 | 31 | if err = errorString(result); err != nil { 32 | return engine, fmt.Errorf("Error getting memory usage of hostengine: %s", err) 33 | } 34 | 35 | var cpu C.dcgmIntrospectCpuUtil_t 36 | 37 | cpu.version = makeVersion2(unsafe.Sizeof(cpu)) 38 | result = C.dcgmIntrospectGetHostengineCpuUtilization(handle.handle, &cpu, C.int(waitIfNoData)) 39 | 40 | if err = errorString(result); err != nil { 41 | return engine, fmt.Errorf("Error getting cpu usage of hostengine: %s", err) 42 | } 43 | 44 | engine = DcgmStatus{ 45 | Memory: toInt64(memory.bytesUsed) / 1024, 46 | CPU: *dblToFloat(cpu.total) * 100, 47 | } 48 | return 49 | } 50 | -------------------------------------------------------------------------------- /bindings/go/dcgm/mig.go: -------------------------------------------------------------------------------- 1 | package dcgm 2 | 3 | /* 4 | #include "./dcgm_agent.h" 5 | #include "./dcgm_structs.h" 6 | */ 7 | import "C" 8 | import ( 9 | "fmt" 10 | "unsafe" 11 | ) 12 | 13 | type Field_Entity_Group uint 14 | 15 | const ( 16 | FE_NONE Field_Entity_Group = iota 17 | FE_GPU 18 | FE_VGPU 19 | FE_SWITCH 20 | FE_GPU_I 21 | FE_GPU_CI 22 | FE_COUNT 23 | ) 24 | 25 | type GroupEntityPair struct { 26 | EntityGroupId Field_Entity_Group 27 | EntityId uint 28 | } 29 | 30 | type MigEntityInfo struct { 31 | GpuUuid string 32 | NvmlGpuIndex uint 33 | NvmlInstanceId uint 34 | NvmlComputeInstanceId uint 35 | NvmlMigProfileId uint 36 | NvmlProfileSlices uint 37 | } 38 | 39 | type MigHierarchyInfo_v2 struct { 40 | Entity GroupEntityPair 41 | Parent GroupEntityPair 42 | Info MigEntityInfo 43 | } 44 | 45 | const ( 46 | MAX_NUM_DEVICES uint = C.DCGM_MAX_NUM_DEVICES 47 | MAX_HIERARCHY_INFO uint = C.DCGM_MAX_HIERARCHY_INFO 48 | ) 49 | 50 | type MigHierarchy_v2 struct { 51 | Version uint 52 | Count uint 53 | EntityList [C.DCGM_MAX_HIERARCHY_INFO]MigHierarchyInfo_v2 54 | } 55 | 56 | func GetGpuInstanceHierarchy() (hierarchy MigHierarchy_v2, err error) { 57 | var c_hierarchy C.dcgmMigHierarchy_v2 58 | c_hierarchy.version = C.dcgmMigHierarchy_version2 59 | ptr_hierarchy := (*C.dcgmMigHierarchy_v2)(unsafe.Pointer(&c_hierarchy)) 60 | result := C.dcgmGetGpuInstanceHierarchy(handle.handle, ptr_hierarchy) 61 | 62 | if err = errorString(result); err != nil { 63 | return toMigHierarchy(c_hierarchy), fmt.Errorf("Error retrieving DCGM MIG hierarchy: %s", err) 64 | } 65 | 66 | return toMigHierarchy(c_hierarchy), nil 67 | } 68 | 69 | func toMigHierarchy(c_hierarchy C.dcgmMigHierarchy_v2) MigHierarchy_v2 { 70 | var hierarchy MigHierarchy_v2 71 | hierarchy.Version = uint(c_hierarchy.version) 72 | hierarchy.Count = uint(c_hierarchy.count) 73 | for i := uint(0); i < hierarchy.Count; i++ { 74 | hierarchy.EntityList[i] = MigHierarchyInfo_v2{ 75 | Entity: GroupEntityPair{Field_Entity_Group(c_hierarchy.entityList[i].entity.entityGroupId), uint(c_hierarchy.entityList[i].entity.entityId)}, 76 | Parent: GroupEntityPair{Field_Entity_Group(c_hierarchy.entityList[i].parent.entityGroupId), uint(c_hierarchy.entityList[i].parent.entityId)}, 77 | Info: MigEntityInfo{ 78 | GpuUuid: *stringPtr(&c_hierarchy.entityList[i].info.gpuUuid[0]), 79 | NvmlGpuIndex: uint(c_hierarchy.entityList[i].info.nvmlGpuIndex), 80 | NvmlInstanceId: uint(c_hierarchy.entityList[i].info.nvmlInstanceId), 81 | NvmlComputeInstanceId: uint(c_hierarchy.entityList[i].info.nvmlComputeInstanceId), 82 | NvmlMigProfileId: uint(c_hierarchy.entityList[i].info.nvmlMigProfileId), 83 | NvmlProfileSlices: uint(c_hierarchy.entityList[i].info.nvmlProfileSlices), 84 | }, 85 | } 86 | } 87 | 88 | return hierarchy 89 | } 90 | -------------------------------------------------------------------------------- /bindings/go/dcgm/profile.go: -------------------------------------------------------------------------------- 1 | package dcgm 2 | 3 | /* 4 | #include "dcgm_agent.h" 5 | #include "dcgm_structs.h" 6 | */ 7 | import "C" 8 | import ( 9 | "fmt" 10 | "unsafe" 11 | ) 12 | 13 | type MetricGroup struct { 14 | major uint 15 | minor uint 16 | fieldIds []uint 17 | } 18 | 19 | func getSupportedMetricGroups(grpid uint) (groups []MetricGroup, err error) { 20 | 21 | var groupInfo C.dcgmProfGetMetricGroups_t 22 | groupInfo.version = makeVersion2(unsafe.Sizeof(groupInfo)) 23 | groupInfo.groupId = C.ulong(grpid) 24 | 25 | result := C.dcgmProfGetSupportedMetricGroups(handle.handle, &groupInfo) 26 | 27 | if err = errorString(result); err != nil { 28 | return groups, fmt.Errorf("Error getting supported metrics: %s", err) 29 | } 30 | 31 | var count = uint(groupInfo.numMetricGroups) 32 | 33 | for i := uint(0); i < count; i++ { 34 | var group MetricGroup 35 | group.major = uint(groupInfo.metricGroups[i].majorId) 36 | group.minor = uint(groupInfo.metricGroups[i].minorId) 37 | 38 | var fieldCount = uint(groupInfo.metricGroups[i].numFieldIds) 39 | 40 | for j := uint(0); j < fieldCount; j++ { 41 | group.fieldIds = append(group.fieldIds, uint(groupInfo.metricGroups[i].fieldIds[j])) 42 | } 43 | groups = append(groups, group) 44 | } 45 | 46 | return groups, nil 47 | } 48 | -------------------------------------------------------------------------------- /bindings/go/dcgm/topology.go: -------------------------------------------------------------------------------- 1 | package dcgm 2 | 3 | /* 4 | #include "dcgm_agent.h" 5 | #include "dcgm_structs.h" 6 | */ 7 | import "C" 8 | import ( 9 | "fmt" 10 | "io/ioutil" 11 | "strings" 12 | "unsafe" 13 | ) 14 | 15 | type P2PLinkType uint 16 | 17 | const ( 18 | P2PLinkUnknown P2PLinkType = iota 19 | P2PLinkCrossCPU 20 | P2PLinkSameCPU 21 | P2PLinkHostBridge 22 | P2PLinkMultiSwitch 23 | P2PLinkSingleSwitch 24 | P2PLinkSameBoard 25 | SingleNVLINKLink 26 | TwoNVLINKLinks 27 | ThreeNVLINKLinks 28 | FourNVLINKLinks 29 | ) 30 | 31 | func (l P2PLinkType) PCIPaths() string { 32 | switch l { 33 | case P2PLinkSameBoard: 34 | return "PSB" 35 | case P2PLinkSingleSwitch: 36 | return "PIX" 37 | case P2PLinkMultiSwitch: 38 | return "PXB" 39 | case P2PLinkHostBridge: 40 | return "PHB" 41 | case P2PLinkSameCPU: 42 | return "NODE" 43 | case P2PLinkCrossCPU: 44 | return "SYS" 45 | case SingleNVLINKLink: 46 | return "NV1" 47 | case TwoNVLINKLinks: 48 | return "NV2" 49 | case ThreeNVLINKLinks: 50 | return "NV3" 51 | case FourNVLINKLinks: 52 | return "NV4" 53 | case P2PLinkUnknown: 54 | } 55 | return "N/A" 56 | } 57 | 58 | type P2PLink struct { 59 | GPU uint 60 | BusID string 61 | Link P2PLinkType 62 | } 63 | 64 | func getP2PLink(path uint) P2PLinkType { 65 | switch path { 66 | case C.DCGM_TOPOLOGY_BOARD: 67 | return P2PLinkSameBoard 68 | case C.DCGM_TOPOLOGY_SINGLE: 69 | return P2PLinkSingleSwitch 70 | case C.DCGM_TOPOLOGY_MULTIPLE: 71 | return P2PLinkMultiSwitch 72 | case C.DCGM_TOPOLOGY_HOSTBRIDGE: 73 | return P2PLinkHostBridge 74 | case C.DCGM_TOPOLOGY_CPU: 75 | return P2PLinkSameCPU 76 | case C.DCGM_TOPOLOGY_SYSTEM: 77 | return P2PLinkCrossCPU 78 | case C.DCGM_TOPOLOGY_NVLINK1: 79 | return SingleNVLINKLink 80 | case C.DCGM_TOPOLOGY_NVLINK2: 81 | return TwoNVLINKLinks 82 | case C.DCGM_TOPOLOGY_NVLINK3: 83 | return ThreeNVLINKLinks 84 | case C.DCGM_TOPOLOGY_NVLINK4: 85 | return FourNVLINKLinks 86 | } 87 | return P2PLinkUnknown 88 | } 89 | 90 | func getCPUAffinity(busid string) (string, error) { 91 | b, err := ioutil.ReadFile(fmt.Sprintf("/sys/bus/pci/devices/%s/local_cpulist", strings.ToLower(busid[4:]))) 92 | if err != nil { 93 | return "", fmt.Errorf("Error getting device cpu affinity: %v", err) 94 | } 95 | return strings.TrimSuffix(string(b), "\n"), nil 96 | } 97 | 98 | func getBusid(gpuid uint) (string, error) { 99 | var device C.dcgmDeviceAttributes_t 100 | device.version = makeVersion2(unsafe.Sizeof(device)) 101 | 102 | result := C.dcgmGetDeviceAttributes(handle.handle, C.uint(gpuid), &device) 103 | if err := errorString(result); err != nil { 104 | return "", fmt.Errorf("Error getting device busid: %s", err) 105 | } 106 | return *stringPtr(&device.identifiers.pciBusId[0]), nil 107 | } 108 | 109 | func getDeviceTopology(gpuid uint) (links []P2PLink, err error) { 110 | var topology C.dcgmDeviceTopology_t 111 | topology.version = makeVersion2(unsafe.Sizeof(topology)) 112 | 113 | result := C.dcgmGetDeviceTopology(handle.handle, C.uint(gpuid), &topology) 114 | if result == C.DCGM_ST_NOT_SUPPORTED { 115 | return links, nil 116 | } 117 | if result != C.DCGM_ST_OK { 118 | return links, fmt.Errorf("Error getting device topology: %s", errorString(result)) 119 | } 120 | 121 | busid, err := getBusid(gpuid) 122 | if err != nil { 123 | return 124 | } 125 | 126 | for i := uint(0); i < uint(topology.numGpus); i++ { 127 | gpu := topology.gpuPaths[i].gpuId 128 | p2pLink := P2PLink{ 129 | GPU: uint(gpu), 130 | BusID: busid, 131 | Link: getP2PLink(uint(topology.gpuPaths[i].path)), 132 | } 133 | links = append(links, p2pLink) 134 | } 135 | return 136 | } 137 | -------------------------------------------------------------------------------- /bindings/go/dcgm/utils.go: -------------------------------------------------------------------------------- 1 | package dcgm 2 | 3 | /* 4 | #include "stdlib.h" 5 | #include "dcgm_structs.h" 6 | */ 7 | import "C" 8 | 9 | import ( 10 | "fmt" 11 | "math" 12 | "unsafe" 13 | ) 14 | 15 | const ( 16 | dcgmInt32Blank = 0x7ffffff0 // 2147483632 17 | dcgmInt64Blank = 0x7ffffffffffffff0 // 9223372036854775792 18 | ) 19 | 20 | func uintPtr(c C.uint) *uint { 21 | i := uint(c) 22 | return &i 23 | } 24 | 25 | func uintPtrInt(c C.int) *uint { 26 | i := uint(c) 27 | return &i 28 | } 29 | 30 | func uintPtrUnsafe(p unsafe.Pointer) *uint { 31 | if p == nil { 32 | return nil 33 | } 34 | uintP := (*uint)(unsafe.Pointer(p)) 35 | val := *uintP 36 | return &val 37 | } 38 | 39 | func uint64Ptr(c C.longlong) *uint64 { 40 | i := uint64(c) 41 | return &i 42 | } 43 | 44 | func int64Ptr(c C.longlong) *int64 { 45 | i := int64(c) 46 | return &i 47 | } 48 | 49 | func uint64PtrUint(c C.uint) *uint64 { 50 | i := uint64(c) 51 | return &i 52 | } 53 | 54 | func uint64PtrUnsafe(p unsafe.Pointer) *uint64 { 55 | if p == nil { 56 | return nil 57 | } 58 | uintP := (*uint64)(unsafe.Pointer(p)) 59 | val := *uintP 60 | return &val 61 | } 62 | 63 | func toInt64(c C.longlong) int64 { 64 | i := int64(c) 65 | return i 66 | } 67 | 68 | func dblToUint(val C.double) *uint { 69 | i := uint(val) 70 | return &i 71 | } 72 | 73 | func dblToFloat(val C.double) *float64 { 74 | i := float64(val) 75 | return &i 76 | } 77 | 78 | func dblToFloatUnsafe(val unsafe.Pointer) *float64 { 79 | if val == nil { 80 | return nil 81 | } 82 | dblP := (*C.double)(unsafe.Pointer(val)) 83 | floatP := float64(*dblP) 84 | return &floatP 85 | } 86 | 87 | func stringPtr(c *C.char) *string { 88 | s := C.GoString(c) 89 | return &s 90 | } 91 | 92 | func errorString(result C.dcgmReturn_t) error { 93 | if result == C.DCGM_ST_OK { 94 | return nil 95 | } 96 | err := C.GoString(C.errorString(result)) 97 | return fmt.Errorf("%v", err) 98 | } 99 | 100 | func freeCString(cStr *C.char) { 101 | C.free(unsafe.Pointer(cStr)) 102 | } 103 | 104 | func IsInt32Blank(value int) bool { 105 | if value >= dcgmInt32Blank { 106 | return true 107 | } 108 | return false 109 | } 110 | 111 | func IsInt64Blank(value int64) bool { 112 | if value >= dcgmInt64Blank { 113 | return true 114 | } 115 | return false 116 | } 117 | 118 | func blank64(val *int64) *int64 { 119 | if val != nil && IsInt64Blank(*val) { 120 | return nil 121 | } 122 | return val 123 | } 124 | 125 | func blank32(val *uint) *uint { 126 | if val != nil && IsInt32Blank(int(*val)) { 127 | return nil 128 | } 129 | return val 130 | } 131 | 132 | func makeVersion1(struct_type uintptr) C.uint { 133 | version := C.uint(struct_type | 1<<24) 134 | return version 135 | } 136 | 137 | func makeVersion2(struct_type uintptr) C.uint { 138 | version := C.uint(struct_type | 2<<24) 139 | return version 140 | } 141 | 142 | func roundFloat(f *float64) *float64 { 143 | var val float64 144 | if f != nil { 145 | val = math.Round(*f) 146 | } 147 | return &val 148 | } 149 | -------------------------------------------------------------------------------- /bindings/go/nvml/mig_test.go: -------------------------------------------------------------------------------- 1 | package nvml 2 | 3 | import ( 4 | "testing" 5 | ) 6 | 7 | func TestSetMigMode(t *testing.T) { 8 | // Initialize NVML 9 | err := Init() 10 | if err != nil { 11 | t.Errorf("%v", err) 12 | } 13 | defer Shutdown() 14 | 15 | // Grab a reference to our first device 16 | device, err := NewDevice(0) 17 | if err != nil { 18 | t.Errorf("%v", err) 19 | } 20 | 21 | // Disable MIG on the device 22 | _, err = device.SetMigMode(DEVICE_MIG_DISABLE) 23 | if err != nil { 24 | t.Errorf("error enabling MIG mode on Device: %v", err) 25 | } 26 | 27 | // Ensure MIG Mode is disabled on the device 28 | current, pending, err := device.GetMigMode() 29 | if err != nil { 30 | t.Errorf("error getting MIG mode on Device: %v", err) 31 | } 32 | if current != pending || current != DEVICE_MIG_DISABLE { 33 | t.Errorf("Expected MIG mode on Device to be DEVICE_MIG_DISABLE, got (current %v, pending %v)", current, pending) 34 | } 35 | 36 | // Enable MIG on the device 37 | _, err = device.SetMigMode(DEVICE_MIG_ENABLE) 38 | if err != nil { 39 | t.Errorf("error enabling MIG mode on Device: %v", err) 40 | } 41 | 42 | // Ensure MIG Mode is enabled on the device 43 | current, pending, err = device.GetMigMode() 44 | if err != nil { 45 | t.Errorf("error getting MIG mode on Device: %v", err) 46 | } 47 | if current != pending || current != DEVICE_MIG_ENABLE { 48 | t.Errorf("Expected MIG mode on Device to be DEVICE_MIG_ENABLE, got (current %v, pending %v)", current, pending) 49 | } 50 | 51 | // Disable MIG on the device 52 | _, err = device.SetMigMode(DEVICE_MIG_DISABLE) 53 | if err != nil { 54 | t.Errorf("error enabling MIG mode on Device: %v", err) 55 | } 56 | 57 | // Ensure MIG Mode is disabled on the device 58 | current, pending, err = device.GetMigMode() 59 | if err != nil { 60 | t.Errorf("error getting MIG mode on Device: %v", err) 61 | } 62 | if current != pending || current != DEVICE_MIG_DISABLE { 63 | t.Errorf("Expected MIG mode on Device to be DEVICE_MIG_DISABLE, got (current %v, pending %v)", current, pending) 64 | } 65 | } 66 | 67 | func TestParseMigDeviceUUID(t *testing.T) { 68 | tests := []struct { 69 | name string 70 | uuid string 71 | expectedGPU string 72 | expectedGi uint 73 | expectedCi uint 74 | expectedError bool 75 | }{ 76 | { 77 | name: "Successfull Parsing", 78 | uuid: "MIG-GPU-b8ea3855-276c-c9cb-b366-c6fa655957c5/1/5", 79 | expectedGPU: "GPU-b8ea3855-276c-c9cb-b366-c6fa655957c5", 80 | expectedGi: 1, 81 | expectedCi: 5, 82 | }, 83 | { 84 | name: "Fail, Missing MIG at the beginning of UUID", 85 | uuid: "GPU-b8ea3855-276c-c9cb-b366-c6fa655957c5/1/5", 86 | expectedError: true, 87 | }, 88 | { 89 | name: "Fail, Missing GPU at the beginning of GPU UUID", 90 | uuid: "MIG-b8ea3855-276c-c9cb-b366-c6fa655957c5/1/5", 91 | expectedError: true, 92 | }, 93 | { 94 | name: "Fail, GI not parsable", 95 | uuid: "MIG-GPU-b8ea3855-276c-c9cb-b366-c6fa655957c5/xx/5", 96 | expectedError: true, 97 | }, 98 | { 99 | name: "Fail, CI not a parsable", 100 | uuid: "MIG-GPU-b8ea3855-276c-c9cb-b366-c6fa655957c5/1/xx", 101 | expectedError: true, 102 | }, 103 | } 104 | 105 | for _, tc := range tests { 106 | t.Run(tc.name, func(t *testing.T) { 107 | gpu, gi, ci, err := ParseMigDeviceUUID(tc.uuid) 108 | if tc.expectedError && err != nil { 109 | return 110 | } 111 | if tc.expectedError && err == nil { 112 | t.Fatalf("Expected an error, but didn't get one: uuid: %v, (gpu: %v, gi: %v, ci: %v)", tc.uuid, gpu, gi, ci) 113 | } 114 | if !tc.expectedError && err != nil { 115 | t.Fatalf("Unexpected error: %v, uuid: %v, (gpu: %v, gi: %v, ci: %v)", err, tc.uuid, gpu, gi, ci) 116 | } 117 | if gpu != tc.expectedGPU || gi != tc.expectedGi || ci != tc.expectedCi { 118 | t.Fatalf("MIG UUID parsed incorrectly: uuid: %v, (gpu: %v, gi: %v, ci: %v)", tc.uuid, gpu, gi, ci) 119 | } 120 | }) 121 | } 122 | } 123 | -------------------------------------------------------------------------------- /bindings/go/nvml/nvml_dl.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. 2 | 3 | // +build linux darwin 4 | 5 | package nvml 6 | 7 | import ( 8 | "unsafe" 9 | ) 10 | 11 | /* 12 | #include 13 | #include "nvml.h" 14 | 15 | // We wrap the call to nvmlInit() here to ensure that we pick up the correct 16 | // version of this call. The macro magic in nvml.h that #defines the symbol 17 | // 'nvmlInit' to 'nvmlInit_v2' is unfortunately lost on cgo. 18 | static nvmlReturn_t nvmlInit_dl(void) { 19 | return nvmlInit(); 20 | } 21 | */ 22 | import "C" 23 | 24 | type dlhandles struct{ handles []unsafe.Pointer } 25 | 26 | var dl dlhandles 27 | 28 | // Initialize NVML, opening a dynamic reference to the NVML library in the process. 29 | func (dl *dlhandles) nvmlInit() C.nvmlReturn_t { 30 | handle := C.dlopen(C.CString("libnvidia-ml.so.1"), C.RTLD_LAZY|C.RTLD_GLOBAL) 31 | if handle == C.NULL { 32 | return C.NVML_ERROR_LIBRARY_NOT_FOUND 33 | } 34 | dl.handles = append(dl.handles, handle) 35 | return C.nvmlInit_dl() 36 | } 37 | 38 | // Shutdown NVML, closing our dynamic reference to the NVML library in the process. 39 | func (dl *dlhandles) nvmlShutdown() C.nvmlReturn_t { 40 | ret := C.nvmlShutdown() 41 | if ret != C.NVML_SUCCESS { 42 | return ret 43 | } 44 | 45 | for _, handle := range dl.handles { 46 | err := C.dlclose(handle) 47 | if err != 0 { 48 | return C.NVML_ERROR_UNKNOWN 49 | } 50 | } 51 | 52 | return C.NVML_SUCCESS 53 | } 54 | 55 | // Check to see if a specific symbol is present in the NVML library. 56 | func (dl *dlhandles) lookupSymbol(symbol string) C.nvmlReturn_t { 57 | for _, handle := range dl.handles { 58 | C.dlerror() 59 | C.dlsym(handle, C.CString(symbol)) 60 | if unsafe.Pointer(C.dlerror()) == C.NULL { 61 | return C.NVML_SUCCESS 62 | } 63 | } 64 | return C.NVML_ERROR_FUNCTION_NOT_FOUND 65 | } 66 | -------------------------------------------------------------------------------- /bindings/go/nvml/nvml_dl_windows.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. 2 | 3 | // +build windows 4 | 5 | package nvml 6 | 7 | import ( 8 | "syscall" 9 | ) 10 | 11 | /* 12 | #include "nvml.h" 13 | 14 | // We wrap the call to nvmlInit() here to ensure that we pick up the correct 15 | // version of this call. The macro magic in nvml.h that #defines the symbol 16 | // 'nvmlInit' to 'nvmlInit_v2' is unfortunately lost on cgo. 17 | static nvmlReturn_t nvmlInit_dl(void) { 18 | return nvmlInit(); 19 | } 20 | */ 21 | import "C" 22 | 23 | type dlhandles struct{ handles []*syscall.LazyDLL } 24 | 25 | var dl dlhandles 26 | 27 | // Initialize NVML, opening a dynamic reference to the NVML library in the process. 28 | func (dl *dlhandles) nvmlInit() C.nvmlReturn_t { 29 | handle := syscall.NewLazyDLL("nvml.dll") 30 | if handle == nil { 31 | return C.NVML_ERROR_LIBRARY_NOT_FOUND 32 | } 33 | dl.handles = append(dl.handles, handle) 34 | return C.nvmlInit_dl() 35 | } 36 | 37 | // Shutdown NVML, closing our dynamic reference to the NVML library in the process. 38 | func (dl *dlhandles) nvmlShutdown() C.nvmlReturn_t { 39 | ret := C.nvmlShutdown() 40 | if ret != C.NVML_SUCCESS { 41 | return ret 42 | } 43 | 44 | dl.handles = dl.handles[:0] 45 | 46 | return C.NVML_SUCCESS 47 | } 48 | 49 | // Check to see if a specific symbol is present in the NVML library. 50 | func (dl *dlhandles) lookupSymbol(symbol string) C.nvmlReturn_t { 51 | for _, handle := range dl.handles { 52 | if proc := handle.NewProc(symbol); proc != nil { 53 | return C.NVML_SUCCESS 54 | } 55 | } 56 | return C.NVML_ERROR_FUNCTION_NOT_FOUND 57 | } 58 | -------------------------------------------------------------------------------- /bindings/go/nvml/nvsmi/nvsmi.go: -------------------------------------------------------------------------------- 1 | package nvsmi 2 | 3 | import ( 4 | "bytes" 5 | "fmt" 6 | "os/exec" 7 | "strings" 8 | ) 9 | 10 | const ( 11 | bin = "nvidia-smi" 12 | gpuArg = "--id=" 13 | queryArg = "--query-gpu=" 14 | formatArg = "--format=csv,noheader,nounits" 15 | ) 16 | 17 | func Query(id string, query string) string { 18 | var out bytes.Buffer 19 | 20 | cmd := exec.Command(bin, gpuArg+id, queryArg+query, formatArg) 21 | cmd.Stdout = &out 22 | 23 | err := cmd.Run() 24 | if err != nil { 25 | fmt.Errorf("nvsmi exec error: %v\n", err) 26 | } 27 | return strings.TrimSpace(out.String()) 28 | } 29 | 30 | func DeviceCount(query string) uint { 31 | var out bytes.Buffer 32 | 33 | cmd := exec.Command(bin, queryArg+query, formatArg) 34 | cmd.Stdout = &out 35 | 36 | err := cmd.Run() 37 | if err != nil { 38 | fmt.Errorf("nvsmi exec error: %v\n", err) 39 | } 40 | 41 | nvSmi := strings.Split(strings.TrimSuffix(out.String(), "\n"), "\n") 42 | return uint(len(nvSmi)) 43 | } 44 | -------------------------------------------------------------------------------- /bindings/go/samples/dcgm/deviceInfo/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "flag" 5 | "log" 6 | "os" 7 | "text/template" 8 | 9 | "github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm" 10 | ) 11 | 12 | const ( 13 | deviceInfo = `Driver Version : {{.Identifiers.DriverVersion}} 14 | GPU : {{.GPU}} 15 | DCGMSupported : {{.DCGMSupported}} 16 | UUID : {{.UUID}} 17 | Brand : {{.Identifiers.Brand}} 18 | Model : {{.Identifiers.Model}} 19 | Serial Number : {{.Identifiers.Serial}} 20 | Vbios : {{or .Identifiers.Vbios "N/A"}} 21 | InforomImage Version : {{.Identifiers.InforomImageVersion}} 22 | Bus ID : {{.PCI.BusID}} 23 | BAR1 (MB) : {{or .PCI.BAR1 "N/A"}} 24 | FrameBuffer Memory (MB): {{or .PCI.FBTotal "N/A"}} 25 | Bandwidth (MB/s) : {{or .PCI.Bandwidth "N/A"}} 26 | Cores (MHz) : {{or .Clocks.Cores "N/A"}} 27 | Memory (MHz) : {{or .Clocks.Memory "N/A"}} 28 | Power (W) : {{or .Power "N/A"}} 29 | CPUAffinity : {{or .CPUAffinity "N/A"}} 30 | P2P Available : {{if not .Topology}}None{{else}}{{range .Topology}} 31 | GPU{{.GPU}} - (BusID){{.BusID}} - {{.Link.PCIPaths}}{{end}}{{end}} 32 | --------------------------------------------------------------------- 33 | ` 34 | ) 35 | 36 | var ( 37 | connectAddr = flag.String("connect", "localhost", "Provide nv-hostengine connection address.") 38 | isSocket = flag.String("socket", "0", "Connecting to Unix socket?") 39 | ) 40 | 41 | // mini version of nvidia-smi -q 42 | // dcgmi discovery -i apc 43 | func main() { 44 | // choose dcgm hostengine running mode 45 | // 1. dcgm.Embedded 46 | // 2. dcgm.Standalone -connect "addr", -socket "isSocket" 47 | // 3. dcgm.StartHostengine 48 | flag.Parse() 49 | cleanup, err := dcgm.Init(dcgm.Standalone, *connectAddr, *isSocket) 50 | if err != nil { 51 | log.Panicln(err) 52 | } 53 | defer cleanup() 54 | 55 | defer func() { 56 | if err := dcgm.Shutdown(); err != nil { 57 | log.Panicln(err) 58 | } 59 | }() 60 | 61 | count, err := dcgm.GetAllDeviceCount() 62 | if err != nil { 63 | log.Panicln(err) 64 | } 65 | 66 | t := template.Must(template.New("Device").Parse(deviceInfo)) 67 | 68 | for i := uint(0); i < count; i++ { 69 | deviceInfo, err := dcgm.GetDeviceInfo(i) 70 | if err != nil { 71 | log.Panicln(err) 72 | } 73 | 74 | if err = t.Execute(os.Stdout, deviceInfo); err != nil { 75 | log.Panicln("Template error:", err) 76 | } 77 | } 78 | } 79 | -------------------------------------------------------------------------------- /bindings/go/samples/dcgm/dmon/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "log" 6 | "os" 7 | "os/signal" 8 | "syscall" 9 | "time" 10 | 11 | "github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm" 12 | ) 13 | 14 | const ( 15 | header = `# gpu pwr temp sm mem enc dec mclk pclk 16 | # Idx W C % % % % MHz MHz` 17 | ) 18 | 19 | // modelled on nvidia-smi dmon 20 | // dcgmi dmon -e 155,150,203,204,206,207,100,101 21 | func main() { 22 | sigs := make(chan os.Signal, 1) 23 | signal.Notify(sigs, syscall.SIGINT, syscall.SIGTERM) 24 | 25 | cleanup, err := dcgm.Init(dcgm.Embedded) 26 | if err != nil { 27 | log.Panicln(err) 28 | } 29 | defer cleanup() 30 | 31 | gpus, err := dcgm.GetSupportedDevices() 32 | if err != nil { 33 | log.Panicln(err) 34 | } 35 | 36 | ticker := time.NewTicker(time.Second * 1) 37 | defer ticker.Stop() 38 | 39 | fmt.Println(header) 40 | for { 41 | select { 42 | case <-ticker.C: 43 | for _, gpu := range gpus { 44 | st, err := dcgm.GetDeviceStatus(gpu) 45 | if err != nil { 46 | log.Panicln(err) 47 | } 48 | fmt.Printf("%5d %5d %5d %5d %5d %5d %5d %5d %5d\n", 49 | gpu, int64(st.Power), st.Temperature, st.Utilization.GPU, st.Utilization.Memory, 50 | st.Utilization.Encoder, st.Utilization.Decoder, st.Clocks.Memory, st.Clocks.Cores) 51 | } 52 | 53 | case <-sigs: 54 | return 55 | } 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /bindings/go/samples/dcgm/health/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "log" 5 | "os" 6 | "os/signal" 7 | "syscall" 8 | "text/template" 9 | "time" 10 | 11 | "github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm" 12 | ) 13 | 14 | const ( 15 | healthStatus = `GPU : {{.GPU}} 16 | Status : {{.Status}} 17 | {{range .Watches}} 18 | Type : {{.Type}} 19 | Status : {{.Status}} 20 | Error : {{.Error}} 21 | {{end}} 22 | ` 23 | ) 24 | 25 | // create group: dcgmi group -c "name" --default 26 | // enable watches: dcgmi health -s a 27 | // check: dcgmi health -g 1 -c 28 | func main() { 29 | sigs := make(chan os.Signal, 1) 30 | signal.Notify(sigs, syscall.SIGINT, syscall.SIGTERM) 31 | 32 | cleanup, err := dcgm.Init(dcgm.Embedded) 33 | if err != nil { 34 | log.Panicln(err) 35 | } 36 | defer cleanup() 37 | 38 | gpus, err := dcgm.GetSupportedDevices() 39 | if err != nil { 40 | log.Panicln(err) 41 | } 42 | 43 | ticker := time.NewTicker(time.Second * 1) 44 | defer ticker.Stop() 45 | 46 | t := template.Must(template.New("Health").Parse(healthStatus)) 47 | for { 48 | select { 49 | case <-ticker.C: 50 | for _, gpu := range gpus { 51 | h, err := dcgm.HealthCheckByGpuId(gpu) 52 | if err != nil { 53 | log.Panicln(err) 54 | } 55 | 56 | if err = t.Execute(os.Stdout, h); err != nil { 57 | log.Panicln("Template error:", err) 58 | } 59 | } 60 | case <-sigs: 61 | return 62 | } 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /bindings/go/samples/dcgm/hostengineStatus/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "log" 6 | 7 | "github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm" 8 | ) 9 | 10 | // dcgmi introspect --enable 11 | // dcgmi introspect -s -H 12 | func main() { 13 | cleanup, err := dcgm.Init(dcgm.Embedded) 14 | if err != nil { 15 | log.Panicln(err) 16 | } 17 | defer cleanup() 18 | 19 | st, err := dcgm.Introspect() 20 | if err != nil { 21 | log.Panicln(err) 22 | } 23 | 24 | fmt.Printf("Memory %2s %v KB\nCPU %5s %.2f %s\n", ":", st.Memory, ":", st.CPU, "%") 25 | } 26 | -------------------------------------------------------------------------------- /bindings/go/samples/dcgm/policy/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "log" 6 | 7 | "github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm" 8 | ) 9 | 10 | // dcgmi group -c "name" --default 11 | // dcgmi policy -g GROUPID --set 0,0 -x -n -p -e -P 250 -T 100 -M 10 12 | // dcgmi policy -g GROUPID --reg 13 | func main() { 14 | cleanup, err := dcgm.Init(dcgm.Embedded) 15 | if err != nil { 16 | log.Panicln(err) 17 | } 18 | defer cleanup() 19 | 20 | gpus, err := dcgm.GetSupportedDevices() 21 | if err != nil { 22 | log.Panicln(err) 23 | } 24 | 25 | // Choose policy conditions to register violation callback. 26 | // Note: Need to be root for some options 27 | // Available options are: 28 | // 1. dcgm.DbePolicy 29 | // 2. dcgm.PCIePolicy 30 | // 3. dcgm.MaxRtPgPolicy 31 | // 4. dcgm.ThermalPolicy 32 | // 5. dcgm.PowerPolicy 33 | // 6. dcgm.NvlinkPolicy 34 | // 7. dcgm.XidPolicy 35 | for _, gpu := range gpus { 36 | c, err := dcgm.Policy(gpu, dcgm.XidPolicy) 37 | if err != nil { 38 | log.Panicln(err) 39 | } 40 | 41 | pe := <-c 42 | fmt.Printf("GPU %8s %v\nError %6s %v\nTimestamp %2s %v\nData %7s %v\n", 43 | ":", gpu, ":", pe.Condition, ":", pe.Timestamp, ":", pe.Data) 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /bindings/go/samples/dcgm/processInfo/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "flag" 5 | "log" 6 | "os" 7 | "text/template" 8 | "time" 9 | 10 | "github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm" 11 | ) 12 | 13 | const ( 14 | processInfo = `---------------------------------------------------------------------- 15 | GPU ID : {{.GPU}} 16 | ----------Execution Stats--------------------------------------------- 17 | PID : {{.PID}} 18 | Name : {{or .Name "N/A"}} 19 | Start Time : {{.ProcessUtilization.StartTime.String}} 20 | End Time : {{.ProcessUtilization.EndTime.String}} 21 | ----------Performance Stats------------------------------------------- 22 | Energy Consumed (Joules) : {{or .ProcessUtilization.EnergyConsumed "N/A"}} 23 | Max GPU Memory Used (bytes) : {{or .Memory.GlobalUsed "N/A"}} 24 | Avg SM Clock (MHz) : {{or .Clocks.Cores "N/A"}} 25 | Avg Memory Clock (MHz) : {{or .Clocks.Memory "N/A"}} 26 | Avg SM Utilization (%) : {{or .GpuUtilization.Memory "N/A"}} 27 | Avg Memory Utilization (%) : {{or .GpuUtilization.GPU "N/A"}} 28 | Avg PCIe Rx Bandwidth (MB) : {{or .PCI.Throughput.Rx "N/A"}} 29 | Avg PCIe Tx Bandwidth (MB) : {{or .PCI.Throughput.Tx "N/A"}} 30 | ----------Event Stats------------------------------------------------- 31 | Single Bit ECC Errors : {{or .Memory.ECCErrors.SingleBit "N/A"}} 32 | Double Bit ECC Errors : {{or .Memory.ECCErrors.DoubleBit "N/A"}} 33 | Critical XID Errors : {{.XIDErrors.NumErrors}} 34 | ----------Slowdown Stats---------------------------------------------- 35 | Due to - Power (%) : {{or .Violations.Power "N/A"}} 36 | - Thermal (%) : {{or .Violations.Thermal "N/A"}} 37 | - Reliability (%) : {{or .Violations.Reliability "N/A"}} 38 | - Board Limit (%) : {{or .Violations.BoardLimit "N/A"}} 39 | - Low Utilization (%) : {{or .Violations.LowUtilization "N/A"}} 40 | - Sync Boost (%) : {{or .Violations.SyncBoost "N/A"}} 41 | ----------Process Utilization----------------------------------------- 42 | Avg SM Utilization (%) : {{or .ProcessUtilization.SmUtil "N/A"}} 43 | Avg Memory Utilization (%) : {{or .ProcessUtilization.MemUtil "N/A"}} 44 | ---------------------------------------------------------------------- 45 | ` 46 | ) 47 | 48 | var process = flag.Uint("pid", 0, "Provide pid to get this process information.") 49 | 50 | // run as root, for enabling health watches 51 | // dcgmi stats -e 52 | // dcgmi stats --pid ENTERPID -v 53 | // sample: sudo ./processInfo -pid PID 54 | func main() { 55 | cleanup, err := dcgm.Init(dcgm.Embedded) 56 | if err != nil { 57 | log.Panicln(err) 58 | } 59 | defer cleanup() 60 | 61 | // Request DCGM to start recording stats for GPU process fields 62 | group, err := dcgm.WatchPidFields() 63 | if err != nil { 64 | log.Panicln(err) 65 | } 66 | 67 | // Before retrieving process stats, wait few seconds for watches to be enabled and collect data 68 | log.Println("Enabling DCGM watches to start collecting process stats. This may take a few seconds....") 69 | time.Sleep(3000 * time.Millisecond) 70 | 71 | flag.Parse() 72 | pidInfo, err := dcgm.GetProcessInfo(group, *process) 73 | if err != nil { 74 | log.Panicln(err) 75 | } 76 | 77 | t := template.Must(template.New("Process").Parse(processInfo)) 78 | for _, gpu := range pidInfo { 79 | 80 | if err = t.Execute(os.Stdout, gpu); err != nil { 81 | log.Panicln("Template error:", err) 82 | } 83 | } 84 | } 85 | -------------------------------------------------------------------------------- /bindings/go/samples/dcgm/restApi/README.md: -------------------------------------------------------------------------------- 1 | ## DCGM REST API 2 | 3 | A sample REST API is provided, demonstrating various endpoints for getting GPU metrics via DCGM. 4 | 5 | 6 | ``` 7 | # Start the http server 8 | # By default the http server is started at localhost:8070 9 | 10 | $ go build && ./restApi 11 | 12 | # Query GPU 0 info 13 | $ GPUID=0 14 | $ curl localhost:8070/dcgm/device/info/id/$GPUID 15 | 16 | # sample output 17 | 18 | Driver Version : 384.130 19 | GPU : 0 20 | DCGMSupported : Yes 21 | UUID : GPU-34e8d7ba-0e4d-ac00-6852-695d5d404f51 22 | Brand : GeForce 23 | Model : GeForce GTX 980 24 | Serial Number : 0324414056639 25 | Vbios : 84.04.1F.00.02 26 | InforomImage Version : G001.0000.01.03 27 | Bus ID : 00000000:01:00.0 28 | BAR1 (MB) : 256 29 | FrameBuffer Memory (MB): 4036 30 | Bandwidth (MB/s) : 15760 31 | Cores (MHz) : 1392 32 | Memory (MHz) : 3505 33 | Power (W) : 180 34 | CPUAffinity : 0-11 35 | P2P Available : None 36 | --------------------------------------------------------------------- 37 | 38 | $ curl localhost:8070/dcgm/device/info/id/$GPUID/json 39 | 40 | # Query GPU info using its UUID 41 | 42 | $ UUID=$(curl -s localhost:8070/dcgm/device/info/id/$GPUID | grep -i uuid | cut -d ":" -f2 ) 43 | $ curl localhost:8070/dcgm/device/info/uuid/$UUID 44 | $ curl localhost:8070/dcgm/device/info/uuid/$UUID/json 45 | 46 | # sample output 47 | 48 | {"GPU":0,"DCGMSupported":"Yes","UUID":"GPU-34e8d7ba-0e4d-ac00-6852-695d5d404f51","Power":180,"PCI":{"BusID":"00000000:01:00.0","BAR1":256,"FBTotal":4036,"Bandwidth":15760},"Clocks":{"Cores":1392,"Memory":3505},"Identifiers":{"Brand":"GeForce","Model":"GeForce GTX 980","Serial":"0324414056639","Vbios":"84.04.1F.00.02","InforomImageVersion":"G001.0000.01.03","DriverVersion":"384.130"},"Topology":null,"CPUAffinity":"0-11"} 49 | 50 | # Query GPU status 51 | 52 | $ curl localhost:8070/dcgm/device/status/id/$GPUID 53 | $ curl localhost:8070/dcgm/device/status/id/$GPUID/json 54 | 55 | # sample output 56 | 57 | Power (W) : 20.985 58 | Temperature (°C) : 47 59 | Sm Utilization (%) : 2 60 | Memory Utilization (%) : 8 61 | Encoder Utilization (%) : 0 62 | Decoder Utilization (%) : 0 63 | Memory Clock (MHz : 324 64 | SM Clock (MHz) : 135 65 | 66 | $ curl localhost:8070/dcgm/device/status/uuid/$UUID 67 | 68 | # sample output 69 | 70 | {"Power":20.793,"Temperature":43,"Utilization":{"GPU":0,"Memory":8,"Encoder":0,"Decoder":0},"Memory":{"GlobalUsed":null,"ECCErrors":{"SingleBit":9223372036854775794,"DoubleBit":9223372036854775794}},"Clocks":{"Cores":135,"Memory":324},"PCI":{"BAR1Used":9,"Throughput":{"Rx":129,"Tx":47,"Replays":0},"FBUsed":423},"Performance":8,"FanSpeed":29} 71 | 72 | $ curl localhost:8070/dcgm/device/status/uuid/$UUID/json 73 | 74 | # Query GPU process info 75 | 76 | # Run CUDA nbody sample and get its PID 77 | $ PID=$(pgrep nbody) 78 | 79 | $ curl localhost:8070/dcgm/process/info/pid/$PID 80 | $ curl localhost:8070/dcgm/process/info/pid/$PID/json 81 | 82 | # sample output 83 | 84 | {"GPU":0,"PID":19132,"Name":"nbody","ProcessUtilization":{"StartTime":1529980640,"EndTime":0,"EnergyConsumed":1346,"SmUtil":0,"MemUtil":0},"PCI":{"BAR1Used":null,"Throughput":{"Rx":null,"Tx":null,"Replays":0},"FBUsed":null},"Memory":{"GlobalUsed":84279296,"ECCErrors":{"SingleBit":0,"DoubleBit":0}},"GpuUtilization":{"GPU":null,"Memory":null,"Encoder":null,"Decoder":null},"Clocks":{"Cores":null,"Memory":null},"Violations":{"Power":0,"Thermal":0,"Reliability":0,"BoardLimit":0,"LowUtilization":0,"SyncBoost":0},"XIDErrors":{"NumErrors":0,"TimeStamp":[]}} 85 | 86 | # Query GPU health 87 | 88 | $ curl localhost:8070/dcgm/health/id/$GPUID 89 | $ curl localhost:8070/dcgm/health/id/$GPUID/json 90 | $ curl localhost:8070/dcgm/health/uuid/$UUID 91 | $ curl localhost:8070/dcgm/health/uuid/$UUID/json 92 | 93 | # sample output 94 | 95 | {"GPU":0,"Status":"Healthy","Watches":[]} 96 | 97 | # Query DCGM hostengine memory and CPU usage 98 | 99 | $ curl localhost:8070/dcgm/status 100 | $ curl localhost:8070/dcgm/status/json 101 | 102 | # sample output 103 | 104 | {"Memory":18380,"CPU":0.16482222745467387} 105 | 106 | ``` -------------------------------------------------------------------------------- /bindings/go/samples/dcgm/restApi/handlers/byIds.go: -------------------------------------------------------------------------------- 1 | package handlers 2 | 3 | import ( 4 | "net/http" 5 | ) 6 | 7 | func DeviceInfo(resp http.ResponseWriter, req *http.Request) { 8 | device := getDeviceInfo(resp, req) 9 | if device == nil { 10 | return 11 | } 12 | if isJson(req) { 13 | encode(resp, req, device) 14 | return 15 | } 16 | print(resp, req, device, deviceInfo) 17 | } 18 | 19 | func DeviceStatus(resp http.ResponseWriter, req *http.Request) { 20 | st := getDeviceStatus(resp, req) 21 | if st == nil { 22 | return 23 | } 24 | if isJson(req) { 25 | encode(resp, req, st) 26 | return 27 | } 28 | print(resp, req, st, deviceStatus) 29 | } 30 | 31 | func ProcessInfo(resp http.ResponseWriter, req *http.Request) { 32 | pInfo := getProcessInfo(resp, req) 33 | if len(pInfo) == 0 { 34 | return 35 | } 36 | if isJson(req) { 37 | encode(resp, req, pInfo) 38 | return 39 | } 40 | processPrint(resp, req, pInfo) 41 | } 42 | 43 | func Health(resp http.ResponseWriter, req *http.Request) { 44 | h := getHealth(resp, req) 45 | if h == nil { 46 | return 47 | } 48 | if isJson(req) { 49 | encode(resp, req, h) 50 | return 51 | } 52 | print(resp, req, h, healthStatus) 53 | } 54 | 55 | func DcgmStatus(resp http.ResponseWriter, req *http.Request) { 56 | st := getDcgmStatus(resp, req) 57 | if st == nil { 58 | return 59 | } 60 | if isJson(req) { 61 | encode(resp, req, st) 62 | return 63 | } 64 | print(resp, req, st, hostengine) 65 | } 66 | -------------------------------------------------------------------------------- /bindings/go/samples/dcgm/restApi/handlers/byUuids.go: -------------------------------------------------------------------------------- 1 | package handlers 2 | 3 | import ( 4 | "log" 5 | "net/http" 6 | 7 | "github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm" 8 | ) 9 | 10 | // map of uuids and device id 11 | var uuids map[string]uint 12 | 13 | func DevicesUuids() { 14 | uuids = make(map[string]uint) 15 | count, err := dcgm.GetAllDeviceCount() 16 | if err != nil { 17 | log.Printf("(DCGM) Error getting devices: %s", err) 18 | return 19 | } 20 | 21 | for i := uint(0); i < count; i++ { 22 | deviceInfo, err := dcgm.GetDeviceInfo(i) 23 | if err != nil { 24 | log.Printf("(DCGM) Error getting device information: %s", err) 25 | return 26 | } 27 | uuids[deviceInfo.UUID] = i 28 | } 29 | } 30 | 31 | func DeviceInfoByUuid(resp http.ResponseWriter, req *http.Request) { 32 | device := getDeviceInfo(resp, req) 33 | if device == nil { 34 | return 35 | } 36 | if isJson(req) { 37 | encode(resp, req, device) 38 | return 39 | } 40 | print(resp, req, device, deviceInfo) 41 | } 42 | 43 | func DeviceStatusByUuid(resp http.ResponseWriter, req *http.Request) { 44 | st := getDeviceStatus(resp, req) 45 | if st == nil { 46 | return 47 | } 48 | if isJson(req) { 49 | encode(resp, req, st) 50 | return 51 | } 52 | print(resp, req, st, deviceStatus) 53 | } 54 | 55 | func HealthByUuid(resp http.ResponseWriter, req *http.Request) { 56 | h := getHealth(resp, req) 57 | if h == nil { 58 | return 59 | } 60 | if isJson(req) { 61 | encode(resp, req, h) 62 | return 63 | } 64 | print(resp, req, h, healthStatus) 65 | } 66 | -------------------------------------------------------------------------------- /bindings/go/samples/dcgm/restApi/handlers/dcgm.go: -------------------------------------------------------------------------------- 1 | package handlers 2 | 3 | import ( 4 | "log" 5 | "math" 6 | "net/http" 7 | "time" 8 | 9 | "github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm" 10 | "github.com/gorilla/mux" 11 | ) 12 | 13 | func getDcgmStatus(resp http.ResponseWriter, req *http.Request) (status *dcgm.DcgmStatus) { 14 | st, err := dcgm.Introspect() 15 | if err != nil { 16 | http.Error(resp, err.Error(), http.StatusInternalServerError) 17 | log.Printf("error: %v%v: %v", req.Host, req.URL, err.Error()) 18 | return 19 | } 20 | return &st 21 | 22 | } 23 | 24 | func getDeviceInfo(resp http.ResponseWriter, req *http.Request) (device *dcgm.Device) { 25 | var id uint 26 | params := mux.Vars(req) 27 | for k, v := range params { 28 | switch k { 29 | case "id": 30 | id = getId(resp, req, v) 31 | case "uuid": 32 | id = getIdByUuid(resp, req, v) 33 | } 34 | } 35 | 36 | if id == math.MaxUint32 { 37 | return 38 | } 39 | 40 | if !isValidId(id, resp, req) { 41 | return 42 | } 43 | d, err := dcgm.GetDeviceInfo(id) 44 | if err != nil { 45 | http.Error(resp, err.Error(), http.StatusInternalServerError) 46 | log.Printf("error: %v%v: %v", req.Host, req.URL, err.Error()) 47 | return 48 | } 49 | return &d 50 | } 51 | 52 | func getDeviceStatus(resp http.ResponseWriter, req *http.Request) (status *dcgm.DeviceStatus) { 53 | var id uint 54 | params := mux.Vars(req) 55 | for k, v := range params { 56 | switch k { 57 | case "id": 58 | id = getId(resp, req, v) 59 | case "uuid": 60 | id = getIdByUuid(resp, req, v) 61 | } 62 | } 63 | 64 | if id == math.MaxUint32 { 65 | return 66 | } 67 | 68 | if !isValidId(id, resp, req) { 69 | return 70 | } 71 | 72 | if !isDcgmSupported(id, resp, req) { 73 | return 74 | } 75 | 76 | st, err := dcgm.GetDeviceStatus(id) 77 | if err != nil { 78 | http.Error(resp, err.Error(), http.StatusInternalServerError) 79 | log.Printf("error: %v%v: %v", req.Host, req.URL, err.Error()) 80 | return 81 | } 82 | return &st 83 | } 84 | 85 | func getHealth(resp http.ResponseWriter, req *http.Request) (health *dcgm.DeviceHealth) { 86 | var id uint 87 | params := mux.Vars(req) 88 | for k, v := range params { 89 | switch k { 90 | case "id": 91 | id = getId(resp, req, v) 92 | case "uuid": 93 | id = getIdByUuid(resp, req, v) 94 | } 95 | } 96 | 97 | if id == math.MaxUint32 { 98 | return 99 | } 100 | 101 | if !isValidId(id, resp, req) { 102 | return 103 | } 104 | 105 | h, err := dcgm.HealthCheckByGpuId(id) 106 | if err != nil { 107 | http.Error(resp, err.Error(), http.StatusInternalServerError) 108 | log.Printf("error: %v%v: %v", req.Host, req.URL, err.Error()) 109 | return 110 | } 111 | return &h 112 | } 113 | 114 | func getProcessInfo(resp http.ResponseWriter, req *http.Request) (pInfo []dcgm.ProcessInfo) { 115 | params := mux.Vars(req) 116 | pid := getId(resp, req, params["pid"]) 117 | if pid == math.MaxUint32 { 118 | return 119 | } 120 | group, err := dcgm.WatchPidFields() 121 | if err != nil { 122 | http.Error(resp, err.Error(), http.StatusInternalServerError) 123 | log.Printf("error: %v%v: %v", req.Host, req.URL, err.Error()) 124 | return 125 | } 126 | 127 | // wait for watches to be enabled 128 | log.Printf("Enabling DCGM watches to start collecting process stats. This may take a few seconds....") 129 | time.Sleep(3000 * time.Millisecond) 130 | pInfo, err = dcgm.GetProcessInfo(group, pid) 131 | if err != nil { 132 | http.Error(resp, err.Error(), http.StatusInternalServerError) 133 | log.Printf("error: %v%v: %v", req.Host, req.URL, err.Error()) 134 | } 135 | return 136 | } 137 | -------------------------------------------------------------------------------- /bindings/go/samples/dcgm/restApi/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "log" 5 | "os" 6 | "os/signal" 7 | "syscall" 8 | 9 | "github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm" 10 | ) 11 | 12 | // res: curl localhost:8070/dcgm/device/info/id/0 13 | 14 | func main() { 15 | stopSig := make(chan os.Signal, 1) 16 | signal.Notify(stopSig, syscall.SIGINT, syscall.SIGTERM) 17 | 18 | cleanup, err := dcgm.Init(dcgm.Embedded) 19 | if err != nil { 20 | log.Panicln(err) 21 | } 22 | defer cleanup() 23 | 24 | addr := ":8070" 25 | server := newHttpServer(addr) 26 | 27 | go func() { 28 | log.Printf("Running http server on localhost%s", addr) 29 | server.serve() 30 | }() 31 | defer server.stop() 32 | 33 | <-stopSig 34 | return 35 | } 36 | -------------------------------------------------------------------------------- /bindings/go/samples/dcgm/restApi/server.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "context" 5 | "log" 6 | "net/http" 7 | "time" 8 | 9 | h "github.com/NVIDIA/gpu-monitoring-tools/bindings/go/samples/dcgm/restApi/handlers" 10 | "github.com/gorilla/mux" 11 | ) 12 | 13 | const timeout = 5 * time.Second 14 | 15 | type httpServer struct { 16 | router *mux.Router 17 | server *http.Server 18 | } 19 | 20 | func newHttpServer(addr string) *httpServer { 21 | r := mux.NewRouter() 22 | 23 | s := &httpServer{ 24 | router: r, 25 | server: &http.Server{ 26 | Addr: addr, 27 | Handler: r, 28 | ReadTimeout: timeout, 29 | WriteTimeout: timeout, 30 | }, 31 | } 32 | 33 | // make a global map of device uuids and ids 34 | h.DevicesUuids() 35 | 36 | s.handler() 37 | return s 38 | } 39 | 40 | func (s *httpServer) handler() { 41 | deviceInfo := "/dcgm/device/info" 42 | subrouter := s.router.PathPrefix(deviceInfo).Subrouter() 43 | subrouter.HandleFunc("/id/{id}", h.DeviceInfo).Methods("GET") 44 | subrouter.HandleFunc("/id/{id}/json", h.DeviceInfo).Methods("GET") 45 | subrouter.HandleFunc("/uuid/{uuid}", h.DeviceInfoByUuid).Methods("GET") 46 | subrouter.HandleFunc("/uuid/{uuid}/json", h.DeviceInfoByUuid).Methods("GET") 47 | 48 | deviceStatus := "/dcgm/device/status" 49 | subrouter = s.router.PathPrefix(deviceStatus).Subrouter() 50 | subrouter.HandleFunc("/id/{id}", h.DeviceStatus).Methods("GET") 51 | subrouter.HandleFunc("/id/{id}/json", h.DeviceStatus).Methods("GET") 52 | subrouter.HandleFunc("/uuid/{uuid}", h.DeviceStatusByUuid).Methods("GET") 53 | subrouter.HandleFunc("/uuid/{uuid}/json", h.DeviceStatusByUuid).Methods("GET") 54 | 55 | processInfo := "/dcgm/process/info/pid/{pid}" 56 | subrouter = s.router.PathPrefix(processInfo).Subrouter() 57 | subrouter.HandleFunc("", h.ProcessInfo).Methods("GET") 58 | subrouter.HandleFunc("/json", h.ProcessInfo).Methods("GET") 59 | 60 | health := "/dcgm/health" 61 | subrouter = s.router.PathPrefix(health).Subrouter() 62 | subrouter.HandleFunc("/id/{id}", h.Health).Methods("GET") 63 | subrouter.HandleFunc("/id/{id}/json", h.Health).Methods("GET") 64 | subrouter.HandleFunc("/uuid/{uuid}", h.HealthByUuid).Methods("GET") 65 | subrouter.HandleFunc("/uuid/{uuid}/json", h.HealthByUuid).Methods("GET") 66 | 67 | dcgmStatus := "/dcgm/status" 68 | subrouter = s.router.PathPrefix(dcgmStatus).Subrouter() 69 | subrouter.HandleFunc("", h.DcgmStatus).Methods("GET") 70 | subrouter.HandleFunc("/json", h.DcgmStatus).Methods("GET") 71 | } 72 | 73 | func (s *httpServer) serve() { 74 | if err := s.server.ListenAndServe(); err != http.ErrServerClosed { 75 | log.Printf("Error: %v", err) 76 | } 77 | } 78 | 79 | func (s *httpServer) stop() { 80 | ctx, cancel := context.WithTimeout(context.Background(), timeout) 81 | defer cancel() 82 | 83 | if err := s.server.Shutdown(ctx); err != nil { 84 | log.Printf("Error: %v", err) 85 | } else { 86 | log.Println("http server stopped") 87 | } 88 | } 89 | -------------------------------------------------------------------------------- /bindings/go/samples/dcgm/topology/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "log" 6 | 7 | "github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm" 8 | ) 9 | 10 | const ( 11 | legend = ` 12 | Legend: 13 | X = Self 14 | SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI) 15 | NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node 16 | PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU) 17 | PXB = Connection traversing multiple PCIe switches (without traversing the PCIe Host Bridge) 18 | PIX = Connection traversing a single PCIe switch 19 | PSB = Connection traversing a single on-board PCIe switch 20 | NV# = Connection traversing a bonded set of # NVLinks` 21 | ) 22 | 23 | // based on nvidia-smi topo -m 24 | // dcgmi topo 25 | func main() { 26 | // choose dcgm hostengine running mode 27 | // 1. dcgm.Embedded 28 | // 2. dcgm.Standalone 29 | // 3. dcgm.StartHostengine 30 | cleanup, err := dcgm.Init(dcgm.Embedded) 31 | if err != nil { 32 | log.Panicln(err) 33 | } 34 | defer cleanup() 35 | 36 | gpus, err := dcgm.GetSupportedDevices() 37 | if err != nil { 38 | log.Panicln(err) 39 | } 40 | 41 | for _, gpu := range gpus { 42 | fmt.Printf("%9s%d", "GPU", gpu) 43 | } 44 | fmt.Printf("%5s\n", "CPUAffinity") 45 | 46 | numGpus := len(gpus) 47 | gpuTopo := make([]string, numGpus) 48 | for i := 0; i < numGpus; i++ { 49 | topo, err := dcgm.GetDeviceTopology(gpus[i]) 50 | if err != nil { 51 | log.Panicln(err) 52 | } 53 | 54 | fmt.Printf("GPU%d", gpus[i]) 55 | for j := 0; j < len(topo); j++ { 56 | // skip current GPU 57 | gpuTopo[topo[j].GPU] = topo[j].Link.PCIPaths() 58 | } 59 | gpuTopo[i] = "X" 60 | for j := 0; j < numGpus; j++ { 61 | fmt.Printf("%5s", gpuTopo[j]) 62 | } 63 | deviceInfo, err := dcgm.GetDeviceInfo(gpus[i]) 64 | if err != nil { 65 | log.Panicln(err) 66 | } 67 | fmt.Printf("%5s\n", deviceInfo.CPUAffinity) 68 | } 69 | fmt.Println(legend) 70 | } 71 | -------------------------------------------------------------------------------- /bindings/go/samples/nvml/README.md: -------------------------------------------------------------------------------- 1 | ## NVML Samples 2 | 3 | Modelled on the [NVIDIA System Management Interface (nvidia-smi)](https://developer.nvidia.com/nvidia-system-management-interface), a commnad line utility using NVML, three samples have been provided to show how to use NVML go bindings. 4 | 5 | #### deviceInfo 6 | 7 | Provides basic information about each GPU on the system. 8 | 9 | ``` 10 | $ go build && ./deviceInfo 11 | 12 | # sample output 13 | 14 | Driver Version : 384.111 15 | GPU : 0 16 | UUID : GPU-34e8d7ba-0e4d-ac00-6852-695d5d404f51 17 | Model : GeForce GTX 980 18 | Path : /dev/nvidia0 19 | Power : 180 W 20 | CPU Affinity : NUMA node0 21 | Bus ID : 00000000:01:00.0 22 | BAR1 : 256 MiB 23 | Bandwidth : 15760 MB/s 24 | Cores : 1392 MHz 25 | Memory : 3505 MHz 26 | P2P Available : None 27 | --------------------------------------------------------------------- 28 | GPU : 1 29 | UUID : GPU-8d3b966d-2248-c3f4-1784-49851a1d02b3 30 | Model : GeForce GTX TITAN 31 | Path : /dev/nvidia1 32 | Power : 250 W 33 | CPU Affinity : NUMA node0 34 | Bus ID : 00000000:06:00.0 35 | BAR1 : 128 MiB 36 | Bandwidth : 8000 MB/s 37 | Cores : 1202 MHz 38 | Memory : 3004 MHz 39 | P2P Available : None 40 | --------------------------------------------------------------------- 41 | ``` 42 | 43 | #### dmon 44 | 45 | Monitors each device status including its power, memory and GPU utilization. 46 | 47 | ``` 48 | $ go build && ./dmon 49 | 50 | # sample output 51 | 52 | # gpu pwr temp sm mem enc dec mclk pclk 53 | # Idx W C % % % % MHz MHz 54 | 0 20 43 0 8 0 0 324 135 55 | 1 10 32 0 0 0 0 324 324 56 | 57 | ``` 58 | 59 | #### processInfo 60 | 61 | Informs about GPU processes running on all devices. 62 | 63 | ``` 64 | $ go build && ./processInfo 65 | 66 | # sample output 67 | 68 | # gpu pid type mem command 69 | # Idx # C/G % name 70 | 0 25712 C+G 0 nbody 71 | 1 - - - - 72 | ``` 73 | -------------------------------------------------------------------------------- /bindings/go/samples/nvml/deviceInfo/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "log" 6 | "os" 7 | "text/template" 8 | 9 | "github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml" 10 | ) 11 | 12 | const ( 13 | DEVICEINFO = `UUID : {{.UUID}} 14 | Model : {{or .Model "N/A"}} 15 | Path : {{.Path}} 16 | Power : {{if .Power}}{{.Power}} W{{else}}N/A{{end}} 17 | Memory : {{if .Memory}}{{.Memory}} MiB{{else}}N/A{{end}} 18 | CudaComputeCap : {{if .CudaComputeCapability.Major}}{{.CudaComputeCapability.Major}}.{{.CudaComputeCapability.Minor}}{{else}}N/A{{end}} 19 | CPU Affinity : {{if .CPUAffinity}}NUMA node{{.CPUAffinity}}{{else}}N/A{{end}} 20 | Bus ID : {{.PCI.BusID}} 21 | BAR1 : {{if .PCI.BAR1}}{{.PCI.BAR1}} MiB{{else}}N/A{{end}} 22 | Bandwidth : {{if .PCI.Bandwidth}}{{.PCI.Bandwidth}} MB/s{{else}}N/A{{end}} 23 | Cores : {{if .Clocks.Cores}}{{.Clocks.Cores}} MHz{{else}}N/A{{end}} 24 | Memory : {{if .Clocks.Memory}}{{.Clocks.Memory}} MHz{{else}}N/A{{end}} 25 | P2P Available : {{if not .Topology}}None{{else}}{{range .Topology}} 26 | {{.BusID}} - {{(.Link.String)}}{{end}}{{end}} 27 | --------------------------------------------------------------------- 28 | ` 29 | ) 30 | 31 | func main() { 32 | nvml.Init() 33 | defer nvml.Shutdown() 34 | 35 | count, err := nvml.GetDeviceCount() 36 | if err != nil { 37 | log.Panicln("Error getting device count:", err) 38 | } 39 | 40 | driverVersion, err := nvml.GetDriverVersion() 41 | if err != nil { 42 | log.Panicln("Error getting driver version:", err) 43 | } 44 | 45 | t := template.Must(template.New("Device").Parse(DEVICEINFO)) 46 | 47 | fmt.Printf("Driver Version : %5v\n", driverVersion) 48 | for i := uint(0); i < count; i++ { 49 | device, err := nvml.NewDevice(i) 50 | if err != nil { 51 | log.Panicf("Error getting device %d: %v\n", i, err) 52 | } 53 | 54 | fmt.Printf("GPU %12s %d\n", ":", i) 55 | err = t.Execute(os.Stdout, device) 56 | if err != nil { 57 | log.Panicln("Template error:", err) 58 | } 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /bindings/go/samples/nvml/dmon/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "log" 6 | "os" 7 | "os/signal" 8 | "syscall" 9 | "time" 10 | 11 | "github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml" 12 | ) 13 | 14 | const ( 15 | DMONHEADER = `# gpu pwr temp sm mem enc dec mclk pclk 16 | # Idx W C % % % % MHz MHz` 17 | ) 18 | 19 | func main() { 20 | nvml.Init() 21 | defer nvml.Shutdown() 22 | 23 | count, err := nvml.GetDeviceCount() 24 | if err != nil { 25 | log.Panicln("Error getting device count:", err) 26 | } 27 | 28 | var devices []*nvml.Device 29 | for i := uint(0); i < count; i++ { 30 | device, err := nvml.NewDevice(i) 31 | if err != nil { 32 | log.Panicf("Error getting device %d: %v\n", i, err) 33 | } 34 | devices = append(devices, device) 35 | } 36 | 37 | sigs := make(chan os.Signal, 1) 38 | signal.Notify(sigs, syscall.SIGINT, syscall.SIGTERM) 39 | 40 | ticker := time.NewTicker(time.Second * 1) 41 | defer ticker.Stop() 42 | 43 | fmt.Println(DMONHEADER) 44 | for { 45 | select { 46 | case <-ticker.C: 47 | for i, device := range devices { 48 | st, err := device.Status() 49 | if err != nil { 50 | log.Panicf("Error getting device %d status: %v\n", i, err) 51 | } 52 | fmt.Printf("%5d %5d %5d %5d %5d %5d %5d %5d %5d\n", 53 | i, *st.Power, *st.Temperature, *st.Utilization.GPU, *st.Utilization.Memory, 54 | *st.Utilization.Encoder, *st.Utilization.Decoder, *st.Clocks.Memory, *st.Clocks.Cores) 55 | } 56 | case <-sigs: 57 | return 58 | } 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /bindings/go/samples/nvml/processInfo/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "log" 6 | "os" 7 | "os/signal" 8 | "syscall" 9 | "time" 10 | 11 | "github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml" 12 | ) 13 | 14 | const ( 15 | PINFOHEADER = `# gpu pid type mem Command 16 | # Idx # C/G MiB name` 17 | ) 18 | 19 | func main() { 20 | nvml.Init() 21 | defer nvml.Shutdown() 22 | 23 | count, err := nvml.GetDeviceCount() 24 | if err != nil { 25 | log.Panicln("Error getting device count:", err) 26 | } 27 | 28 | var devices []*nvml.Device 29 | for i := uint(0); i < count; i++ { 30 | device, err := nvml.NewDevice(i) 31 | if err != nil { 32 | log.Panicf("Error getting device %d: %v\n", i, err) 33 | } 34 | devices = append(devices, device) 35 | } 36 | 37 | sigs := make(chan os.Signal, 1) 38 | signal.Notify(sigs, syscall.SIGINT, syscall.SIGTERM) 39 | 40 | ticker := time.NewTicker(time.Second * 1) 41 | defer ticker.Stop() 42 | 43 | fmt.Println(PINFOHEADER) 44 | for { 45 | select { 46 | case <-ticker.C: 47 | for i, device := range devices { 48 | pInfo, err := device.GetAllRunningProcesses() 49 | if err != nil { 50 | log.Panicf("Error getting device %d processes: %v\n", i, err) 51 | } 52 | if len(pInfo) == 0 { 53 | fmt.Printf("%5v %5s %5s %5s %-5s\n", i, "-", "-", "-", "-") 54 | } 55 | for j := range pInfo { 56 | fmt.Printf("%5v %5v %5v %5v %-5v\n", 57 | i, pInfo[j].PID, pInfo[j].Type, pInfo[j].MemoryUsed, pInfo[j].Name) 58 | } 59 | } 60 | case <-sigs: 61 | return 62 | } 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /dcgm-exporter.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: apps/v1 16 | kind: DaemonSet 17 | metadata: 18 | name: "dcgm-exporter" 19 | labels: 20 | app.kubernetes.io/name: "dcgm-exporter" 21 | app.kubernetes.io/version: "2.4.0" 22 | spec: 23 | updateStrategy: 24 | type: RollingUpdate 25 | selector: 26 | matchLabels: 27 | app.kubernetes.io/name: "dcgm-exporter" 28 | app.kubernetes.io/version: "2.4.0" 29 | template: 30 | metadata: 31 | labels: 32 | app.kubernetes.io/name: "dcgm-exporter" 33 | app.kubernetes.io/version: "2.4.0" 34 | name: "dcgm-exporter" 35 | spec: 36 | containers: 37 | - image: "nvcr.io/nvidia/k8s/dcgm-exporter:2.2.9-2.4.0-ubuntu18.04" 38 | env: 39 | - name: "DCGM_EXPORTER_LISTEN" 40 | value: ":9400" 41 | - name: "DCGM_EXPORTER_KUBERNETES" 42 | value: "true" 43 | name: "dcgm-exporter" 44 | ports: 45 | - name: "metrics" 46 | containerPort: 9400 47 | securityContext: 48 | runAsNonRoot: false 49 | runAsUser: 0 50 | volumeMounts: 51 | - name: "pod-gpu-resources" 52 | readOnly: true 53 | mountPath: "/var/lib/kubelet/pod-resources" 54 | volumes: 55 | - name: "pod-gpu-resources" 56 | hostPath: 57 | path: "/var/lib/kubelet/pod-resources" 58 | 59 | --- 60 | 61 | kind: Service 62 | apiVersion: v1 63 | metadata: 64 | name: "dcgm-exporter" 65 | labels: 66 | app.kubernetes.io/name: "dcgm-exporter" 67 | app.kubernetes.io/version: "2.4.0" 68 | spec: 69 | selector: 70 | app.kubernetes.io/name: "dcgm-exporter" 71 | app.kubernetes.io/version: "2.4.0" 72 | ports: 73 | - name: "metrics" 74 | port: 9400 75 | -------------------------------------------------------------------------------- /deployment/dcgm-exporter/.helmignore: -------------------------------------------------------------------------------- 1 | # Patterns to ignore when building packages. 2 | # This supports shell glob matching, relative path matching, and 3 | # negation (prefixed with !). Only one pattern per line. 4 | .DS_Store 5 | # Common VCS dirs 6 | .git/ 7 | .gitignore 8 | .bzr/ 9 | .bzrignore 10 | .hg/ 11 | .hgignore 12 | .svn/ 13 | # Common backup files 14 | *.swp 15 | *.bak 16 | *.tmp 17 | *.orig 18 | *~ 19 | # Various IDEs 20 | .project 21 | .idea/ 22 | *.tmproj 23 | .vscode/ 24 | -------------------------------------------------------------------------------- /deployment/dcgm-exporter/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v2 2 | name: dcgm-exporter 3 | description: A Helm chart for DCGM exporter 4 | version: "2.4.0" 5 | kubeVersion: ">= 1.13.0-0" 6 | appVersion: "2.4.0" 7 | sources: 8 | - https://gitlab.com/nvidia/container-toolkit/gpu-monitoring-tools 9 | home: https://github.com/nvidia/gpu-monitoring-tools/ 10 | icon: https://assets.nvidiagrid.net/ngc/logos/DCGM.png 11 | keywords: 12 | - gpu 13 | - cuda 14 | - compute 15 | - monitoring 16 | - telemetry 17 | - tesla 18 | -------------------------------------------------------------------------------- /deployment/dcgm-exporter/templates/NOTES.txt: -------------------------------------------------------------------------------- 1 | 1. Get the application URL by running these commands: 2 | {{- if contains "NodePort" .Values.service.type }} 3 | export NODE_PORT=$(kubectl get --namespace {{ .Release.Namespace }} -o jsonpath="{.spec.ports[0].nodePort}" services {{ include "dcgm-exporter.fullname" . }}) 4 | export NODE_IP=$(kubectl get nodes --namespace {{ .Release.Namespace }} -o jsonpath="{.items[0].status.addresses[0].address}") 5 | echo http://$NODE_IP:$NODE_PORT/metrics 6 | {{- else if contains "LoadBalancer" .Values.service.type }} 7 | NOTE: It may take a few minutes for the LoadBalancer IP to be available. 8 | You can watch the status of by running 'kubectl get --namespace {{ .Release.Namespace }} svc -w {{ include "dcgm-exporter.fullname" . }}' 9 | export SERVICE_IP=$(kubectl get svc --namespace {{ .Release.Namespace }} {{ include "dcgm-exporter.fullname" . }} --template "{{"{{ range (index .status.loadBalancer.ingress 0) }}{{.}}{{ end }}"}}") 10 | echo http://$SERVICE_IP:{{ .Values.service.port }} 11 | {{- else if contains "ClusterIP" .Values.service.type }} 12 | export POD_NAME=$(kubectl get pods -n {{ .Release.Namespace }} -l "app.kubernetes.io/name={{ include "dcgm-exporter.name" . }},app.kubernetes.io/instance={{ .Release.Name }}" -o jsonpath="{.items[0].metadata.name}") 13 | kubectl -n {{ .Release.Namespace }} port-forward $POD_NAME 8080:{{ .Values.service.port }} & 14 | echo "Visit http://127.0.0.1:8080/metrics to use your application" 15 | {{- end }} 16 | -------------------------------------------------------------------------------- /deployment/dcgm-exporter/templates/_helpers.tpl: -------------------------------------------------------------------------------- 1 | {{/* vim: set filetype=mustache: */}} 2 | {{/* 3 | Expand the name of the chart. 4 | */}} 5 | {{- define "dcgm-exporter.name" -}} 6 | {{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}} 7 | {{- end -}} 8 | 9 | {{/* 10 | Create a default fully qualified app name. 11 | We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). 12 | If release name contains chart name it will be used as a full name. 13 | */}} 14 | {{- define "dcgm-exporter.fullname" -}} 15 | {{- if .Values.fullnameOverride -}} 16 | {{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" -}} 17 | {{- else -}} 18 | {{- $name := default .Chart.Name .Values.nameOverride -}} 19 | {{- if contains $name .Release.Name -}} 20 | {{- .Release.Name | trunc 63 | trimSuffix "-" -}} 21 | {{- else -}} 22 | {{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" -}} 23 | {{- end -}} 24 | {{- end -}} 25 | {{- end -}} 26 | 27 | {{/* 28 | Create chart name and version as used by the chart label. 29 | */}} 30 | {{- define "dcgm-exporter.chart" -}} 31 | {{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" -}} 32 | {{- end -}} 33 | 34 | {{/* 35 | Common labels 36 | */}} 37 | {{- define "dcgm-exporter.labels" -}} 38 | helm.sh/chart: {{ include "dcgm-exporter.chart" . }} 39 | {{ include "dcgm-exporter.selectorLabels" . }} 40 | {{- if .Chart.AppVersion }} 41 | app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} 42 | {{- end }} 43 | app.kubernetes.io/managed-by: {{ .Release.Service }} 44 | {{- end -}} 45 | 46 | {{/* 47 | Selector labels 48 | */}} 49 | {{- define "dcgm-exporter.selectorLabels" -}} 50 | app.kubernetes.io/name: {{ include "dcgm-exporter.name" . }} 51 | app.kubernetes.io/instance: {{ .Release.Name }} 52 | {{- end -}} 53 | 54 | {{/* 55 | Create the name of the service account to use 56 | */}} 57 | {{- define "dcgm-exporter.serviceAccountName" -}} 58 | {{- if .Values.serviceAccount.create -}} 59 | {{ default (include "dcgm-exporter.fullname" .) .Values.serviceAccount.name }} 60 | {{- else -}} 61 | {{ default "default" .Values.serviceAccount.name }} 62 | {{- end -}} 63 | {{- end -}} 64 | -------------------------------------------------------------------------------- /deployment/dcgm-exporter/templates/daemonset.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: apps/v1 16 | kind: DaemonSet 17 | metadata: 18 | name: {{ include "dcgm-exporter.fullname" . }} 19 | namespace: {{ .Release.Namespace }} 20 | labels: 21 | {{- include "dcgm-exporter.labels" . | nindent 4 }} 22 | app.kubernetes.io/component: "dcgm-exporter" 23 | spec: 24 | updateStrategy: 25 | type: RollingUpdate 26 | selector: 27 | matchLabels: 28 | {{- include "dcgm-exporter.selectorLabels" . | nindent 6 }} 29 | app.kubernetes.io/component: "dcgm-exporter" 30 | template: 31 | metadata: 32 | labels: 33 | {{- include "dcgm-exporter.selectorLabels" . | nindent 8 }} 34 | app.kubernetes.io/component: "dcgm-exporter" 35 | spec: 36 | {{- with .Values.imagePullSecrets }} 37 | imagePullSecrets: 38 | {{- toYaml . | nindent 8 }} 39 | {{- end }} 40 | serviceAccountName: {{ include "dcgm-exporter.serviceAccountName" . }} 41 | {{- if .Values.podSecurityContext }} 42 | securityContext: 43 | {{- toYaml .Values.podSecurityContext | nindent 8 }} 44 | {{- end }} 45 | {{- if .Values.affinity }} 46 | affinity: 47 | {{- toYaml .Values.affinity | nindent 8 }} 48 | {{- end }} 49 | {{- if .Values.nodeSelector }} 50 | nodeSelector: 51 | {{- toYaml .Values.nodeSelector | nindent 8 }} 52 | {{- end }} 53 | {{- with .Values.tolerations }} 54 | tolerations: 55 | {{- toYaml . | nindent 6 }} 56 | {{- end }} 57 | volumes: 58 | - name: "pod-gpu-resources" 59 | hostPath: 60 | path: "/var/lib/kubelet/pod-resources" 61 | {{- range .Values.extraHostVolumes }} 62 | - name: {{ .name | quote }} 63 | hostPath: 64 | path: {{ .hostPath | quote }} 65 | {{- end }} 66 | containers: 67 | - name: exporter 68 | securityContext: 69 | {{- toYaml .Values.securityContext | nindent 10 }} 70 | {{- if .Values.image.tag }} 71 | image: "{{ .Values.image.repository }}:{{ .Values.image.tag }}" 72 | {{- else }} 73 | image: "{{ .Values.image.repository }}:{{ .Chart.AppVersion }}" 74 | {{- end }} 75 | imagePullPolicy: "{{ .Values.image.pullPolicy }}" 76 | args: 77 | {{- range $.Values.arguments }} 78 | - {{ . }} 79 | {{- end }} 80 | env: 81 | - name: "DCGM_EXPORTER_KUBERNETES" 82 | value: "true" 83 | - name: "DCGM_EXPORTER_LISTEN" 84 | value: "{{ .Values.service.address }}" 85 | {{- if .Values.extraEnv }} 86 | {{- toYaml .Values.extraEnv | nindent 8 }} 87 | {{- end }} 88 | ports: 89 | - name: "metrics" 90 | containerPort: {{ .Values.service.port }} 91 | volumeMounts: 92 | - name: "pod-gpu-resources" 93 | readOnly: true 94 | mountPath: "/var/lib/kubelet/pod-resources" 95 | {{- if .Values.extraVolumeMounts }} 96 | {{- toYaml .Values.extraVolumeMounts | nindent 8 }} 97 | {{- end }} 98 | livenessProbe: 99 | httpGet: 100 | path: /health 101 | port: {{ .Values.service.port }} 102 | initialDelaySeconds: 5 103 | periodSeconds: 5 104 | readinessProbe: 105 | httpGet: 106 | path: /health 107 | port: {{ .Values.service.port }} 108 | initialDelaySeconds: 5 109 | {{- if .Values.resources }} 110 | resources: 111 | {{- toYaml .Values.resources | nindent 10 }} 112 | {{- end }} 113 | -------------------------------------------------------------------------------- /deployment/dcgm-exporter/templates/service-monitor.yaml: -------------------------------------------------------------------------------- 1 | {{- if .Values.serviceMonitor.enabled }} 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | apiVersion: monitoring.coreos.com/v1 17 | kind: ServiceMonitor 18 | metadata: 19 | name: {{ include "dcgm-exporter.fullname" . }} 20 | namespace: {{ .Release.Namespace }} 21 | labels: 22 | {{- include "dcgm-exporter.labels" . | nindent 4 }} 23 | app.kubernetes.io/component: "dcgm-exporter" 24 | {{- if .Values.serviceMonitor.additionalLabels }} 25 | {{- toYaml .Values.serviceMonitor.additionalLabels | nindent 4 }} 26 | {{- end }} 27 | spec: 28 | selector: 29 | matchLabels: 30 | {{- include "dcgm-exporter.selectorLabels" . | nindent 6 }} 31 | app.kubernetes.io/component: "dcgm-exporter" 32 | namespaceSelector: 33 | matchNames: 34 | - "{{ .Release.Namespace }}" 35 | endpoints: 36 | - port: "metrics" 37 | path: "/metrics" 38 | interval: "{{ .Values.serviceMonitor.interval }}" 39 | {{- end -}} 40 | -------------------------------------------------------------------------------- /deployment/dcgm-exporter/templates/service.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: v1 16 | kind: Service 17 | metadata: 18 | name: {{ include "dcgm-exporter.fullname" . }} 19 | namespace: {{ .Release.Namespace }} 20 | labels: 21 | {{- include "dcgm-exporter.labels" . | nindent 4 }} 22 | app.kubernetes.io/component: "dcgm-exporter" 23 | {{- with .Values.service.annotations }} 24 | annotations: 25 | {{- toYaml . | nindent 4 }} 26 | {{- end }} 27 | spec: 28 | type: {{ .Values.service.type }} 29 | ports: 30 | - name: "metrics" 31 | port: {{ .Values.service.port }} 32 | targetPort: {{ .Values.service.port }} 33 | protocol: TCP 34 | selector: 35 | {{- include "dcgm-exporter.selectorLabels" . | nindent 4 }} 36 | -------------------------------------------------------------------------------- /deployment/dcgm-exporter/templates/serviceaccount.yaml: -------------------------------------------------------------------------------- 1 | {{- if .Values.serviceAccount.create -}} 2 | apiVersion: v1 3 | kind: ServiceAccount 4 | metadata: 5 | name: {{ include "dcgm-exporter.serviceAccountName" . }} 6 | namespace: {{ .Release.Namespace }} 7 | labels: 8 | {{- include "dcgm-exporter.labels" . | nindent 4 }} 9 | app.kubernetes.io/component: "dcgm-exporter" 10 | {{- with .Values.serviceAccount.annotations }} 11 | annotations: 12 | {{- toYaml . | nindent 4 }} 13 | {{- end }} 14 | {{- end -}} 15 | -------------------------------------------------------------------------------- /deployment/dcgm-exporter/values.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | image: 16 | repository: nvcr.io/nvidia/k8s/dcgm-exporter 17 | pullPolicy: IfNotPresent 18 | # Image tag defaults to AppVersion, but you can use the tag key 19 | # for the image tag, e.g: 20 | tag: 2.2.9-2.4.0-ubuntu18.04 21 | 22 | # Comment the following line to stop profiling metrics from DCGM 23 | arguments: ["-f", "/etc/dcgm-exporter/dcp-metrics-included.csv"] 24 | # NOTE: in general, add any command line arguments to arguments above 25 | # and they will be passed through. 26 | # Use "-r", ":" to connect to an already running hostengine 27 | # Example arguments: ["-r", "host123:5555"] 28 | # Use "-n" to remove the hostname tag from the output. 29 | # Example arguments: ["-n"] 30 | # Use "-d" to specify the devices to monitor. -d must be followed by a string 31 | # in the following format: [f] or [g[:numeric_range][+]][i[:numeric_range]] 32 | # Where a numeric range is something like 0-4 or 0,2,4, etc. 33 | # Example arguments: ["-d", "g+i"] to monitor all GPUs and GPU instances or 34 | # ["-d", "g:0-3"] to monitor GPUs 0-3. 35 | 36 | imagePullSecrets: [] 37 | nameOverride: "" 38 | fullnameOverride: "" 39 | 40 | serviceAccount: 41 | # Specifies whether a service account should be created 42 | create: true 43 | # Annotations to add to the service account 44 | annotations: {} 45 | # The name of the service account to use. 46 | # If not set and create is true, a name is generated using the fullname template 47 | name: 48 | 49 | podSecurityContext: {} 50 | # fsGroup: 2000 51 | 52 | securityContext: 53 | runAsNonRoot: false 54 | runAsUser: 0 55 | capabilities: 56 | add: ["SYS_ADMIN"] 57 | # readOnlyRootFilesystem: true 58 | 59 | service: 60 | type: ClusterIP 61 | port: 9400 62 | address: ":9400" 63 | # Annotations to add to the service 64 | annotations: {} 65 | 66 | resources: {} 67 | # limits: 68 | # cpu: 100m 69 | # memory: 128Mi 70 | # requests: 71 | # cpu: 100m 72 | # memory: 128Mi 73 | serviceMonitor: 74 | enabled: true 75 | interval: 15s 76 | additionalLabels: {} 77 | #monitoring: prometheus 78 | 79 | mapPodsMetrics: false 80 | 81 | nodeSelector: {} 82 | #node: gpu 83 | 84 | tolerations: [] 85 | #- operator: Exists 86 | 87 | affinity: {} 88 | #nodeAffinity: 89 | # requiredDuringSchedulingIgnoredDuringExecution: 90 | # nodeSelectorTerms: 91 | # - matchExpressions: 92 | # - key: nvidia-gpu 93 | # operator: Exists 94 | 95 | extraHostVolumes: [] 96 | #- name: host-binaries 97 | # hostPath: /opt/bin 98 | 99 | extraVolumeMounts: [] 100 | #- name: host-binaries 101 | # mountPath: /opt/bin 102 | # readOnly: true 103 | 104 | extraEnv: [] 105 | #- name: EXTRA_VAR 106 | # value: "TheStringValue" 107 | -------------------------------------------------------------------------------- /docker/Dockerfile.ubi8: -------------------------------------------------------------------------------- 1 | ARG GOLANG_VERSION 2 | FROM golang:$GOLANG_VERSION AS builder 3 | WORKDIR /go/src/github.com/NVIDIA/gpu-monitoring-tools 4 | 5 | COPY . . 6 | 7 | RUN make binary check-format 8 | 9 | FROM nvcr.io/nvidia/cuda:11.2.1-base-ubi8 10 | LABEL io.k8s.display-name="NVIDIA DCGM Exporter" 11 | 12 | ARG DCGM_VERSION 13 | RUN dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo && \ 14 | dnf clean expire-cache 15 | RUN dnf install -y datacenter-gpu-manager-${DCGM_VERSION} 16 | 17 | COPY --from=builder /go/src/github.com/NVIDIA/gpu-monitoring-tools/pkg/dcgm-exporter /usr/bin/ 18 | COPY etc/dcgm-exporter /etc/dcgm-exporter 19 | 20 | ENV NVIDIA_VISIBLE_DEVICES=all 21 | # disable all constraints on the configurations required by NVIDIA container toolkit 22 | ENV NVIDIA_DISABLE_REQUIRE="true" 23 | # Required for DCP metrics 24 | ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility,compat32 25 | 26 | ARG VERSION 27 | 28 | LABEL io.k8s.display-name="NVIDIA DCGM Eporter" 29 | LABEL name="NVIDIA DCGM Exporter" 30 | LABEL vendor="NVIDIA" 31 | LABEL version="${VERSION}" 32 | LABEL release="N/A" 33 | LABEL summary="Exports GPU Metrics to Prometheus" 34 | LABEL description="See summary" 35 | 36 | COPY ./LICENSE ./licenses/LICENSE 37 | 38 | ENV NO_SETCAP= 39 | COPY docker/dcgm-exporter-entrypoint.sh /usr/local/dcgm/dcgm-exporter-entrypoint.sh 40 | RUN chmod +x /usr/local/dcgm/dcgm-exporter-entrypoint.sh 41 | 42 | ENTRYPOINT ["/usr/local/dcgm/dcgm-exporter-entrypoint.sh"] 43 | -------------------------------------------------------------------------------- /docker/Dockerfile.ubuntu18.04: -------------------------------------------------------------------------------- 1 | ARG GOLANG_VERSION 2 | FROM golang:$GOLANG_VERSION AS builder 3 | WORKDIR /go/src/github.com/NVIDIA/gpu-monitoring-tools 4 | 5 | COPY . . 6 | 7 | RUN make binary check-format 8 | 9 | FROM nvcr.io/nvidia/cuda:11.2.1-base-ubuntu18.04 10 | LABEL io.k8s.display-name="NVIDIA DCGM Exporter" 11 | 12 | COPY --from=builder /go/src/github.com/NVIDIA/gpu-monitoring-tools/pkg/dcgm-exporter /usr/bin/ 13 | COPY etc/dcgm-exporter /etc/dcgm-exporter 14 | 15 | ARG DCGM_VERSION 16 | RUN apt-get update && apt-get install -y --no-install-recommends \ 17 | libcap2-bin gnupg2 curl ca-certificates && \ 18 | curl -fsSL https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/7fa2af80.pub | apt-key add - && \ 19 | echo "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64 /" > /etc/apt/sources.list.d/cuda.list && \ 20 | echo "deb https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64 /" > /etc/apt/sources.list.d/nvidia-ml.list && \ 21 | apt-get purge --autoremove -y curl \ 22 | && rm -rf /var/lib/apt/lists/* 23 | 24 | RUN apt-get update && apt-get install -y --no-install-recommends \ 25 | datacenter-gpu-manager=1:${DCGM_VERSION} && apt-get purge --autoremove -y openssl 26 | 27 | # Required for DCP metrics 28 | ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility,compat32 29 | # disable all constraints on the configurations required by NVIDIA container toolkit 30 | ENV NVIDIA_DISABLE_REQUIRE="true" 31 | ENV NVIDIA_VISIBLE_DEVICES=all 32 | 33 | ENV NO_SETCAP= 34 | COPY docker/dcgm-exporter-entrypoint.sh /usr/local/dcgm/dcgm-exporter-entrypoint.sh 35 | RUN chmod +x /usr/local/dcgm/dcgm-exporter-entrypoint.sh 36 | 37 | ENTRYPOINT ["/usr/local/dcgm/dcgm-exporter-entrypoint.sh"] 38 | -------------------------------------------------------------------------------- /docker/Dockerfile.ubuntu20.04: -------------------------------------------------------------------------------- 1 | ARG GOLANG_VERSION 2 | FROM golang:$GOLANG_VERSION AS builder 3 | WORKDIR /go/src/github.com/NVIDIA/gpu-monitoring-tools 4 | 5 | COPY . . 6 | 7 | RUN make binary check-format 8 | 9 | FROM nvcr.io/nvidia/cuda:11.2.1-base-ubuntu20.04 10 | LABEL io.k8s.display-name="NVIDIA DCGM Exporter" 11 | 12 | COPY --from=builder /go/src/github.com/NVIDIA/gpu-monitoring-tools/pkg/dcgm-exporter /usr/bin/ 13 | COPY etc/dcgm-exporter /etc/dcgm-exporter 14 | 15 | ARG DCGM_VERSION 16 | RUN apt-get update && apt-get install -y --no-install-recommends \ 17 | libcap2-bin gnupg2 curl ca-certificates && \ 18 | curl -fsSL https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/7fa2af80.pub | apt-key add - && \ 19 | echo "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64 /" > /etc/apt/sources.list.d/cuda.list && \ 20 | echo "deb https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu2004/x86_64 /" > /etc/apt/sources.list.d/nvidia-ml.list && \ 21 | apt-get purge --autoremove -y curl \ 22 | && rm -rf /var/lib/apt/lists/* 23 | 24 | RUN apt-get update && apt-get install -y --no-install-recommends \ 25 | datacenter-gpu-manager=1:${DCGM_VERSION} && apt-get purge --autoremove -y openssl 26 | 27 | # Required for DCP metrics 28 | ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility,compat32 29 | # disable all constraints on the configurations required by NVIDIA container toolkit 30 | ENV NVIDIA_DISABLE_REQUIRE="true" 31 | ENV NVIDIA_VISIBLE_DEVICES=all 32 | 33 | ENV NO_SETCAP= 34 | COPY docker/dcgm-exporter-entrypoint.sh /usr/local/dcgm/dcgm-exporter-entrypoint.sh 35 | RUN chmod +x /usr/local/dcgm/dcgm-exporter-entrypoint.sh 36 | 37 | ENTRYPOINT ["/usr/local/dcgm/dcgm-exporter-entrypoint.sh"] 38 | -------------------------------------------------------------------------------- /docker/dcgm-exporter-entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -euo pipefail 3 | 4 | # We want to setcap only when the container is started with the right caps 5 | DCGM_EXPORTER=$(readlink -f $(which dcgm-exporter)) 6 | if [ -z "$NO_SETCAP" ]; then 7 | setcap 'cap_sys_admin=+ep' $DCGM_EXPORTER 8 | if [ $? -eq 0 ]; then 9 | if ! $DCGM_EXPORTER -v 1>/dev/null 2>/dev/null; then 10 | >&2 echo "Warning #2: dcgm-exporter doesn't have sufficient privileges to expose profiling metrics. To get profiling metrics with dcgm-exporter, use --cap-add SYS_ADMIN" 11 | setcap 'cap_sys_admin=-ep' $DCGM_EXPORTER 12 | fi 13 | else 14 | >&2 echo "Warning #1: dcgm-exporter doesn't have sufficient privileges to expose profiling metrics. To get profiling metrics with dcgm-exporter, use --cap-add SYS_ADMIN" 15 | fi 16 | 17 | fi 18 | 19 | # Pass the command line arguments to dcgm-exporter 20 | set -- $DCGM_EXPORTER "$@" 21 | exec "$@" 22 | -------------------------------------------------------------------------------- /etc/dcgm-exporter/1.x-compatibility-metrics.csv: -------------------------------------------------------------------------------- 1 | # Format,, 2 | # If line starts with a '#' it is considered a comment,, 3 | # DCGM FIELD, Prometheus metric type, help message 4 | 5 | # Clocks,, 6 | dcgm_sm_clock, gauge, SM clock frequency (in MHz). 7 | dcgm_memory_clock, gauge, Memory clock frequency (in MHz). 8 | 9 | # Temperature,, 10 | dcgm_memory_temp, gauge, Memory temperature (in C). 11 | dcgm_gpu_temp, gauge, GPU temperature (in C). 12 | 13 | # Power,, 14 | dcgm_power_usage, gauge, Power draw (in W). 15 | dcgm_total_energy_consumption, counter, Total energy consumption since boot (in mJ). 16 | 17 | # PCIe,, 18 | dcgm_pcie_tx_throughput, counter, Total number of bytes transmitted through PCIe TX (in KB) via NVML. 19 | dcgm_pcie_rx_throughput, counter, Total number of bytes received through PCIe RX (in KB) via NVML. 20 | dcgm_pcie_replay_counter, counter, Total number of PCIe retries. 21 | 22 | # Utilization (the sample period varies depending on the product),, 23 | dcgm_gpu_utilization, gauge, GPU utilization (in %). 24 | dcgm_mem_copy_utilization, gauge, Memory utilization (in %). 25 | dcgm_enc_utilization, gauge, Encoder utilization (in %). 26 | dcgm_dec_utilization, gauge, Decoder utilization (in %). 27 | 28 | # Errors and violations,, 29 | dcgm_xid_errors, gauge, Value of the last XID error encountered. 30 | # dcgm_power_violation, counter, Throttling duration due to power constraints (in us). 31 | # dcgm_thermal_violation, counter, Throttling duration due to thermal constraints (in us). 32 | # dcgm_sync_boost_violation, counter, Throttling duration due to sync-boost constraints (in us). 33 | # dcgm_board_limit_violation, counter, Throttling duration due to board limit constraints (in us). 34 | # dcgm_low_util_violation, counter, Throttling duration due to low utilization (in us). 35 | # dcgm_reliability_violation, counter, Throttling duration due to reliability constraints (in us). 36 | 37 | # Memory usage,, 38 | dcgm_fb_free, gauge, Framebuffer memory free (in MiB). 39 | dcgm_fb_used, gauge, Framebuffer memory used (in MiB). 40 | 41 | # ECC,, 42 | # dcgm_ecc_sbe_volatile_total, counter, Total number of single-bit volatile ECC errors. 43 | # dcgm_ecc_dbe_volatile_total, counter, Total number of double-bit volatile ECC errors. 44 | # dcgm_ecc_sbe_aggregate_total, counter, Total number of single-bit persistent ECC errors. 45 | # dcgm_ecc_dbe_aggregate_total, counter, Total number of double-bit persistent ECC errors. 46 | 47 | # Retired pages,, 48 | # dcgm_retired_pages_sbe, counter, Total number of retired pages due to single-bit errors. 49 | # dcgm_retired_pages_dbe, counter, Total number of retired pages due to double-bit errors. 50 | # dcgm_retired_pages_pending, counter, Total number of pages pending retirement. 51 | 52 | # NVLink,, 53 | # dcgm_nvlink_flit_crc_error_count_total, counter, Total number of NVLink flow-control CRC errors. 54 | # dcgm_nvlink_data_crc_error_count_total, counter, Total number of NVLink data CRC errors. 55 | # dcgm_nvlink_replay_error_count_total, counter, Total number of NVLink retries. 56 | # dcgm_nvlink_recovery_error_count_total, counter, Total number of NVLink recovery errors. 57 | dcgm_nvlink_bandwidth_total, counter, Total number of NVLink bandwidth counters for all lanes 58 | 59 | # Add DCP metrics,, 60 | dcgm_fi_prof_gr_engine_active, gauge, Ratio of time the graphics engine is active (in %). 61 | # dcgm_fi_prof_sm_active, gauge, The ratio of cycles an SM has at least 1 warp assigned (in %). 62 | # dcgm_fi_prof_sm_occupancy, gauge, The ratio of number of warps resident on an SM (in %). 63 | dcgm_fi_prof_pipe_tensor_active, gauge, Ratio of cycles the tensor (HMMA) pipe is active (in %). 64 | dcgm_fi_prof_dram_active, gauge, Ratio of cycles the device memory interface is active sending or receiving data (in %). 65 | dcgm_fi_prof_pcie_tx_bytes, counter, The number of bytes of active pcie tx data including both header and payload. 66 | dcgm_fi_prof_pcie_rx_bytes, counter, The number of bytes of active pcie rx data including both header and payload. 67 | -------------------------------------------------------------------------------- /etc/dcgm-exporter/dcp-metrics-included.csv: -------------------------------------------------------------------------------- 1 | # Format,, 2 | # If line starts with a '#' it is considered a comment,, 3 | # DCGM FIELD, Prometheus metric type, help message 4 | 5 | # Clocks,, 6 | DCGM_FI_DEV_SM_CLOCK, gauge, SM clock frequency (in MHz). 7 | DCGM_FI_DEV_MEM_CLOCK, gauge, Memory clock frequency (in MHz). 8 | 9 | # Temperature,, 10 | DCGM_FI_DEV_MEMORY_TEMP, gauge, Memory temperature (in C). 11 | DCGM_FI_DEV_GPU_TEMP, gauge, GPU temperature (in C). 12 | 13 | # Power,, 14 | DCGM_FI_DEV_POWER_USAGE, gauge, Power draw (in W). 15 | DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION, counter, Total energy consumption since boot (in mJ). 16 | 17 | # PCIE,, 18 | # DCGM_FI_DEV_PCIE_TX_THROUGHPUT, counter, Total number of bytes transmitted through PCIe TX (in KB) via NVML. 19 | # DCGM_FI_DEV_PCIE_RX_THROUGHPUT, counter, Total number of bytes received through PCIe RX (in KB) via NVML. 20 | DCGM_FI_DEV_PCIE_REPLAY_COUNTER, counter, Total number of PCIe retries. 21 | 22 | # Utilization (the sample period varies depending on the product),, 23 | # DCGM_FI_DEV_GPU_UTIL, gauge, GPU utilization (in %). 24 | DCGM_FI_DEV_MEM_COPY_UTIL, gauge, Memory utilization (in %). 25 | DCGM_FI_DEV_ENC_UTIL, gauge, Encoder utilization (in %). 26 | DCGM_FI_DEV_DEC_UTIL , gauge, Decoder utilization (in %). 27 | 28 | # Errors and violations,, 29 | DCGM_FI_DEV_XID_ERRORS, gauge, Value of the last XID error encountered. 30 | # DCGM_FI_DEV_POWER_VIOLATION, counter, Throttling duration due to power constraints (in us). 31 | # DCGM_FI_DEV_THERMAL_VIOLATION, counter, Throttling duration due to thermal constraints (in us). 32 | # DCGM_FI_DEV_SYNC_BOOST_VIOLATION, counter, Throttling duration due to sync-boost constraints (in us). 33 | # DCGM_FI_DEV_BOARD_LIMIT_VIOLATION, counter, Throttling duration due to board limit constraints (in us). 34 | # DCGM_FI_DEV_LOW_UTIL_VIOLATION, counter, Throttling duration due to low utilization (in us). 35 | # DCGM_FI_DEV_RELIABILITY_VIOLATION, counter, Throttling duration due to reliability constraints (in us). 36 | 37 | # Memory usage,, 38 | DCGM_FI_DEV_FB_FREE, gauge, Framebuffer memory free (in MiB). 39 | DCGM_FI_DEV_FB_USED, gauge, Framebuffer memory used (in MiB). 40 | 41 | # ECC,, 42 | # DCGM_FI_DEV_ECC_SBE_VOL_TOTAL, counter, Total number of single-bit volatile ECC errors. 43 | # DCGM_FI_DEV_ECC_DBE_VOL_TOTAL, counter, Total number of double-bit volatile ECC errors. 44 | # DCGM_FI_DEV_ECC_SBE_AGG_TOTAL, counter, Total number of single-bit persistent ECC errors. 45 | # DCGM_FI_DEV_ECC_DBE_AGG_TOTAL, counter, Total number of double-bit persistent ECC errors. 46 | 47 | # Retired pages,, 48 | # DCGM_FI_DEV_RETIRED_SBE, counter, Total number of retired pages due to single-bit errors. 49 | # DCGM_FI_DEV_RETIRED_DBE, counter, Total number of retired pages due to double-bit errors. 50 | # DCGM_FI_DEV_RETIRED_PENDING, counter, Total number of pages pending retirement. 51 | 52 | # NVLink,, 53 | # DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL, counter, Total number of NVLink flow-control CRC errors. 54 | # DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL, counter, Total number of NVLink data CRC errors. 55 | # DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL, counter, Total number of NVLink retries. 56 | # DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL, counter, Total number of NVLink recovery errors. 57 | DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL, counter, Total number of NVLink bandwidth counters for all lanes. 58 | # DCGM_FI_DEV_NVLINK_BANDWIDTH_L0, counter, The number of bytes of active NVLink rx or tx data including both header and payload. 59 | 60 | # VGPU License status,, 61 | DCGM_FI_DEV_VGPU_LICENSE_STATUS, gauge, vGPU License status 62 | 63 | # Remapped rows,, 64 | DCGM_FI_DEV_UNCORRECTABLE_REMAPPED_ROWS, counter, Number of remapped rows for uncorrectable errors 65 | DCGM_FI_DEV_CORRECTABLE_REMAPPED_ROWS, counter, Number of remapped rows for correctable errors 66 | DCGM_FI_DEV_ROW_REMAP_FAILURE, gauge, Whether remapping of rows has failed 67 | 68 | # DCP metrics,, 69 | DCGM_FI_PROF_GR_ENGINE_ACTIVE, gauge, Ratio of time the graphics engine is active (in %). 70 | # DCGM_FI_PROF_SM_ACTIVE, gauge, The ratio of cycles an SM has at least 1 warp assigned (in %). 71 | # DCGM_FI_PROF_SM_OCCUPANCY, gauge, The ratio of number of warps resident on an SM (in %). 72 | DCGM_FI_PROF_PIPE_TENSOR_ACTIVE, gauge, Ratio of cycles the tensor (HMMA) pipe is active (in %). 73 | DCGM_FI_PROF_DRAM_ACTIVE, gauge, Ratio of cycles the device memory interface is active sending or receiving data (in %). 74 | # DCGM_FI_PROF_PIPE_FP64_ACTIVE, gauge, Ratio of cycles the fp64 pipes are active (in %). 75 | # DCGM_FI_PROF_PIPE_FP32_ACTIVE, gauge, Ratio of cycles the fp32 pipes are active (in %). 76 | # DCGM_FI_PROF_PIPE_FP16_ACTIVE, gauge, Ratio of cycles the fp16 pipes are active (in %). 77 | DCGM_FI_PROF_PCIE_TX_BYTES, counter, The number of bytes of active pcie tx data including both header and payload. 78 | DCGM_FI_PROF_PCIE_RX_BYTES, counter, The number of bytes of active pcie rx data including both header and payload. 79 | 80 | -------------------------------------------------------------------------------- /etc/dcgm-exporter/default-counters.csv: -------------------------------------------------------------------------------- 1 | # Format,, 2 | # If line starts with a '#' it is considered a comment,, 3 | # DCGM FIELD, Prometheus metric type, help message 4 | 5 | # Clocks,, 6 | DCGM_FI_DEV_SM_CLOCK, gauge, SM clock frequency (in MHz). 7 | DCGM_FI_DEV_MEM_CLOCK, gauge, Memory clock frequency (in MHz). 8 | 9 | # Temperature,, 10 | DCGM_FI_DEV_MEMORY_TEMP, gauge, Memory temperature (in C). 11 | DCGM_FI_DEV_GPU_TEMP, gauge, GPU temperature (in C). 12 | 13 | # Power,, 14 | DCGM_FI_DEV_POWER_USAGE, gauge, Power draw (in W). 15 | DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION, counter, Total energy consumption since boot (in mJ). 16 | 17 | # PCIE,, 18 | DCGM_FI_DEV_PCIE_TX_THROUGHPUT, counter, Total number of bytes transmitted through PCIe TX (in KB) via NVML. 19 | DCGM_FI_DEV_PCIE_RX_THROUGHPUT, counter, Total number of bytes received through PCIe RX (in KB) via NVML. 20 | DCGM_FI_DEV_PCIE_REPLAY_COUNTER, counter, Total number of PCIe retries. 21 | 22 | # Utilization (the sample period varies depending on the product),, 23 | # DCGM_FI_DEV_GPU_UTIL, gauge, GPU utilization (in %). 24 | DCGM_FI_DEV_MEM_COPY_UTIL, gauge, Memory utilization (in %). 25 | DCGM_FI_DEV_ENC_UTIL, gauge, Encoder utilization (in %). 26 | DCGM_FI_DEV_DEC_UTIL , gauge, Decoder utilization (in %). 27 | 28 | # Errors and violations,, 29 | DCGM_FI_DEV_XID_ERRORS, gauge, Value of the last XID error encountered. 30 | # DCGM_FI_DEV_POWER_VIOLATION, counter, Throttling duration due to power constraints (in us). 31 | # DCGM_FI_DEV_THERMAL_VIOLATION, counter, Throttling duration due to thermal constraints (in us). 32 | # DCGM_FI_DEV_SYNC_BOOST_VIOLATION, counter, Throttling duration due to sync-boost constraints (in us). 33 | # DCGM_FI_DEV_BOARD_LIMIT_VIOLATION, counter, Throttling duration due to board limit constraints (in us). 34 | # DCGM_FI_DEV_LOW_UTIL_VIOLATION, counter, Throttling duration due to low utilization (in us). 35 | # DCGM_FI_DEV_RELIABILITY_VIOLATION, counter, Throttling duration due to reliability constraints (in us). 36 | 37 | # Memory usage,, 38 | DCGM_FI_DEV_FB_FREE, gauge, Framebuffer memory free (in MiB). 39 | DCGM_FI_DEV_FB_USED, gauge, Framebuffer memory used (in MiB). 40 | 41 | # ECC,, 42 | # DCGM_FI_DEV_ECC_SBE_VOL_TOTAL, counter, Total number of single-bit volatile ECC errors. 43 | # DCGM_FI_DEV_ECC_DBE_VOL_TOTAL, counter, Total number of double-bit volatile ECC errors. 44 | # DCGM_FI_DEV_ECC_SBE_AGG_TOTAL, counter, Total number of single-bit persistent ECC errors. 45 | # DCGM_FI_DEV_ECC_DBE_AGG_TOTAL, counter, Total number of double-bit persistent ECC errors. 46 | 47 | # Retired pages,, 48 | # DCGM_FI_DEV_RETIRED_SBE, counter, Total number of retired pages due to single-bit errors. 49 | # DCGM_FI_DEV_RETIRED_DBE, counter, Total number of retired pages due to double-bit errors. 50 | # DCGM_FI_DEV_RETIRED_PENDING, counter, Total number of pages pending retirement. 51 | 52 | # NVLink,, 53 | # DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL, counter, Total number of NVLink flow-control CRC errors. 54 | # DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL, counter, Total number of NVLink data CRC errors. 55 | # DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL, counter, Total number of NVLink retries. 56 | # DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL, counter, Total number of NVLink recovery errors. 57 | DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL, counter, Total number of NVLink bandwidth counters for all lanes 58 | 59 | # VGPU License status,, 60 | DCGM_FI_DEV_VGPU_LICENSE_STATUS, gauge, vGPU License status 61 | 62 | # Remapped rows,, 63 | DCGM_FI_DEV_UNCORRECTABLE_REMAPPED_ROWS, counter, Number of remapped rows for uncorrectable errors 64 | DCGM_FI_DEV_CORRECTABLE_REMAPPED_ROWS, counter, Number of remapped rows for correctable errors 65 | DCGM_FI_DEV_ROW_REMAP_FAILURE, gauge, Whether remapping of rows has failed 66 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/NVIDIA/gpu-monitoring-tools 2 | 3 | go 1.14 4 | 5 | require ( 6 | github.com/Masterminds/semver v1.5.0 // indirect 7 | github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm v0.0.0-20210325210537-29b4f1784f18 8 | github.com/gorilla/mux v1.7.4 9 | ) 10 | 11 | replace ( 12 | github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm => ./bindings/go/dcgm 13 | k8s.io/api => k8s.io/api v0.20.2 14 | k8s.io/apiextensions-apiserver => k8s.io/apiextensions-apiserver v0.20.2 15 | k8s.io/apimachinery => k8s.io/apimachinery v0.20.2 16 | k8s.io/apiserver => k8s.io/apiserver v0.20.2 17 | k8s.io/cli-runtime => k8s.io/cli-runtime v0.20.2 18 | k8s.io/client-go => k8s.io/client-go v0.20.2 19 | k8s.io/cloud-provider => k8s.io/cloud-provider v0.20.2 20 | k8s.io/cluster-bootstrap => k8s.io/cluster-bootstrap v0.20.2 21 | k8s.io/code-generator => k8s.io/code-generator v0.20.2 22 | k8s.io/component-base => k8s.io/component-base v0.20.2 23 | k8s.io/cri-api => k8s.io/cri-api v0.20.2 24 | k8s.io/csi-translation-lib => k8s.io/csi-translation-lib v0.20.2 25 | k8s.io/kube-aggregator => k8s.io/kube-aggregator v0.20.2 26 | k8s.io/kube-controller-manager => k8s.io/kube-controller-manager v0.20.2 27 | k8s.io/kube-proxy => k8s.io/kube-proxy v0.20.2 28 | k8s.io/kube-scheduler => k8s.io/kube-scheduler v0.20.2 29 | k8s.io/kubectl => k8s.io/kubectl v0.20.2 30 | k8s.io/kubelet => k8s.io/kubelet v0.20.2 31 | k8s.io/legacy-cloud-providers => k8s.io/legacy-cloud-providers v0.20.2 32 | k8s.io/metrics => k8s.io/metrics v0.20.2 33 | k8s.io/sample-apiserver => k8s.io/sample-apiserver v0.20.2 34 | ) 35 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | github.com/Masterminds/semver v1.5.0 h1:H65muMkzWKEuNDnfl9d70GUjFniHKHRbFPGBuZ3QEww= 2 | github.com/Masterminds/semver v1.5.0/go.mod h1:MB6lktGJrhw8PrUyiEoblNEGEQ+RzHPF078ddwwvV3Y= 3 | github.com/gorilla/mux v1.7.4 h1:VuZ8uybHlWmqV03+zRzdwKL4tUnIp1MAQtp1mIFE1bc= 4 | github.com/gorilla/mux v1.7.4/go.mod h1:DVbg23sWSpFRCP0SfiEN6jmj59UnW/n46BH5rLB71So= 5 | -------------------------------------------------------------------------------- /pkg/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:16.04 2 | 3 | ARG DCGM_VERSION 4 | 5 | RUN apt-get update && apt-get install -y --no-install-recommends libgomp1 \ 6 | ca-certificates wget && \ 7 | rm -rf /var/lib/apt/lists/* 8 | 9 | RUN wget https://developer.download.nvidia.com/compute/redist/dcgm/${DCGM_VERSION}/DEBS/datacenter-gpu-manager_${DCGM_VERSION}_amd64.deb && \ 10 | dpkg -i datacenter-gpu-manager_*.deb && \ 11 | rm -f datacenter-gpu-manager_*.deb 12 | 13 | COPY dcgm-exporter /usr/local/bin 14 | 15 | ENV NVIDIA_VISIBLE_DEVICES all 16 | ENV NVIDIA_DRIVER_CAPABILITIES all 17 | 18 | VOLUME /run/prometheus 19 | 20 | ENTRYPOINT [ "dcgm-exporter", "-e" ] 21 | -------------------------------------------------------------------------------- /pkg/dcgm.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package main 18 | 19 | import ( 20 | "fmt" 21 | "github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm" 22 | "math/rand" 23 | ) 24 | 25 | func NewGroup() (dcgm.GroupHandle, func(), error) { 26 | group, err := dcgm.NewDefaultGroup(fmt.Sprintf("gpu-collector-group-%d", rand.Uint64())) 27 | if err != nil { 28 | return dcgm.GroupHandle{}, func() {}, err 29 | } 30 | 31 | return group, func() { dcgm.DestroyGroup(group) }, nil 32 | } 33 | 34 | func NewDeviceFields(counters []Counter) []dcgm.Short { 35 | deviceFields := make([]dcgm.Short, len(counters)) 36 | for i, f := range counters { 37 | deviceFields[i] = f.FieldID 38 | } 39 | 40 | return deviceFields 41 | } 42 | 43 | func NewFieldGroup(deviceFields []dcgm.Short) (dcgm.FieldHandle, func(), error) { 44 | name := fmt.Sprintf("gpu-collector-fieldgroup-%d", rand.Uint64()) 45 | fieldGroup, err := dcgm.FieldGroupCreate(name, deviceFields) 46 | if err != nil { 47 | return dcgm.FieldHandle{}, func() {}, err 48 | } 49 | 50 | return fieldGroup, func() { dcgm.FieldGroupDestroy(fieldGroup) }, nil 51 | } 52 | 53 | func WatchFieldGroup(group dcgm.GroupHandle, field dcgm.FieldHandle) error { 54 | err := dcgm.WatchFieldsWithGroup(field, group) 55 | if err != nil { 56 | return err 57 | } 58 | 59 | return nil 60 | } 61 | 62 | func SetupDcgmFieldsWatch(deviceFields []dcgm.Short, sysInfo SystemInfo) ([]func(), error) { 63 | var err error 64 | var cleanups []func() 65 | var cleanup func() 66 | var group dcgm.GroupHandle 67 | var fieldGroup dcgm.FieldHandle 68 | 69 | group, cleanup, err = CreateGroupFromSystemInfo(sysInfo) 70 | if err != nil { 71 | goto fail 72 | } 73 | 74 | cleanups = append(cleanups, cleanup) 75 | 76 | fieldGroup, cleanup, err = NewFieldGroup(deviceFields) 77 | if err != nil { 78 | goto fail 79 | } 80 | 81 | cleanups = append(cleanups, cleanup) 82 | 83 | err = WatchFieldGroup(group, fieldGroup) 84 | if err != nil { 85 | goto fail 86 | } 87 | 88 | return cleanups, nil 89 | 90 | fail: 91 | for _, f := range cleanups { 92 | f() 93 | } 94 | 95 | return nil, err 96 | } 97 | -------------------------------------------------------------------------------- /pkg/go.mod: -------------------------------------------------------------------------------- 1 | module dcgm-exporter 2 | 3 | go 1.14 4 | 5 | replace ( 6 | github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm => ../bindings/go/dcgm 7 | k8s.io/api => k8s.io/api v0.20.2 8 | k8s.io/apiextensions-apiserver => k8s.io/apiextensions-apiserver v0.20.2 9 | k8s.io/apimachinery => k8s.io/apimachinery v0.20.2 10 | k8s.io/apiserver => k8s.io/apiserver v0.20.2 11 | k8s.io/cli-runtime => k8s.io/cli-runtime v0.20.2 12 | k8s.io/client-go => k8s.io/client-go v0.20.2 13 | k8s.io/cloud-provider => k8s.io/cloud-provider v0.20.2 14 | k8s.io/cluster-bootstrap => k8s.io/cluster-bootstrap v0.20.2 15 | k8s.io/code-generator => k8s.io/code-generator v0.20.2 16 | k8s.io/component-base => k8s.io/component-base v0.20.2 17 | k8s.io/cri-api => k8s.io/cri-api v0.20.2 18 | k8s.io/csi-translation-lib => k8s.io/csi-translation-lib v0.20.2 19 | k8s.io/kube-aggregator => k8s.io/kube-aggregator v0.20.2 20 | k8s.io/kube-controller-manager => k8s.io/kube-controller-manager v0.20.2 21 | k8s.io/kube-proxy => k8s.io/kube-proxy v0.20.2 22 | k8s.io/kube-scheduler => k8s.io/kube-scheduler v0.20.2 23 | k8s.io/kubectl => k8s.io/kubectl v0.20.2 24 | k8s.io/kubelet => k8s.io/kubelet v0.20.2 25 | k8s.io/legacy-cloud-providers => k8s.io/legacy-cloud-providers v0.20.2 26 | k8s.io/metrics => k8s.io/metrics v0.20.2 27 | k8s.io/sample-apiserver => k8s.io/sample-apiserver v0.20.2 28 | ) 29 | 30 | require ( 31 | github.com/Masterminds/semver v1.5.0 // indirect 32 | github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm v0.0.0-00010101000000-000000000000 33 | github.com/gorilla/mux v1.8.0 34 | github.com/sirupsen/logrus v1.7.0 35 | github.com/stretchr/testify v1.6.1 36 | github.com/urfave/cli/v2 v2.3.0 37 | google.golang.org/grpc v1.35.0 38 | k8s.io/kubelet v0.20.2 39 | k8s.io/kubernetes v1.18.2 40 | ) 41 | -------------------------------------------------------------------------------- /pkg/gpu_collector.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package main 18 | 19 | import ( 20 | "fmt" 21 | "github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm" 22 | "os" 23 | ) 24 | 25 | func NewDCGMCollector(c []Counter, config *Config) (*DCGMCollector, func(), error) { 26 | sysInfo, err := InitializeSystemInfo(config.Devices, config.UseFakeGpus) 27 | if err != nil { 28 | return nil, func() {}, err 29 | } 30 | 31 | hostname := "" 32 | if config.NoHostname == false { 33 | hostname, err = os.Hostname() 34 | if err != nil { 35 | return nil, func() {}, err 36 | } 37 | } 38 | 39 | collector := &DCGMCollector{ 40 | Counters: c, 41 | DeviceFields: NewDeviceFields(c), 42 | UseOldNamespace: config.UseOldNamespace, 43 | SysInfo: sysInfo, 44 | Hostname: hostname, 45 | } 46 | 47 | cleanups, err := SetupDcgmFieldsWatch(collector.DeviceFields, sysInfo) 48 | if err != nil { 49 | return nil, func() {}, err 50 | } 51 | 52 | collector.Cleanups = cleanups 53 | 54 | return collector, func() { collector.Cleanup() }, nil 55 | } 56 | 57 | func (c *DCGMCollector) Cleanup() { 58 | for _, c := range c.Cleanups { 59 | c() 60 | } 61 | } 62 | 63 | func (c *DCGMCollector) GetMetrics() ([][]Metric, error) { 64 | monitoringInfo := GetMonitoredEntities(c.SysInfo) 65 | count := len(monitoringInfo) 66 | 67 | metrics := make([][]Metric, count) 68 | 69 | for i, mi := range monitoringInfo { 70 | vals, err := dcgm.EntityGetLatestValues(mi.Entity.EntityGroupId, mi.Entity.EntityId, c.DeviceFields) 71 | if err != nil { 72 | return nil, err 73 | } 74 | 75 | // InstanceInfo will be nil for GPUs 76 | metrics[i] = ToMetric(vals, c.Counters, mi.DeviceInfo, mi.InstanceInfo, c.UseOldNamespace, c.Hostname) 77 | } 78 | 79 | return metrics, nil 80 | } 81 | 82 | func ToMetric(values []dcgm.FieldValue_v1, c []Counter, d dcgm.Device, instanceInfo *GpuInstanceInfo, useOld bool, hostname string) []Metric { 83 | var metrics []Metric 84 | 85 | for i, val := range values { 86 | v := ToString(val) 87 | // Filter out counters with no value and ignored fields for this entity 88 | if v == SkipDCGMValue { 89 | continue 90 | } 91 | uuid := "UUID" 92 | if useOld { 93 | uuid = "uuid" 94 | } 95 | m := Metric{ 96 | Counter: &c[i], 97 | Value: v, 98 | 99 | UUID: uuid, 100 | GPU: fmt.Sprintf("%d", d.GPU), 101 | GPUUUID: d.UUID, 102 | GPUDevice: fmt.Sprintf("nvidia%d", d.GPU), 103 | GPUModelName: d.Identifiers.Model, 104 | Hostname: hostname, 105 | 106 | Attributes: map[string]string{}, 107 | } 108 | if instanceInfo != nil { 109 | m.MigProfile = instanceInfo.ProfileName 110 | m.GPUInstanceID = fmt.Sprintf("%d", instanceInfo.Info.NvmlInstanceId) 111 | } else { 112 | m.MigProfile = "" 113 | m.GPUInstanceID = "" 114 | } 115 | metrics = append(metrics, m) 116 | } 117 | 118 | return metrics 119 | } 120 | 121 | func ToString(value dcgm.FieldValue_v1) string { 122 | switch v := value.Int64(); v { 123 | case dcgm.DCGM_FT_INT32_BLANK: 124 | return SkipDCGMValue 125 | case dcgm.DCGM_FT_INT32_NOT_FOUND: 126 | return SkipDCGMValue 127 | case dcgm.DCGM_FT_INT32_NOT_SUPPORTED: 128 | return SkipDCGMValue 129 | case dcgm.DCGM_FT_INT32_NOT_PERMISSIONED: 130 | return SkipDCGMValue 131 | case dcgm.DCGM_FT_INT64_BLANK: 132 | return SkipDCGMValue 133 | case dcgm.DCGM_FT_INT64_NOT_FOUND: 134 | return SkipDCGMValue 135 | case dcgm.DCGM_FT_INT64_NOT_SUPPORTED: 136 | return SkipDCGMValue 137 | case dcgm.DCGM_FT_INT64_NOT_PERMISSIONED: 138 | return SkipDCGMValue 139 | } 140 | switch v := value.Float64(); v { 141 | case dcgm.DCGM_FT_FP64_BLANK: 142 | return SkipDCGMValue 143 | case dcgm.DCGM_FT_FP64_NOT_FOUND: 144 | return SkipDCGMValue 145 | case dcgm.DCGM_FT_FP64_NOT_SUPPORTED: 146 | return SkipDCGMValue 147 | case dcgm.DCGM_FT_FP64_NOT_PERMISSIONED: 148 | return SkipDCGMValue 149 | } 150 | switch v := value.FieldType; v { 151 | case dcgm.DCGM_FT_STRING: 152 | return value.String() 153 | case dcgm.DCGM_FT_DOUBLE: 154 | return fmt.Sprintf("%f", value.Float64()) 155 | case dcgm.DCGM_FT_INT64: 156 | return fmt.Sprintf("%d", value.Int64()) 157 | default: 158 | return FailedToConvert 159 | } 160 | 161 | return FailedToConvert 162 | } 163 | -------------------------------------------------------------------------------- /pkg/gpu_collector_test.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package main 18 | 19 | import ( 20 | "fmt" 21 | "testing" 22 | 23 | "github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm" 24 | "github.com/stretchr/testify/require" 25 | ) 26 | 27 | var sampleCounters = []Counter{ 28 | {dcgm.DCGM_FI_DEV_GPU_TEMP, "DCGM_FI_DEV_GPU_TEMP", "gauge", "Temperature Help info"}, 29 | {dcgm.DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION, "DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION", "gauge", "Energy help info"}, 30 | {dcgm.DCGM_FI_DEV_POWER_USAGE, "DCGM_FI_DEV_POWER_USAGE", "gauge", "Power help info"}, 31 | } 32 | 33 | func TestDCGMCollector(t *testing.T) { 34 | cleanup, err := dcgm.Init(dcgm.Embedded) 35 | require.NoError(t, err) 36 | defer cleanup() 37 | 38 | _, cleanup = testDCGMCollector(t, sampleCounters) 39 | cleanup() 40 | } 41 | 42 | func testDCGMCollector(t *testing.T, counters []Counter) (*DCGMCollector, func()) { 43 | dOpt := DeviceOptions{true, []int{-1}, []int{-1}} 44 | cfg := Config{ 45 | Devices: dOpt, 46 | NoHostname: false, 47 | UseOldNamespace: false, 48 | UseFakeGpus: false, 49 | } 50 | c, cleanup, err := NewDCGMCollector(counters, &cfg) 51 | require.NoError(t, err) 52 | 53 | out, err := c.GetMetrics() 54 | require.NoError(t, err) 55 | require.Greater(t, len(out), 0, "Check that you have a GPU on this node") 56 | require.Len(t, out[0], len(counters)) 57 | 58 | for i, dev := range out { 59 | for j, metric := range dev { 60 | require.Equal(t, metric.Name, counters[j].FieldName) 61 | require.Equal(t, metric.GPU, fmt.Sprintf("%d", i)) 62 | 63 | require.NotEmpty(t, metric.Value) 64 | require.NotEqual(t, metric.Value, FailedToConvert) 65 | } 66 | } 67 | 68 | return c, cleanup 69 | } 70 | -------------------------------------------------------------------------------- /pkg/kubernetes_test.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package main 18 | 19 | import ( 20 | "context" 21 | "fmt" 22 | "io/ioutil" 23 | "os" 24 | "testing" 25 | "time" 26 | 27 | "github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm" 28 | "github.com/stretchr/testify/require" 29 | "google.golang.org/grpc" 30 | podresourcesapi "k8s.io/kubernetes/pkg/kubelet/apis/podresources/v1alpha1" 31 | "k8s.io/kubernetes/pkg/kubelet/util" 32 | ) 33 | 34 | var tmpDir string 35 | 36 | func TestProcessPodMapper(t *testing.T) { 37 | cleanup := CreateTmpDir(t) 38 | defer cleanup() 39 | 40 | cleanup, err := dcgm.Init(dcgm.Embedded) 41 | require.NoError(t, err) 42 | defer cleanup() 43 | 44 | c, cleanup := testDCGMCollector(t, sampleCounters) 45 | defer cleanup() 46 | 47 | out, err := c.GetMetrics() 48 | require.NoError(t, err) 49 | original := append(out[:0:0], out...) 50 | 51 | socketPath = tmpDir + "/kubelet.sock" 52 | server := grpc.NewServer() 53 | gpus := GetGPUUUIDs(original) 54 | podresourcesapi.RegisterPodResourcesListerServer(server, NewPodResourcesMockServer(gpus)) 55 | 56 | cleanup = StartMockServer(t, server, socketPath) 57 | defer cleanup() 58 | 59 | podMapper := NewPodMapper(&Config{KubernetesGPUIdType: GPUUID}) 60 | var sysInfo SystemInfo 61 | err = podMapper.Process(out, sysInfo) 62 | require.NoError(t, err) 63 | 64 | require.Len(t, out, len(original)) 65 | for i, dev := range out { 66 | for _, metric := range dev { 67 | require.Contains(t, metric.Attributes, podAttribute) 68 | require.Contains(t, metric.Attributes, namespaceAttribute) 69 | require.Contains(t, metric.Attributes, containerAttribute) 70 | 71 | // TODO currently we rely on ordering and implicit expectations of the mock implementation 72 | // This should be a table comparison 73 | require.Equal(t, metric.Attributes[podAttribute], fmt.Sprintf("gpu-pod-%d", i)) 74 | require.Equal(t, metric.Attributes[namespaceAttribute], "default") 75 | require.Equal(t, metric.Attributes[containerAttribute], "default") 76 | } 77 | } 78 | } 79 | 80 | func GetGPUUUIDs(metrics [][]Metric) []string { 81 | gpus := make([]string, len(metrics)) 82 | for i, dev := range metrics { 83 | gpus[i] = dev[0].GPUUUID 84 | } 85 | 86 | return gpus 87 | } 88 | 89 | func StartMockServer(t *testing.T, server *grpc.Server, socket string) func() { 90 | l, err := util.CreateListener("unix://" + socket) 91 | require.NoError(t, err) 92 | 93 | stopped := make(chan interface{}) 94 | 95 | go func() { 96 | server.Serve(l) 97 | close(stopped) 98 | }() 99 | 100 | return func() { 101 | server.Stop() 102 | select { 103 | case <-stopped: 104 | return 105 | case <-time.After(1 * time.Second): 106 | t.Fatal("Failed waiting for gRPC server to stop") 107 | } 108 | } 109 | } 110 | 111 | func CreateTmpDir(t *testing.T) func() { 112 | path, err := ioutil.TempDir("", "gpu-monitoring-tools") 113 | require.NoError(t, err) 114 | 115 | tmpDir = path 116 | 117 | return func() { 118 | require.NoError(t, os.RemoveAll(tmpDir)) 119 | } 120 | } 121 | 122 | // Contains a list of UUIDs 123 | type PodResourcesMockServer struct { 124 | gpus []string 125 | } 126 | 127 | func NewPodResourcesMockServer(used []string) *PodResourcesMockServer { 128 | return &PodResourcesMockServer{ 129 | gpus: used, 130 | } 131 | } 132 | 133 | func (s *PodResourcesMockServer) List(ctx context.Context, req *podresourcesapi.ListPodResourcesRequest) (*podresourcesapi.ListPodResourcesResponse, error) { 134 | podResources := make([]*podresourcesapi.PodResources, len(s.gpus)) 135 | 136 | for i, gpu := range s.gpus { 137 | podResources[i] = &podresourcesapi.PodResources{ 138 | Name: fmt.Sprintf("gpu-pod-%d", i), 139 | Namespace: "default", 140 | Containers: []*podresourcesapi.ContainerResources{ 141 | &podresourcesapi.ContainerResources{ 142 | Name: "default", 143 | Devices: []*podresourcesapi.ContainerDevices{ 144 | &podresourcesapi.ContainerDevices{ 145 | ResourceName: nvidiaResourceName, 146 | DeviceIds: []string{gpu}, 147 | }, 148 | }, 149 | }, 150 | }, 151 | } 152 | } 153 | 154 | return &podresourcesapi.ListPodResourcesResponse{ 155 | PodResources: podResources, 156 | }, nil 157 | 158 | } 159 | -------------------------------------------------------------------------------- /pkg/parser.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package main 18 | 19 | import ( 20 | "encoding/csv" 21 | "fmt" 22 | "os" 23 | "strings" 24 | 25 | "github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm" 26 | "github.com/sirupsen/logrus" 27 | ) 28 | 29 | func ExtractCounters(filename string, dcpAllowed bool) ([]Counter, error) { 30 | records, err := ReadCSVFile(filename) 31 | if err != nil { 32 | fmt.Printf("Error: %v\n", err) 33 | return nil, err 34 | } 35 | 36 | counters, err := extractCounters(records, dcpAllowed) 37 | if err != nil { 38 | return nil, err 39 | } 40 | 41 | return counters, err 42 | } 43 | 44 | func ReadCSVFile(filename string) ([][]string, error) { 45 | file, err := os.Open(filename) 46 | if err != nil { 47 | return nil, err 48 | } 49 | 50 | defer file.Close() 51 | 52 | r := csv.NewReader(file) 53 | records, err := r.ReadAll() 54 | 55 | return records, err 56 | } 57 | 58 | func extractCounters(records [][]string, dcpAllowed bool) ([]Counter, error) { 59 | f := make([]Counter, 0, len(records)) 60 | 61 | for i, record := range records { 62 | var useOld = false 63 | if len(record) == 0 { 64 | continue 65 | } 66 | 67 | for j, r := range record { 68 | record[j] = strings.Trim(r, " ") 69 | } 70 | 71 | if recordIsCommentOrEmpty(record) { 72 | logrus.Debugf("Skipping line %d (`%v`)", i, record) 73 | continue 74 | } 75 | 76 | if len(record) != 3 { 77 | return nil, fmt.Errorf("Malformed CSV record, failed to parse line %d (`%v`), expected 3 fields", i, record) 78 | } 79 | 80 | fieldID, ok := dcgm.DCGM_FI[record[0]] 81 | oldFieldID, oldOk := dcgm.OLD_DCGM_FI[record[0]] 82 | if !ok && !oldOk { 83 | return nil, fmt.Errorf("Could not find DCGM field %s", record[0]) 84 | } 85 | 86 | if !ok && oldOk { 87 | useOld = true 88 | } 89 | 90 | if !useOld { 91 | if !dcpAllowed && fieldID >= 1000 { 92 | logrus.Warnf("Skipping line %d ('%s'): DCP metrics not enabled", i, record[0]) 93 | continue 94 | } 95 | 96 | if _, ok := promMetricType[record[1]]; !ok { 97 | return nil, fmt.Errorf("Could not find Prometheus metry type %s", record[1]) 98 | } 99 | 100 | f = append(f, Counter{fieldID, record[0], record[1], record[2]}) 101 | } else { 102 | if !dcpAllowed && oldFieldID >= 1000 { 103 | logrus.Warnf("Skipping line %d ('%s'): DCP metrics not enabled", i, record[0]) 104 | continue 105 | } 106 | 107 | if _, ok := promMetricType[record[1]]; !ok { 108 | return nil, fmt.Errorf("Could not find Prometheus metry type %s", record[1]) 109 | } 110 | 111 | f = append(f, Counter{oldFieldID, record[0], record[1], record[2]}) 112 | 113 | } 114 | } 115 | 116 | return f, nil 117 | } 118 | 119 | func recordIsCommentOrEmpty(s []string) bool { 120 | if len(s) == 0 { 121 | return true 122 | } 123 | 124 | if len(s[0]) < 1 || s[0][0] == '#' { 125 | return true 126 | } 127 | 128 | return false 129 | } 130 | -------------------------------------------------------------------------------- /pkg/pipeline_test.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package main 18 | 19 | import ( 20 | "testing" 21 | 22 | "github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm" 23 | "github.com/stretchr/testify/require" 24 | ) 25 | 26 | func TestRun(t *testing.T) { 27 | cleanup, err := dcgm.Init(dcgm.Embedded) 28 | require.NoError(t, err) 29 | defer cleanup() 30 | 31 | c, cleanup := testDCGMCollector(t, sampleCounters) 32 | defer cleanup() 33 | 34 | p, cleanup, err := NewMetricsPipelineWithGPUCollector(&Config{}, c) 35 | defer cleanup() 36 | 37 | out, err := p.run() 38 | require.NoError(t, err) 39 | require.NotEmpty(t, out) 40 | 41 | // Note it is pretty difficult to make non superficial tests without 42 | // writting a full blown parser, always look at the results 43 | // We'll be testing them more throughly in the e2e tests (e.g: by running prometheus). 44 | t.Logf("Pipeline result is:\n%v", out) 45 | } 46 | -------------------------------------------------------------------------------- /pkg/server.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package main 18 | 19 | import ( 20 | "context" 21 | "net/http" 22 | "sync" 23 | "time" 24 | 25 | "github.com/gorilla/mux" 26 | "github.com/sirupsen/logrus" 27 | ) 28 | 29 | func NewMetricsServer(c *Config, metrics chan string) (*MetricsServer, func(), error) { 30 | router := mux.NewRouter() 31 | serverv1 := &MetricsServer{ 32 | server: http.Server{ 33 | Addr: c.Address, 34 | Handler: router, 35 | ReadTimeout: 10 * time.Second, 36 | WriteTimeout: 10 * time.Second, 37 | }, 38 | metricsChan: metrics, 39 | metrics: "", 40 | } 41 | 42 | router.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) { 43 | w.Write([]byte(` 44 | GPU Exporter 45 | 46 |

GPU Exporter

47 |

Metrics

48 | 49 | `)) 50 | }) 51 | 52 | router.HandleFunc("/health", serverv1.Health) 53 | router.HandleFunc("/metrics", serverv1.Metrics) 54 | 55 | return serverv1, func() {}, nil 56 | } 57 | 58 | func (s *MetricsServer) Run(stop chan interface{}, wg *sync.WaitGroup) { 59 | defer wg.Done() 60 | 61 | var httpwg sync.WaitGroup 62 | httpwg.Add(1) 63 | go func() { 64 | defer httpwg.Done() 65 | logrus.Info("Starting webserver") 66 | if err := s.server.ListenAndServe(); err != nil && err != http.ErrServerClosed { 67 | logrus.Fatalf("Failed to Listen and Server HTTP server with err: `%v`", err) 68 | } 69 | }() 70 | 71 | httpwg.Add(1) 72 | go func() { 73 | defer httpwg.Done() 74 | for { 75 | select { 76 | case <-stop: 77 | return 78 | case m := <-s.metricsChan: 79 | s.updateMetrics(m) 80 | } 81 | } 82 | }() 83 | 84 | <-stop 85 | if err := s.server.Shutdown(context.Background()); err != nil { 86 | logrus.Fatalf("Failed to shutdown HTTP server, with err: `%v`", err) 87 | } 88 | 89 | if err := WaitWithTimeout(&httpwg, 3*time.Second); err != nil { 90 | logrus.Fatalf("Failed waiting for HTTP server to shutdown, with err: `%v`", err) 91 | } 92 | } 93 | 94 | func (s *MetricsServer) Metrics(w http.ResponseWriter, r *http.Request) { 95 | w.WriteHeader(http.StatusOK) 96 | w.Write([]byte(s.getMetrics())) 97 | } 98 | 99 | func (s *MetricsServer) Health(w http.ResponseWriter, r *http.Request) { 100 | if s.getMetrics() == "" { 101 | w.WriteHeader(http.StatusServiceUnavailable) 102 | w.Write([]byte("KO")) 103 | } else { 104 | w.WriteHeader(http.StatusOK) 105 | w.Write([]byte("OK")) 106 | } 107 | } 108 | 109 | func (s *MetricsServer) updateMetrics(m string) { 110 | s.Lock() 111 | defer s.Unlock() 112 | 113 | s.metrics = m 114 | } 115 | 116 | func (s *MetricsServer) getMetrics() string { 117 | s.Lock() 118 | defer s.Unlock() 119 | 120 | return s.metrics 121 | } 122 | -------------------------------------------------------------------------------- /pkg/system_info_test.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package main 18 | 19 | import ( 20 | "fmt" 21 | "github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm" 22 | "github.com/stretchr/testify/require" 23 | "testing" 24 | ) 25 | 26 | const ( 27 | fakeProfileName string = "2fake.4gb" 28 | ) 29 | 30 | func SpoofSystemInfo() SystemInfo { 31 | var sysInfo SystemInfo 32 | sysInfo.GpuCount = 2 33 | sysInfo.MigEnabled = true 34 | sysInfo.Gpus[0].DeviceInfo.GPU = 0 35 | gi := GpuInstanceInfo{ 36 | Info: dcgm.MigEntityInfo{"fake", 0, 0, 0, 0, 3}, 37 | ProfileName: fakeProfileName, 38 | EntityId: 0, 39 | } 40 | sysInfo.Gpus[0].GpuInstances = append(sysInfo.Gpus[0].GpuInstances, gi) 41 | gi2 := GpuInstanceInfo{ 42 | Info: dcgm.MigEntityInfo{"fake", 0, 1, 0, 0, 3}, 43 | ProfileName: fakeProfileName, 44 | EntityId: 14, 45 | } 46 | sysInfo.Gpus[1].GpuInstances = append(sysInfo.Gpus[1].GpuInstances, gi2) 47 | sysInfo.Gpus[1].DeviceInfo.GPU = 1 48 | 49 | return sysInfo 50 | } 51 | 52 | func TestMonitoredEntities(t *testing.T) { 53 | sysInfo := SpoofSystemInfo() 54 | sysInfo.dOpt.Flex = true 55 | 56 | monitoring := GetMonitoredEntities(sysInfo) 57 | require.Equal(t, len(monitoring), 2, fmt.Sprintf("Should have 2 monitored entities but found %d", len(monitoring))) 58 | instanceCount := 0 59 | gpuCount := 0 60 | for _, mi := range monitoring { 61 | if mi.Entity.EntityGroupId == dcgm.FE_GPU_I { 62 | instanceCount = instanceCount + 1 63 | require.NotEqual(t, mi.InstanceInfo, nil, "Expected InstanceInfo to be populated but it wasn't") 64 | require.Equal(t, mi.InstanceInfo.ProfileName, fakeProfileName, "Expected profile named '%s' but found '%s'", fakeProfileName, mi.InstanceInfo.ProfileName) 65 | if mi.Entity.EntityId != uint(0) { 66 | // One of these should be 0, the other should be 14 67 | require.Equal(t, mi.Entity.EntityId, uint(14), "Expected 14 as EntityId but found %s", monitoring[1].Entity.EntityId) 68 | } 69 | } else { 70 | gpuCount = gpuCount + 1 71 | require.Equal(t, mi.InstanceInfo, (*GpuInstanceInfo)(nil), "Expected InstanceInfo to be nil but it wasn't") 72 | } 73 | } 74 | require.Equal(t, instanceCount, 2, "Expected 2 GPU instances but found %d", instanceCount) 75 | require.Equal(t, gpuCount, 0, "Expected 0 GPUs but found %d", gpuCount) 76 | 77 | sysInfo.MigEnabled = false // we are now monitoring the GPUs 78 | monitoring = GetMonitoredEntities(sysInfo) 79 | require.Equal(t, 2, len(monitoring), fmt.Sprintf("Should have 2 monitored entities but found %d", len(monitoring))) 80 | for i, mi := range monitoring { 81 | require.Equal(t, mi.Entity.EntityGroupId, dcgm.FE_GPU, "Expected FE_GPU but found %d", mi.Entity.EntityGroupId) 82 | require.Equal(t, uint(i), mi.DeviceInfo.GPU, "Expected GPU %d but found %d", i, mi.DeviceInfo.GPU) 83 | require.Equal(t, (*GpuInstanceInfo)(nil), mi.InstanceInfo, "Expected InstanceInfo not to be populated but it was") 84 | } 85 | } 86 | 87 | func TestVerifyDevicePresence(t *testing.T) { 88 | sysInfo := SpoofSystemInfo() 89 | var dOpt DeviceOptions 90 | dOpt.Flex = true 91 | err := VerifyDevicePresence(&sysInfo, dOpt) 92 | require.Equal(t, err, nil, "Expected to have no error, but found %s", err) 93 | 94 | dOpt.Flex = false 95 | dOpt.GpuRange = append(dOpt.GpuRange, -1) 96 | dOpt.GpuInstanceRange = append(dOpt.GpuInstanceRange, -1) 97 | err = VerifyDevicePresence(&sysInfo, dOpt) 98 | require.Equal(t, err, nil, "Expected to have no error, but found %s", err) 99 | 100 | dOpt.GpuInstanceRange[0] = 10 // this GPU instance doesn't exist 101 | err = VerifyDevicePresence(&sysInfo, dOpt) 102 | require.NotEqual(t, err, nil, "Expected to have an error for a non-existent GPU instance, but none found") 103 | 104 | dOpt.GpuRange[0] = 10 // this GPU doesn't exist 105 | dOpt.GpuInstanceRange[0] = -1 106 | err = VerifyDevicePresence(&sysInfo, dOpt) 107 | require.NotEqual(t, err, nil, "Expected to have an error for a non-existent GPU, but none found") 108 | 109 | // Add GPUs and instances that exist 110 | dOpt.GpuRange[0] = 0 111 | dOpt.GpuRange = append(dOpt.GpuRange, 1) 112 | dOpt.GpuInstanceRange[0] = 0 113 | dOpt.GpuInstanceRange = append(dOpt.GpuInstanceRange, 14) 114 | err = VerifyDevicePresence(&sysInfo, dOpt) 115 | require.Equal(t, err, nil, "Expected to have no error, but found %s", err) 116 | } 117 | 118 | //func TestMigProfileNames(t *testing.T) { 119 | // sysInfo := SpoofSystemInfo() 120 | // SetMigProfileNames(sysInfo, values) 121 | //} 122 | -------------------------------------------------------------------------------- /pkg/types.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package main 18 | 19 | import ( 20 | "fmt" 21 | "net/http" 22 | "sync" 23 | "text/template" 24 | 25 | "github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm" 26 | ) 27 | 28 | var ( 29 | SkipDCGMValue = "SKIPPING DCGM VALUE" 30 | FailedToConvert = "ERROR - FAILED TO CONVERT TO STRING" 31 | 32 | nvidiaResourceName = "nvidia.com/gpu" 33 | nvidiaMigResourcePrefix = "nvidia.com/mig-" 34 | MIG_UUID_PREFIX = "MIG-" 35 | 36 | // Note standard resource attributes 37 | podAttribute = "pod" 38 | namespaceAttribute = "namespace" 39 | containerAttribute = "container" 40 | 41 | oldPodAttribute = "pod_name" 42 | oldNamespaceAttribute = "pod_namespace" 43 | oldContainerAttribute = "container_name" 44 | ) 45 | 46 | type KubernetesGPUIDType string 47 | 48 | const ( 49 | GPUUID KubernetesGPUIDType = "uid" 50 | DeviceName KubernetesGPUIDType = "device-name" 51 | ) 52 | 53 | const ( 54 | FlexKey = "f" // Monitor all GPUs if MIG is disabled or all GPU instances if MIG is enabled 55 | GPUKey = "g" // Monitor GPUs 56 | GPUInstanceKey = "i" // Monitor GPU instances - cannot be specified if MIG is disabled 57 | ) 58 | 59 | type DeviceOptions struct { 60 | Flex bool // If true, then monitor all GPUs if MIG mode is disabled or all GPU instances if MIG is enabled. 61 | GpuRange []int // The indices of each GPU to monitor, or -1 to monitor all 62 | GpuInstanceRange []int // The indices of each GPU instance to monitor, or -1 to monitor all 63 | } 64 | 65 | type Config struct { 66 | CollectorsFile string 67 | Address string 68 | CollectInterval int 69 | Kubernetes bool 70 | KubernetesGPUIdType KubernetesGPUIDType 71 | CollectDCP bool 72 | UseOldNamespace bool 73 | UseRemoteHE bool 74 | RemoteHEInfo string 75 | Devices DeviceOptions 76 | NoHostname bool 77 | UseFakeGpus bool 78 | } 79 | 80 | type Transform interface { 81 | Process(metrics [][]Metric, sysInfo SystemInfo) error 82 | Name() string 83 | } 84 | 85 | type MetricsPipeline struct { 86 | config *Config 87 | 88 | transformations []Transform 89 | metricsFormat *template.Template 90 | migMetricsFormat *template.Template 91 | 92 | counters []Counter 93 | gpuCollector *DCGMCollector 94 | } 95 | 96 | type DCGMCollector struct { 97 | Counters []Counter 98 | DeviceFields []dcgm.Short 99 | Cleanups []func() 100 | UseOldNamespace bool 101 | SysInfo SystemInfo 102 | Hostname string 103 | } 104 | 105 | type Counter struct { 106 | FieldID dcgm.Short 107 | FieldName string 108 | PromType string 109 | Help string 110 | } 111 | 112 | type Metric struct { 113 | Counter *Counter 114 | Value string 115 | 116 | GPU string 117 | GPUUUID string 118 | GPUDevice string 119 | GPUModelName string 120 | 121 | UUID string 122 | 123 | MigProfile string 124 | GPUInstanceID string 125 | Hostname string 126 | 127 | Attributes map[string]string 128 | } 129 | 130 | func (m Metric) getIDOfType(idType KubernetesGPUIDType) (string, error) { 131 | // For MIG devices, return the MIG profile instead of 132 | if m.MigProfile != "" { 133 | return fmt.Sprintf("%s-%s", m.GPU, m.GPUInstanceID), nil 134 | } 135 | switch idType { 136 | case GPUUID: 137 | return m.GPUUUID, nil 138 | case DeviceName: 139 | return m.GPUDevice, nil 140 | } 141 | return "", fmt.Errorf("unsupported KubernetesGPUIDType for MetricID '%s'", idType) 142 | } 143 | 144 | var promMetricType = map[string]bool{ 145 | "gauge": true, 146 | "counter": true, 147 | "histogram": true, 148 | "summary": true, 149 | } 150 | 151 | type MetricsServer struct { 152 | sync.Mutex 153 | 154 | server http.Server 155 | metrics string 156 | metricsChan chan string 157 | } 158 | 159 | type PodMapper struct { 160 | Config *Config 161 | } 162 | 163 | type PodInfo struct { 164 | Name string 165 | Namespace string 166 | Container string 167 | } 168 | -------------------------------------------------------------------------------- /pkg/utils.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package main 18 | 19 | import ( 20 | "fmt" 21 | "os" 22 | "os/signal" 23 | "sync" 24 | "time" 25 | ) 26 | 27 | func WaitWithTimeout(wg *sync.WaitGroup, timeout time.Duration) error { 28 | c := make(chan struct{}) 29 | go func() { 30 | defer close(c) 31 | wg.Wait() 32 | }() 33 | select { 34 | case <-c: 35 | return nil 36 | case <-time.After(timeout): 37 | return fmt.Errorf("Timeout waiting for WaitGroup") 38 | } 39 | } 40 | 41 | func newOSWatcher(sigs ...os.Signal) chan os.Signal { 42 | sigChan := make(chan os.Signal, 1) 43 | signal.Notify(sigChan, sigs...) 44 | 45 | return sigChan 46 | } 47 | -------------------------------------------------------------------------------- /service-monitor.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: monitoring.coreos.com/v1 16 | kind: ServiceMonitor 17 | metadata: 18 | name: "dcgm-exporter" 19 | labels: 20 | app.kubernetes.io/name: "dcgm-exporter" 21 | app.kubernetes.io/version: "2.4.0" 22 | spec: 23 | selector: 24 | matchLabels: 25 | app.kubernetes.io/name: "dcgm-exporter" 26 | app.kubernetes.io/version: "2.4.0" 27 | endpoints: 28 | - port: "metrics" 29 | path: "/metrics" 30 | -------------------------------------------------------------------------------- /tests/ci-run-e2e.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash -x 2 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | set -euxo pipefail 17 | shopt -s lastpipe 18 | 19 | readonly basedir="$(dirname "$(realpath "$0")")" 20 | 21 | # shellcheck source=tests/common.sh 22 | source "${basedir}/common.sh" 23 | 24 | # shellcheck source=tests/metrics.sh 25 | source "${basedir}/metrics.sh" 26 | 27 | CI_REGISTRY_IMAGE=${CI_REGISTRY_IMAGE:-"undefined"} 28 | CI_COMMIT_SHORT_SHA=${CI_COMMIT_SHORT_SHA:-"undefined"} 29 | 30 | install::jq() { 31 | apt update && apt install -y --no-install-recommends jq 32 | } 33 | 34 | install::helm() { 35 | curl https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 | bash 36 | } 37 | 38 | install::dcgm::exporter() { 39 | helm package deployment/dcgm-exporter 40 | helm install --wait dcgm-exporter ./*.tgz --set "image.repository=${CI_REGISTRY_IMAGE}/dcgm-exporter" --set "image.tag=${CI_COMMIT_SHORT_SHA}" --set "serviceMonitor.enabled=true" 41 | } 42 | 43 | install::prom() { 44 | helm repo add stable https://charts.helm.sh/stable 45 | helm install --wait stable/prometheus-operator --generate-name \ 46 | --set "prometheus.prometheusSpec.serviceMonitorSelectorNilUsesHelmValues=false" 47 | } 48 | 49 | query::prom() { 50 | IP="$(kubectl get svc -l app=prometheus-operator-prometheus -o jsonpath='{.items[0].spec.clusterIP}')" 51 | val="$(curl -sL "http://$IP:9090/api/v1/query?query=$1" | jq -r '.data.result')" 52 | 53 | [ "${val}" != "" ] || return 1 54 | [ "${val}" != "[]" ] || return 1 55 | 56 | echo "$val" 57 | } 58 | 59 | query::pod::phase() { 60 | state="$(kubectl get pods "$1" -o jsonpath='{.status.phase}')" 61 | [ "$state" = "$2" ] || return 1 62 | } 63 | 64 | testing::log::kube() { 65 | kubectl get pods 66 | kubectl get svc 67 | kubectl get serviceMonitor 68 | 69 | kubectl get pods -l "app.kubernetes.io/component=dcgm-exporter" -o yaml 70 | } 71 | 72 | install::jq 73 | install::helm 74 | install::prom 75 | install::dcgm::exporter 76 | 77 | trap 'testing::log::kube' ERR 78 | 79 | for test_case in "metrics"; do 80 | log INFO "=================Testing ${test_case}=================" 81 | testing::${test_case}::setup "$@" 82 | testing::${test_case}::main "$@" 83 | testing::${test_case}::cleanup "$@" 84 | done 85 | 86 | -------------------------------------------------------------------------------- /tests/common.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | # shellcheck disable=SC2015 17 | [ -t 2 ] && readonly LOG_TTY=1 || readonly LOG_NO_TTY=1 18 | 19 | if [ "${LOG_TTY-0}" -eq 1 ] && [ "$(tput colors)" -ge 15 ]; then 20 | readonly FMT_BOLD=$(tput bold) 21 | readonly FMT_RED=$(tput setaf 1) 22 | readonly FMT_YELLOW=$(tput setaf 3) 23 | readonly FMT_BLUE=$(tput setaf 12) 24 | readonly FMT_CLEAR=$(tput sgr0) 25 | fi 26 | 27 | log() { 28 | local -r level="$1"; shift 29 | local -r message="$*" 30 | 31 | local fmt_on="${FMT_CLEAR-}" 32 | local -r fmt_off="${FMT_CLEAR-}" 33 | 34 | case "${level}" in 35 | INFO) fmt_on="${FMT_BLUE-}" ;; 36 | WARN) fmt_on="${FMT_YELLOW-}" ;; 37 | ERROR) fmt_on="${FMT_RED-}" ;; 38 | esac 39 | printf "%s[%s]%s %b\n" "${fmt_on}" "${level}" "${fmt_off}" "${message}" >&2 40 | } 41 | 42 | with_retry() { 43 | local -r max_attempts="$1" delay="$2" 44 | shift 2 45 | local count=0 rc 46 | 47 | while true; do 48 | set +e 49 | "$@" 50 | rc="$?" 51 | set -e 52 | count="$((count+1))" 53 | 54 | if [[ "${rc}" -eq 0 ]]; then 55 | echo "'$*' SUCCEEDED in ${count} attempts !" 56 | return 0 57 | fi 58 | 59 | if [[ "${max_attempts}" -le 0 ]] || [[ "${count}" -lt "${max_attempts}" ]]; then 60 | echo "'$*' FAILED at attempt ${count}, will retry in ${delay} seconds ..." 61 | sleep "${delay}" 62 | else 63 | break 64 | fi 65 | done 66 | 67 | echo "'$*' FAILED in ${count} attempts !" 68 | return 1 69 | } 70 | -------------------------------------------------------------------------------- /tests/gpu-pod.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: nbody-pod 5 | spec: 6 | restartPolicy: OnFailure 7 | containers: 8 | - name: nbody 9 | image: "nvcr.io/nvidia/k8s/cuda-sample:nbody" 10 | command: ["nbody"] 11 | args: ["-benchmark", "-i=10000000"] 12 | resources: 13 | limits: 14 | nvidia.com/gpu: 1 15 | -------------------------------------------------------------------------------- /tests/metrics.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash -x 2 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | testing::metrics::setup() { 17 | : 18 | } 19 | 20 | testing::metrics::cleanup() { 21 | kubectl delete -f tests/gpu-pod.yaml 22 | } 23 | 24 | testing::metrics::utilization::increase() { 25 | # For a short while we might have multiple values returned 26 | # In this case it seems like the first item is the oldest 27 | val="$(query::prom "DCGM_FI_DEV_GPU_UTIL" | jq -r '.[-1].value[1]')" 28 | [ "$val" -ge 0 ] || return 1 29 | } 30 | 31 | testing::metrics::ensure::kube::labels() { 32 | val="$(query::prom "DCGM_FI_DEV_GPU_UTIL")" 33 | UUID="$(echo "${val}" | jq -r '.[0].metric.UUID')" 34 | gpu="$(echo "${val}" | jq -r '.[0].metric.gpu')" 35 | pod="$(echo "${val}" | jq -r '.[0].metric.exported_pod')" 36 | namespace="$(echo "${val}" | jq -r '.[0].metric.exported_namespace')" 37 | 38 | [ "$UUID" != "" ] || return 1 39 | [ "$gpu" != "" ] || return 1 40 | 41 | [ "$pod" = "nbody-pod" ] || return 1 42 | [ "$namespace" = "default" ] || return 1 43 | } 44 | 45 | testing::metrics::main() { 46 | # Prometheus can take a while to pickup the exporter 47 | with_retry 30 10s query::prom "DCGM_FI_DEV_MEMORY_TEMP" 48 | 49 | kubectl create -f tests/gpu-pod.yaml 50 | with_retry 30 10s query::pod::phase "nbody-pod" "Running" 51 | 52 | with_retry 10 10s testing::metrics::utilization::increase 53 | with_retry 10 10s testing::metrics::ensure::kube::labels 54 | } 55 | -------------------------------------------------------------------------------- /tests/variables.tfvars: -------------------------------------------------------------------------------- 1 | instance_type = "p3.2xlarge" 2 | project_name = "gpu-monitoring-tools" 3 | setup_params = "--driver --k8s-plugin --nvcr" 4 | -------------------------------------------------------------------------------- /vendor/github.com/Masterminds/semver/.travis.yml: -------------------------------------------------------------------------------- 1 | language: go 2 | 3 | go: 4 | - 1.6.x 5 | - 1.7.x 6 | - 1.8.x 7 | - 1.9.x 8 | - 1.10.x 9 | - 1.11.x 10 | - 1.12.x 11 | - tip 12 | 13 | # Setting sudo access to false will let Travis CI use containers rather than 14 | # VMs to run the tests. For more details see: 15 | # - http://docs.travis-ci.com/user/workers/container-based-infrastructure/ 16 | # - http://docs.travis-ci.com/user/workers/standard-infrastructure/ 17 | sudo: false 18 | 19 | script: 20 | - make setup 21 | - make test 22 | 23 | notifications: 24 | webhooks: 25 | urls: 26 | - https://webhooks.gitter.im/e/06e3328629952dabe3e0 27 | on_success: change # options: [always|never|change] default: always 28 | on_failure: always # options: [always|never|change] default: always 29 | on_start: never # options: [always|never|change] default: always 30 | -------------------------------------------------------------------------------- /vendor/github.com/Masterminds/semver/CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # 1.5.0 (2019-09-11) 2 | 3 | ## Added 4 | 5 | - #103: Add basic fuzzing for `NewVersion()` (thanks @jesse-c) 6 | 7 | ## Changed 8 | 9 | - #82: Clarify wildcard meaning in range constraints and update tests for it (thanks @greysteil) 10 | - #83: Clarify caret operator range for pre-1.0.0 dependencies (thanks @greysteil) 11 | - #72: Adding docs comment pointing to vert for a cli 12 | - #71: Update the docs on pre-release comparator handling 13 | - #89: Test with new go versions (thanks @thedevsaddam) 14 | - #87: Added $ to ValidPrerelease for better validation (thanks @jeremycarroll) 15 | 16 | ## Fixed 17 | 18 | - #78: Fix unchecked error in example code (thanks @ravron) 19 | - #70: Fix the handling of pre-releases and the 0.0.0 release edge case 20 | - #97: Fixed copyright file for proper display on GitHub 21 | - #107: Fix handling prerelease when sorting alphanum and num 22 | - #109: Fixed where Validate sometimes returns wrong message on error 23 | 24 | # 1.4.2 (2018-04-10) 25 | 26 | ## Changed 27 | - #72: Updated the docs to point to vert for a console appliaction 28 | - #71: Update the docs on pre-release comparator handling 29 | 30 | ## Fixed 31 | - #70: Fix the handling of pre-releases and the 0.0.0 release edge case 32 | 33 | # 1.4.1 (2018-04-02) 34 | 35 | ## Fixed 36 | - Fixed #64: Fix pre-release precedence issue (thanks @uudashr) 37 | 38 | # 1.4.0 (2017-10-04) 39 | 40 | ## Changed 41 | - #61: Update NewVersion to parse ints with a 64bit int size (thanks @zknill) 42 | 43 | # 1.3.1 (2017-07-10) 44 | 45 | ## Fixed 46 | - Fixed #57: number comparisons in prerelease sometimes inaccurate 47 | 48 | # 1.3.0 (2017-05-02) 49 | 50 | ## Added 51 | - #45: Added json (un)marshaling support (thanks @mh-cbon) 52 | - Stability marker. See https://masterminds.github.io/stability/ 53 | 54 | ## Fixed 55 | - #51: Fix handling of single digit tilde constraint (thanks @dgodd) 56 | 57 | ## Changed 58 | - #55: The godoc icon moved from png to svg 59 | 60 | # 1.2.3 (2017-04-03) 61 | 62 | ## Fixed 63 | - #46: Fixed 0.x.x and 0.0.x in constraints being treated as * 64 | 65 | # Release 1.2.2 (2016-12-13) 66 | 67 | ## Fixed 68 | - #34: Fixed issue where hyphen range was not working with pre-release parsing. 69 | 70 | # Release 1.2.1 (2016-11-28) 71 | 72 | ## Fixed 73 | - #24: Fixed edge case issue where constraint "> 0" does not handle "0.0.1-alpha" 74 | properly. 75 | 76 | # Release 1.2.0 (2016-11-04) 77 | 78 | ## Added 79 | - #20: Added MustParse function for versions (thanks @adamreese) 80 | - #15: Added increment methods on versions (thanks @mh-cbon) 81 | 82 | ## Fixed 83 | - Issue #21: Per the SemVer spec (section 9) a pre-release is unstable and 84 | might not satisfy the intended compatibility. The change here ignores pre-releases 85 | on constraint checks (e.g., ~ or ^) when a pre-release is not part of the 86 | constraint. For example, `^1.2.3` will ignore pre-releases while 87 | `^1.2.3-alpha` will include them. 88 | 89 | # Release 1.1.1 (2016-06-30) 90 | 91 | ## Changed 92 | - Issue #9: Speed up version comparison performance (thanks @sdboyer) 93 | - Issue #8: Added benchmarks (thanks @sdboyer) 94 | - Updated Go Report Card URL to new location 95 | - Updated Readme to add code snippet formatting (thanks @mh-cbon) 96 | - Updating tagging to v[SemVer] structure for compatibility with other tools. 97 | 98 | # Release 1.1.0 (2016-03-11) 99 | 100 | - Issue #2: Implemented validation to provide reasons a versions failed a 101 | constraint. 102 | 103 | # Release 1.0.1 (2015-12-31) 104 | 105 | - Fixed #1: * constraint failing on valid versions. 106 | 107 | # Release 1.0.0 (2015-10-20) 108 | 109 | - Initial release 110 | -------------------------------------------------------------------------------- /vendor/github.com/Masterminds/semver/LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright (C) 2014-2019, Matt Butcher and Matt Farina 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in 11 | all copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | THE SOFTWARE. 20 | -------------------------------------------------------------------------------- /vendor/github.com/Masterminds/semver/Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: setup 2 | setup: 3 | go get -u gopkg.in/alecthomas/gometalinter.v1 4 | gometalinter.v1 --install 5 | 6 | .PHONY: test 7 | test: validate lint 8 | @echo "==> Running tests" 9 | go test -v 10 | 11 | .PHONY: validate 12 | validate: 13 | @echo "==> Running static validations" 14 | @gometalinter.v1 \ 15 | --disable-all \ 16 | --enable deadcode \ 17 | --severity deadcode:error \ 18 | --enable gofmt \ 19 | --enable gosimple \ 20 | --enable ineffassign \ 21 | --enable misspell \ 22 | --enable vet \ 23 | --tests \ 24 | --vendor \ 25 | --deadline 60s \ 26 | ./... || exit_code=1 27 | 28 | .PHONY: lint 29 | lint: 30 | @echo "==> Running linters" 31 | @gometalinter.v1 \ 32 | --disable-all \ 33 | --enable golint \ 34 | --vendor \ 35 | --deadline 60s \ 36 | ./... || : 37 | -------------------------------------------------------------------------------- /vendor/github.com/Masterminds/semver/appveyor.yml: -------------------------------------------------------------------------------- 1 | version: build-{build}.{branch} 2 | 3 | clone_folder: C:\gopath\src\github.com\Masterminds\semver 4 | shallow_clone: true 5 | 6 | environment: 7 | GOPATH: C:\gopath 8 | 9 | platform: 10 | - x64 11 | 12 | install: 13 | - go version 14 | - go env 15 | - go get -u gopkg.in/alecthomas/gometalinter.v1 16 | - set PATH=%PATH%;%GOPATH%\bin 17 | - gometalinter.v1.exe --install 18 | 19 | build_script: 20 | - go install -v ./... 21 | 22 | test_script: 23 | - "gometalinter.v1 \ 24 | --disable-all \ 25 | --enable deadcode \ 26 | --severity deadcode:error \ 27 | --enable gofmt \ 28 | --enable gosimple \ 29 | --enable ineffassign \ 30 | --enable misspell \ 31 | --enable vet \ 32 | --tests \ 33 | --vendor \ 34 | --deadline 60s \ 35 | ./... || exit_code=1" 36 | - "gometalinter.v1 \ 37 | --disable-all \ 38 | --enable golint \ 39 | --vendor \ 40 | --deadline 60s \ 41 | ./... || :" 42 | - go test -v 43 | 44 | deploy: off 45 | -------------------------------------------------------------------------------- /vendor/github.com/Masterminds/semver/collection.go: -------------------------------------------------------------------------------- 1 | package semver 2 | 3 | // Collection is a collection of Version instances and implements the sort 4 | // interface. See the sort package for more details. 5 | // https://golang.org/pkg/sort/ 6 | type Collection []*Version 7 | 8 | // Len returns the length of a collection. The number of Version instances 9 | // on the slice. 10 | func (c Collection) Len() int { 11 | return len(c) 12 | } 13 | 14 | // Less is needed for the sort interface to compare two Version objects on the 15 | // slice. If checks if one is less than the other. 16 | func (c Collection) Less(i, j int) bool { 17 | return c[i].LessThan(c[j]) 18 | } 19 | 20 | // Swap is needed for the sort interface to replace the Version objects 21 | // at two different positions in the slice. 22 | func (c Collection) Swap(i, j int) { 23 | c[i], c[j] = c[j], c[i] 24 | } 25 | -------------------------------------------------------------------------------- /vendor/github.com/Masterminds/semver/doc.go: -------------------------------------------------------------------------------- 1 | /* 2 | Package semver provides the ability to work with Semantic Versions (http://semver.org) in Go. 3 | 4 | Specifically it provides the ability to: 5 | 6 | * Parse semantic versions 7 | * Sort semantic versions 8 | * Check if a semantic version fits within a set of constraints 9 | * Optionally work with a `v` prefix 10 | 11 | Parsing Semantic Versions 12 | 13 | To parse a semantic version use the `NewVersion` function. For example, 14 | 15 | v, err := semver.NewVersion("1.2.3-beta.1+build345") 16 | 17 | If there is an error the version wasn't parseable. The version object has methods 18 | to get the parts of the version, compare it to other versions, convert the 19 | version back into a string, and get the original string. For more details 20 | please see the documentation at https://godoc.org/github.com/Masterminds/semver. 21 | 22 | Sorting Semantic Versions 23 | 24 | A set of versions can be sorted using the `sort` package from the standard library. 25 | For example, 26 | 27 | raw := []string{"1.2.3", "1.0", "1.3", "2", "0.4.2",} 28 | vs := make([]*semver.Version, len(raw)) 29 | for i, r := range raw { 30 | v, err := semver.NewVersion(r) 31 | if err != nil { 32 | t.Errorf("Error parsing version: %s", err) 33 | } 34 | 35 | vs[i] = v 36 | } 37 | 38 | sort.Sort(semver.Collection(vs)) 39 | 40 | Checking Version Constraints 41 | 42 | Checking a version against version constraints is one of the most featureful 43 | parts of the package. 44 | 45 | c, err := semver.NewConstraint(">= 1.2.3") 46 | if err != nil { 47 | // Handle constraint not being parseable. 48 | } 49 | 50 | v, err := semver.NewVersion("1.3") 51 | if err != nil { 52 | // Handle version not being parseable. 53 | } 54 | // Check if the version meets the constraints. The a variable will be true. 55 | a := c.Check(v) 56 | 57 | Basic Comparisons 58 | 59 | There are two elements to the comparisons. First, a comparison string is a list 60 | of comma separated and comparisons. These are then separated by || separated or 61 | comparisons. For example, `">= 1.2, < 3.0.0 || >= 4.2.3"` is looking for a 62 | comparison that's greater than or equal to 1.2 and less than 3.0.0 or is 63 | greater than or equal to 4.2.3. 64 | 65 | The basic comparisons are: 66 | 67 | * `=`: equal (aliased to no operator) 68 | * `!=`: not equal 69 | * `>`: greater than 70 | * `<`: less than 71 | * `>=`: greater than or equal to 72 | * `<=`: less than or equal to 73 | 74 | Hyphen Range Comparisons 75 | 76 | There are multiple methods to handle ranges and the first is hyphens ranges. 77 | These look like: 78 | 79 | * `1.2 - 1.4.5` which is equivalent to `>= 1.2, <= 1.4.5` 80 | * `2.3.4 - 4.5` which is equivalent to `>= 2.3.4, <= 4.5` 81 | 82 | Wildcards In Comparisons 83 | 84 | The `x`, `X`, and `*` characters can be used as a wildcard character. This works 85 | for all comparison operators. When used on the `=` operator it falls 86 | back to the pack level comparison (see tilde below). For example, 87 | 88 | * `1.2.x` is equivalent to `>= 1.2.0, < 1.3.0` 89 | * `>= 1.2.x` is equivalent to `>= 1.2.0` 90 | * `<= 2.x` is equivalent to `<= 3` 91 | * `*` is equivalent to `>= 0.0.0` 92 | 93 | Tilde Range Comparisons (Patch) 94 | 95 | The tilde (`~`) comparison operator is for patch level ranges when a minor 96 | version is specified and major level changes when the minor number is missing. 97 | For example, 98 | 99 | * `~1.2.3` is equivalent to `>= 1.2.3, < 1.3.0` 100 | * `~1` is equivalent to `>= 1, < 2` 101 | * `~2.3` is equivalent to `>= 2.3, < 2.4` 102 | * `~1.2.x` is equivalent to `>= 1.2.0, < 1.3.0` 103 | * `~1.x` is equivalent to `>= 1, < 2` 104 | 105 | Caret Range Comparisons (Major) 106 | 107 | The caret (`^`) comparison operator is for major level changes. This is useful 108 | when comparisons of API versions as a major change is API breaking. For example, 109 | 110 | * `^1.2.3` is equivalent to `>= 1.2.3, < 2.0.0` 111 | * `^1.2.x` is equivalent to `>= 1.2.0, < 2.0.0` 112 | * `^2.3` is equivalent to `>= 2.3, < 3` 113 | * `^2.x` is equivalent to `>= 2.0.0, < 3` 114 | */ 115 | package semver 116 | -------------------------------------------------------------------------------- /vendor/github.com/Masterminds/semver/version_fuzz.go: -------------------------------------------------------------------------------- 1 | // +build gofuzz 2 | 3 | package semver 4 | 5 | func Fuzz(data []byte) int { 6 | if _, err := NewVersion(string(data)); err != nil { 7 | return 0 8 | } 9 | return 1 10 | } 11 | -------------------------------------------------------------------------------- /vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm/api.go: -------------------------------------------------------------------------------- 1 | package dcgm 2 | 3 | import ( 4 | "fmt" 5 | "os" 6 | "sync" 7 | ) 8 | 9 | var ( 10 | dcgmInitCounter int 11 | mux sync.Mutex 12 | ) 13 | 14 | // Init starts DCGM, based on the user selected mode 15 | // DCGM can be started in 3 differengt modes: 16 | // 1. Embedded: Start hostengine within this process 17 | // 2. Standalone: Connect to an already running nv-hostengine at the specified address 18 | // Connection address can be passed as command line args: -connect "IP:PORT/Socket" -socket "isSocket" 19 | // 3. StartHostengine: Open an Unix socket to start and connect to the nv-hostengine and terminate before exiting 20 | func Init(m mode, args ...string) (cleanup func(), err error) { 21 | mux.Lock() 22 | if dcgmInitCounter < 0 { 23 | count := fmt.Sprintf("%d", dcgmInitCounter) 24 | err = fmt.Errorf("Shutdown() is called %s times, before Init()", count[1:]) 25 | } 26 | if dcgmInitCounter == 0 { 27 | err = initDcgm(m, args...) 28 | } 29 | dcgmInitCounter += 1 30 | mux.Unlock() 31 | 32 | return func() { 33 | if err := Shutdown(); err != nil { 34 | fmt.Fprintf(os.Stderr, "Failed to shutdown DCGM with error: `%v`", err) 35 | } 36 | }, err 37 | } 38 | 39 | // Shutdown stops DCGM and destroy all connections 40 | func Shutdown() (err error) { 41 | mux.Lock() 42 | if dcgmInitCounter <= 0 { 43 | err = fmt.Errorf("Init() needs to be called before Shutdown()") 44 | } 45 | if dcgmInitCounter == 1 { 46 | err = shutdown() 47 | } 48 | dcgmInitCounter -= 1 49 | mux.Unlock() 50 | 51 | return 52 | } 53 | 54 | // GetAllDeviceCount counts all GPUs on the system 55 | func GetAllDeviceCount() (uint, error) { 56 | return getAllDeviceCount() 57 | } 58 | 59 | // GetSupportedDevices returns only DCGM supported GPUs 60 | func GetSupportedDevices() ([]uint, error) { 61 | return getSupportedDevices() 62 | } 63 | 64 | // GetDeviceInfo describes the given device 65 | func GetDeviceInfo(gpuId uint) (Device, error) { 66 | return getDeviceInfo(gpuId) 67 | } 68 | 69 | // GetDeviceStatus monitors GPU status including its power, memory and GPU utilization 70 | func GetDeviceStatus(gpuId uint) (DeviceStatus, error) { 71 | return latestValuesForDevice(gpuId) 72 | } 73 | 74 | // GetDeviceTopology returns device topology corresponding to the gpuId 75 | func GetDeviceTopology(gpuId uint) ([]P2PLink, error) { 76 | return getDeviceTopology(gpuId) 77 | } 78 | 79 | // WatchPidFields lets DCGM start recording stats for GPU process 80 | // It needs to be called before calling GetProcessInfo 81 | func WatchPidFields() (GroupHandle, error) { 82 | return watchPidFields() 83 | } 84 | 85 | // GetProcessInfo provides detailed per GPU stats for this process 86 | func GetProcessInfo(group GroupHandle, pid uint) ([]ProcessInfo, error) { 87 | return getProcessInfo(group, pid) 88 | } 89 | 90 | // HealthCheckByGpuId monitors GPU health for any errors/failures/warnings 91 | func HealthCheckByGpuId(gpuId uint) (DeviceHealth, error) { 92 | return healthCheckByGpuId(gpuId) 93 | } 94 | 95 | // Policy sets GPU usage and error policies and notifies in case of any violations via callback functions 96 | func Policy(gpuId uint, typ ...policyCondition) (<-chan PolicyViolation, error) { 97 | return registerPolicy(gpuId, typ...) 98 | } 99 | 100 | // Introspect returns DCGM hostengine memory and CPU usage 101 | func Introspect() (DcgmStatus, error) { 102 | return introspect() 103 | } 104 | 105 | // Get all of the profiling metric groups for a given GPU group. 106 | func GetSupportedMetricGroups(grpid uint) ([]MetricGroup, error) { 107 | return getSupportedMetricGroups(grpid) 108 | } 109 | -------------------------------------------------------------------------------- /vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm/bcast.go: -------------------------------------------------------------------------------- 1 | package dcgm 2 | 3 | import ( 4 | "fmt" 5 | "sync" 6 | ) 7 | 8 | type publisher struct { 9 | publish chan interface{} 10 | close chan bool 11 | subscribers []*subscriber 12 | subscriberLock sync.Mutex 13 | } 14 | 15 | type subscriber struct { 16 | read chan interface{} 17 | close chan bool 18 | } 19 | 20 | func newPublisher() *publisher { 21 | pub := &publisher{ 22 | publish: make(chan interface{}), 23 | close: make(chan bool), 24 | } 25 | return pub 26 | } 27 | 28 | func (p *publisher) subscriberList() []*subscriber { 29 | p.subscriberLock.Lock() 30 | defer p.subscriberLock.Unlock() 31 | return p.subscribers[:] 32 | } 33 | 34 | func (p *publisher) add() *subscriber { 35 | p.subscriberLock.Lock() 36 | defer p.subscriberLock.Unlock() 37 | newSub := &subscriber{ 38 | read: make(chan interface{}), 39 | close: make(chan bool), 40 | } 41 | p.subscribers = append(p.subscribers, newSub) 42 | return newSub 43 | } 44 | 45 | func (p *publisher) remove(leaving *subscriber) error { 46 | p.subscriberLock.Lock() 47 | defer p.subscriberLock.Unlock() 48 | subscriberIndex := -1 49 | for i, sub := range p.subscribers { 50 | if sub == leaving { 51 | subscriberIndex = i 52 | break 53 | } 54 | } 55 | if subscriberIndex == -1 { 56 | return fmt.Errorf("Could not find subscriber") 57 | } 58 | go func() { leaving.close <- true }() 59 | p.subscribers = append(p.subscribers[:subscriberIndex], p.subscribers[subscriberIndex+1:]...) 60 | return nil 61 | } 62 | 63 | func (p *publisher) send(val interface{}) { 64 | p.publish <- val 65 | } 66 | 67 | func (p *publisher) broadcast() { 68 | for { 69 | select { 70 | case publishing := <-p.publish: 71 | for _, sub := range p.subscriberList() { 72 | go func(s *subscriber, val interface{}) { 73 | s.read <- val 74 | }(sub, publishing) 75 | } 76 | case <-p.close: 77 | return 78 | } 79 | } 80 | } 81 | 82 | func (p *publisher) closePublisher() { 83 | p.close <- true 84 | } 85 | -------------------------------------------------------------------------------- /vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm/callback.c: -------------------------------------------------------------------------------- 1 | int violationNotify(void* p) { 2 | int ViolationRegistration(void*); 3 | return ViolationRegistration(p); 4 | } 5 | -------------------------------------------------------------------------------- /vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm/device_status.go: -------------------------------------------------------------------------------- 1 | package dcgm 2 | 3 | /* 4 | #include "./dcgm_agent.h" 5 | #include "./dcgm_structs.h" 6 | */ 7 | import "C" 8 | import ( 9 | "fmt" 10 | "math/rand" 11 | ) 12 | 13 | type PerfState uint 14 | 15 | const ( 16 | PerfStateMax = 0 17 | PerfStateMin = 15 18 | PerfStateUnknown = 32 19 | ) 20 | 21 | func (p PerfState) String() string { 22 | if p >= PerfStateMax && p <= PerfStateMin { 23 | return fmt.Sprintf("P%d", p) 24 | } 25 | return "Unknown" 26 | } 27 | 28 | type UtilizationInfo struct { 29 | GPU int64 // % 30 | Memory int64 // % 31 | Encoder int64 // % 32 | Decoder int64 // % 33 | } 34 | 35 | type ECCErrorsInfo struct { 36 | SingleBit int64 37 | DoubleBit int64 38 | } 39 | 40 | type MemoryInfo struct { 41 | GlobalUsed int64 42 | ECCErrors ECCErrorsInfo 43 | } 44 | 45 | type ClockInfo struct { 46 | Cores int64 // MHz 47 | Memory int64 // MHz 48 | } 49 | 50 | type PCIThroughputInfo struct { 51 | Rx int64 // MB 52 | Tx int64 // MB 53 | Replays int64 54 | } 55 | 56 | type PCIStatusInfo struct { 57 | BAR1Used int64 // MB 58 | Throughput PCIThroughputInfo 59 | FBUsed int64 60 | } 61 | 62 | type DeviceStatus struct { 63 | Power float64 // W 64 | Temperature int64 // °C 65 | Utilization UtilizationInfo 66 | Memory MemoryInfo 67 | Clocks ClockInfo 68 | PCI PCIStatusInfo 69 | Performance PerfState 70 | FanSpeed int64 // % 71 | } 72 | 73 | func latestValuesForDevice(gpuId uint) (status DeviceStatus, err error) { 74 | const ( 75 | pwr int = iota 76 | temp 77 | sm 78 | mem 79 | enc 80 | dec 81 | smClock 82 | memClock 83 | bar1Used 84 | pcieRxThroughput 85 | pcieTxThroughput 86 | pcieReplay 87 | fbUsed 88 | sbe 89 | dbe 90 | pstate 91 | fanSpeed 92 | fieldsCount 93 | ) 94 | 95 | deviceFields := make([]Short, fieldsCount) 96 | deviceFields[pwr] = C.DCGM_FI_DEV_POWER_USAGE 97 | deviceFields[temp] = C.DCGM_FI_DEV_GPU_TEMP 98 | deviceFields[sm] = C.DCGM_FI_DEV_GPU_UTIL 99 | deviceFields[mem] = C.DCGM_FI_DEV_MEM_COPY_UTIL 100 | deviceFields[enc] = C.DCGM_FI_DEV_ENC_UTIL 101 | deviceFields[dec] = C.DCGM_FI_DEV_DEC_UTIL 102 | deviceFields[smClock] = C.DCGM_FI_DEV_SM_CLOCK 103 | deviceFields[memClock] = C.DCGM_FI_DEV_MEM_CLOCK 104 | deviceFields[bar1Used] = C.DCGM_FI_DEV_BAR1_USED 105 | deviceFields[pcieRxThroughput] = C.DCGM_FI_DEV_PCIE_RX_THROUGHPUT 106 | deviceFields[pcieTxThroughput] = C.DCGM_FI_DEV_PCIE_TX_THROUGHPUT 107 | deviceFields[pcieReplay] = C.DCGM_FI_DEV_PCIE_REPLAY_COUNTER 108 | deviceFields[fbUsed] = C.DCGM_FI_DEV_FB_USED 109 | deviceFields[sbe] = C.DCGM_FI_DEV_ECC_SBE_AGG_TOTAL 110 | deviceFields[dbe] = C.DCGM_FI_DEV_ECC_DBE_AGG_TOTAL 111 | deviceFields[pstate] = C.DCGM_FI_DEV_PSTATE 112 | deviceFields[fanSpeed] = C.DCGM_FI_DEV_FAN_SPEED 113 | 114 | fieldsName := fmt.Sprintf("devStatusFields%d", rand.Uint64()) 115 | fieldsId, err := FieldGroupCreate(fieldsName, deviceFields) 116 | if err != nil { 117 | return 118 | } 119 | 120 | groupName := fmt.Sprintf("devStatus%d", rand.Uint64()) 121 | groupId, err := WatchFields(gpuId, fieldsId, groupName) 122 | if err != nil { 123 | _ = FieldGroupDestroy(fieldsId) 124 | return 125 | } 126 | 127 | values, err := GetLatestValuesForFields(gpuId, deviceFields) 128 | if err != nil { 129 | _ = FieldGroupDestroy(fieldsId) 130 | _ = DestroyGroup(groupId) 131 | return status, fmt.Errorf("Error getting device status: %s", err) 132 | } 133 | 134 | power := values[pwr].Float64() 135 | 136 | gpuUtil := UtilizationInfo{ 137 | GPU: values[sm].Int64(), 138 | Memory: values[mem].Int64(), 139 | Encoder: values[enc].Int64(), 140 | Decoder: values[dec].Int64(), 141 | } 142 | 143 | memory := MemoryInfo{ 144 | ECCErrors: ECCErrorsInfo{ 145 | SingleBit: values[sbe].Int64(), 146 | DoubleBit: values[dbe].Int64(), 147 | }, 148 | } 149 | 150 | clocks := ClockInfo{ 151 | Cores: values[smClock].Int64(), 152 | Memory: values[memClock].Int64(), 153 | } 154 | 155 | pci := PCIStatusInfo{ 156 | BAR1Used: values[bar1Used].Int64(), 157 | Throughput: PCIThroughputInfo{ 158 | Rx: values[pcieRxThroughput].Int64(), 159 | Tx: values[pcieTxThroughput].Int64(), 160 | Replays: values[pcieReplay].Int64(), 161 | }, 162 | FBUsed: values[fbUsed].Int64(), 163 | } 164 | 165 | status = DeviceStatus{ 166 | Power: power, 167 | Temperature: values[temp].Int64(), 168 | Utilization: gpuUtil, 169 | Memory: memory, 170 | Clocks: clocks, 171 | PCI: pci, 172 | Performance: PerfState(values[pstate].Int64()), 173 | FanSpeed: values[fanSpeed].Int64(), 174 | } 175 | 176 | _ = FieldGroupDestroy(fieldsId) 177 | _ = DestroyGroup(groupId) 178 | return 179 | } 180 | -------------------------------------------------------------------------------- /vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm/go.mod: -------------------------------------------------------------------------------- 1 | module github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm 2 | 3 | go 1.14 4 | -------------------------------------------------------------------------------- /vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm/gpu_group.go: -------------------------------------------------------------------------------- 1 | package dcgm 2 | 3 | /* 4 | #include "dcgm_agent.h" 5 | #include "dcgm_structs.h" 6 | */ 7 | import "C" 8 | import ( 9 | "fmt" 10 | ) 11 | 12 | type GroupHandle struct{ handle C.dcgmGpuGrp_t } 13 | 14 | func CreateGroup(groupName string) (goGroupId GroupHandle, err error) { 15 | var cGroupId C.dcgmGpuGrp_t 16 | cname := C.CString(groupName) 17 | defer freeCString(cname) 18 | 19 | result := C.dcgmGroupCreate(handle.handle, C.DCGM_GROUP_EMPTY, cname, &cGroupId) 20 | if err = errorString(result); err != nil { 21 | return goGroupId, fmt.Errorf("Error creating group: %s", err) 22 | } 23 | 24 | goGroupId = GroupHandle{cGroupId} 25 | return 26 | } 27 | 28 | func NewDefaultGroup(groupName string) (GroupHandle, error) { 29 | var cGroupId C.dcgmGpuGrp_t 30 | 31 | cname := C.CString(groupName) 32 | defer freeCString(cname) 33 | 34 | result := C.dcgmGroupCreate(handle.handle, C.DCGM_GROUP_DEFAULT, cname, &cGroupId) 35 | if err := errorString(result); err != nil { 36 | return GroupHandle{}, fmt.Errorf("Error creating group: %s", err) 37 | } 38 | 39 | return GroupHandle{cGroupId}, nil 40 | } 41 | 42 | func AddToGroup(groupId GroupHandle, gpuId uint) (err error) { 43 | result := C.dcgmGroupAddDevice(handle.handle, groupId.handle, C.uint(gpuId)) 44 | if err = errorString(result); err != nil { 45 | return fmt.Errorf("Error adding GPU %v to group: %s", gpuId, err) 46 | } 47 | 48 | return 49 | } 50 | 51 | func AddEntityToGroup(groupId GroupHandle, entityGroupId Field_Entity_Group, entityId uint) (err error) { 52 | result := C.dcgmGroupAddEntity(handle.handle, groupId.handle, C.dcgm_field_entity_group_t(entityGroupId), C.uint(entityId)) 53 | if err = errorString(result); err != nil { 54 | return fmt.Errorf("Error adding entity group type %v, entity %v to group: %s", entityGroupId, entityId, err) 55 | } 56 | 57 | return 58 | } 59 | 60 | func DestroyGroup(groupId GroupHandle) (err error) { 61 | result := C.dcgmGroupDestroy(handle.handle, groupId.handle) 62 | if err = errorString(result); err != nil { 63 | return fmt.Errorf("Error destroying group: %s", err) 64 | } 65 | 66 | return 67 | } 68 | -------------------------------------------------------------------------------- /vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm/health.go: -------------------------------------------------------------------------------- 1 | package dcgm 2 | 3 | /* 4 | #include "dcgm_agent.h" 5 | #include "dcgm_structs.h" 6 | */ 7 | import "C" 8 | import ( 9 | "fmt" 10 | "math/rand" 11 | "unsafe" 12 | ) 13 | 14 | type SystemWatch struct { 15 | Type string 16 | Status string 17 | Error string 18 | } 19 | 20 | type DeviceHealth struct { 21 | GPU uint 22 | Status string 23 | Watches []SystemWatch 24 | } 25 | 26 | func setHealthWatches(groupId GroupHandle) (err error) { 27 | result := C.dcgmHealthSet(handle.handle, groupId.handle, C.DCGM_HEALTH_WATCH_ALL) 28 | if err = errorString(result); err != nil { 29 | return fmt.Errorf("Error setting health watches: %s", err) 30 | } 31 | return 32 | } 33 | 34 | func healthCheckByGpuId(gpuId uint) (deviceHealth DeviceHealth, err error) { 35 | name := fmt.Sprintf("health%d", rand.Uint64()) 36 | groupId, err := CreateGroup(name) 37 | if err != nil { 38 | return 39 | } 40 | 41 | err = AddToGroup(groupId, gpuId) 42 | if err != nil { 43 | return 44 | } 45 | 46 | err = setHealthWatches(groupId) 47 | if err != nil { 48 | return 49 | } 50 | 51 | var healthResults C.dcgmHealthResponse_v4 52 | healthResults.version = makeVersion2(unsafe.Sizeof(healthResults)) 53 | 54 | result := C.dcgmHealthCheck(handle.handle, groupId.handle, (*C.dcgmHealthResponse_t)(unsafe.Pointer(&healthResults))) 55 | 56 | if err = errorString(result); err != nil { 57 | return deviceHealth, fmt.Errorf("Error checking GPU health: %s", err) 58 | } 59 | 60 | status := healthStatus(int8(healthResults.overallHealth)) 61 | watches := []SystemWatch{} 62 | 63 | // number of watches that encountred error/warning 64 | incidents := uint(healthResults.incidentCount) 65 | 66 | for j := uint(0); j < incidents; j++ { 67 | watch := SystemWatch{ 68 | Type: systemWatch(int(healthResults.incidents[j].system)), 69 | Status: healthStatus(int8(healthResults.incidents[j].health)), 70 | 71 | Error: *stringPtr(&healthResults.incidents[j].error.msg[0]), 72 | } 73 | watches = append(watches, watch) 74 | } 75 | 76 | deviceHealth = DeviceHealth{ 77 | GPU: gpuId, 78 | Status: status, 79 | Watches: watches, 80 | } 81 | _ = DestroyGroup(groupId) 82 | return 83 | } 84 | 85 | func healthStatus(status int8) string { 86 | switch status { 87 | case 0: 88 | return "Healthy" 89 | case 10: 90 | return "Warning" 91 | case 20: 92 | return "Failure" 93 | } 94 | return "N/A" 95 | } 96 | 97 | func systemWatch(watch int) string { 98 | switch watch { 99 | case 1: 100 | return "PCIe watches" 101 | case 2: 102 | return "NVLINK watches" 103 | case 4: 104 | return "Power Managemnt unit watches" 105 | case 8: 106 | return "Microcontroller unit watches" 107 | case 16: 108 | return "Memory watches" 109 | case 32: 110 | return "Streaming Multiprocessor watches" 111 | case 64: 112 | return "Inforom watches" 113 | case 128: 114 | return "Temperature watches" 115 | case 256: 116 | return "Power watches" 117 | case 512: 118 | return "Driver-related watches" 119 | } 120 | return "N/A" 121 | } 122 | -------------------------------------------------------------------------------- /vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm/hostengine_status.go: -------------------------------------------------------------------------------- 1 | package dcgm 2 | 3 | /* 4 | #include "dcgm_agent.h" 5 | #include "dcgm_structs.h" 6 | */ 7 | import "C" 8 | import ( 9 | "fmt" 10 | "unsafe" 11 | ) 12 | 13 | type DcgmStatus struct { 14 | Memory int64 15 | CPU float64 16 | } 17 | 18 | func introspect() (engine DcgmStatus, err error) { 19 | enableIntrospect := C.dcgmIntrospectState_t(1) 20 | result := C.dcgmIntrospectToggleState(handle.handle, enableIntrospect) 21 | 22 | if err = errorString(result); err != nil { 23 | return engine, fmt.Errorf("Error enabling DCGM introspection: %s", err) 24 | } 25 | 26 | var memory C.dcgmIntrospectMemory_t 27 | memory.version = makeVersion2(unsafe.Sizeof(memory)) 28 | waitIfNoData := 1 29 | result = C.dcgmIntrospectGetHostengineMemoryUsage(handle.handle, &memory, C.int(waitIfNoData)) 30 | 31 | if err = errorString(result); err != nil { 32 | return engine, fmt.Errorf("Error getting memory usage of hostengine: %s", err) 33 | } 34 | 35 | var cpu C.dcgmIntrospectCpuUtil_t 36 | 37 | cpu.version = makeVersion2(unsafe.Sizeof(cpu)) 38 | result = C.dcgmIntrospectGetHostengineCpuUtilization(handle.handle, &cpu, C.int(waitIfNoData)) 39 | 40 | if err = errorString(result); err != nil { 41 | return engine, fmt.Errorf("Error getting cpu usage of hostengine: %s", err) 42 | } 43 | 44 | engine = DcgmStatus{ 45 | Memory: toInt64(memory.bytesUsed) / 1024, 46 | CPU: *dblToFloat(cpu.total) * 100, 47 | } 48 | return 49 | } 50 | -------------------------------------------------------------------------------- /vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm/mig.go: -------------------------------------------------------------------------------- 1 | package dcgm 2 | 3 | /* 4 | #include "./dcgm_agent.h" 5 | #include "./dcgm_structs.h" 6 | */ 7 | import "C" 8 | import ( 9 | "fmt" 10 | "unsafe" 11 | ) 12 | 13 | type Field_Entity_Group uint 14 | 15 | const ( 16 | FE_NONE Field_Entity_Group = iota 17 | FE_GPU 18 | FE_VGPU 19 | FE_SWITCH 20 | FE_GPU_I 21 | FE_GPU_CI 22 | FE_COUNT 23 | ) 24 | 25 | type GroupEntityPair struct { 26 | EntityGroupId Field_Entity_Group 27 | EntityId uint 28 | } 29 | 30 | type MigEntityInfo struct { 31 | GpuUuid string 32 | NvmlGpuIndex uint 33 | NvmlInstanceId uint 34 | NvmlComputeInstanceId uint 35 | NvmlMigProfileId uint 36 | NvmlProfileSlices uint 37 | } 38 | 39 | type MigHierarchyInfo_v2 struct { 40 | Entity GroupEntityPair 41 | Parent GroupEntityPair 42 | Info MigEntityInfo 43 | } 44 | 45 | const ( 46 | MAX_NUM_DEVICES uint = C.DCGM_MAX_NUM_DEVICES 47 | MAX_HIERARCHY_INFO uint = C.DCGM_MAX_HIERARCHY_INFO 48 | ) 49 | 50 | type MigHierarchy_v2 struct { 51 | Version uint 52 | Count uint 53 | EntityList [C.DCGM_MAX_HIERARCHY_INFO]MigHierarchyInfo_v2 54 | } 55 | 56 | func GetGpuInstanceHierarchy() (hierarchy MigHierarchy_v2, err error) { 57 | var c_hierarchy C.dcgmMigHierarchy_v2 58 | c_hierarchy.version = C.dcgmMigHierarchy_version2 59 | ptr_hierarchy := (*C.dcgmMigHierarchy_v2)(unsafe.Pointer(&c_hierarchy)) 60 | result := C.dcgmGetGpuInstanceHierarchy(handle.handle, ptr_hierarchy) 61 | 62 | if err = errorString(result); err != nil { 63 | return toMigHierarchy(c_hierarchy), fmt.Errorf("Error retrieving DCGM MIG hierarchy: %s", err) 64 | } 65 | 66 | return toMigHierarchy(c_hierarchy), nil 67 | } 68 | 69 | func toMigHierarchy(c_hierarchy C.dcgmMigHierarchy_v2) MigHierarchy_v2 { 70 | var hierarchy MigHierarchy_v2 71 | hierarchy.Version = uint(c_hierarchy.version) 72 | hierarchy.Count = uint(c_hierarchy.count) 73 | for i := uint(0); i < hierarchy.Count; i++ { 74 | hierarchy.EntityList[i] = MigHierarchyInfo_v2{ 75 | Entity: GroupEntityPair{Field_Entity_Group(c_hierarchy.entityList[i].entity.entityGroupId), uint(c_hierarchy.entityList[i].entity.entityId)}, 76 | Parent: GroupEntityPair{Field_Entity_Group(c_hierarchy.entityList[i].parent.entityGroupId), uint(c_hierarchy.entityList[i].parent.entityId)}, 77 | Info: MigEntityInfo{ 78 | GpuUuid: *stringPtr(&c_hierarchy.entityList[i].info.gpuUuid[0]), 79 | NvmlGpuIndex: uint(c_hierarchy.entityList[i].info.nvmlGpuIndex), 80 | NvmlInstanceId: uint(c_hierarchy.entityList[i].info.nvmlInstanceId), 81 | NvmlComputeInstanceId: uint(c_hierarchy.entityList[i].info.nvmlComputeInstanceId), 82 | NvmlMigProfileId: uint(c_hierarchy.entityList[i].info.nvmlMigProfileId), 83 | NvmlProfileSlices: uint(c_hierarchy.entityList[i].info.nvmlProfileSlices), 84 | }, 85 | } 86 | } 87 | 88 | return hierarchy 89 | } 90 | -------------------------------------------------------------------------------- /vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm/profile.go: -------------------------------------------------------------------------------- 1 | package dcgm 2 | 3 | /* 4 | #include "dcgm_agent.h" 5 | #include "dcgm_structs.h" 6 | */ 7 | import "C" 8 | import ( 9 | "fmt" 10 | "unsafe" 11 | ) 12 | 13 | type MetricGroup struct { 14 | major uint 15 | minor uint 16 | fieldIds []uint 17 | } 18 | 19 | func getSupportedMetricGroups(grpid uint) (groups []MetricGroup, err error) { 20 | 21 | var groupInfo C.dcgmProfGetMetricGroups_t 22 | groupInfo.version = makeVersion2(unsafe.Sizeof(groupInfo)) 23 | groupInfo.groupId = C.ulong(grpid) 24 | 25 | result := C.dcgmProfGetSupportedMetricGroups(handle.handle, &groupInfo) 26 | 27 | if err = errorString(result); err != nil { 28 | return groups, fmt.Errorf("Error getting supported metrics: %s", err) 29 | } 30 | 31 | var count = uint(groupInfo.numMetricGroups) 32 | 33 | for i := uint(0); i < count; i++ { 34 | var group MetricGroup 35 | group.major = uint(groupInfo.metricGroups[i].majorId) 36 | group.minor = uint(groupInfo.metricGroups[i].minorId) 37 | 38 | var fieldCount = uint(groupInfo.metricGroups[i].numFieldIds) 39 | 40 | for j := uint(0); j < fieldCount; j++ { 41 | group.fieldIds = append(group.fieldIds, uint(groupInfo.metricGroups[i].fieldIds[j])) 42 | } 43 | groups = append(groups, group) 44 | } 45 | 46 | return groups, nil 47 | } 48 | -------------------------------------------------------------------------------- /vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm/topology.go: -------------------------------------------------------------------------------- 1 | package dcgm 2 | 3 | /* 4 | #include "dcgm_agent.h" 5 | #include "dcgm_structs.h" 6 | */ 7 | import "C" 8 | import ( 9 | "fmt" 10 | "io/ioutil" 11 | "strings" 12 | "unsafe" 13 | ) 14 | 15 | type P2PLinkType uint 16 | 17 | const ( 18 | P2PLinkUnknown P2PLinkType = iota 19 | P2PLinkCrossCPU 20 | P2PLinkSameCPU 21 | P2PLinkHostBridge 22 | P2PLinkMultiSwitch 23 | P2PLinkSingleSwitch 24 | P2PLinkSameBoard 25 | SingleNVLINKLink 26 | TwoNVLINKLinks 27 | ThreeNVLINKLinks 28 | FourNVLINKLinks 29 | ) 30 | 31 | func (l P2PLinkType) PCIPaths() string { 32 | switch l { 33 | case P2PLinkSameBoard: 34 | return "PSB" 35 | case P2PLinkSingleSwitch: 36 | return "PIX" 37 | case P2PLinkMultiSwitch: 38 | return "PXB" 39 | case P2PLinkHostBridge: 40 | return "PHB" 41 | case P2PLinkSameCPU: 42 | return "NODE" 43 | case P2PLinkCrossCPU: 44 | return "SYS" 45 | case SingleNVLINKLink: 46 | return "NV1" 47 | case TwoNVLINKLinks: 48 | return "NV2" 49 | case ThreeNVLINKLinks: 50 | return "NV3" 51 | case FourNVLINKLinks: 52 | return "NV4" 53 | case P2PLinkUnknown: 54 | } 55 | return "N/A" 56 | } 57 | 58 | type P2PLink struct { 59 | GPU uint 60 | BusID string 61 | Link P2PLinkType 62 | } 63 | 64 | func getP2PLink(path uint) P2PLinkType { 65 | switch path { 66 | case C.DCGM_TOPOLOGY_BOARD: 67 | return P2PLinkSameBoard 68 | case C.DCGM_TOPOLOGY_SINGLE: 69 | return P2PLinkSingleSwitch 70 | case C.DCGM_TOPOLOGY_MULTIPLE: 71 | return P2PLinkMultiSwitch 72 | case C.DCGM_TOPOLOGY_HOSTBRIDGE: 73 | return P2PLinkHostBridge 74 | case C.DCGM_TOPOLOGY_CPU: 75 | return P2PLinkSameCPU 76 | case C.DCGM_TOPOLOGY_SYSTEM: 77 | return P2PLinkCrossCPU 78 | case C.DCGM_TOPOLOGY_NVLINK1: 79 | return SingleNVLINKLink 80 | case C.DCGM_TOPOLOGY_NVLINK2: 81 | return TwoNVLINKLinks 82 | case C.DCGM_TOPOLOGY_NVLINK3: 83 | return ThreeNVLINKLinks 84 | case C.DCGM_TOPOLOGY_NVLINK4: 85 | return FourNVLINKLinks 86 | } 87 | return P2PLinkUnknown 88 | } 89 | 90 | func getCPUAffinity(busid string) (string, error) { 91 | b, err := ioutil.ReadFile(fmt.Sprintf("/sys/bus/pci/devices/%s/local_cpulist", strings.ToLower(busid[4:]))) 92 | if err != nil { 93 | return "", fmt.Errorf("Error getting device cpu affinity: %v", err) 94 | } 95 | return strings.TrimSuffix(string(b), "\n"), nil 96 | } 97 | 98 | func getBusid(gpuid uint) (string, error) { 99 | var device C.dcgmDeviceAttributes_t 100 | device.version = makeVersion2(unsafe.Sizeof(device)) 101 | 102 | result := C.dcgmGetDeviceAttributes(handle.handle, C.uint(gpuid), &device) 103 | if err := errorString(result); err != nil { 104 | return "", fmt.Errorf("Error getting device busid: %s", err) 105 | } 106 | return *stringPtr(&device.identifiers.pciBusId[0]), nil 107 | } 108 | 109 | func getDeviceTopology(gpuid uint) (links []P2PLink, err error) { 110 | var topology C.dcgmDeviceTopology_t 111 | topology.version = makeVersion2(unsafe.Sizeof(topology)) 112 | 113 | result := C.dcgmGetDeviceTopology(handle.handle, C.uint(gpuid), &topology) 114 | if result == C.DCGM_ST_NOT_SUPPORTED { 115 | return links, nil 116 | } 117 | if result != C.DCGM_ST_OK { 118 | return links, fmt.Errorf("Error getting device topology: %s", errorString(result)) 119 | } 120 | 121 | busid, err := getBusid(gpuid) 122 | if err != nil { 123 | return 124 | } 125 | 126 | for i := uint(0); i < uint(topology.numGpus); i++ { 127 | gpu := topology.gpuPaths[i].gpuId 128 | p2pLink := P2PLink{ 129 | GPU: uint(gpu), 130 | BusID: busid, 131 | Link: getP2PLink(uint(topology.gpuPaths[i].path)), 132 | } 133 | links = append(links, p2pLink) 134 | } 135 | return 136 | } 137 | -------------------------------------------------------------------------------- /vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm/utils.go: -------------------------------------------------------------------------------- 1 | package dcgm 2 | 3 | /* 4 | #include "stdlib.h" 5 | #include "dcgm_structs.h" 6 | */ 7 | import "C" 8 | 9 | import ( 10 | "fmt" 11 | "math" 12 | "unsafe" 13 | ) 14 | 15 | const ( 16 | dcgmInt32Blank = 0x7ffffff0 // 2147483632 17 | dcgmInt64Blank = 0x7ffffffffffffff0 // 9223372036854775792 18 | ) 19 | 20 | func uintPtr(c C.uint) *uint { 21 | i := uint(c) 22 | return &i 23 | } 24 | 25 | func uintPtrInt(c C.int) *uint { 26 | i := uint(c) 27 | return &i 28 | } 29 | 30 | func uintPtrUnsafe(p unsafe.Pointer) *uint { 31 | if p == nil { 32 | return nil 33 | } 34 | uintP := (*uint)(unsafe.Pointer(p)) 35 | val := *uintP 36 | return &val 37 | } 38 | 39 | func uint64Ptr(c C.longlong) *uint64 { 40 | i := uint64(c) 41 | return &i 42 | } 43 | 44 | func int64Ptr(c C.longlong) *int64 { 45 | i := int64(c) 46 | return &i 47 | } 48 | 49 | func uint64PtrUint(c C.uint) *uint64 { 50 | i := uint64(c) 51 | return &i 52 | } 53 | 54 | func uint64PtrUnsafe(p unsafe.Pointer) *uint64 { 55 | if p == nil { 56 | return nil 57 | } 58 | uintP := (*uint64)(unsafe.Pointer(p)) 59 | val := *uintP 60 | return &val 61 | } 62 | 63 | func toInt64(c C.longlong) int64 { 64 | i := int64(c) 65 | return i 66 | } 67 | 68 | func dblToUint(val C.double) *uint { 69 | i := uint(val) 70 | return &i 71 | } 72 | 73 | func dblToFloat(val C.double) *float64 { 74 | i := float64(val) 75 | return &i 76 | } 77 | 78 | func dblToFloatUnsafe(val unsafe.Pointer) *float64 { 79 | if val == nil { 80 | return nil 81 | } 82 | dblP := (*C.double)(unsafe.Pointer(val)) 83 | floatP := float64(*dblP) 84 | return &floatP 85 | } 86 | 87 | func stringPtr(c *C.char) *string { 88 | s := C.GoString(c) 89 | return &s 90 | } 91 | 92 | func errorString(result C.dcgmReturn_t) error { 93 | if result == C.DCGM_ST_OK { 94 | return nil 95 | } 96 | err := C.GoString(C.errorString(result)) 97 | return fmt.Errorf("%v", err) 98 | } 99 | 100 | func freeCString(cStr *C.char) { 101 | C.free(unsafe.Pointer(cStr)) 102 | } 103 | 104 | func IsInt32Blank(value int) bool { 105 | if value >= dcgmInt32Blank { 106 | return true 107 | } 108 | return false 109 | } 110 | 111 | func IsInt64Blank(value int64) bool { 112 | if value >= dcgmInt64Blank { 113 | return true 114 | } 115 | return false 116 | } 117 | 118 | func blank64(val *int64) *int64 { 119 | if val != nil && IsInt64Blank(*val) { 120 | return nil 121 | } 122 | return val 123 | } 124 | 125 | func blank32(val *uint) *uint { 126 | if val != nil && IsInt32Blank(int(*val)) { 127 | return nil 128 | } 129 | return val 130 | } 131 | 132 | func makeVersion1(struct_type uintptr) C.uint { 133 | version := C.uint(struct_type | 1<<24) 134 | return version 135 | } 136 | 137 | func makeVersion2(struct_type uintptr) C.uint { 138 | version := C.uint(struct_type | 2<<24) 139 | return version 140 | } 141 | 142 | func roundFloat(f *float64) *float64 { 143 | var val float64 144 | if f != nil { 145 | val = math.Round(*f) 146 | } 147 | return &val 148 | } 149 | -------------------------------------------------------------------------------- /vendor/github.com/gorilla/mux/AUTHORS: -------------------------------------------------------------------------------- 1 | # This is the official list of gorilla/mux authors for copyright purposes. 2 | # 3 | # Please keep the list sorted. 4 | 5 | Google LLC (https://opensource.google.com/) 6 | Kamil Kisielk 7 | Matt Silverlock 8 | Rodrigo Moraes (https://github.com/moraes) 9 | -------------------------------------------------------------------------------- /vendor/github.com/gorilla/mux/LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2012-2018 The Gorilla Authors. All rights reserved. 2 | 3 | Redistribution and use in source and binary forms, with or without 4 | modification, are permitted provided that the following conditions are 5 | met: 6 | 7 | * Redistributions of source code must retain the above copyright 8 | notice, this list of conditions and the following disclaimer. 9 | * Redistributions in binary form must reproduce the above 10 | copyright notice, this list of conditions and the following disclaimer 11 | in the documentation and/or other materials provided with the 12 | distribution. 13 | * Neither the name of Google Inc. nor the names of its 14 | contributors may be used to endorse or promote products derived from 15 | this software without specific prior written permission. 16 | 17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 21 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 22 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 23 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | -------------------------------------------------------------------------------- /vendor/github.com/gorilla/mux/go.mod: -------------------------------------------------------------------------------- 1 | module github.com/gorilla/mux 2 | 3 | go 1.12 4 | -------------------------------------------------------------------------------- /vendor/github.com/gorilla/mux/middleware.go: -------------------------------------------------------------------------------- 1 | package mux 2 | 3 | import ( 4 | "net/http" 5 | "strings" 6 | ) 7 | 8 | // MiddlewareFunc is a function which receives an http.Handler and returns another http.Handler. 9 | // Typically, the returned handler is a closure which does something with the http.ResponseWriter and http.Request passed 10 | // to it, and then calls the handler passed as parameter to the MiddlewareFunc. 11 | type MiddlewareFunc func(http.Handler) http.Handler 12 | 13 | // middleware interface is anything which implements a MiddlewareFunc named Middleware. 14 | type middleware interface { 15 | Middleware(handler http.Handler) http.Handler 16 | } 17 | 18 | // Middleware allows MiddlewareFunc to implement the middleware interface. 19 | func (mw MiddlewareFunc) Middleware(handler http.Handler) http.Handler { 20 | return mw(handler) 21 | } 22 | 23 | // Use appends a MiddlewareFunc to the chain. Middleware can be used to intercept or otherwise modify requests and/or responses, and are executed in the order that they are applied to the Router. 24 | func (r *Router) Use(mwf ...MiddlewareFunc) { 25 | for _, fn := range mwf { 26 | r.middlewares = append(r.middlewares, fn) 27 | } 28 | } 29 | 30 | // useInterface appends a middleware to the chain. Middleware can be used to intercept or otherwise modify requests and/or responses, and are executed in the order that they are applied to the Router. 31 | func (r *Router) useInterface(mw middleware) { 32 | r.middlewares = append(r.middlewares, mw) 33 | } 34 | 35 | // CORSMethodMiddleware automatically sets the Access-Control-Allow-Methods response header 36 | // on requests for routes that have an OPTIONS method matcher to all the method matchers on 37 | // the route. Routes that do not explicitly handle OPTIONS requests will not be processed 38 | // by the middleware. See examples for usage. 39 | func CORSMethodMiddleware(r *Router) MiddlewareFunc { 40 | return func(next http.Handler) http.Handler { 41 | return http.HandlerFunc(func(w http.ResponseWriter, req *http.Request) { 42 | allMethods, err := getAllMethodsForRoute(r, req) 43 | if err == nil { 44 | for _, v := range allMethods { 45 | if v == http.MethodOptions { 46 | w.Header().Set("Access-Control-Allow-Methods", strings.Join(allMethods, ",")) 47 | } 48 | } 49 | } 50 | 51 | next.ServeHTTP(w, req) 52 | }) 53 | } 54 | } 55 | 56 | // getAllMethodsForRoute returns all the methods from method matchers matching a given 57 | // request. 58 | func getAllMethodsForRoute(r *Router, req *http.Request) ([]string, error) { 59 | var allMethods []string 60 | 61 | for _, route := range r.routes { 62 | var match RouteMatch 63 | if route.Match(req, &match) || match.MatchErr == ErrMethodMismatch { 64 | methods, err := route.GetMethods() 65 | if err != nil { 66 | return nil, err 67 | } 68 | 69 | allMethods = append(allMethods, methods...) 70 | } 71 | } 72 | 73 | return allMethods, nil 74 | } 75 | -------------------------------------------------------------------------------- /vendor/github.com/gorilla/mux/test_helpers.go: -------------------------------------------------------------------------------- 1 | // Copyright 2012 The Gorilla Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package mux 6 | 7 | import "net/http" 8 | 9 | // SetURLVars sets the URL variables for the given request, to be accessed via 10 | // mux.Vars for testing route behaviour. Arguments are not modified, a shallow 11 | // copy is returned. 12 | // 13 | // This API should only be used for testing purposes; it provides a way to 14 | // inject variables into the request context. Alternatively, URL variables 15 | // can be set by making a route that captures the required variables, 16 | // starting a server and sending the request to that server. 17 | func SetURLVars(r *http.Request, val map[string]string) *http.Request { 18 | return requestWithVars(r, val) 19 | } 20 | -------------------------------------------------------------------------------- /vendor/modules.txt: -------------------------------------------------------------------------------- 1 | # github.com/Masterminds/semver v1.5.0 2 | ## explicit 3 | github.com/Masterminds/semver 4 | # github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm v0.0.0-20210325210537-29b4f1784f18 => ./bindings/go/dcgm 5 | ## explicit 6 | github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm 7 | # github.com/gorilla/mux v1.7.4 8 | ## explicit 9 | github.com/gorilla/mux 10 | # github.com/NVIDIA/gpu-monitoring-tools/bindings/go/dcgm => ./bindings/go/dcgm 11 | # k8s.io/api => k8s.io/api v0.20.2 12 | # k8s.io/apiextensions-apiserver => k8s.io/apiextensions-apiserver v0.20.2 13 | # k8s.io/apimachinery => k8s.io/apimachinery v0.20.2 14 | # k8s.io/apiserver => k8s.io/apiserver v0.20.2 15 | # k8s.io/cli-runtime => k8s.io/cli-runtime v0.20.2 16 | # k8s.io/client-go => k8s.io/client-go v0.20.2 17 | # k8s.io/cloud-provider => k8s.io/cloud-provider v0.20.2 18 | # k8s.io/cluster-bootstrap => k8s.io/cluster-bootstrap v0.20.2 19 | # k8s.io/code-generator => k8s.io/code-generator v0.20.2 20 | # k8s.io/component-base => k8s.io/component-base v0.20.2 21 | # k8s.io/cri-api => k8s.io/cri-api v0.20.2 22 | # k8s.io/csi-translation-lib => k8s.io/csi-translation-lib v0.20.2 23 | # k8s.io/kube-aggregator => k8s.io/kube-aggregator v0.20.2 24 | # k8s.io/kube-controller-manager => k8s.io/kube-controller-manager v0.20.2 25 | # k8s.io/kube-proxy => k8s.io/kube-proxy v0.20.2 26 | # k8s.io/kube-scheduler => k8s.io/kube-scheduler v0.20.2 27 | # k8s.io/kubectl => k8s.io/kubectl v0.20.2 28 | # k8s.io/kubelet => k8s.io/kubelet v0.20.2 29 | # k8s.io/legacy-cloud-providers => k8s.io/legacy-cloud-providers v0.20.2 30 | # k8s.io/metrics => k8s.io/metrics v0.20.2 31 | # k8s.io/sample-apiserver => k8s.io/sample-apiserver v0.20.2 32 | --------------------------------------------------------------------------------