├── .gitmodules ├── staticcheck.conf ├── .envrc ├── hack ├── VERSION └── header.txt ├── internal ├── pkg │ ├── os │ │ ├── README.md │ │ └── os.go │ ├── exec │ │ ├── README.md │ │ └── exec.go │ ├── elf │ │ ├── README.md │ │ ├── elf.go │ │ └── types.go │ ├── devicemonitoring │ │ ├── const.go │ │ └── types.go │ ├── prerequisites │ │ ├── types.go │ │ ├── validation.go │ │ ├── variables.go │ │ ├── dcgmlib_rule.go │ │ └── validation_test.go │ ├── testutils │ │ ├── types.go │ │ └── const.go │ ├── devicewatcher │ │ ├── variables.go │ │ ├── const.go │ │ └── types.go │ ├── collector │ │ ├── variables.go │ │ ├── const.go │ │ ├── xid_collector.go │ │ └── base_collector.go │ ├── transformation │ │ ├── variables.go │ │ ├── transformer.go │ │ ├── const.go │ │ ├── transformer_test.go │ │ └── types.go │ ├── appconfig │ │ ├── const.go │ │ └── types.go │ ├── nvmlprovider │ │ ├── types.go │ │ └── provider_test.go │ ├── registry │ │ ├── types.go │ │ └── registry.go │ ├── counters │ │ ├── variables.go │ │ ├── const.go │ │ ├── exporter_counters_test.go │ │ ├── types.go │ │ └── exporter_counters.go │ ├── logging │ │ ├── json_handler.go │ │ └── const.go │ ├── kubeclient │ │ └── client.go │ ├── devicewatchlistmanager │ │ └── types.go │ ├── capabilities │ │ └── capabilities_test.go │ ├── server │ │ └── types.go │ ├── stdout │ │ ├── stdoutprocessor.go │ │ ├── capture.go │ │ ├── capture_test_wrapper.go │ │ └── capture_test.go │ ├── dcgmprovider │ │ ├── smart_init.go │ │ └── types.go │ ├── hostname │ │ └── hostname.go │ ├── deviceinfo │ │ └── types.go │ └── utils │ │ └── utils.go ├── README.md └── mocks │ └── pkg │ ├── exec │ ├── mock_cmd.go │ └── mock_exec.go │ ├── elf │ └── mock_elf.go │ ├── collector │ └── mock_collector.go │ ├── nvmlprovider │ └── mock_client.go │ ├── transformation │ └── mock_transformer.go │ ├── transformations │ └── mock_transformer.go │ ├── devicewatcher │ └── mock_device_watcher.go │ ├── devicewatchlistmanager │ └── mock_device_watchlist_manager.go │ └── os │ └── mock_dir_entry.go ├── docker ├── dcgm-exporter-entrypoint.sh └── build-cross.sh ├── tests ├── integration │ ├── testdata │ │ ├── web-config.yml │ │ ├── tlsCertificate.crt │ │ └── tlsCertificate.key │ ├── README.md │ ├── start_read_test.go │ ├── helpers_test.go │ └── start_with_tls_test.go ├── gpu-pod.yaml ├── e2e │ ├── internal │ │ └── framework │ │ │ └── utils.go │ ├── Makefile │ ├── README.md │ ├── main_test.go │ └── e2e_verify_tls_test.go └── docker │ ├── Makefile │ └── docker_suite_test.go ├── .github ├── ISSUE_TEMPLATE │ ├── config.yml │ ├── submit-question.yml │ ├── bug_report_form.yml │ └── feature_request_form.yml ├── workflows │ └── go.yml ├── PR_TEMPLATE.md └── ISSUE_TEMPLATE.md ├── .golangci.yml ├── deployment ├── templates │ ├── role.yaml │ ├── clusterrole.yaml │ ├── rolebinding.yaml │ ├── clusterrolebinding.yaml │ ├── serviceaccount.yaml │ ├── service.yaml │ ├── service-monitor.yaml │ ├── NOTES.txt │ ├── web-config-configmap.yaml │ ├── tls-secret.yaml │ └── _helpers.tpl ├── Chart.yaml └── .helmignore ├── .devcontainer ├── devcontainer.json └── Dockerfile ├── cmd └── dcgm-exporter │ └── main.go ├── service-monitor.yaml ├── RELEASE.md ├── packaging └── config-files │ └── systemd │ └── nvidia-dcgm-exporter.service ├── pkg └── cmd │ └── const.go ├── .dockerignore ├── .hadolint.yaml ├── .vscode └── launch.json ├── scripts └── test_coverage.sh ├── CONTRIBUTING.md ├── dcgm-exporter.yaml ├── security.md └── etc └── 1.x-compatibility-metrics.csv /.gitmodules: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /staticcheck.conf: -------------------------------------------------------------------------------- 1 | checks = ["all", "-ST1005"] -------------------------------------------------------------------------------- /.envrc: -------------------------------------------------------------------------------- 1 | export GOROOT=$(go1.24.5 env GOROOT) 2 | export PATH=$GOROOT/bin:$PATH 3 | 4 | -------------------------------------------------------------------------------- /hack/VERSION: -------------------------------------------------------------------------------- 1 | OLD_DCGM_VERSION=4.4.1 2 | OLD_EXPORTER_VERSION=4.7.0 3 | NEW_DCGM_VERSION=4.4.2 4 | NEW_EXPORTER_VERSION=4.7.1 5 | -------------------------------------------------------------------------------- /internal/pkg/os/README.md: -------------------------------------------------------------------------------- 1 | # OS - wrapper package for system os package 2 | 3 | The package allows to mock os package functions for testing purposes. 4 | 5 | 6 | -------------------------------------------------------------------------------- /internal/pkg/exec/README.md: -------------------------------------------------------------------------------- 1 | # Exec - wrapper package for system os/exec package 2 | 3 | The package allows to mock os/exec package functions for testing purposes. 4 | -------------------------------------------------------------------------------- /internal/pkg/elf/README.md: -------------------------------------------------------------------------------- 1 | # Exec - wrapper package for system debug/elf package 2 | 3 | The package allows to mock debug/elf package functions for testing purposes. 4 | -------------------------------------------------------------------------------- /docker/dcgm-exporter-entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -euo pipefail 3 | 4 | # Entrypoint for dcgm-exporter 5 | # Capability checking is done in Go code (internal/pkg/capabilities) 6 | 7 | exec /usr/bin/dcgm-exporter "$@" 8 | -------------------------------------------------------------------------------- /tests/integration/testdata/web-config.yml: -------------------------------------------------------------------------------- 1 | tls_server_config: 2 | cert_file: tlsCertificate.crt 3 | key_file: tlsCertificate.key 4 | basic_auth_users: 5 | # password: password 6 | alice: $2y$10$CWCEbs7mt8QFrToJR9OcG.tduEKpx9SROhWnaDHQgQX4Q6hqWNASa -------------------------------------------------------------------------------- /internal/README.md: -------------------------------------------------------------------------------- 1 | `/internal` 2 | 3 | Code intended for private use only, not for external import. 4 | Note that this layout pattern is enforced by the Go compiler itself. See the Go 1.4 [`release notes`](https://golang.org/doc/go1.4#internalpackages) for more details. -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/config.yml: -------------------------------------------------------------------------------- 1 | # GitHub info on config.yml 2 | # https://docs.github.com/en/communities/using-templates-to-encourage-useful-issues-and-pull-requests/configuring-issue-templates-for-your-repository#configuring-the-template-chooser 3 | # Set to 'false' if you only want the templates to be used. 4 | blank_issues_enabled: true -------------------------------------------------------------------------------- /.golangci.yml: -------------------------------------------------------------------------------- 1 | linters: 2 | enable: 3 | - contextcheck 4 | - gocritic 5 | - gofmt 6 | - goimports 7 | - gosec 8 | - gosimple 9 | - govet 10 | - ineffassign 11 | - misspell 12 | - staticcheck 13 | - unused 14 | - gofumpt 15 | 16 | linters-settings: 17 | goimports: 18 | local-prefixes: github.com/NVIDIA/dcgm-exporter 19 | 20 | -------------------------------------------------------------------------------- /deployment/templates/role.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: Role 3 | metadata: 4 | name: dcgm-exporter-read-cm 5 | namespace: {{ include "dcgm-exporter.namespace" . }} 6 | labels: 7 | {{- include "dcgm-exporter.labels" . | nindent 4 }} 8 | rules: 9 | - apiGroups: [""] 10 | resources: ["configmaps"] 11 | resourceNames: ["exporter-metrics-config-map"] 12 | verbs: ["get"] 13 | -------------------------------------------------------------------------------- /deployment/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v2 2 | name: dcgm-exporter 3 | description: A Helm chart for DCGM exporter 4 | version: "4.7.1" 5 | kubeVersion: ">= 1.19.0-0" 6 | appVersion: "4.7.1" 7 | sources: 8 | - https://github.com/nvidia/dcgm-exporter 9 | home: https://github.com/nvidia/dcgm-exporter/ 10 | icon: https://assets.nvidiagrid.net/ngc/logos/DCGM.png 11 | keywords: 12 | - gpu 13 | - cuda 14 | - compute 15 | - monitoring 16 | - telemetry 17 | - tesla 18 | -------------------------------------------------------------------------------- /deployment/.helmignore: -------------------------------------------------------------------------------- 1 | # Patterns to ignore when building packages. 2 | # This supports shell glob matching, relative path matching, and 3 | # negation (prefixed with !). Only one pattern per line. 4 | .DS_Store 5 | # Common VCS dirs 6 | .git/ 7 | .gitignore 8 | .bzr/ 9 | .bzrignore 10 | .hg/ 11 | .hgignore 12 | .svn/ 13 | # Common backup files 14 | *.swp 15 | *.bak 16 | *.tmp 17 | *.orig 18 | *~ 19 | # Various IDEs 20 | .project 21 | .idea/ 22 | *.tmproj 23 | .vscode/ 24 | -------------------------------------------------------------------------------- /.github/workflows/go.yml: -------------------------------------------------------------------------------- 1 | name: Go 2 | 3 | on: 4 | push: 5 | branches: [ main ] 6 | pull_request: 7 | branches: [ main ] 8 | 9 | jobs: 10 | 11 | build: 12 | runs-on: ubuntu-latest 13 | steps: 14 | - uses: actions/checkout@v2 15 | 16 | - name: Set up Go 17 | uses: actions/setup-go@v2 18 | with: 19 | go-version: 1.24 20 | 21 | - name: Build 22 | run: make binary 23 | 24 | - name: Lint 25 | run: make check-format 26 | -------------------------------------------------------------------------------- /.github/PR_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | **Please open your pull requests on [gitlab repository](https://gitlab.com/nvidia/dcgm-exporter.git) ** 2 | 3 | Make sure to complete the following items:_ 4 | 5 | - _A reference to a related issue._ 6 | - _A small description of the changes proposed in the pull request._ 7 | - _One commit per change and descriptive commit messages._ 8 | - _Sign-off your work following these [guidelines](https://gitlab.com/nvidia/dcgm-exporter/blob/master/CONTRIBUTING.md) ._ 9 | - _Test run of your changes._ 10 | -------------------------------------------------------------------------------- /tests/gpu-pod.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: nbody-pod 5 | spec: 6 | automountServiceAccountToken: false 7 | restartPolicy: OnFailure 8 | containers: 9 | - name: nbody 10 | image: "nvcr.io/nvidia/k8s/cuda-sample:nbody" 11 | command: ["nbody"] 12 | args: ["-benchmark", "-i=10000000"] 13 | resources: 14 | limits: 15 | cpu: 200m 16 | memory: 256Mi 17 | nvidia.com/gpu: 1 18 | requests: 19 | cpu: 100m 20 | memory: 128Mi 21 | -------------------------------------------------------------------------------- /deployment/templates/clusterrole.yaml: -------------------------------------------------------------------------------- 1 | {{- if or (and (or .Values.kubernetes.enablePodLabels .Values.kubernetes.enablePodUID) .Values.kubernetes.rbac.create) .Values.kubernetesDRA.enabled }} 2 | apiVersion: rbac.authorization.k8s.io/v1 3 | kind: ClusterRole 4 | metadata: 5 | name: {{ include "dcgm-exporter.fullname" . }}-read-pods 6 | labels: 7 | {{- include "dcgm-exporter.labels" . | nindent 4 }} 8 | rules: 9 | - apiGroups: ["", "resource.k8s.io"] 10 | resources: ["pods", "resourceslices"] 11 | verbs: ["get", "list", "watch"] 12 | {{- end }} 13 | -------------------------------------------------------------------------------- /deployment/templates/rolebinding.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: RoleBinding 3 | metadata: 4 | name: {{ include "dcgm-exporter.fullname" . }} 5 | namespace: {{ include "dcgm-exporter.namespace" . }} 6 | labels: 7 | {{- include "dcgm-exporter.labels" . | nindent 4 }} 8 | subjects: 9 | - kind: ServiceAccount 10 | name: {{ include "dcgm-exporter.serviceAccountName" . }} 11 | namespace: {{ include "dcgm-exporter.namespace" . }} 12 | roleRef: 13 | kind: Role 14 | name: dcgm-exporter-read-cm 15 | apiGroup: rbac.authorization.k8s.io 16 | -------------------------------------------------------------------------------- /hack/header.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. -------------------------------------------------------------------------------- /deployment/templates/clusterrolebinding.yaml: -------------------------------------------------------------------------------- 1 | {{- if or (and (or .Values.kubernetes.enablePodLabels .Values.kubernetes.enablePodUID) .Values.kubernetes.rbac.create) .Values.kubernetesDRA.enabled }} 2 | apiVersion: rbac.authorization.k8s.io/v1 3 | kind: ClusterRoleBinding 4 | metadata: 5 | name: {{ include "dcgm-exporter.fullname" . }}-read-pods 6 | labels: 7 | {{- include "dcgm-exporter.labels" . | nindent 4 }} 8 | subjects: 9 | - kind: ServiceAccount 10 | name: {{ include "dcgm-exporter.serviceAccountName" . }} 11 | namespace: {{ include "dcgm-exporter.namespace" . }} 12 | roleRef: 13 | kind: ClusterRole 14 | name: {{ include "dcgm-exporter.fullname" . }}-read-pods 15 | apiGroup: rbac.authorization.k8s.io 16 | {{- end }} 17 | -------------------------------------------------------------------------------- /.devcontainer/devcontainer.json: -------------------------------------------------------------------------------- 1 | // Read here https://containers.dev/implementors/json_reference/ 2 | { 3 | "name": "dcgm-exporter-container", 4 | "build": { 5 | "dockerfile": "Dockerfile" 6 | }, 7 | "privileged": true, 8 | "runArgs": [ 9 | "-v", "/run/docker.sock:/run/docker.sock:rw", 10 | "--mount", "type=bind,src=${env:HOME}/.ssh,dst=/home/developer/.ssh", 11 | "-p", "2222:22", 12 | "--name", "vscode_dev_container", 13 | "-e", "DCGM_BUILD_INSIDE_DOCKER=1", 14 | "-e", "NVIDIA_DRIVER_CAPABILITIES=compute,utility", 15 | "-e", "NVIDIA_VISIBLE_DEVICES=ALL", 16 | "--cap-add=SYS_ADMIN", 17 | "--security-opt", 18 | "seccomp=unconfined", 19 | "--gpus=all" 20 | ], 21 | "postStartCommand": "docker run --privileged --rm tonistiigi/binfmt --install all" 22 | } 23 | -------------------------------------------------------------------------------- /internal/pkg/devicemonitoring/const.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package devicemonitoring 18 | 19 | const ( 20 | PARENT_ID_IGNORED = 0 21 | ) 22 | -------------------------------------------------------------------------------- /internal/pkg/prerequisites/types.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package prerequisites 18 | 19 | type rule interface { 20 | Validate() error 21 | } 22 | -------------------------------------------------------------------------------- /internal/pkg/testutils/types.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package testutils 18 | 19 | type WatchedEntityKey struct { 20 | ParentID uint 21 | ChildID uint 22 | } 23 | -------------------------------------------------------------------------------- /internal/pkg/devicewatcher/variables.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package devicewatcher 18 | 19 | var doNothing = func() { 20 | // This function is intentionally left blank 21 | } 22 | -------------------------------------------------------------------------------- /internal/pkg/collector/variables.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package collector 18 | 19 | import osinterface "github.com/NVIDIA/dcgm-exporter/internal/pkg/os" 20 | 21 | var os osinterface.OS = osinterface.RealOS{} 22 | -------------------------------------------------------------------------------- /internal/pkg/elf/elf.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package elf 18 | 19 | import ( 20 | "debug/elf" 21 | ) 22 | 23 | var _ ELF = (*RealELF)(nil) 24 | 25 | type RealELF struct{} 26 | 27 | func (r RealELF) Open(name string) (*elf.File, error) { 28 | return elf.Open(name) 29 | } 30 | -------------------------------------------------------------------------------- /internal/pkg/transformation/variables.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package transformation 18 | 19 | import osinterface "github.com/NVIDIA/dcgm-exporter/internal/pkg/os" 20 | 21 | var os osinterface.OS = osinterface.RealOS{} 22 | 23 | var doNothing = func() { 24 | // This function is intentionally left blank 25 | } 26 | -------------------------------------------------------------------------------- /internal/pkg/devicewatcher/const.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package devicewatcher 18 | 19 | const ( 20 | DCGM_ST_NOT_CONFIGURED = "Setting not configured" 21 | 22 | maxKeepAge = 600.0 // How long to keep data for this field in seconds 23 | maxKeepSamples = 0 // Maximum number of samples to keep. 0=no limit 24 | ) 25 | -------------------------------------------------------------------------------- /internal/pkg/elf/types.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package elf 18 | 19 | import "debug/elf" 20 | 21 | //go:generate go run -v go.uber.org/mock/mockgen -destination=../../mocks/pkg/elf/mock_elf.go -package=elf -copyright_file=../../../hack/header.txt . ELF 22 | type ELF interface { 23 | Open(name string) (*elf.File, error) 24 | } 25 | -------------------------------------------------------------------------------- /internal/pkg/prerequisites/validation.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package prerequisites 18 | 19 | var rules = []rule{ 20 | dcgmLibExistsRule{}, 21 | } 22 | 23 | func Validate() error { 24 | for _, rule := range rules { 25 | err := rule.Validate() 26 | if err != nil { 27 | return err 28 | } 29 | } 30 | 31 | return nil 32 | } 33 | -------------------------------------------------------------------------------- /internal/pkg/appconfig/const.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package appconfig 18 | 19 | const ( 20 | GPUUID KubernetesGPUIDType = "uid" 21 | DeviceName KubernetesGPUIDType = "device-name" 22 | 23 | NvidiaResourceName = "nvidia.com/gpu" 24 | NvidiaMigResourcePrefix = "nvidia.com/mig-" 25 | MIG_UUID_PREFIX = "MIG-" 26 | ) 27 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | ### What happened? 2 | Tell us what happened and provide as many details as possible, including logs. 3 | 4 | ### What did you expect to happen? 5 | Tell us about expected behaviour. 6 | 7 | ### What is the GPU model? 8 | Tell us about the hardware configuration of the GPU, including the output of 'nvidia-smi' 9 | 10 | ### What is the environment? 11 | Is DCGM-Exporter running on bare metal or in a virtual environment, container, pod, etc? 12 | 13 | ### How did you deploy the dcgm-exporter and what is the configuration? 14 | Tell us how you deployed DCGM-Exporter. Did you use helm, build from source or use the GPU Operator? 15 | 16 | ### How can we reproduce the issue? 17 | Clear and concise steps to reproduce an issue can help everyone by allowing us to identify and fix problems more quickly. 18 | 19 | ### What is the version? 20 | Tell us about DCGM-exporter version. 21 | 22 | ### Anything else we need to know? 23 | Any small detail can help. 24 | 25 | 26 | 27 | 28 | -------------------------------------------------------------------------------- /internal/pkg/nvmlprovider/types.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | //go:generate go run -v go.uber.org/mock/mockgen -destination=../../mocks/pkg/nvmlprovider/mock_client.go -package=nvmlprovider -copyright_file=../../../hack/header.txt . NVML 18 | 19 | package nvmlprovider 20 | 21 | type NVML interface { 22 | GetMIGDeviceInfoByID(string) (*MIGDeviceInfo, error) 23 | Cleanup() 24 | } 25 | -------------------------------------------------------------------------------- /internal/pkg/registry/types.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package registry 18 | 19 | import ( 20 | "github.com/NVIDIA/go-dcgm/pkg/dcgm" 21 | 22 | "github.com/NVIDIA/dcgm-exporter/internal/pkg/collector" 23 | ) 24 | 25 | // MetricsByCounterGroup represents a group of metrics by specific counter groups 26 | type MetricsByCounterGroup map[dcgm.Field_Entity_Group]collector.MetricsByCounter 27 | -------------------------------------------------------------------------------- /internal/pkg/counters/variables.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package counters 18 | 19 | import osinterface "github.com/NVIDIA/dcgm-exporter/internal/pkg/os" 20 | 21 | var os osinterface.OS = osinterface.RealOS{} 22 | 23 | var promMetricType = map[string]bool{ 24 | "gauge": true, 25 | "counter": true, 26 | "histogram": true, 27 | "summary": true, 28 | "label": true, 29 | } 30 | -------------------------------------------------------------------------------- /internal/pkg/logging/json_handler.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package logging 18 | 19 | import ( 20 | "io" 21 | "log/slog" 22 | ) 23 | 24 | // SetupGlobalLogger configures the default logger with JSON handler 25 | func SetupGlobalLogger(w io.Writer, opts *slog.HandlerOptions) { 26 | handler := slog.NewJSONHandler(w, opts) 27 | logger := slog.New(handler) 28 | slog.SetDefault(logger) 29 | } 30 | -------------------------------------------------------------------------------- /cmd/dcgm-exporter/main.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package main 18 | 19 | import ( 20 | "log/slog" 21 | "os" 22 | 23 | "github.com/NVIDIA/dcgm-exporter/pkg/cmd" 24 | ) 25 | 26 | var BuildVersion = "Filled by the build system" 27 | 28 | func main() { 29 | app := cmd.NewApp(BuildVersion) 30 | if err := app.Run(os.Args); err != nil { 31 | slog.Error(err.Error()) 32 | os.Exit(1) 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /internal/pkg/logging/const.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package logging 18 | 19 | // Constants for logging fields 20 | const ( 21 | GroupIDKey = "groupID" 22 | DumpKey = "dump" 23 | StackTrace = "stacktrace" 24 | FieldEntityGroupKey = "fieldEntityGroup" 25 | MetricsKey = "metrics" 26 | DeviceInfoKey = "deviceInfo" 27 | ErrorKey = "error" 28 | ) 29 | -------------------------------------------------------------------------------- /internal/pkg/counters/const.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package counters 18 | 19 | const ( 20 | undefinedConfigMapData = "none" 21 | 22 | cpuFieldsStart = 1100 23 | dcpFieldsStart = 1000 24 | 25 | DCGMExpClockEventsCount = "DCGM_EXP_CLOCK_EVENTS_COUNT" 26 | DCGMExpXIDErrorsCount = "DCGM_EXP_XID_ERRORS_COUNT" 27 | DCGMExpGPUHealthStatus = "DCGM_EXP_GPU_HEALTH_STATUS" 28 | DCGMExpP2PStatus = "DCGM_EXP_P2P_STATUS" 29 | ) 30 | -------------------------------------------------------------------------------- /service-monitor.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: monitoring.coreos.com/v1 16 | kind: ServiceMonitor 17 | metadata: 18 | name: "dcgm-exporter" 19 | labels: 20 | app.kubernetes.io/name: "dcgm-exporter" 21 | app.kubernetes.io/version: "4.7.1" 22 | spec: 23 | selector: 24 | matchLabels: 25 | app.kubernetes.io/name: "dcgm-exporter" 26 | app.kubernetes.io/version: "4.7.1" 27 | endpoints: 28 | - port: "metrics" 29 | path: "/metrics" 30 | -------------------------------------------------------------------------------- /internal/pkg/devicemonitoring/types.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package devicemonitoring 18 | 19 | import ( 20 | "github.com/NVIDIA/go-dcgm/pkg/dcgm" 21 | 22 | "github.com/NVIDIA/dcgm-exporter/internal/pkg/deviceinfo" 23 | ) 24 | 25 | type Info struct { 26 | Entity dcgm.GroupEntityPair 27 | DeviceInfo dcgm.Device 28 | InstanceInfo *deviceinfo.GPUInstanceInfo 29 | ParentId uint 30 | ParentType dcgm.Field_Entity_Group 31 | } 32 | -------------------------------------------------------------------------------- /internal/pkg/kubeclient/client.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package kubeclient 18 | 19 | import ( 20 | "k8s.io/client-go/kubernetes" 21 | "k8s.io/client-go/rest" 22 | ) 23 | 24 | func GetKubeClient() (kubernetes.Interface, error) { 25 | config, err := rest.InClusterConfig() 26 | if err != nil { 27 | return nil, err 28 | } 29 | 30 | client, err := kubernetes.NewForConfig(config) 31 | if err != nil { 32 | return nil, err 33 | } 34 | 35 | return client, err 36 | } 37 | -------------------------------------------------------------------------------- /RELEASE.md: -------------------------------------------------------------------------------- 1 | # Release 2 | 3 | This documents the release process as well as the versioning strategy for the DCGM exporter. 4 | 5 | ## Versioning 6 | 7 | The DCGM container has three major components: 8 | - The DCGM Version (e.g: 4.2.3) 9 | - The Exporter Version (e.g: 4.1.1) 10 | - The platform of the container (e.g: ubuntu22.04) 11 | 12 | The overall version of the DCGM container has three forms: 13 | - The long form: `${DCGM_VERSION}-${EXPORTER_VERSION}-${PLATFORM}` 14 | - The short form: `${DCGM_VERSION}` 15 | - The latest tag: `latest` 16 | 17 | The long form is a unique tag that once pushed will always refer to the same container. 18 | This means that no updates will be made to that tag and it will always point to the same container. 19 | 20 | The short form refers to the latest EXPORTER_VERSION with the platform fixed to ubuntu22.04. 21 | The latest tag refers to the latest short form (i.e: latest DCGM_VERSION and EXPORTER_VERSION). 22 | 23 | Note: We do not maintain multiple version branches. The Exporter functions with the latest go-dcgm bindings. 24 | 25 | ## Releases 26 | 27 | Newer versions are released on demand but tend to follow DCGM's release cadence. 28 | -------------------------------------------------------------------------------- /packaging/config-files/systemd/nvidia-dcgm-exporter.service: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | [Unit] 17 | Description=NVIDIA DCGM-exporter service 18 | Wants=nvidia-dcgm.service 19 | After=nvidia-dcgm.service 20 | 21 | [Service] 22 | User=root 23 | PrivateTmp=false 24 | 25 | StandardOutput=append:/var/log/dcgm-exporter.log 26 | StandardError=append:/var/log/dcgm-exporter.log 27 | 28 | ExecStart=/usr/bin/dcgm-exporter -f /etc/dcgm-exporter/default-counters.csv 29 | 30 | Restart=on-abort 31 | 32 | [Install] 33 | WantedBy=multi-user.target 34 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/submit-question.yml: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION. All rights reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | name: Submit question 17 | description: Ask a general question about DCGM-exporter 18 | labels: ["question"] 19 | 20 | body: 21 | - type: markdown 22 | attributes: 23 | value: | 24 | Thanks for taking the time to fill out this question! 25 | 26 | - type: textarea 27 | id: description 28 | attributes: 29 | label: Ask your question 30 | description: What is your question? 31 | validations: 32 | required: true -------------------------------------------------------------------------------- /pkg/cmd/const.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package cmd 18 | 19 | // DCGMDbgLvl is a DCGM library debug level. 20 | const ( 21 | DCGMDbgLvlNone = "NONE" 22 | DCGMDbgLvlFatal = "FATAL" 23 | DCGMDbgLvlError = "ERROR" 24 | DCGMDbgLvlWarn = "WARN" 25 | DCGMDbgLvlInfo = "INFO" 26 | DCGMDbgLvlDebug = "DEBUG" 27 | DCGMDbgLvlVerb = "VERB" 28 | ) 29 | 30 | var DCGMDbgLvlValues = []string{ 31 | DCGMDbgLvlNone, 32 | DCGMDbgLvlFatal, 33 | DCGMDbgLvlError, 34 | DCGMDbgLvlWarn, 35 | DCGMDbgLvlInfo, 36 | DCGMDbgLvlDebug, 37 | DCGMDbgLvlVerb, 38 | } 39 | -------------------------------------------------------------------------------- /tests/integration/testdata/tlsCertificate.crt: -------------------------------------------------------------------------------- 1 | -----BEGIN CERTIFICATE----- 2 | MIIDHDCCAgSgAwIBAgIUGk7Hkcf3X3cTkf+AcO+LGnZ/gXcwDQYJKoZIhvcNAQEL 3 | BQAwEzERMA8GA1UEAwwIdmZlZG9yb3YwHhcNMjMxMjA2MTc0ODA5WhcNMzMxMjAz 4 | MTc0ODA5WjATMREwDwYDVQQDDAh2ZmVkb3JvdjCCASIwDQYJKoZIhvcNAQEBBQAD 5 | ggEPADCCAQoCggEBAM6eXKOZGnWWwHfLTWi86q6Fwtbpp+Kw3NGHkiZT9Fbevv+A 6 | 7lEBHkisr/9Iya9BW4v01eabA6EUxFT93s8v0mb/TdSi1t2RczWStfQ9EVUoYHMw 7 | RBWughyEYBRmEgMs/9dYiSzYmAV4QjrKhyFf+9SyCd7C4Lc1S1WX2GRt+VEMvAdJ 8 | GBDhzAm1fJrsNq1FhcQd6skbsbRihVDYXcVZK8oy2Jzp1E/XVwxbrHgJpzoRmWtC 9 | lfv1piss+cP56epTb67fGg7Q/aDKVubc4FL2hrIt+iO6pMrzXQEZk0VJmdmXFLxk 10 | z1nb029jNqgxXaJcbQ+VEvs3r0yPyjm6PXOMzJMCAwEAAaNoMGYwHQYDVR0OBBYE 11 | FK8RnI5wqzc1gOmJrvVjC6k9L6QVMB8GA1UdIwQYMBaAFK8RnI5wqzc1gOmJrvVj 12 | C6k9L6QVMA8GA1UdEwEB/wQFMAMBAf8wEwYDVR0RBAwwCoIIdmZlZG9yb3YwDQYJ 13 | KoZIhvcNAQELBQADggEBAAA3QjlOZs3Aw8pgvw/DCDTr3ipO22d19LEzjUFlUKdj 14 | yDwmBE2O9Obebtjhv2R6ltkmaCsIa3li6DbpltfHLXq2bkzEckP7sbpq4QnJcAw/ 15 | hjC65xli08HZO5VvtA9NYSZv6q8JdoY+r3pl8QtYTc4Wh/6D4lilv+OubQuGtjwE 16 | HXR4JPisNA70uLN5rUuMzKRTPmQCX0ljAuNmKKB7BHAxKxyGIJKVGMgcak+oEtR2 17 | wmEj+7Q4GkhYitTvJsBwm1XQyCuo+4nlZ2zHlELWklyIiuJ4YmUOBMOmgFtJVGRw 18 | ycp43YM3B4ZsOUNj+TXp7jG1isobfIjOlJrrlyHGbOk= 19 | -----END CERTIFICATE----- 20 | -------------------------------------------------------------------------------- /tests/integration/README.md: -------------------------------------------------------------------------------- 1 | # Prerequisites 2 | 3 | - Machine with NVIDIA GPU 4 | - Operation system Linux 5 | - [Golang >= 1.24 installed](https://golang.org/) 6 | - [DCGM installed](https://developer.nvidia.com/dcgm) 7 | 8 | # Integration Tests 9 | 10 | ## Basics 11 | 12 | From the dcgm-exporter root directory run the tests: 13 | 14 | ``` 15 | make test-integration 16 | ``` 17 | 18 | ## Quickly iterating on a single test 19 | 20 | Run a single test 21 | 22 | ``` 23 | make test-integration -e TEST_ARGS="-test.run [test name here]" 24 | ``` 25 | 26 | Example: 27 | ``` 28 | make test-integration -e TEST_ARGS="-test.run TestStartAndReadMetrics" 29 | ``` 30 | 31 | **WARNING**: It takes about 30 seconds, before the dcgm-exporter instance will read available metrics. Some metrics require at least two data points to compute a value, meaning at least one polling interval should be passed before we can get the results. By default, dcgm-exporter uses 30-second polling intervals, thus the delay we observe. 32 | 33 | 34 | # Testing Philosophy 35 | 36 | * Assumed that tests can be run on any Linux machine with compatible NVIDIA GPU 37 | * Tests are the best documentation. 38 | * The reader should easily read and understand the tested scenario. 39 | * One file must contain only one test scenario. 40 | -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | # Version control 2 | .git 3 | .gitignore 4 | .gitlab-ci.yml 5 | 6 | # Build artifacts 7 | dist/ 8 | *.tar.gz 9 | *.zip 10 | bin/ 11 | cmd/dcgm-exporter/dcgm-exporter 12 | 13 | # Test files 14 | tests/ 15 | tests.cov 16 | *.test 17 | 18 | # Documentation 19 | README.md 20 | CONTRIBUTING.md 21 | RELEASE.md 22 | security.md 23 | *.md 24 | 25 | # CI/CD and deployment 26 | .github/ 27 | .gitlab/ 28 | deployment/ 29 | examples/ 30 | grafana/ 31 | 32 | # Docker files - allow Dockerfile, build script, and entrypoint 33 | .dockerignore 34 | !docker/Dockerfile 35 | !docker/build-cross.sh 36 | !docker/dcgm-exporter-entrypoint.sh 37 | 38 | # IDE and editor files 39 | .vscode/ 40 | .idea/ 41 | *.swp 42 | *.swo 43 | *~ 44 | 45 | # Temporary files 46 | tmp/ 47 | *.tmp 48 | *.log 49 | 50 | # OS files 51 | .DS_Store 52 | Thumbs.db 53 | 54 | # Local configuration 55 | .env 56 | .env.* 57 | *.local 58 | 59 | # Static analysis 60 | staticcheck.conf 61 | lint/ 62 | 63 | # Packaging 64 | packaging/ 65 | 66 | # Scripts that aren't needed in build 67 | scripts/ 68 | # Note: hack/ contains VERSION file needed by Makefile 69 | 70 | # Service definitions 71 | *.yaml 72 | *.yml 73 | 74 | # Note: Makefile is needed for the build process 75 | 76 | # Coverage reports 77 | coverage.out 78 | coverage.html 79 | 80 | -------------------------------------------------------------------------------- /.hadolint.yaml: -------------------------------------------------------------------------------- 1 | # Hadolint configuration for dcgm-exporter Dockerfiles 2 | # https://github.com/hadolint/hadolint 3 | 4 | # Ignored rules with justification: 5 | # - DL3008/DL3041: Package version pinning not used because: 6 | # * We intentionally use the latest DCGM version available in NVIDIA repos 7 | # * Version control is provided by the versioned CUDA base image (e.g., cuda:13.0.1) 8 | # * Allows automatic security patches and bug fixes within compatible versions 9 | # * Pinning would require Dockerfile updates for every DCGM patch release 10 | # * Build tools (wget, gcc) are ephemeral and don't affect final image 11 | # - DL3029: --platform flag required for multi-arch builds (amd64/arm64) 12 | # - DL3002: USER root required for Docker to grant CAP_SYS_ADMIN capability 13 | # * Docker only grants capabilities to root processes (UID 0) 14 | # * Non-root execution available via --user flag (basic metrics only) 15 | # * Documented in Dockerfile with security measures 16 | ignored: 17 | - DL3008 # apt-get version pinning 18 | - DL3029 # --platform flag 19 | - DL3002 # USER root 20 | - DL3041 # dnf version pinning (same reasoning as DL3008) 21 | - DL3059 # Multiple consecutive RUN (acceptable for clarity) 22 | - SC2086 # ShellCheck - quoting (handled where needed) 23 | 24 | -------------------------------------------------------------------------------- /docker/build-cross.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | set -e 17 | 18 | TARGETOS=${TARGETOS:-linux} 19 | TARGETARCH=${TARGETARCH:-amd64} 20 | 21 | # Configure cross-compilation based on target architecture 22 | if [ "$TARGETARCH" = "arm64" ]; then 23 | export CC=aarch64-linux-gnu-gcc 24 | export LD_LIBRARY_PATH=/usr/aarch64-linux-gnu/lib:$LD_LIBRARY_PATH 25 | else 26 | export CC=gcc 27 | fi 28 | 29 | echo "Building dcgm-exporter for $TARGETOS/$TARGETARCH using CC=$CC" 30 | 31 | # Execute build with all necessary environment variables 32 | GOOS=$TARGETOS GOARCH=$TARGETARCH CGO_ENABLED=1 CC=$CC make install 33 | 34 | echo "Build completed successfully" 35 | 36 | -------------------------------------------------------------------------------- /deployment/templates/serviceaccount.yaml: -------------------------------------------------------------------------------- 1 | {{- if .Values.serviceAccount.create -}} 2 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | apiVersion: v1 17 | kind: ServiceAccount 18 | metadata: 19 | name: {{ include "dcgm-exporter.serviceAccountName" . }} 20 | namespace: {{ include "dcgm-exporter.namespace" . }} 21 | labels: 22 | {{- include "dcgm-exporter.labels" . | nindent 4 }} 23 | {{- with .Values.serviceAccount.annotations }} 24 | annotations: 25 | {{- toYaml . | nindent 4 }} 26 | {{- end }} 27 | automountServiceAccountToken: {{ or (and (or .Values.kubernetes.enablePodLabels .Values.kubernetes.enablePodUID) .Values.kubernetes.rbac.create) .Values.kubernetesDRA.enabled }} 28 | {{- end -}} 29 | -------------------------------------------------------------------------------- /internal/pkg/devicewatchlistmanager/types.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | //go:generate go run -v go.uber.org/mock/mockgen -destination=../../mocks/pkg/devicewatchlistmanager/mock_device_watchlist_manager.go -package=devicewatchlistmanager -copyright_file=../../../hack/header.txt . Manager 18 | 19 | package devicewatchlistmanager 20 | 21 | import ( 22 | "github.com/NVIDIA/go-dcgm/pkg/dcgm" 23 | 24 | "github.com/NVIDIA/dcgm-exporter/internal/pkg/devicewatcher" 25 | ) 26 | 27 | type Manager interface { 28 | CreateEntityWatchList(dcgm.Field_Entity_Group, devicewatcher.Watcher, int64) error 29 | EntityWatchList(dcgm.Field_Entity_Group) (WatchList, bool) 30 | } 31 | -------------------------------------------------------------------------------- /internal/pkg/collector/const.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package collector 18 | 19 | const ( 20 | windowSizeInMSLabel = "window_size_in_ms" 21 | 22 | skipDCGMValue = "SKIPPING DCGM VALUE" 23 | FailedToConvert = "ERROR - FAILED TO CONVERT TO STRING" 24 | 25 | LinkStatusOK = "OK" 26 | LinkStatusChipsetNotSupported = "ChipsetNotSupported" 27 | LinkStatusTopologyNotSupported = "TopologyNotSupported" 28 | LinkStatusDisabledByRegKey = "DisabledByRegKey" 29 | LinkStatusNotSupported = "NotSupported" 30 | LinkStatusUnknown = "Unknown" 31 | 32 | PeerGPULabel = "peer_gpu" 33 | LinkStatusLabel = "link_status" 34 | ) 35 | -------------------------------------------------------------------------------- /internal/pkg/transformation/transformer.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package transformation 18 | 19 | import ( 20 | "github.com/NVIDIA/dcgm-exporter/internal/pkg/appconfig" 21 | ) 22 | 23 | // GetTransformations return list of transformation applicable for metrics 24 | func GetTransformations(c *appconfig.Config) []Transform { 25 | var transformations []Transform 26 | if c.Kubernetes { 27 | podMapper := NewPodMapper(c) 28 | transformations = append(transformations, podMapper) 29 | } 30 | 31 | if c.HPCJobMappingDir != "" { 32 | hpcMapper := newHPCMapper(c) 33 | transformations = append(transformations, hpcMapper) 34 | } 35 | 36 | return transformations 37 | } 38 | -------------------------------------------------------------------------------- /internal/pkg/devicewatcher/types.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | //go:generate go run -v go.uber.org/mock/mockgen -destination=../../mocks/pkg/devicewatcher/mock_device_watcher.go -package=devicewatcher -copyright_file=../../../hack/header.txt . Watcher 18 | 19 | package devicewatcher 20 | 21 | import ( 22 | "github.com/NVIDIA/go-dcgm/pkg/dcgm" 23 | 24 | "github.com/NVIDIA/dcgm-exporter/internal/pkg/counters" 25 | "github.com/NVIDIA/dcgm-exporter/internal/pkg/deviceinfo" 26 | ) 27 | 28 | type Watcher interface { 29 | GetDeviceFields([]counters.Counter, dcgm.Field_Entity_Group) []dcgm.Short 30 | WatchDeviceFields([]dcgm.Short, deviceinfo.Provider, int64) ([]dcgm.GroupHandle, dcgm.FieldHandle, []func(), error) 31 | } 32 | -------------------------------------------------------------------------------- /tests/e2e/internal/framework/utils.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package framework 18 | 19 | import ( 20 | "os" 21 | "path/filepath" 22 | "strings" 23 | ) 24 | 25 | // ResolvePath resolves a path containing $HOME or ~ to an absolute path. 26 | func ResolvePath(path string) (string, error) { 27 | // Expand environment variables like $HOME 28 | path = os.ExpandEnv(path) 29 | 30 | // If the path starts with ~, replace it with the home directory 31 | if strings.HasPrefix(path, "~") { 32 | home, err := os.UserHomeDir() 33 | if err != nil { 34 | return "", err 35 | } 36 | path = strings.Replace(path, "~", home, 1) 37 | } 38 | 39 | // Clean up the path and make it absolute 40 | absPath, err := filepath.Abs(path) 41 | if err != nil { 42 | return "", err 43 | } 44 | 45 | return absPath, nil 46 | } 47 | -------------------------------------------------------------------------------- /internal/pkg/capabilities/capabilities_test.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package capabilities 18 | 19 | import ( 20 | "testing" 21 | ) 22 | 23 | func TestGetCurrentCapabilities(t *testing.T) { 24 | caps := GetCurrentCapabilities() 25 | if caps == "" { 26 | t.Error("Expected non-empty capability string") 27 | } 28 | t.Logf("Current capabilities: %s", caps) 29 | } 30 | 31 | func TestCheckSysAdmin(t *testing.T) { 32 | // This test will vary depending on how it's run 33 | // Just verify it doesn't panic 34 | hasCap := CheckSysAdmin() 35 | t.Logf("Has CAP_SYS_ADMIN: %v", hasCap) 36 | } 37 | 38 | func TestIsRunningAsRoot(t *testing.T) { 39 | // Just verify it doesn't panic 40 | isRoot := IsRunningAsRoot() 41 | t.Logf("Running as root: %v", isRoot) 42 | } 43 | 44 | func TestLogCapabilityInfo(t *testing.T) { 45 | // Just verify it doesn't panic 46 | LogCapabilityInfo() 47 | } 48 | -------------------------------------------------------------------------------- /internal/pkg/testutils/const.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package testutils 18 | 19 | import ( 20 | "github.com/NVIDIA/go-dcgm/pkg/dcgm" 21 | 22 | "github.com/NVIDIA/dcgm-exporter/internal/pkg/deviceinfo" 23 | ) 24 | 25 | var fakeProfileName = "2fake.4gb" 26 | 27 | var ( 28 | MockGPUInstanceInfo1 = deviceinfo.GPUInstanceInfo{ 29 | Info: dcgm.MigEntityInfo{GpuUuid: "fake", NvmlProfileSlices: 3}, 30 | ProfileName: fakeProfileName, 31 | EntityId: 0, 32 | } 33 | 34 | MockGPUInstanceInfo2 = deviceinfo.GPUInstanceInfo{ 35 | Info: dcgm.MigEntityInfo{GpuUuid: "fake", NvmlInstanceId: 1, NvmlProfileSlices: 3}, 36 | ProfileName: fakeProfileName, 37 | EntityId: 14, 38 | } 39 | 40 | MockNVLinkVal1 = dcgm.NvLinkStatus{ 41 | State: 2, 42 | Index: 0, 43 | } 44 | 45 | MockNVLinkVal2 = dcgm.NvLinkStatus{ 46 | State: 3, 47 | Index: 1, 48 | } 49 | ) 50 | -------------------------------------------------------------------------------- /internal/pkg/transformation/const.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package transformation 18 | 19 | const ( 20 | // Note standard resource attributes 21 | podAttribute = "pod" 22 | namespaceAttribute = "namespace" 23 | containerAttribute = "container" 24 | uidAttribute = "pod_uid" 25 | vgpuAttribute = "vgpu" 26 | 27 | hpcJobAttribute = "hpc_job" 28 | 29 | oldPodAttribute = "pod_name" 30 | oldNamespaceAttribute = "pod_namespace" 31 | oldContainerAttribute = "container_name" 32 | draClaimName = "dra_claim_name" 33 | draClaimNamespace = "dra_claim_namespace" 34 | draDriverName = "dra_driver_name" 35 | draPoolName = "dra_pool_name" 36 | draDeviceName = "dra_device_name" 37 | 38 | draMigProfile = "dra_mig_profile" 39 | draMigDeviceUUID = "dra_mig_device_uuid" 40 | 41 | DRAGPUDriverName = "gpu.nvidia.com" 42 | ) 43 | -------------------------------------------------------------------------------- /.vscode/launch.json: -------------------------------------------------------------------------------- 1 | { 2 | // Use IntelliSense to learn about possible attributes. 3 | // Hover to view descriptions of existing attributes. 4 | // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 5 | "version": "0.2.0", 6 | "configurations": [ 7 | { 8 | "name": "Go: Launch e2e", 9 | "type": "go", 10 | "request": "launch", 11 | "mode": "test", 12 | "program": "${workspaceFolder}/tests/e2e", 13 | "args": [ 14 | "-test.v", 15 | "--ginkgo.v", 16 | "-kubeconfig", 17 | "~/.kube/config", 18 | "-chart", 19 | "./../../deployment/", 20 | "-image-repository", 21 | "nvidia/dcgm-exporter", 22 | "-arguments", 23 | "{-f=/etc/dcgm-exporter/default-counters.csv}" 24 | ], 25 | "env": {}, 26 | "buildFlags": "-tags=e2e" 27 | }, 28 | { 29 | "name": "Run Debug", 30 | "type": "go", 31 | "request": "launch", 32 | "mode": "debug", 33 | "cwd": "${workspaceRoot}", 34 | "program": "cmd/dcgm-exporter/main.go", 35 | "args": [ 36 | "-f", 37 | "./etc/default-counters.csv", 38 | "--debug", 39 | "--dump-enabled", 40 | "-r", 41 | "localhost:5555" 42 | ] 43 | } 44 | ] 45 | } -------------------------------------------------------------------------------- /deployment/templates/service.yaml: -------------------------------------------------------------------------------- 1 | {{- if .Values.service.enable }} 2 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | apiVersion: v1 17 | kind: Service 18 | metadata: 19 | name: {{ include "dcgm-exporter.fullname" . }} 20 | namespace: {{ include "dcgm-exporter.namespace" . }} 21 | labels: 22 | {{- include "dcgm-exporter.labels" . | nindent 4 }} 23 | {{- with .Values.service.annotations }} 24 | annotations: 25 | {{- toYaml . | nindent 4 }} 26 | {{- end }} 27 | spec: 28 | type: {{ .Values.service.type }} 29 | {{- if .Values.service.clusterIP }} 30 | clusterIP: {{ .Values.service.clusterIP | quote }} 31 | {{- end }} 32 | internalTrafficPolicy: {{ .Values.service.internalTrafficPolicy }} 33 | ports: 34 | - name: "metrics" 35 | port: {{ .Values.service.port }} 36 | targetPort: {{ .Values.service.port }} 37 | protocol: TCP 38 | selector: 39 | {{- include "dcgm-exporter.selectorLabels" . | nindent 4 }} 40 | {{- end }} 41 | -------------------------------------------------------------------------------- /internal/pkg/exec/exec.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package exec 18 | 19 | import "os/exec" 20 | 21 | //go:generate go run -v go.uber.org/mock/mockgen -destination=../../mocks/pkg/exec/mock_exec.go -package=exec -copyright_file=../../../hack/header.txt . Exec 22 | type Exec interface { 23 | Command(name string, arg ...string) Cmd 24 | } 25 | 26 | //go:generate go run -v go.uber.org/mock/mockgen -destination=../../mocks/pkg/exec/mock_cmd.go -package=exec -copyright_file=../../../hack/header.txt . Cmd 27 | type Cmd interface { 28 | Output() ([]byte, error) 29 | } 30 | 31 | var ( 32 | _ Exec = (*RealExec)(nil) 33 | _ Cmd = (*RealCmd)(nil) 34 | ) 35 | 36 | type RealExec struct{} 37 | 38 | func (r RealExec) Command(name string, arg ...string) Cmd { 39 | return &RealCmd{cmd: exec.Command(name, arg...)} 40 | } 41 | 42 | type RealCmd struct { 43 | cmd *exec.Cmd 44 | } 45 | 46 | func (r *RealCmd) Output() ([]byte, error) { 47 | return r.cmd.Output() 48 | } 49 | -------------------------------------------------------------------------------- /internal/pkg/server/types.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package server 18 | 19 | import ( 20 | "net/http" 21 | "sync" 22 | 23 | "github.com/prometheus/exporter-toolkit/web" 24 | 25 | "github.com/NVIDIA/dcgm-exporter/internal/pkg/appconfig" 26 | "github.com/NVIDIA/dcgm-exporter/internal/pkg/debug" 27 | "github.com/NVIDIA/dcgm-exporter/internal/pkg/devicewatchlistmanager" 28 | "github.com/NVIDIA/dcgm-exporter/internal/pkg/registry" 29 | "github.com/NVIDIA/dcgm-exporter/internal/pkg/transformation" 30 | ) 31 | 32 | type MetricsServer struct { 33 | sync.Mutex 34 | 35 | server *http.Server 36 | webConfig *web.FlagConfig 37 | metrics string 38 | metricsChan chan string 39 | registry *registry.Registry 40 | config *appconfig.Config 41 | transformations []transformation.Transform 42 | deviceWatchListManager devicewatchlistmanager.Manager 43 | fileDumper *debug.FileDumper 44 | } 45 | -------------------------------------------------------------------------------- /internal/pkg/prerequisites/variables.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package prerequisites 18 | 19 | import ( 20 | "fmt" 21 | "regexp" 22 | 23 | elfinterface "github.com/NVIDIA/dcgm-exporter/internal/pkg/elf" 24 | execinterface "github.com/NVIDIA/dcgm-exporter/internal/pkg/exec" 25 | osinterface "github.com/NVIDIA/dcgm-exporter/internal/pkg/os" 26 | ) 27 | 28 | var ( 29 | os osinterface.OS = osinterface.RealOS{} 30 | 31 | exec execinterface.Exec = execinterface.RealExec{} 32 | 33 | elf elfinterface.ELF = elfinterface.RealELF{} 34 | 35 | // rxLDCacheEntry matches the following library strings: 36 | // libdcgm.so.4 (libc6,x86-64) => /lib/x86_64-linux-gnu/libdcgm.so.4 37 | // ld-linux.so.2 (ELF) => /lib/ld-linux.so.2 38 | // ld-linux-x86-64.so.2 (libc6,x86-64) => /lib/x86_64-linux-gnu/ld-linux-x86-64.so.2 39 | rxLDCacheEntry = regexp.MustCompile(`(?m)^(.*)\s*\(.*\)\s*=>\s*(.*)$`) 40 | 41 | errLibdcgmNotFound = fmt.Errorf("the %s library was not found. Install Data Center GPU Manager (DCGM).", libdcgmco) 42 | ) 43 | -------------------------------------------------------------------------------- /scripts/test_coverage.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # 4 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | 19 | echo "Running unit tests..." 20 | go test $(go list ./... | grep -v "/tests/e2e/") \ 21 | -count=1 \ 22 | -timeout 5m \ 23 | -covermode=count \ 24 | -coverprofile=unit_coverage.out \ 25 | -json > test_results.json 26 | 27 | if [ $? -ne 0 ]; then 28 | echo "Unit tests failed." 29 | exit 1 30 | fi 31 | 32 | echo "Running integration tests..." 33 | go test ./internal/pkg/integration_test/... \ 34 | -count=1 \ 35 | -timeout 5m \ 36 | -covermode=count \ 37 | -coverpkg=./internal/pkg/... \ 38 | -coverprofile=integration_coverage.out \ 39 | -json >> test_results.json 40 | 41 | if [ $? -ne 0 ]; then 42 | echo "Integration tests failed." 43 | exit 1 44 | fi 45 | 46 | echo "Merging coverage profiles..." 47 | gocovmerge unit_coverage.out integration_coverage.out > combined_coverage.out.tmp 48 | 49 | # Remove mocks from coverage 50 | cat combined_coverage.out.tmp | grep -v "mock_" > tests.cov 51 | 52 | # Cleanup 53 | rm combined_coverage.out.tmp integration_coverage.out unit_coverage.out -------------------------------------------------------------------------------- /deployment/templates/service-monitor.yaml: -------------------------------------------------------------------------------- 1 | {{- if .Values.serviceMonitor.enabled }} 2 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | apiVersion: {{ .Values.serviceMonitor.apiVersion }} 17 | kind: ServiceMonitor 18 | metadata: 19 | name: {{ include "dcgm-exporter.fullname" . }} 20 | namespace: {{ include "dcgm-exporter.namespace" . }} 21 | labels: 22 | {{- include "dcgm-exporter.labels" . | nindent 4 }} 23 | {{- if .Values.serviceMonitor.additionalLabels }} 24 | {{- toYaml .Values.serviceMonitor.additionalLabels | nindent 4 }} 25 | {{- end }} 26 | spec: 27 | selector: 28 | matchLabels: 29 | {{- include "dcgm-exporter.selectorLabels" . | nindent 6 }} 30 | namespaceSelector: 31 | matchNames: 32 | - "{{ include "dcgm-exporter.namespace" . }}" 33 | endpoints: 34 | - port: "metrics" 35 | path: "/metrics" 36 | interval: "{{ .Values.serviceMonitor.interval }}" 37 | honorLabels: {{ .Values.serviceMonitor.honorLabels }} 38 | relabelings: 39 | {{ toYaml .Values.serviceMonitor.relabelings | nindent 6 }} 40 | metricRelabelings: 41 | {{ toYaml .Values.serviceMonitor.metricRelabelings | nindent 6 }} 42 | {{- end -}} 43 | -------------------------------------------------------------------------------- /tests/integration/testdata/tlsCertificate.key: -------------------------------------------------------------------------------- 1 | -----BEGIN PRIVATE KEY----- 2 | MIIEvQIBADANBgkqhkiG9w0BAQEFAASCBKcwggSjAgEAAoIBAQDOnlyjmRp1lsB3 3 | y01ovOquhcLW6afisNzRh5ImU/RW3r7/gO5RAR5IrK//SMmvQVuL9NXmmwOhFMRU 4 | /d7PL9Jm/03UotbdkXM1krX0PRFVKGBzMEQVroIchGAUZhIDLP/XWIks2JgFeEI6 5 | yochX/vUsgnewuC3NUtVl9hkbflRDLwHSRgQ4cwJtXya7DatRYXEHerJG7G0YoVQ 6 | 2F3FWSvKMtic6dRP11cMW6x4Cac6EZlrQpX79aYrLPnD+enqU2+u3xoO0P2gylbm 7 | 3OBS9oayLfojuqTK810BGZNFSZnZlxS8ZM9Z29NvYzaoMV2iXG0PlRL7N69Mj8o5 8 | uj1zjMyTAgMBAAECggEAAp0UNOmR4RzoEFvc/c8t9MkyXr+T5RUxpeDeI2f6dkLZ 9 | DahW22V1OZwdmdAU5QehndKaBhG96QK3PQ/9CofjVHQ0StVIfIMDfxD9Bjrjhj09 10 | gxeANPoBIVU3Veq4nZurHNE84vlWavNLeyzmqrBpIhaic1PpvceJzWpfTV3qwnCt 11 | 8s9BwCnYAQeZkG89H+o89veh8Gg8KnIQgeYuIBr8OSVPjpN2mx2/2HcmxMSLQzLd 12 | 0WoGUqBe7INhsH8Ftr5Ft1esbVzVGuHYNpqzDON+ltkgHOXLsZZUFdZKWTvVcMl+ 13 | DZJx60gLvS0gW1Pl+HUbfm9E+h7d87nydZHkyJqS6QKBgQD+4D1goP3sL7SGnZlD 14 | 8y/y3EKulW9l5FluNcKaT8iAsb1CqFrH04bP9d6Dnf1FeugysUbq2+/DIuB4VgEH 15 | k2Ddd6uIOJqkFyQPFcueifjomsc5MnEsJUt4sK/ZuFFB9/MDberornS17s03GBlj 16 | bnQYcwwNwqay3kdDQSLH+4Jk7wKBgQDPh6N52ty4XQ9Kl3qchci35iC+0H4bbcOf 17 | m/QYmfhTWrAKuAbPKcDx3kmESIEZ39r1ne4piM2DpbIKRK5lp4zVCWC5+GjZeK5H 18 | rZTqNiPDCtQVPUgNrpq23lj98ZWgQlFbZmjD2+VpF1MhszgARN4nqtA6NxILYQkq 19 | K2+DhTV6nQKBgQCR1cbo0lVYXleCKwz2V2DZKnSxJmouOAF/0Rz/ZKVKFelACcqf 20 | ygW6Wmmuj6jKp6/SyaLVYXEIC5263SMLraVgM0gf0puRRnhodUUj+rhEh8N0k8sz 21 | wkcz0I1eG1vLTk9rZ+pgpum669/aJVzpitq83p9IBFeTwyGtdsiq+ts6WwKBgGKZ 22 | PfkqG83YIOi+mXl+MoP06T/fTNr5RoUxkkKR/KKVpqghENMiMlJ32xPnYrcJXT1u 23 | 4HWJUqMmeTVZtkYD2expfTQ+KoqQRLU3alXwjQMgxOymZfJiL42MjFlyizVN7Ntx 24 | B3tuoUtR0qRjQssV7TqJdE8EVSVk3/bSFjQ1eNgdAoGAauCpps2W0yALIUWFRfDU 25 | RVOZWua079wbfvBvTNsQ0x9KDnfv7+3fZL7qoKyB+vLwuqgm1cj9IjyXDLppGVzJ 26 | Mk3zSbO9DwmOcM5IXnXwSd/zFFpUqH7FrGqUvfncMYgUrys1jmdY2yjr3iXRp/AT 27 | zP/AX7Fte5ABLuHZfxPkX6c= 28 | -----END PRIVATE KEY----- 29 | -------------------------------------------------------------------------------- /internal/pkg/counters/exporter_counters_test.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package counters 18 | 19 | import ( 20 | "testing" 21 | 22 | "github.com/stretchr/testify/assert" 23 | ) 24 | 25 | func TestIdentifyMetricType(t *testing.T) { 26 | tests := []struct { 27 | name string 28 | field string 29 | output ExporterCounter 30 | valid bool 31 | }{ 32 | { 33 | name: "Valid Input DCGM_EXP_XID_ERRORS_COUNT", 34 | field: "DCGM_EXP_XID_ERRORS_COUNT", 35 | output: DCGMXIDErrorsCount, 36 | valid: true, 37 | }, 38 | { 39 | name: "Valid Input DCGM_FI_UNKNOWN", 40 | field: "DCGM_FI_UNKNOWN", 41 | output: DCGMFIUnknown, 42 | valid: true, 43 | }, 44 | { 45 | name: "Invalid Input DCGM_EXP_XID_ERRORS_COUNTXXX", 46 | field: "DCGM_EXP_XID_ERRORS_COUNTXXX", 47 | output: DCGMFIUnknown, 48 | valid: false, 49 | }, 50 | } 51 | 52 | for _, tt := range tests { 53 | t.Run(tt.name, func(t *testing.T) { 54 | output, err := IdentifyMetricType(tt.field) 55 | if tt.valid { 56 | assert.NoError(t, err, "Expected metrics to be found.") 57 | assert.Equal(t, output, tt.output, "Invalid output") 58 | } else { 59 | assert.Errorf(t, err, "Expected metrics to be not found.") 60 | } 61 | }) 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /internal/pkg/stdout/stdoutprocessor.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package stdout 18 | 19 | import ( 20 | "strings" 21 | "time" 22 | ) 23 | 24 | // outputEntry represents the structured form of the parsed log entry. 25 | type outputEntry struct { 26 | Timestamp time.Time 27 | Level string 28 | Message string 29 | IsRawString bool 30 | } 31 | 32 | // parseOutputEntry takes a log entry string and returns a structured outputEntry object. 33 | func parseOutputEntry(entry string) outputEntry { 34 | // Split the entry by spaces, taking care to not split the function call and its arguments. 35 | fields := strings.Fields(entry) 36 | 37 | if len(fields) > 2 { 38 | // Parse the timestamp. 39 | timestamp, err := time.Parse("2006-01-02 15:04:05.000", fields[0]+" "+fields[1]) 40 | if err != nil { 41 | return outputEntry{ 42 | Message: entry, 43 | IsRawString: true, 44 | } 45 | } 46 | 47 | level := fields[2] 48 | 49 | // Reconstruct the string from the fourth field onwards to deal with function calls and arguments. 50 | remainder := strings.Join(fields[4:], " ") 51 | 52 | return outputEntry{ 53 | Timestamp: timestamp, 54 | Level: level, 55 | Message: remainder, 56 | } 57 | } 58 | 59 | return outputEntry{ 60 | Message: entry, 61 | IsRawString: true, 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /internal/pkg/counters/types.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package counters 18 | 19 | import ( 20 | "strings" 21 | 22 | "github.com/NVIDIA/go-dcgm/pkg/dcgm" 23 | ) 24 | 25 | type Counter struct { 26 | FieldID dcgm.Short `json:"field_id"` 27 | FieldName string `json:"field_name"` 28 | PromType string `json:"prom_type"` 29 | Help string `json:"help"` 30 | } 31 | 32 | func (c Counter) IsLabel() bool { 33 | return c.PromType == "label" 34 | } 35 | 36 | func (c Counter) IsProfilingMetric() bool { 37 | return strings.HasPrefix(c.FieldName, "DCGM_FI_PROF_") 38 | } 39 | 40 | type CounterList []Counter 41 | 42 | func (c CounterList) LabelCounters() CounterList { 43 | var labelsCounters CounterList 44 | for _, counter := range c { 45 | if counter.IsLabel() { 46 | labelsCounters = append(labelsCounters, counter) 47 | } 48 | } 49 | 50 | return labelsCounters 51 | } 52 | 53 | func (c CounterList) HasProfilingMetrics() bool { 54 | for _, counter := range c { 55 | if counter.IsProfilingMetric() { 56 | return true 57 | } 58 | } 59 | return false 60 | } 61 | 62 | type CounterSet struct { 63 | DCGMCounters CounterList 64 | ExporterCounters CounterList 65 | } 66 | 67 | func (cs *CounterSet) HasProfilingMetrics() bool { 68 | return cs.DCGMCounters.HasProfilingMetrics() || cs.ExporterCounters.HasProfilingMetrics() 69 | } 70 | -------------------------------------------------------------------------------- /deployment/templates/NOTES.txt: -------------------------------------------------------------------------------- 1 | 1. Get the application URL by running these commands: 2 | {{- if contains "NodePort" .Values.service.type }} 3 | export NODE_PORT=$(kubectl get --namespace {{ include "dcgm-exporter.namespace" . }} -o jsonpath="{.spec.ports[0].nodePort}" services {{ include "dcgm-exporter.fullname" . }}) 4 | export NODE_IP=$(kubectl get nodes --namespace {{ include "dcgm-exporter.namespace" . }} -o jsonpath="{.items[0].status.addresses[0].address}") 5 | echo http://$NODE_IP:$NODE_PORT/metrics 6 | {{- else if contains "LoadBalancer" .Values.service.type }} 7 | NOTE: It may take a few minutes for the LoadBalancer IP to be available. 8 | You can watch the status of by running 'kubectl get --namespace {{ include "dcgm-exporter.namespace" . }} svc -w {{ include "dcgm-exporter.fullname" . }}' 9 | export SERVICE_IP=$(kubectl get svc --namespace {{ include "dcgm-exporter.namespace" . }} {{ include "dcgm-exporter.fullname" . }} --template "{{"{{ range (index .status.loadBalancer.ingress 0) }}{{.}}{{ end }}"}}") 10 | echo http://$SERVICE_IP:{{ .Values.service.port }} 11 | {{- else if contains "ClusterIP" .Values.service.type }} 12 | export POD_NAME=$(kubectl get pods -n {{ include "dcgm-exporter.namespace" . }} -l "app.kubernetes.io/name={{ include "dcgm-exporter.name" . }},app.kubernetes.io/instance={{ .Release.Name }}" -o jsonpath="{.items[0].metadata.name}") 13 | kubectl -n {{ include "dcgm-exporter.namespace" . }} port-forward $POD_NAME 8080:{{ .Values.service.port }} & 14 | echo "Visit http://127.0.0.1:8080/metrics to use your application" 15 | {{- end }} 16 | 17 | {{- if .Values.debugDump.enabled }} 18 | 2. Debug dump functionality is enabled: 19 | - Debug files are stored in: {{ .Values.debugDump.directory }} 20 | - Retention period: {{ .Values.debugDump.retention }} hours 21 | - Compression: {{ .Values.debugDump.compression }} 22 | 23 | To access debug files on a node: 24 | kubectl exec -n {{ include "dcgm-exporter.namespace" . }} -- ls -la {{ .Values.debugDump.directory }} 25 | {{- end }} 26 | -------------------------------------------------------------------------------- /internal/pkg/transformation/transformer_test.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package transformation 18 | 19 | import ( 20 | "testing" 21 | 22 | "github.com/stretchr/testify/assert" 23 | 24 | "github.com/NVIDIA/dcgm-exporter/internal/pkg/appconfig" 25 | ) 26 | 27 | func TestGetTransformations(t *testing.T) { 28 | tests := []struct { 29 | name string 30 | config *appconfig.Config 31 | assert func(*testing.T, []Transform) 32 | }{ 33 | { 34 | name: "The environment is not kubernetes", 35 | config: &appconfig.Config{ 36 | Kubernetes: false, 37 | }, 38 | assert: func(t *testing.T, transforms []Transform) { 39 | assert.Len(t, transforms, 0) 40 | }, 41 | }, 42 | { 43 | name: "The environment is kubernetes", 44 | config: &appconfig.Config{ 45 | Kubernetes: true, 46 | }, 47 | assert: func(t *testing.T, transforms []Transform) { 48 | assert.Len(t, transforms, 1) 49 | }, 50 | }, 51 | { 52 | name: "The environment is HPC cluster", 53 | config: &appconfig.Config{ 54 | HPCJobMappingDir: "/var/run/nvidia/slurm", 55 | }, 56 | assert: func(t *testing.T, transforms []Transform) { 57 | assert.Len(t, transforms, 1) 58 | }, 59 | }, 60 | } 61 | for _, tt := range tests { 62 | t.Run(tt.name, func(t *testing.T) { 63 | transformations := GetTransformations(tt.config) 64 | tt.assert(t, transformations) 65 | }) 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /internal/pkg/stdout/capture.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package stdout 18 | 19 | import ( 20 | "bufio" 21 | "context" 22 | "log/slog" 23 | "os" 24 | "syscall" 25 | ) 26 | 27 | // Capture go and C stdout and stderr and writes to std output 28 | func Capture(ctx context.Context, inner func() error) error { 29 | stdout, err := syscall.Dup(syscall.Stdout) 30 | if err != nil { 31 | return err 32 | } 33 | 34 | r, w, err := os.Pipe() 35 | if err != nil { 36 | return err 37 | } 38 | 39 | err = syscall.Dup3(int(w.Fd()), syscall.Stdout, 0) 40 | if err != nil { 41 | return err 42 | } 43 | 44 | defer func() { 45 | ierr := syscall.Close(syscall.Stdout) 46 | if ierr != nil { 47 | err = ierr 48 | } 49 | 50 | ierr = syscall.Dup3(stdout, syscall.Stdout, 0) 51 | if ierr != nil { 52 | err = ierr 53 | } 54 | }() 55 | 56 | scanner := bufio.NewScanner(r) 57 | go func() { 58 | for scanner.Scan() { 59 | if ctx.Err() != nil { 60 | return 61 | } 62 | logEntry := scanner.Text() 63 | parsedLogEntry := parseOutputEntry(logEntry) 64 | if parsedLogEntry.IsRawString { 65 | _, err := os.Stdout.Write([]byte(parsedLogEntry.Message + "\n")) 66 | if err != nil { 67 | return 68 | } 69 | continue 70 | } 71 | slog.LogAttrs(ctx, slog.LevelInfo, parsedLogEntry.Message, slog.String("dcgm_level", parsedLogEntry.Level)) 72 | } 73 | }() 74 | 75 | // Call function here 76 | return inner() 77 | } 78 | -------------------------------------------------------------------------------- /deployment/templates/web-config-configmap.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | {{- if or .Values.tlsServerConfig.enabled .Values.basicAuth.users }} 15 | apiVersion: v1 16 | kind: ConfigMap 17 | metadata: 18 | name: {{ include "dcgm-exporter.webConfigConfigMap" . }} 19 | namespace: {{ include "dcgm-exporter.namespace" . }} 20 | labels: 21 | {{- include "dcgm-exporter.labels" . | nindent 4 }} 22 | data: 23 | web-config.yaml: | 24 | {{- if .Values.tlsServerConfig.enabled }} 25 | tls_server_config: 26 | cert_file: {{ required "'tlsServerConfig.certFilename' is required when 'tlsServerConfig.enabled=true'" .Values.tlsServerConfig.certFilename | printf "/etc/dcgm-exporter/tls/%s" | quote }} 27 | key_file: {{ required "'tlsServerConfig.keyFilename' is required when 'tlsServerConfig.enabled=true'" .Values.tlsServerConfig.keyFilename | printf "/etc/dcgm-exporter/tls/%s" | quote }} 28 | {{- if .Values.tlsServerConfig.clientAuthType }} 29 | client_auth_type: {{ .Values.tlsServerConfig.clientAuthType }} 30 | client_ca_file: {{ required "'tlsServerConfig.caFilename' is required when 'tlsServerConfig.clientAuthType' is provided" .Values.tlsServerConfig.caFilename | printf "/etc/dcgm-exporter/tls/%s" | quote }} 31 | {{- end }} 32 | {{- end }} 33 | {{- if .Values.basicAuth.users }} 34 | basic_auth_users: 35 | {{- range $user, $password := .Values.basicAuth.users }} 36 | {{ $user }}: {{ (split ":" (htpasswd $user $password))._1 }} 37 | {{- end }} 38 | {{- end }} 39 | {{- end }} 40 | -------------------------------------------------------------------------------- /internal/pkg/counters/exporter_counters.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package counters 18 | 19 | import "fmt" 20 | 21 | type ExporterCounter uint16 22 | 23 | const ( 24 | DCGMFIUnknown ExporterCounter = 0 25 | DCGMXIDErrorsCount ExporterCounter = iota + 9000 26 | DCGMClockEventsCount ExporterCounter = iota + 9000 27 | DCGMGPUHealthStatus ExporterCounter = iota + 9000 28 | DCGMP2PStatus ExporterCounter = iota + 9000 29 | ) 30 | 31 | // String method to convert the enum value to a string 32 | func (enm ExporterCounter) String() string { 33 | switch enm { 34 | case DCGMXIDErrorsCount: 35 | return DCGMExpXIDErrorsCount 36 | case DCGMClockEventsCount: 37 | return DCGMExpClockEventsCount 38 | case DCGMGPUHealthStatus: 39 | return DCGMExpGPUHealthStatus 40 | case DCGMP2PStatus: 41 | return DCGMExpP2PStatus 42 | default: 43 | return "DCGM_FI_UNKNOWN" 44 | } 45 | } 46 | 47 | // DCGMFields maps DCGMExporterMetric String to enum 48 | var DCGMFields = map[string]ExporterCounter{ 49 | DCGMXIDErrorsCount.String(): DCGMXIDErrorsCount, 50 | DCGMClockEventsCount.String(): DCGMClockEventsCount, 51 | DCGMGPUHealthStatus.String(): DCGMGPUHealthStatus, 52 | DCGMP2PStatus.String(): DCGMP2PStatus, 53 | DCGMFIUnknown.String(): DCGMFIUnknown, 54 | } 55 | 56 | func IdentifyMetricType(s string) (ExporterCounter, error) { 57 | mv, ok := DCGMFields[s] 58 | if !ok { 59 | return mv, fmt.Errorf("unknown ExporterCounter field '%s'", s) 60 | } 61 | return mv, nil 62 | } 63 | -------------------------------------------------------------------------------- /internal/pkg/dcgmprovider/smart_init.go: -------------------------------------------------------------------------------- 1 | package dcgmprovider 2 | 3 | import ( 4 | "fmt" 5 | "log/slog" 6 | "os" 7 | "testing" 8 | 9 | "github.com/NVIDIA/dcgm-exporter/internal/pkg/appconfig" 10 | "github.com/NVIDIA/go-dcgm/pkg/dcgm" 11 | ) 12 | 13 | // SmartDCGMInit tries to initialize DCGM with embedded mode first, then falls back to remote if it fails 14 | // This function is intended for test use only 15 | func SmartDCGMInit(t *testing.T, config *appconfig.Config) { 16 | t.Helper() 17 | 18 | // Check if a DCGM client already exists and return it if so. 19 | if Client() != nil { 20 | slog.Info("DCGM already initialized") 21 | return 22 | } 23 | 24 | client := dcgmProvider{} 25 | 26 | // Try embedded mode first 27 | config.UseRemoteHE = false 28 | if config.EnableDCGMLog { 29 | os.Setenv("__DCGM_DBG_FILE", "-") 30 | os.Setenv("__DCGM_DBG_LVL", config.DCGMLogLevel) 31 | } 32 | 33 | slog.Info("Attempting to initialize DCGM in embedded mode.") 34 | cleanup, err := dcgm.Init(dcgm.Embedded) 35 | if err != nil { 36 | slog.Info("Embedded DCGM failed, trying remote host engine") 37 | // Try remote mode as fallback 38 | config.UseRemoteHE = true 39 | config.RemoteHEInfo = "localhost:5555" 40 | 41 | slog.Info("Attempting to connect to remote hostengine at " + config.RemoteHEInfo) 42 | cleanup, err = dcgm.Init(dcgm.Standalone, config.RemoteHEInfo, "0") 43 | if err != nil { 44 | if cleanup != nil { 45 | cleanup() 46 | } 47 | slog.Error(fmt.Sprintf("Both embedded and remote DCGM failed: %v", err)) 48 | t.Skip("Skipping test - DCGM initialization failed for both embedded and remote modes") 49 | return 50 | } 51 | } else { 52 | slog.Info("Embedded DCGM initialized successfully") 53 | } 54 | 55 | client.shutdown = cleanup 56 | 57 | // Initialize the DcgmFields module 58 | if val := dcgm.FieldsInit(); val < 0 { 59 | slog.Error(fmt.Sprintf("Failed to initialize DCGM Fields module; err: %d", val)) 60 | client.shutdown() 61 | t.Skip("Skipping test - DCGM Fields module initialization failed") 62 | return 63 | } else { 64 | slog.Info("Initialized DCGM Fields module.") 65 | } 66 | 67 | // Set the client 68 | SetClient(client) 69 | } 70 | -------------------------------------------------------------------------------- /internal/pkg/hostname/hostname.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package hostname 18 | 19 | import ( 20 | "net" 21 | 22 | "github.com/NVIDIA/dcgm-exporter/internal/pkg/appconfig" 23 | osinterface "github.com/NVIDIA/dcgm-exporter/internal/pkg/os" 24 | ) 25 | 26 | var os osinterface.OS = osinterface.RealOS{} 27 | 28 | // GetHostname return a hostname where metric was collected. 29 | func GetHostname(config *appconfig.Config) (string, error) { 30 | if config.Kubernetes { 31 | /* in kubernetes, the remote hostname is generic and local, so it's not useful */ 32 | return getLocalHostname() 33 | } 34 | if config.UseRemoteHE { 35 | return parseRemoteHostname(config) 36 | } 37 | return getLocalHostname() 38 | } 39 | 40 | func parseRemoteHostname(config *appconfig.Config) (string, error) { 41 | // Extract the hostname or IP address part from the appconfig.RemoteHEInfo 42 | // This handles inputs like "localhost:5555", "example.com:5555", or "192.168.1.1:5555" 43 | host, _, err := net.SplitHostPort(config.RemoteHEInfo) 44 | if err != nil { 45 | // If there's an error, it might be because there's no port in the appconfig.RemoteHEInfo 46 | // In that case, use the appconfig.RemoteHEInfo as is 47 | host = config.RemoteHEInfo 48 | } 49 | return host, nil 50 | } 51 | 52 | func getLocalHostname() (string, error) { 53 | if nodeName := os.Getenv("NODE_NAME"); nodeName != "" { 54 | return nodeName, nil 55 | } 56 | hostname, err := os.Hostname() 57 | if err != nil { 58 | return "", err 59 | } 60 | return hostname, nil 61 | } 62 | -------------------------------------------------------------------------------- /internal/pkg/deviceinfo/types.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | //go:generate go run -v go.uber.org/mock/mockgen -destination=../../mocks/pkg/deviceinfo/mock_device_info.go -package=deviceinfo -copyright_file=../../../hack/header.txt . Provider 18 | 19 | package deviceinfo 20 | 21 | import ( 22 | "github.com/NVIDIA/go-dcgm/pkg/dcgm" 23 | 24 | "github.com/NVIDIA/dcgm-exporter/internal/pkg/appconfig" 25 | ) 26 | 27 | type Provider interface { 28 | GPUCount() uint 29 | GPUs() []GPUInfo 30 | GPU(i uint) GPUInfo 31 | Switches() []SwitchInfo 32 | Switch(i uint) SwitchInfo 33 | CPUs() []CPUInfo 34 | CPU(i uint) CPUInfo 35 | GOpts() appconfig.DeviceOptions 36 | SOpts() appconfig.DeviceOptions 37 | COpts() appconfig.DeviceOptions 38 | InfoType() dcgm.Field_Entity_Group 39 | IsCPUWatched(cpuID uint) bool 40 | IsCoreWatched(coreID uint, cpuID uint) bool 41 | IsSwitchWatched(switchID uint) bool 42 | IsLinkWatched(linkIndex uint, switchID uint) bool 43 | } 44 | 45 | type GPUInfo struct { 46 | DeviceInfo dcgm.Device 47 | GPUInstances []GPUInstanceInfo 48 | MigEnabled bool 49 | NvLinks []dcgm.NvLinkStatus 50 | } 51 | 52 | type GPUInstanceInfo struct { 53 | Info dcgm.MigEntityInfo 54 | ProfileName string 55 | EntityId uint 56 | ComputeInstances []ComputeInstanceInfo 57 | } 58 | 59 | type ComputeInstanceInfo struct { 60 | InstanceInfo dcgm.MigEntityInfo 61 | ProfileName string 62 | EntityId uint 63 | } 64 | 65 | type CPUInfo struct { 66 | EntityId uint 67 | Cores []uint 68 | } 69 | 70 | type SwitchInfo struct { 71 | EntityId uint 72 | NvLinks []dcgm.NvLinkStatus 73 | } 74 | -------------------------------------------------------------------------------- /deployment/templates/tls-secret.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | {{- if and .Values.tlsServerConfig.enabled (not .Values.tlsServerConfig.existingSecret) }} 17 | apiVersion: v1 18 | kind: Secret 19 | metadata: 20 | name: {{ (include "dcgm-exporter.tlsCertsSecretName" .) }} 21 | namespace: {{ include "dcgm-exporter.namespace" . }} 22 | labels: 23 | {{- include "dcgm-exporter.labels" . | nindent 4 }} 24 | type: Opaque 25 | data: 26 | {{- if .Values.tlsServerConfig.autoGenerated }} 27 | {{- $ca := genCA "dcgm-exporter-ca" 365 }} 28 | {{- $hostname := printf "%s" (include "dcgm-exporter.fullname" .) }} 29 | {{- $cert := genSignedCert $hostname nil (list $hostname) 365 $ca }} 30 | {{ .Values.tlsServerConfig.certFilename }}: {{ $cert.Cert | b64enc | quote }} 31 | {{ .Values.tlsServerConfig.keyFilename }}: {{ $cert.Key | b64enc | quote }} 32 | {{- if .Values.tlsServerConfig.clientAuthType }} 33 | {{ .Values.tlsServerConfig.caFilename }}: {{ $ca.Cert | b64enc | quote }} 34 | {{- end }} 35 | {{- else }} 36 | {{ .Values.tlsServerConfig.certFilename }}: {{ required "'tlsServerConfig.cert' is required when 'tlsServerConfig.enabled=true'" .Values.tlsServerConfig.cert | b64enc | quote }} 37 | {{ .Values.tlsServerConfig.keyFilename }}: {{ required "'tlsServerConfig.key' is required when 'tlsServerConfig.enabled=true'" .Values.tlsServerConfig.key | b64enc | quote }} 38 | {{- if .Values.tlsServerConfig.clientAuthType }} 39 | {{ .Values.tlsServerConfig.caFilename }}: {{ required "'tlsServerConfig.ca' is required when 'tlsServerConfig.clientAuthType' is provided" .Values.tlsServerConfig.ca | b64enc | quote }} 40 | {{- end }} 41 | {{- end }} 42 | {{- end }} 43 | -------------------------------------------------------------------------------- /internal/pkg/stdout/capture_test_wrapper.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package stdout 18 | 19 | /* 20 | #include 21 | void printBoom() { 22 | printf("Boom\n"); 23 | fflush(stdout); 24 | } 25 | */ 26 | import "C" 27 | 28 | import ( 29 | "bytes" 30 | "context" 31 | "os" 32 | "strings" 33 | "testing" 34 | 35 | "github.com/stretchr/testify/assert" 36 | "github.com/stretchr/testify/require" 37 | ) 38 | 39 | func testCaptureWithCGO(t *testing.T) { 40 | t.Helper() 41 | // Create a buffer to capture stdout output 42 | var buf bytes.Buffer 43 | 44 | // Save the original stdout 45 | stdout := os.Stdout 46 | 47 | // Create a pipe to redirect stdout 48 | r, w, err := os.Pipe() 49 | assert.NoError(t, err) 50 | 51 | os.Stdout = w // Redirect stdout to the write end of the pipe 52 | 53 | ctx, cancel := context.WithCancel(context.Background()) 54 | 55 | err = Capture(ctx, func() error { 56 | C.printBoom() 57 | return nil 58 | }) 59 | assert.NoError(t, err) 60 | // It takes a time before CGO flushes logs to the std output 61 | // We need to wait until we start to receive the data 62 | // Create temporary buffer to detect data 63 | var tempBuf [1]byte 64 | // Read from the pipe to ensure data is available 65 | _, err = r.Read(tempBuf[:]) // Block until data is written 66 | assert.NoError(t, err) 67 | buf.Write(tempBuf[:]) // Start capturing the data 68 | // Close the write end of the pipe to allow reading all data 69 | _ = w.Close() 70 | _, err = buf.ReadFrom(r) // Read the remaining data 71 | assert.NoError(t, err) 72 | require.Equal(t, "Boom", strings.TrimSpace(buf.String())) 73 | os.Stdout = stdout // Restore original stdout 74 | cancel() 75 | } 76 | -------------------------------------------------------------------------------- /tests/e2e/Makefile: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | GO_CMD ?= go 16 | NAMESPACE ?= "dcgm-exporter" 17 | CHART ?= "./../../deployment/" 18 | IMAGE_REPOSITORY ?= "nvcr.io/nvidia/k8s/dcgm-exporter" 19 | IMAGE_TAG ?= "4.4.2-4.7.1-distroless" 20 | KUBECONFIG ?= "~/.kube/config" 21 | RUNTIME_CLASS ?= "" 22 | NO_CLEANUP ?= "false" 23 | 24 | define TEST_CMD 25 | @if [ -z ${KUBECONFIG} ]; then \ 26 | echo "[ERR] KUBECONFIG is missing, must be set"; \ 27 | exit 1; \ 28 | fi 29 | $(GO_CMD) test --tags=e2e -v . \ 30 | -args \ 31 | --ginkgo.v \ 32 | --ginkgo.no-color \ 33 | -kubeconfig=$(KUBECONFIG) \ 34 | -chart="$(CHART)" \ 35 | -namespace=$(NAMESPACE) \ 36 | -image-repository=$(IMAGE_REPOSITORY) \ 37 | -image-tag=$(IMAGE_TAG) \ 38 | -runtime-class=$(RUNTIME_CLASS) \ 39 | -no-cleanup=$(NO_CLEANUP) 40 | endef 41 | 42 | .PHONY: e2e-test 43 | e2e-test: 44 | @$(TEST_CMD) 45 | 46 | 47 | .PHONY: e2e-test-no-profiling 48 | e2e-test-no-profiling: 49 | @$(TEST_CMD) \ 50 | -arguments="{-f=/etc/dcgm-exporter/default-counters.csv}" 51 | 52 | .PHONY: e2e-basic-auth 53 | e2e-basic-auth: 54 | @$(TEST_CMD) \ 55 | -arguments="{-f=/etc/dcgm-exporter/default-counters.csv}" \ 56 | --ginkgo.label-filter=basicAuth 57 | 58 | .PHONY: e2e-tls 59 | e2e-tls: 60 | @$(TEST_CMD) \ 61 | -arguments="{-f=/etc/dcgm-exporter/default-counters.csv}" \ 62 | --ginkgo.label-filter=tls 63 | 64 | .PHONY: e2e-default 65 | e2e-default: 66 | @$(TEST_CMD) \ 67 | -arguments="{-f=/etc/dcgm-exporter/default-counters.csv}" \ 68 | --ginkgo.label-filter=default 69 | 70 | e2e-labels: 71 | @$(TEST_CMD) \ 72 | -arguments="{-f=/etc/dcgm-exporter/default-counters.csv}" \ 73 | --ginkgo.label-filter=labels 74 | 75 | binary: 76 | go test -c --tags="e2e" . 77 | -------------------------------------------------------------------------------- /internal/mocks/pkg/exec/mock_cmd.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // Code generated by MockGen. DO NOT EDIT. 16 | // Source: github.com/NVIDIA/dcgm-exporter/internal/pkg/exec (interfaces: Cmd) 17 | // 18 | // Generated by this command: 19 | // 20 | // mockgen -destination=../../mocks/pkg/exec/mock_cmd.go -package=exec -copyright_file=../../../hack/header.txt . Cmd 21 | // 22 | 23 | // Package exec is a generated GoMock package. 24 | package exec 25 | 26 | import ( 27 | reflect "reflect" 28 | 29 | gomock "go.uber.org/mock/gomock" 30 | ) 31 | 32 | // MockCmd is a mock of Cmd interface. 33 | type MockCmd struct { 34 | ctrl *gomock.Controller 35 | recorder *MockCmdMockRecorder 36 | isgomock struct{} 37 | } 38 | 39 | // MockCmdMockRecorder is the mock recorder for MockCmd. 40 | type MockCmdMockRecorder struct { 41 | mock *MockCmd 42 | } 43 | 44 | // NewMockCmd creates a new mock instance. 45 | func NewMockCmd(ctrl *gomock.Controller) *MockCmd { 46 | mock := &MockCmd{ctrl: ctrl} 47 | mock.recorder = &MockCmdMockRecorder{mock} 48 | return mock 49 | } 50 | 51 | // EXPECT returns an object that allows the caller to indicate expected use. 52 | func (m *MockCmd) EXPECT() *MockCmdMockRecorder { 53 | return m.recorder 54 | } 55 | 56 | // Output mocks base method. 57 | func (m *MockCmd) Output() ([]byte, error) { 58 | m.ctrl.T.Helper() 59 | ret := m.ctrl.Call(m, "Output") 60 | ret0, _ := ret[0].([]byte) 61 | ret1, _ := ret[1].(error) 62 | return ret0, ret1 63 | } 64 | 65 | // Output indicates an expected call of Output. 66 | func (mr *MockCmdMockRecorder) Output() *gomock.Call { 67 | mr.mock.ctrl.T.Helper() 68 | return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Output", reflect.TypeOf((*MockCmd)(nil).Output)) 69 | } 70 | -------------------------------------------------------------------------------- /tests/integration/start_read_test.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package integration 18 | 19 | import ( 20 | "context" 21 | "errors" 22 | "fmt" 23 | "os" 24 | "strings" 25 | "testing" 26 | "time" 27 | 28 | "github.com/avast/retry-go/v4" 29 | "github.com/prometheus/common/expfmt" 30 | "github.com/stretchr/testify/require" 31 | 32 | "github.com/NVIDIA/dcgm-exporter/pkg/cmd" 33 | ) 34 | 35 | func TestStartAndReadMetrics(t *testing.T) { 36 | if testing.Short() { 37 | t.Skip("skipping test in short mode.") 38 | } 39 | app := cmd.NewApp() 40 | args := os.Args[0:1] 41 | args = append(args, "-f=./testdata/default-counters.csv") // Append a file with default counters 42 | port := getRandomAvailablePort(t) 43 | args = append(args, fmt.Sprintf("-a=:%d", port)) 44 | ctx, cancel := context.WithCancel(context.Background()) 45 | go func(ctx context.Context) { 46 | err := app.Run(args) 47 | require.NoError(t, err) 48 | }(ctx) 49 | 50 | t.Logf("Read metrics from http://localhost:%d/metrics", port) 51 | 52 | metricsResp, _ := retry.DoWithData( 53 | func() (string, error) { 54 | metricsResp, _, err := httpGet(t, fmt.Sprintf("http://localhost:%d/metrics", port)) 55 | if err != nil { 56 | return "", err 57 | } 58 | 59 | if len(metricsResp) == 0 { 60 | return "", errors.New("empty response") 61 | } 62 | return metricsResp, nil 63 | }, 64 | retry.Attempts(10), 65 | retry.MaxDelay(10*time.Second), 66 | ) 67 | 68 | require.NotEmpty(t, metricsResp) 69 | var parser expfmt.TextParser 70 | mf, err := parser.TextToMetricFamilies(strings.NewReader(metricsResp)) 71 | require.NoError(t, err) 72 | require.Greater(t, len(mf), 0, "expected number of metrics more than 0") 73 | cancel() 74 | } 75 | -------------------------------------------------------------------------------- /internal/mocks/pkg/elf/mock_elf.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // Code generated by MockGen. DO NOT EDIT. 16 | // Source: github.com/NVIDIA/dcgm-exporter/internal/pkg/elf (interfaces: ELF) 17 | // 18 | // Generated by this command: 19 | // 20 | // mockgen -destination=../../mocks/pkg/elf/mock_elf.go -package=elf -copyright_file=../../../hack/header.txt . ELF 21 | // 22 | 23 | // Package elf is a generated GoMock package. 24 | package elf 25 | 26 | import ( 27 | elf "debug/elf" 28 | reflect "reflect" 29 | 30 | gomock "go.uber.org/mock/gomock" 31 | ) 32 | 33 | // MockELF is a mock of ELF interface. 34 | type MockELF struct { 35 | ctrl *gomock.Controller 36 | recorder *MockELFMockRecorder 37 | isgomock struct{} 38 | } 39 | 40 | // MockELFMockRecorder is the mock recorder for MockELF. 41 | type MockELFMockRecorder struct { 42 | mock *MockELF 43 | } 44 | 45 | // NewMockELF creates a new mock instance. 46 | func NewMockELF(ctrl *gomock.Controller) *MockELF { 47 | mock := &MockELF{ctrl: ctrl} 48 | mock.recorder = &MockELFMockRecorder{mock} 49 | return mock 50 | } 51 | 52 | // EXPECT returns an object that allows the caller to indicate expected use. 53 | func (m *MockELF) EXPECT() *MockELFMockRecorder { 54 | return m.recorder 55 | } 56 | 57 | // Open mocks base method. 58 | func (m *MockELF) Open(name string) (*elf.File, error) { 59 | m.ctrl.T.Helper() 60 | ret := m.ctrl.Call(m, "Open", name) 61 | ret0, _ := ret[0].(*elf.File) 62 | ret1, _ := ret[1].(error) 63 | return ret0, ret1 64 | } 65 | 66 | // Open indicates an expected call of Open. 67 | func (mr *MockELFMockRecorder) Open(name any) *gomock.Call { 68 | mr.mock.ctrl.T.Helper() 69 | return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Open", reflect.TypeOf((*MockELF)(nil).Open), name) 70 | } 71 | -------------------------------------------------------------------------------- /tests/integration/helpers_test.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package integration 18 | 19 | import ( 20 | "encoding/base64" 21 | "io" 22 | "net" 23 | "net/http" 24 | "sync" 25 | "testing" 26 | 27 | "github.com/stretchr/testify/require" 28 | ) 29 | 30 | var randomPortMutex sync.Mutex 31 | 32 | var usedPorts = map[int]struct{}{} 33 | 34 | func getRandomAvailablePort(t *testing.T) int { 35 | randomPortMutex.Lock() 36 | defer randomPortMutex.Unlock() 37 | t.Helper() 38 | retry: 39 | addr, err := net.ResolveTCPAddr("tcp", ":0") 40 | require.NoError(t, err) 41 | l, err := net.ListenTCP("tcp", addr) 42 | require.NoError(t, err) 43 | defer l.Close() 44 | port := l.Addr().(*net.TCPAddr).Port 45 | if _, exist := usedPorts[port]; exist { 46 | goto retry 47 | } 48 | usedPorts[port] = struct{}{} 49 | return port 50 | } 51 | 52 | func httpGet(t *testing.T, url string, customClient ...*http.Client) (string, int, error) { 53 | t.Helper() 54 | 55 | client := http.DefaultClient 56 | 57 | if len(customClient) > 0 { 58 | client = customClient[0] 59 | } 60 | 61 | resp, err := client.Get(url) 62 | if err != nil { 63 | return "", -1, err 64 | } 65 | defer resp.Body.Close() 66 | body, err := io.ReadAll(resp.Body) 67 | if err != nil { 68 | return "", -1, err 69 | } 70 | return string(body), resp.StatusCode, nil 71 | } 72 | 73 | func newRequestWithBasicAuth(t *testing.T, username, password, method string, url string, body io.Reader) *http.Request { 74 | t.Helper() 75 | auth := username + ":" + password 76 | authorizationValue := base64.StdEncoding.EncodeToString([]byte(auth)) 77 | req, err := http.NewRequest(method, url, body) 78 | require.NoError(t, err) 79 | req.Header.Add("Authorization", "Basic "+authorizationValue) 80 | return req 81 | } 82 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contribute to the DCGM-Exporter Project 2 | 3 | Want to hack on the NVIDIA DCGM-Exporter Project? Awesome! 4 | We only require you to sign your work, the below section describes this! 5 | 6 | ## Sign your work 7 | 8 | The sign-off is a simple line at the end of the explanation for the patch. Your 9 | signature certifies that you wrote the patch or otherwise have the right to pass 10 | it on as an open-source patch. The rules are pretty simple: if you can certify 11 | the below (from [developercertificate.org](http://developercertificate.org/)): 12 | 13 | ``` 14 | Developer Certificate of Origin 15 | Version 1.1 16 | 17 | Copyright (C) 2004, 2006 The Linux Foundation and its contributors. 18 | 1 Letterman Drive 19 | Suite D4700 20 | San Francisco, CA, 94129 21 | 22 | Everyone is permitted to copy and distribute verbatim copies of this 23 | license document, but changing it is not allowed. 24 | 25 | Developer's Certificate of Origin 1.1 26 | 27 | By making a contribution to this project, I certify that: 28 | 29 | (a) The contribution was created in whole or in part by me and I 30 | have the right to submit it under the open source license 31 | indicated in the file; or 32 | 33 | (b) The contribution is based upon previous work that, to the best 34 | of my knowledge, is covered under an appropriate open source 35 | license and I have the right under that license to submit that 36 | work with modifications, whether created in whole or in part 37 | by me, under the same open source license (unless I am 38 | permitted to submit under a different license), as indicated 39 | in the file; or 40 | 41 | (c) The contribution was provided directly to me by some other 42 | person who certified (a), (b) or (c) and I have not modified 43 | it. 44 | 45 | (d) I understand and agree that this project and the contribution 46 | are public and that a record of the contribution (including all 47 | personal information I submit with it, including my sign-off) is 48 | maintained indefinitely and may be redistributed consistent with 49 | this project or the open source license(s) involved. 50 | ``` 51 | 52 | Then you just add a line to every git commit message: 53 | 54 | Signed-off-by: Joe Smith 55 | 56 | Use your real name (sorry, no pseudonyms or anonymous contributions.) 57 | 58 | If you set your `user.name` and `user.email` git configs, you can sign your 59 | commit automatically with `git commit -s`. 60 | 61 | -------------------------------------------------------------------------------- /tests/e2e/README.md: -------------------------------------------------------------------------------- 1 | # End-to-End tests 2 | 3 | The end-to-tests required to maintain a confidence, that the dcgm-exporter works as expected correctly 4 | after the changes. The tests aim to reproduce a typical deployment scenario on k8s environment and tests 5 | how does the following components work together on K8S environment: 6 | * Helm package - helm package can deploy the specified dcgm-exporter image; 7 | * Docker image - docker image contains all necessary components to run the dcgm-exporter; 8 | * dcgm-exporter - binary executable starts, reads GPU metrics and produces expected results. 9 | 10 | The basic test executes the following scenario: 11 | 12 | 1. Connect to the Kubernetes cluster; 13 | 2. Create a namespace; 14 | 3. Install the dcgm-exporter helm package; 15 | 4. The E2E test waits until the dcgm-exporter is up and running; 16 | 5. When the dcgm-exporter is up and running, the test deploys a pod, that runs a workload on GPU. 17 | 6. The test reads `/metrics` endpoint output and verifies that the GPU metrics available and contains labels, such as 18 | `namespace`, `container` and `pod`. 19 | 20 | If there aren't any errors during execution of steps from 1 to 7, the end-to-end test is considered as passed. 21 | 22 | New e2e tests can be added in a future. 23 | 24 | ## Prerequisites 25 | 26 | 1. NVIDIA GPU-compatible hardware for use with DCGM (Requirements: https://docs.nvidia.com/datacenter/dcgm/latest/user-guide/getting-started.html) 27 | 2. Kubernetes cluster with configured NVIDIA container tool kit (https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/index.html). 28 | For the development or local environments, it is recommended to use [minikube](https://minikube.sigs.k8s.io/) with configured GPU support: https://minikube.sigs.k8s.io/docs/tutorials/nvidia/. 29 | 30 | ## How run E2E tests 31 | 32 | ### Scenario: Test the current DCGM-exporter release 33 | 34 | The scenario installs the dcgm-exporter with default configuration, defined in the helm package [values](https://github.com/NVIDIA/dcgm-exporter/blob/main/deployment/values.yaml). 35 | 36 | ```shell 37 | KUBECONFIG="~/.kube/config" make e2e-test 38 | ``` 39 | 40 | ### Scenario build images, deploy and test DCGM-exporter after changes 41 | 42 | 1. Build local images; 43 | 44 | ```shell 45 | cd ../../ # go to the project root directory 46 | make local 47 | ``` 48 | 49 | 2. Run tests 50 | 51 | ```shell 52 | cd tests/e2e # back to the e2e test directory 53 | 54 | KUBECONFIG="~/.kube/config" IMAGE_REPOSITORY="nvidia/dcgm-exporter" make e2e-test 55 | ``` 56 | -------------------------------------------------------------------------------- /internal/pkg/utils/utils.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package utils 18 | 19 | import ( 20 | "bytes" 21 | "crypto/rand" 22 | "encoding/binary" 23 | "encoding/gob" 24 | "fmt" 25 | "regexp" 26 | "sync" 27 | "time" 28 | ) 29 | 30 | // invalidLabelCharRE is a regular expression that matches any character that is not a letter, digit, or underscore. 31 | var invalidLabelCharRE = regexp.MustCompile(`[^a-zA-Z0-9_]`) 32 | 33 | func WaitWithTimeout(wg *sync.WaitGroup, timeout time.Duration) error { 34 | c := make(chan struct{}) 35 | go func() { 36 | defer close(c) 37 | wg.Wait() 38 | }() 39 | select { 40 | case <-c: 41 | return nil 42 | case <-time.After(timeout): 43 | return fmt.Errorf("timeout waiting for WaitGroup") 44 | } 45 | } 46 | 47 | func RandUint64() (uint64, error) { 48 | var num uint64 49 | err := binary.Read(rand.Reader, binary.BigEndian, &num) 50 | if err != nil { 51 | return 0, fmt.Errorf("failed to generate random 64-bit number; err: %w", err) 52 | } 53 | 54 | return num, nil 55 | } 56 | 57 | func DeepCopy[T any](src T) (dst T, err error) { 58 | var buf bytes.Buffer 59 | 60 | defer func() { 61 | if r := recover(); r != nil { 62 | // If there was a panic, return the zero value of T and the error. 63 | dst = *new(T) 64 | err = fmt.Errorf("panic occurred: %v", r) 65 | } 66 | }() 67 | 68 | // Create an encoder and send a value. 69 | err = gob.NewEncoder(&buf).Encode(src) 70 | if err != nil { 71 | return *new(T), err 72 | } 73 | 74 | // Create a new instance of the type T and decode into that. 75 | err = gob.NewDecoder(&buf).Decode(&dst) 76 | if err != nil { 77 | return *new(T), err 78 | } 79 | 80 | return dst, nil 81 | } 82 | 83 | func CleanupOnError(cleanups []func()) []func() { 84 | for _, cleanup := range cleanups { 85 | cleanup() 86 | } 87 | 88 | return nil 89 | } 90 | 91 | func SanitizeLabelName(s string) string { 92 | return invalidLabelCharRE.ReplaceAllString(s, "_") 93 | } 94 | -------------------------------------------------------------------------------- /internal/mocks/pkg/exec/mock_exec.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // Code generated by MockGen. DO NOT EDIT. 16 | // Source: github.com/NVIDIA/dcgm-exporter/internal/pkg/exec (interfaces: Exec) 17 | // 18 | // Generated by this command: 19 | // 20 | // mockgen -destination=../../mocks/pkg/exec/mock_exec.go -package=exec -copyright_file=../../../hack/header.txt . Exec 21 | // 22 | 23 | // Package exec is a generated GoMock package. 24 | package exec 25 | 26 | import ( 27 | reflect "reflect" 28 | 29 | exec "github.com/NVIDIA/dcgm-exporter/internal/pkg/exec" 30 | gomock "go.uber.org/mock/gomock" 31 | ) 32 | 33 | // MockExec is a mock of Exec interface. 34 | type MockExec struct { 35 | ctrl *gomock.Controller 36 | recorder *MockExecMockRecorder 37 | isgomock struct{} 38 | } 39 | 40 | // MockExecMockRecorder is the mock recorder for MockExec. 41 | type MockExecMockRecorder struct { 42 | mock *MockExec 43 | } 44 | 45 | // NewMockExec creates a new mock instance. 46 | func NewMockExec(ctrl *gomock.Controller) *MockExec { 47 | mock := &MockExec{ctrl: ctrl} 48 | mock.recorder = &MockExecMockRecorder{mock} 49 | return mock 50 | } 51 | 52 | // EXPECT returns an object that allows the caller to indicate expected use. 53 | func (m *MockExec) EXPECT() *MockExecMockRecorder { 54 | return m.recorder 55 | } 56 | 57 | // Command mocks base method. 58 | func (m *MockExec) Command(name string, arg ...string) exec.Cmd { 59 | m.ctrl.T.Helper() 60 | varargs := []any{name} 61 | for _, a := range arg { 62 | varargs = append(varargs, a) 63 | } 64 | ret := m.ctrl.Call(m, "Command", varargs...) 65 | ret0, _ := ret[0].(exec.Cmd) 66 | return ret0 67 | } 68 | 69 | // Command indicates an expected call of Command. 70 | func (mr *MockExecMockRecorder) Command(name any, arg ...any) *gomock.Call { 71 | mr.mock.ctrl.T.Helper() 72 | varargs := append([]any{name}, arg...) 73 | return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Command", reflect.TypeOf((*MockExec)(nil).Command), varargs...) 74 | } 75 | -------------------------------------------------------------------------------- /internal/pkg/collector/xid_collector.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package collector 18 | 19 | import ( 20 | "fmt" 21 | "log/slog" 22 | "slices" 23 | 24 | "github.com/NVIDIA/go-dcgm/pkg/dcgm" 25 | 26 | "github.com/NVIDIA/dcgm-exporter/internal/pkg/appconfig" 27 | "github.com/NVIDIA/dcgm-exporter/internal/pkg/counters" 28 | "github.com/NVIDIA/dcgm-exporter/internal/pkg/devicewatchlistmanager" 29 | ) 30 | 31 | type xidCollector struct { 32 | expCollector 33 | } 34 | 35 | func (c *xidCollector) GetMetrics() (MetricsByCounter, error) { 36 | return c.expCollector.getMetrics() 37 | } 38 | 39 | func NewXIDCollector( 40 | counterList counters.CounterList, 41 | hostname string, 42 | config *appconfig.Config, 43 | deviceWatchList devicewatchlistmanager.WatchList, 44 | ) (Collector, error) { 45 | if !IsDCGMExpXIDErrorsCountEnabled(counterList) { 46 | slog.Error(counters.DCGMExpXIDErrorsCount + " collector is disabled") 47 | return nil, fmt.Errorf(counters.DCGMExpXIDErrorsCount + " collector is disabled") 48 | } 49 | 50 | collector := xidCollector{} 51 | var err error 52 | deviceWatchList.SetDeviceFields([]dcgm.Short{dcgm.DCGM_FI_DEV_XID_ERRORS}) 53 | 54 | collector.expCollector, err = newExpCollector( 55 | counterList.LabelCounters(), 56 | hostname, 57 | config, 58 | deviceWatchList, 59 | ) 60 | if err != nil { 61 | return nil, err 62 | } 63 | 64 | collector.counter = counterList[slices.IndexFunc(counterList, func(c counters.Counter) bool { 65 | return c.FieldName == counters.DCGMExpXIDErrorsCount 66 | })] 67 | 68 | collector.labelFiller = func(metricValueLabels map[string]string, entityValue int64) { 69 | metricValueLabels["xid"] = fmt.Sprint(entityValue) 70 | } 71 | 72 | collector.windowSize = config.XIDCountWindowSize 73 | 74 | return &collector, nil 75 | } 76 | 77 | func IsDCGMExpXIDErrorsCountEnabled(counterList counters.CounterList) bool { 78 | return slices.ContainsFunc(counterList, func(c counters.Counter) bool { 79 | return c.FieldName == counters.DCGMExpXIDErrorsCount 80 | }) 81 | } 82 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report_form.yml: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION. All rights reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | name: Bug Report 16 | description: File a bug report 17 | labels: ["bug"] 18 | body: 19 | - type: markdown 20 | attributes: 21 | value: | 22 | Thanks for taking the time to fill out this bug report! 23 | - type: input 24 | id: version 25 | attributes: 26 | label: What is the version? 27 | description: Tell us about DCGM-exporter version. 28 | placeholder: "example: 3.3.5-3.4.0" 29 | validations: 30 | required: true 31 | - type: textarea 32 | id: description 33 | attributes: 34 | label: What happened? 35 | description: Tell us what happened and provide as many details as possible, including logs. 36 | validations: 37 | required: true 38 | - type: textarea 39 | id: expectations 40 | attributes: 41 | label: What did you expect to happen? 42 | description: Tell us about expected behaviour. 43 | validations: 44 | required: true 45 | - type: textarea 46 | id: model 47 | attributes: 48 | label: What is the GPU model? 49 | description: Tell us about the hardware configuration of the GPU, including the output of 'nvidia-smi' 50 | - type: textarea 51 | id: environment 52 | attributes: 53 | label: What is the environment? 54 | description: Is DCGM-Exporter running on bare metal or in a virtual environment, container, pod, etc? 55 | - type: textarea 56 | id: deployment 57 | attributes: 58 | label: How did you deploy the dcgm-exporter and what is the configuration? 59 | description: Tell us how you deployed DCGM-Exporter. Did you use helm, build from source or use the GPU Operator? 60 | - type: textarea 61 | id: steps-to-reproduce 62 | attributes: 63 | label: How to reproduce the issue? 64 | description: Clear and concise steps to reproduce an issue can help everyone by allowing us to identify and fix problems more quickly. 65 | - type: textarea 66 | id: misc 67 | attributes: 68 | label: Anything else we need to know? 69 | description: Any small detail can help. 70 | -------------------------------------------------------------------------------- /internal/pkg/os/os.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package os 18 | 19 | import "os" 20 | 21 | //go:generate go run -v go.uber.org/mock/mockgen -destination=../../mocks/pkg/os/mock_os.go -package=os -copyright_file=../../../hack/header.txt . OS 22 | //go:generate go run -v go.uber.org/mock/mockgen -destination=../../mocks/pkg/os/mock_dir_entry.go -package=os -copyright_file=../../../hack/header.txt os DirEntry 23 | //go:generate go run -v go.uber.org/mock/mockgen -destination=../../mocks/pkg/os/mock_file_info.go -package=os -copyright_file=../../../hack/header.txt io/fs FileInfo 24 | type OS interface { 25 | CreateTemp(dir, pattern string) (*os.File, error) 26 | Getenv(key string) string 27 | Hostname() (string, error) 28 | IsNotExist(err error) bool 29 | MkdirTemp(dir, pattern string) (string, error) 30 | Open(name string) (*os.File, error) 31 | Remove(name string) error 32 | RemoveAll(path string) error 33 | Stat(name string) (os.FileInfo, error) 34 | TempDir() string 35 | ReadDir(name string) ([]os.DirEntry, error) 36 | Exit(code int) 37 | } 38 | 39 | type RealOS struct{} 40 | 41 | func (RealOS) Hostname() (string, error) { 42 | return os.Hostname() 43 | } 44 | 45 | func (RealOS) Getenv(key string) string { 46 | return os.Getenv(key) 47 | } 48 | 49 | func (RealOS) Stat(name string) (os.FileInfo, error) { 50 | return os.Stat(name) 51 | } 52 | 53 | func (RealOS) IsNotExist(err error) bool { 54 | return os.IsNotExist(err) 55 | } 56 | 57 | func (RealOS) Open(name string) (*os.File, error) { 58 | return os.Open(name) 59 | } 60 | 61 | func (RealOS) MkdirTemp(dir, pattern string) (string, error) { 62 | return os.MkdirTemp(dir, pattern) 63 | } 64 | 65 | func (RealOS) RemoveAll(path string) error { 66 | return os.RemoveAll(path) 67 | } 68 | 69 | func (RealOS) CreateTemp(dir, pattern string) (*os.File, error) { 70 | return os.CreateTemp(dir, pattern) 71 | } 72 | 73 | func (RealOS) TempDir() string { 74 | return os.TempDir() 75 | } 76 | 77 | func (RealOS) Remove(name string) error { 78 | return os.Remove(name) 79 | } 80 | 81 | func (RealOS) ReadDir(name string) ([]os.DirEntry, error) { 82 | return os.ReadDir(name) 83 | } 84 | 85 | func (RealOS) Exit(code int) { os.Exit(code) } 86 | -------------------------------------------------------------------------------- /internal/pkg/prerequisites/dcgmlib_rule.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package prerequisites 18 | 19 | import ( 20 | debugelf "debug/elf" 21 | "fmt" 22 | "log/slog" 23 | "strings" 24 | ) 25 | 26 | const ( 27 | libdcgmco = "libdcgm.so.4" 28 | procSelfExe = "/proc/self/exe" 29 | ldconfig = "ldconfig" 30 | ldconfigParam = "-p" 31 | ) 32 | 33 | type dcgmLibExistsRule struct{} 34 | 35 | // Validate checks if libdcgm.so.4 exists and matches with the machine architecture. 36 | func (c dcgmLibExistsRule) Validate() error { 37 | // On Ubuntu, ldconfig is a wrapper around ldconfig.real 38 | ldconfigPath := fmt.Sprintf("/sbin/%s.real", ldconfig) 39 | if _, err := os.Stat(ldconfigPath); err != nil { 40 | ldconfigPath = "/sbin/" + ldconfig 41 | } 42 | // Get list of shared libraries. See: man ldconfig 43 | out, err := exec.Command(ldconfigPath, ldconfigParam).Output() 44 | if err != nil { 45 | return err 46 | } 47 | 48 | for _, match := range rxLDCacheEntry.FindAllSubmatch(out, -1) { 49 | libName := strings.TrimSpace(string(match[1])) 50 | if libName == libdcgmco { 51 | libPath := strings.TrimSpace(string(match[2])) 52 | selfMachine, err := c.readELF(procSelfExe) 53 | if err != nil { 54 | return err 55 | } 56 | libMachine, err := c.readELF(libPath) 57 | if err != nil { 58 | // When datacenter-gpu-manager uninstalled, the ldconfig -p may return that the libdcgm.so.4 is present, 59 | // but the library file was removed. 60 | slog.Error(err.Error()) 61 | return errLibdcgmNotFound 62 | } 63 | 64 | if selfMachine != libMachine { 65 | return fmt.Errorf("the %s library architecture mismatch with the system; wanted: %s, received: %s", 66 | libdcgmco, selfMachine, libMachine) 67 | } 68 | 69 | return nil 70 | } 71 | } 72 | 73 | return errLibdcgmNotFound 74 | } 75 | 76 | func (c dcgmLibExistsRule) readELF(name string) (debugelf.Machine, error) { 77 | elfFile, err := elf.Open(name) 78 | if err != nil { 79 | return 0, fmt.Errorf("could not open %s: %v", name, err) 80 | } 81 | if err := elfFile.Close(); err != nil { 82 | slog.Warn(fmt.Sprintf("could not close ELF: %v", err)) 83 | } 84 | 85 | return elfFile.Machine, nil 86 | } 87 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request_form.yml: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION. All rights reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | name: Feature Request Form 17 | description: Request new or improved functionality or changes to existing functionality 18 | labels: ["enhancement"] 19 | 20 | body: 21 | - type: markdown 22 | attributes: 23 | value: | 24 | Thanks for taking the time to fill out this feature request! 25 | 26 | - type: dropdown 27 | id: new_or_improvement 28 | attributes: 29 | label: Is this a new feature, an improvement, or a change to existing functionality? 30 | options: 31 | - New Feature 32 | - Improvement 33 | - Change 34 | validations: 35 | required: true 36 | 37 | - type: textarea 38 | id: problem 39 | attributes: 40 | label: Please provide a clear description of the problem this feature solves 41 | description: Real usage examples are especially helpful, non-code. 42 | validations: 43 | required: true 44 | 45 | - type: textarea 46 | id: Feature_Description 47 | attributes: 48 | label: Feature Description 49 | description: Provide a clear description of the requested feature. 50 | placeholder: > 51 | For new feature requests, please use one of the following formats to describe the feature 52 | 1. From End-user perspective, use the following user story format 53 | As a , I , . 54 | 2. From System perspective, use the following EARS format 55 | shall 56 | validations: 57 | required: true 58 | - type: textarea 59 | id: solution 60 | attributes: 61 | label: Describe your ideal solution 62 | description: Please describe the functionality you would like added. 63 | placeholder: > 64 | How would you see an ideal solution? 65 | validations: 66 | required: true 67 | - type: textarea 68 | id: misc 69 | attributes: 70 | label: Additional context 71 | description: Add any other context, code examples, or references to existing implementations about the feature request here. 72 | -------------------------------------------------------------------------------- /tests/e2e/main_test.go: -------------------------------------------------------------------------------- 1 | //go:build e2e 2 | 3 | /* 4 | * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package e2e 19 | 20 | import ( 21 | "flag" 22 | "os" 23 | "testing" 24 | 25 | "github.com/google/uuid" 26 | "github.com/onsi/ginkgo/v2" 27 | "github.com/onsi/ginkgo/v2/types" 28 | "github.com/onsi/gomega" 29 | ) 30 | 31 | var runID = uuid.New() 32 | 33 | var testContext = testContextType{} 34 | 35 | func TestMain(m *testing.M) { 36 | flag.StringVar(&testContext.kubeconfig, 37 | "kubeconfig", 38 | "~/.kube/config", 39 | "path to the kubeconfig file.") 40 | 41 | flag.StringVar(&testContext.namespace, 42 | "namespace", 43 | "dcgm-exporter", 44 | "Namespace name to use for the DCGM-exporter deployment") 45 | 46 | flag.StringVar(&testContext.chart, 47 | "chart", 48 | "", 49 | "Helm chart to use") 50 | 51 | flag.StringVar(&testContext.imageRepository, 52 | "image-repository", 53 | "", 54 | "DCGM-exporter image repository") 55 | 56 | flag.StringVar(&testContext.imageTag, 57 | "image-tag", 58 | "", 59 | "DCGM-exporter image tag to use") 60 | 61 | flag.StringVar(&testContext.arguments, 62 | "arguments", 63 | "", 64 | `DCGM-exporter command line arguments. Example: -arguments="{-f=/etc/dcgm-exporter/default-counters.csv}"`) 65 | 66 | flag.BoolVar(&testContext.noCleanup, 67 | "no-cleanup", 68 | false, 69 | `Skip clean up after tests execution`) 70 | 71 | flag.StringVar(&testContext.runtimeClass, 72 | "runtime-class", 73 | "", 74 | "Runtime Class to use for the DCGM-exporter deployment and workload pods") 75 | 76 | flag.Parse() 77 | 78 | os.Exit(m.Run()) 79 | } 80 | 81 | func createGinkgoConfig() (types.SuiteConfig, types.ReporterConfig) { 82 | // fetch the current config 83 | suiteConfig, reporterConfig := ginkgo.GinkgoConfiguration() 84 | // Randomize specs as well as suites 85 | suiteConfig.RandomizeAllSpecs = true 86 | return suiteConfig, reporterConfig 87 | } 88 | 89 | func TestE2E(t *testing.T) { 90 | gomega.RegisterFailHandler(ginkgo.Fail) 91 | 92 | // Run tests through the Ginkgo runner with output to console + JUnit for Jenkins 93 | suiteConfig, reporterConfig := createGinkgoConfig() 94 | ginkgo.RunSpecs(t, "DCGM-exporter e2e suite", suiteConfig, reporterConfig) 95 | } 96 | -------------------------------------------------------------------------------- /dcgm-exporter.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | apiVersion: apps/v1 16 | kind: DaemonSet 17 | metadata: 18 | name: "dcgm-exporter" 19 | labels: 20 | app.kubernetes.io/name: "dcgm-exporter" 21 | app.kubernetes.io/version: "4.7.1" 22 | spec: 23 | updateStrategy: 24 | type: RollingUpdate 25 | selector: 26 | matchLabels: 27 | app.kubernetes.io/name: "dcgm-exporter" 28 | app.kubernetes.io/version: "4.7.1" 29 | template: 30 | metadata: 31 | labels: 32 | app.kubernetes.io/name: "dcgm-exporter" 33 | app.kubernetes.io/version: "4.7.1" 34 | name: "dcgm-exporter" 35 | spec: 36 | automountServiceAccountToken: false 37 | containers: 38 | - image: "nvcr.io/nvidia/k8s/dcgm-exporter:4.4.2-4.7.1-ubuntu22.04" 39 | env: 40 | - name: "DCGM_EXPORTER_LISTEN" 41 | value: ":9400" 42 | - name: "DCGM_EXPORTER_KUBERNETES" 43 | value: "true" 44 | name: "dcgm-exporter" 45 | ports: 46 | - name: "metrics" 47 | containerPort: 9400 48 | securityContext: 49 | runAsNonRoot: false 50 | runAsUser: 0 51 | capabilities: 52 | add: ["SYS_ADMIN"] # Required for profiling metrics (DCGM_FI_PROF_*) 53 | drop: ["ALL"] 54 | allowPrivilegeEscalation: false 55 | # readOnlyRootFilesystem: true # Enable if tmpfs volumes are configured 56 | # Note: For non-root without profiling metrics, use: 57 | # runAsNonRoot: true, runAsUser: 1000, and remove SYS_ADMIN from capabilities.add 58 | volumeMounts: 59 | - name: "pod-gpu-resources" 60 | readOnly: true 61 | mountPath: "/var/lib/kubelet/pod-resources" 62 | resources: 63 | limits: 64 | cpu: 200m 65 | memory: 256Mi 66 | requests: 67 | cpu: 100m 68 | memory: 128Mi 69 | volumes: 70 | - name: "pod-gpu-resources" 71 | hostPath: 72 | path: "/var/lib/kubelet/pod-resources" 73 | 74 | --- 75 | 76 | kind: Service 77 | apiVersion: v1 78 | metadata: 79 | name: "dcgm-exporter" 80 | labels: 81 | app.kubernetes.io/name: "dcgm-exporter" 82 | app.kubernetes.io/version: "4.7.1" 83 | spec: 84 | selector: 85 | app.kubernetes.io/name: "dcgm-exporter" 86 | app.kubernetes.io/version: "4.7.1" 87 | ports: 88 | - name: "metrics" 89 | port: 9400 90 | -------------------------------------------------------------------------------- /internal/mocks/pkg/collector/mock_collector.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // Code generated by MockGen. DO NOT EDIT. 16 | // Source: github.com/NVIDIA/dcgm-exporter/internal/pkg/collector (interfaces: Collector) 17 | // 18 | // Generated by this command: 19 | // 20 | // mockgen -destination=../../mocks/pkg/collector/mock_collector.go -package=collector -copyright_file=../../../hack/header.txt . Collector 21 | // 22 | 23 | // Package collector is a generated GoMock package. 24 | package collector 25 | 26 | import ( 27 | reflect "reflect" 28 | 29 | collector "github.com/NVIDIA/dcgm-exporter/internal/pkg/collector" 30 | gomock "go.uber.org/mock/gomock" 31 | ) 32 | 33 | // MockCollector is a mock of Collector interface. 34 | type MockCollector struct { 35 | ctrl *gomock.Controller 36 | recorder *MockCollectorMockRecorder 37 | isgomock struct{} 38 | } 39 | 40 | // MockCollectorMockRecorder is the mock recorder for MockCollector. 41 | type MockCollectorMockRecorder struct { 42 | mock *MockCollector 43 | } 44 | 45 | // NewMockCollector creates a new mock instance. 46 | func NewMockCollector(ctrl *gomock.Controller) *MockCollector { 47 | mock := &MockCollector{ctrl: ctrl} 48 | mock.recorder = &MockCollectorMockRecorder{mock} 49 | return mock 50 | } 51 | 52 | // EXPECT returns an object that allows the caller to indicate expected use. 53 | func (m *MockCollector) EXPECT() *MockCollectorMockRecorder { 54 | return m.recorder 55 | } 56 | 57 | // Cleanup mocks base method. 58 | func (m *MockCollector) Cleanup() { 59 | m.ctrl.T.Helper() 60 | m.ctrl.Call(m, "Cleanup") 61 | } 62 | 63 | // Cleanup indicates an expected call of Cleanup. 64 | func (mr *MockCollectorMockRecorder) Cleanup() *gomock.Call { 65 | mr.mock.ctrl.T.Helper() 66 | return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Cleanup", reflect.TypeOf((*MockCollector)(nil).Cleanup)) 67 | } 68 | 69 | // GetMetrics mocks base method. 70 | func (m *MockCollector) GetMetrics() (collector.MetricsByCounter, error) { 71 | m.ctrl.T.Helper() 72 | ret := m.ctrl.Call(m, "GetMetrics") 73 | ret0, _ := ret[0].(collector.MetricsByCounter) 74 | ret1, _ := ret[1].(error) 75 | return ret0, ret1 76 | } 77 | 78 | // GetMetrics indicates an expected call of GetMetrics. 79 | func (mr *MockCollectorMockRecorder) GetMetrics() *gomock.Call { 80 | mr.mock.ctrl.T.Helper() 81 | return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetMetrics", reflect.TypeOf((*MockCollector)(nil).GetMetrics)) 82 | } 83 | -------------------------------------------------------------------------------- /internal/mocks/pkg/nvmlprovider/mock_client.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // Code generated by MockGen. DO NOT EDIT. 16 | // Source: github.com/NVIDIA/dcgm-exporter/internal/pkg/nvmlprovider (interfaces: NVML) 17 | // 18 | // Generated by this command: 19 | // 20 | // mockgen -destination=../../mocks/pkg/nvmlprovider/mock_client.go -package=nvmlprovider -copyright_file=../../../hack/header.txt . NVML 21 | // 22 | 23 | // Package nvmlprovider is a generated GoMock package. 24 | package nvmlprovider 25 | 26 | import ( 27 | reflect "reflect" 28 | 29 | nvmlprovider "github.com/NVIDIA/dcgm-exporter/internal/pkg/nvmlprovider" 30 | gomock "go.uber.org/mock/gomock" 31 | ) 32 | 33 | // MockNVML is a mock of NVML interface. 34 | type MockNVML struct { 35 | ctrl *gomock.Controller 36 | recorder *MockNVMLMockRecorder 37 | isgomock struct{} 38 | } 39 | 40 | // MockNVMLMockRecorder is the mock recorder for MockNVML. 41 | type MockNVMLMockRecorder struct { 42 | mock *MockNVML 43 | } 44 | 45 | // NewMockNVML creates a new mock instance. 46 | func NewMockNVML(ctrl *gomock.Controller) *MockNVML { 47 | mock := &MockNVML{ctrl: ctrl} 48 | mock.recorder = &MockNVMLMockRecorder{mock} 49 | return mock 50 | } 51 | 52 | // EXPECT returns an object that allows the caller to indicate expected use. 53 | func (m *MockNVML) EXPECT() *MockNVMLMockRecorder { 54 | return m.recorder 55 | } 56 | 57 | // Cleanup mocks base method. 58 | func (m *MockNVML) Cleanup() { 59 | m.ctrl.T.Helper() 60 | m.ctrl.Call(m, "Cleanup") 61 | } 62 | 63 | // Cleanup indicates an expected call of Cleanup. 64 | func (mr *MockNVMLMockRecorder) Cleanup() *gomock.Call { 65 | mr.mock.ctrl.T.Helper() 66 | return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Cleanup", reflect.TypeOf((*MockNVML)(nil).Cleanup)) 67 | } 68 | 69 | // GetMIGDeviceInfoByID mocks base method. 70 | func (m *MockNVML) GetMIGDeviceInfoByID(arg0 string) (*nvmlprovider.MIGDeviceInfo, error) { 71 | m.ctrl.T.Helper() 72 | ret := m.ctrl.Call(m, "GetMIGDeviceInfoByID", arg0) 73 | ret0, _ := ret[0].(*nvmlprovider.MIGDeviceInfo) 74 | ret1, _ := ret[1].(error) 75 | return ret0, ret1 76 | } 77 | 78 | // GetMIGDeviceInfoByID indicates an expected call of GetMIGDeviceInfoByID. 79 | func (mr *MockNVMLMockRecorder) GetMIGDeviceInfoByID(arg0 any) *gomock.Call { 80 | mr.mock.ctrl.T.Helper() 81 | return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetMIGDeviceInfoByID", reflect.TypeOf((*MockNVML)(nil).GetMIGDeviceInfoByID), arg0) 82 | } 83 | -------------------------------------------------------------------------------- /.devcontainer/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM nvcr.io/nvidia/cuda:12.9.1-base-ubuntu22.04 2 | ARG GOLANG_VERSION=1.24.5 3 | ARG USERNAME=developer 4 | ARG USER_UID=1000 5 | ARG USER_GID=1000 6 | # Create a user 'developer' with UID=1000, add to 'developer' group, and add to 'sudo' group 7 | RUN groupadd -g $USER_GID $USERNAME && \ 8 | useradd -m -u $USER_GID -g $USERNAME -s /bin/bash $USERNAME && \ 9 | usermod -aG sudo $USERNAME 10 | # Allow 'developer' to use sudo without a password 11 | RUN echo "$USERNAME ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers 12 | 13 | RUN --mount=type=cache,target=/var/cache/apt \ 14 | set -eux; \ 15 | apt-get update; \ 16 | apt-get install -y --no-install-recommends \ 17 | git \ 18 | ca-certificates \ 19 | g++ \ 20 | gcc \ 21 | libc6-dev \ 22 | make \ 23 | pkg-config \ 24 | wget \ 25 | datacenter-gpu-manager-4-core \ 26 | libcap2-bin \ 27 | && install -m 0755 -d /etc/apt/keyrings \ 28 | && wget -O /etc/apt/keyrings/docker.asc https://download.docker.com/linux/ubuntu/gpg \ 29 | && chmod a+r /etc/apt/keyrings/docker.asc \ 30 | && echo \ 31 | "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \ 32 | $(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \ 33 | tee /etc/apt/sources.list.d/docker.list > /dev/null \ 34 | && apt-get update \ 35 | && apt-get install -y --no-install-recommends docker-ce docker-ce-cli containerd.io docker-buildx-plugin \ 36 | && apt-get autoremove -y \ 37 | && rm -rfd /usr/local/dcgm/bindings /usr/local/dcgm/sdk_samples /usr/share/nvidia-validation-suite \ 38 | # DCGM exporter doesn't use libdcgm_cublas_proxy*.so. 39 | && rm -rf /usr/lib/x86_64-linux-gnu/libdcgm_cublas_proxy*.so \ 40 | && rm -rf /var/lib/apt/lists/* 41 | 42 | RUN set -eux; \ 43 | arch="$(dpkg --print-architecture)"; arch="${arch##*-}"; \ 44 | url=; \ 45 | echo "$arch"; \ 46 | case "$arch" in \ 47 | 'amd64') \ 48 | url="https://dl.google.com/go/go${GOLANG_VERSION}.linux-amd64.tar.gz"; \ 49 | ;; \ 50 | 'arm64') \ 51 | url="https://dl.google.com/go/go${GOLANG_VERSION}.linux-arm64.tar.gz"; \ 52 | ;; \ 53 | *) echo >&2 "error: unsupported architecture '$arch' (likely packaging update needed)"; exit 1 ;; \ 54 | esac; \ 55 | build=; \ 56 | if [ -z "$url" ]; then \ 57 | # https://github.com/golang/go/issues/38536#issuecomment-616897960 58 | build=1; \ 59 | url="https://dl.google.com/go/go${GOLANG_VERSION}.src.tar.gz"; \ 60 | echo >&2; \ 61 | echo >&2 "warning: current architecture ($arch) does not have a compatible Go binary release; will be building from source"; \ 62 | echo >&2; \ 63 | fi; \ 64 | wget -O go.tgz "$url" --progress=dot:giga; \ 65 | tar -C /usr/local -xzf go.tgz; \ 66 | rm go.tgz 67 | ENV GOTOOLCHAIN=local 68 | ENV GOPATH=/go 69 | ENV PATH=$GOPATH/bin:$PATH 70 | RUN mkdir -p "$GOPATH/src" "$GOPATH/bin" && chmod -R 1777 "$GOPATH" 71 | ENV PATH=$PATH:/usr/local/go/bin 72 | 73 | # Required for DCGM metrics 74 | ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility,compat32 75 | # disable all constraints on the configurations required by NVIDIA container toolkit 76 | ENV NVIDIA_DISABLE_REQUIRE="true" 77 | ENV NVIDIA_VISIBLE_DEVICES=all 78 | -------------------------------------------------------------------------------- /internal/pkg/dcgmprovider/types.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | //go:generate go run -v go.uber.org/mock/mockgen -destination=../../mocks/pkg/dcgmprovider/mock_client.go -package=dcgmprovider -copyright_file=../../../hack/header.txt . DCGM 18 | 19 | package dcgmprovider 20 | 21 | import ( 22 | "time" 23 | 24 | "github.com/NVIDIA/go-dcgm/pkg/dcgm" 25 | ) 26 | 27 | var _ DCGM = &dcgmProvider{} 28 | 29 | type DCGM interface { 30 | AddEntityToGroup(dcgm.GroupHandle, dcgm.Field_Entity_Group, uint) error 31 | AddLinkEntityToGroup(dcgm.GroupHandle, uint, dcgm.Field_Entity_Group, uint) error 32 | CreateFakeEntities(entities []dcgm.MigHierarchyInfo) ([]uint, error) 33 | CreateGroup(string) (dcgm.GroupHandle, error) 34 | DestroyGroup(groupID dcgm.GroupHandle) error 35 | EntitiesGetLatestValues([]dcgm.GroupEntityPair, []dcgm.Short, uint) ([]dcgm.FieldValue_v2, error) 36 | EntityGetLatestValues(dcgm.Field_Entity_Group, uint, []dcgm.Short) ([]dcgm.FieldValue_v1, error) 37 | Fv2_String(fv dcgm.FieldValue_v2) string 38 | FieldGetByID(dcgm.Short) dcgm.FieldMeta 39 | FieldGroupCreate(string, []dcgm.Short) (dcgm.FieldHandle, error) 40 | FieldGroupDestroy(dcgm.FieldHandle) error 41 | GetAllDeviceCount() (uint, error) 42 | GetCPUHierarchy() (dcgm.CPUHierarchy_v1, error) 43 | GetDeviceInfo(uint) (dcgm.Device, error) 44 | GetEntityGroupEntities(entityGroup dcgm.Field_Entity_Group) ([]uint, error) 45 | GetGPUInstanceHierarchy() (dcgm.MigHierarchy_v2, error) 46 | GetNvLinkLinkStatus() ([]dcgm.NvLinkStatus, error) 47 | GetSupportedDevices() ([]uint, error) 48 | GetSupportedMetricGroups(uint) ([]dcgm.MetricGroup, error) 49 | GetValuesSince(dcgm.GroupHandle, dcgm.FieldHandle, time.Time) ([]dcgm.FieldValue_v2, time.Time, error) 50 | GroupAllGPUs() dcgm.GroupHandle 51 | InjectFieldValue(gpu uint, fieldID dcgm.Short, fieldType uint, status int, ts int64, value interface{}) error 52 | LinkGetLatestValues(uint, dcgm.Field_Entity_Group, uint, []dcgm.Short) ([]dcgm.FieldValue_v1, error) 53 | NewDefaultGroup(string) (dcgm.GroupHandle, error) 54 | UpdateAllFields() error 55 | WatchFieldsWithGroupEx(dcgm.FieldHandle, dcgm.GroupHandle, int64, float64, int32) error 56 | Cleanup() 57 | HealthSet(groupID dcgm.GroupHandle, systems dcgm.HealthSystem) error 58 | HealthGet(groupID dcgm.GroupHandle) (dcgm.HealthSystem, error) 59 | HealthCheck(groupID dcgm.GroupHandle) (dcgm.HealthResponse, error) 60 | GetGroupInfo(groupID dcgm.GroupHandle) (*dcgm.GroupInfo, error) 61 | GetNvLinkP2PStatus() (dcgm.NvLinkP2PStatus, error) 62 | } 63 | -------------------------------------------------------------------------------- /internal/mocks/pkg/transformation/mock_transformer.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // Code generated by MockGen. DO NOT EDIT. 16 | // Source: github.com/NVIDIA/dcgm-exporter/internal/pkg/transformations (interfaces: Transform) 17 | // 18 | // Generated by this command: 19 | // 20 | // mockgen -destination=../../mocks/pkg/transformations/mock_transformer.go -package=transformation -copyright_file=../../../hack/header.txt . Transform 21 | // 22 | 23 | // Package transformation is a generated GoMock package. 24 | package transformation 25 | 26 | import ( 27 | reflect "reflect" 28 | 29 | gomock "go.uber.org/mock/gomock" 30 | 31 | collector "github.com/NVIDIA/dcgm-exporter/internal/pkg/collector" 32 | deviceinfo "github.com/NVIDIA/dcgm-exporter/internal/pkg/deviceinfo" 33 | ) 34 | 35 | // MockTransform is a mock of Transform interface. 36 | type MockTransform struct { 37 | ctrl *gomock.Controller 38 | recorder *MockTransformMockRecorder 39 | } 40 | 41 | // MockTransformMockRecorder is the mock recorder for MockTransform. 42 | type MockTransformMockRecorder struct { 43 | mock *MockTransform 44 | } 45 | 46 | // NewMockTransform creates a new mock instance. 47 | func NewMockTransform(ctrl *gomock.Controller) *MockTransform { 48 | mock := &MockTransform{ctrl: ctrl} 49 | mock.recorder = &MockTransformMockRecorder{mock} 50 | return mock 51 | } 52 | 53 | // EXPECT returns an object that allows the caller to indicate expected use. 54 | func (m *MockTransform) EXPECT() *MockTransformMockRecorder { 55 | return m.recorder 56 | } 57 | 58 | // Name mocks base method. 59 | func (m *MockTransform) Name() string { 60 | m.ctrl.T.Helper() 61 | ret := m.ctrl.Call(m, "Name") 62 | ret0, _ := ret[0].(string) 63 | return ret0 64 | } 65 | 66 | // Name indicates an expected call of Name. 67 | func (mr *MockTransformMockRecorder) Name() *gomock.Call { 68 | mr.mock.ctrl.T.Helper() 69 | return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Name", reflect.TypeOf((*MockTransform)(nil).Name)) 70 | } 71 | 72 | // Process mocks base method. 73 | func (m *MockTransform) Process(arg0 collector.MetricsByCounter, arg1 deviceinfo.Provider) error { 74 | m.ctrl.T.Helper() 75 | ret := m.ctrl.Call(m, "Process", arg0, arg1) 76 | ret0, _ := ret[0].(error) 77 | return ret0 78 | } 79 | 80 | // Process indicates an expected call of Process. 81 | func (mr *MockTransformMockRecorder) Process(arg0, arg1 any) *gomock.Call { 82 | mr.mock.ctrl.T.Helper() 83 | return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Process", reflect.TypeOf((*MockTransform)(nil).Process), arg0, arg1) 84 | } 85 | -------------------------------------------------------------------------------- /internal/pkg/stdout/capture_test.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package stdout 18 | 19 | import ( 20 | "bytes" 21 | "context" 22 | "fmt" 23 | "os" 24 | "strings" 25 | "testing" 26 | 27 | "github.com/stretchr/testify/assert" 28 | ) 29 | 30 | func TestCapture(t *testing.T) { 31 | type testCase struct { 32 | name string 33 | logMessage string 34 | assert func(t *testing.T, str string) 35 | } 36 | 37 | testCases := []testCase{ 38 | { 39 | name: "function writes an arbitrary string into /dev/stdout", 40 | logMessage: "hello from dcgm", 41 | assert: func(t *testing.T, str string) { 42 | assert.Equal(t, "hello from dcgm", strings.TrimSpace(str)) 43 | }, 44 | }, 45 | { 46 | name: "function writes an DCGM log entry string into /dev/stdout", 47 | logMessage: "2024-02-07 18:01:05.641 INFO [517155:517155] Linux 4.15.0-180-generic [{anonymous}::StartEmbeddedV2]", 48 | assert: func(t *testing.T, str string) { 49 | assert.Contains(t, strings.TrimSpace(str), "Linux 4.15.0-180-generic") 50 | }, 51 | }, 52 | { 53 | name: "function writes an DCGM log entry string with a valid date only", 54 | logMessage: "2024-02-07 18:01:05.641", 55 | assert: func(t *testing.T, str string) { 56 | assert.Equal(t, "2024-02-07 18:01:05.641", strings.TrimSpace(str)) 57 | }, 58 | }, 59 | } 60 | 61 | for _, tc := range testCases { 62 | t.Run(tc.name, func(t *testing.T) { 63 | // Create a buffer to capture stdout output 64 | var buf bytes.Buffer 65 | 66 | // Save the original stdout 67 | stdout := os.Stdout 68 | 69 | // Create a pipe to redirect stdout 70 | r, w, err := os.Pipe() 71 | assert.NoError(t, err) 72 | 73 | os.Stdout = w // Redirect stdout to the write end of the pipe 74 | 75 | ctx, cancel := context.WithCancel(context.Background()) 76 | err = Capture(ctx, func() error { 77 | fmt.Println(tc.logMessage) 78 | return nil 79 | }) 80 | 81 | assert.NoError(t, err) 82 | 83 | // Close the write end of the pipe to allow reading all data 84 | _ = w.Close() 85 | os.Stdout = stdout // Restore original stdout 86 | 87 | // Read from the pipe directly into the buffer 88 | _, err = buf.ReadFrom(r) 89 | assert.NoError(t, err) 90 | if tc.assert != nil { 91 | tc.assert(t, buf.String()) 92 | } 93 | cancel() 94 | }) 95 | } 96 | } 97 | 98 | func TestCaptureWithCGO(t *testing.T) { 99 | testCaptureWithCGO(t) 100 | } 101 | -------------------------------------------------------------------------------- /tests/docker/Makefile: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | GO_CMD ?= go 16 | 17 | # Default image configuration 18 | # Note: FULL_VERSION should match root Makefile's DCGM_VERSION-VERSION format 19 | # This gets updated automatically by 'make update-version' from root 20 | REGISTRY ?= nvidia 21 | FULL_VERSION ?= 4.4.2-4.7.1 22 | 23 | # Override specific images (optional) 24 | # If not set, defaults to: $(REGISTRY)/dcgm-exporter:$(FULL_VERSION)- 25 | IMAGE_UBUNTU ?= 26 | IMAGE_UBI ?= 27 | IMAGE_DISTROLESS ?= 28 | 29 | .PHONY: docker-test 30 | docker-test: ## Run tests for all configured images 31 | @echo "Running Docker image tests..." 32 | @REGISTRY=$(REGISTRY) VERSION=$(FULL_VERSION) \ 33 | IMAGE_UBUNTU=$(IMAGE_UBUNTU) IMAGE_UBI=$(IMAGE_UBI) IMAGE_DISTROLESS=$(IMAGE_DISTROLESS) \ 34 | $(GO_CMD) test --tags=docker -v . \ 35 | -args \ 36 | --ginkgo.v \ 37 | --ginkgo.no-color 38 | 39 | .PHONY: docker-test-ubuntu 40 | docker-test-ubuntu: ## Test only Ubuntu image 41 | @echo "Testing Ubuntu image only..." 42 | @REGISTRY=$(REGISTRY) VERSION=$(FULL_VERSION) \ 43 | IMAGE_UBUNTU=$(IMAGE_UBUNTU) IMAGE_UBI="" IMAGE_DISTROLESS="" \ 44 | $(GO_CMD) test --tags=docker -v . \ 45 | -args \ 46 | --ginkgo.v \ 47 | --ginkgo.no-color 48 | 49 | .PHONY: docker-test-ubi 50 | docker-test-ubi: ## Test only UBI image 51 | @echo "Testing UBI image only..." 52 | @REGISTRY=$(REGISTRY) VERSION=$(FULL_VERSION) \ 53 | IMAGE_UBUNTU="" IMAGE_UBI=$(IMAGE_UBI) IMAGE_DISTROLESS="" \ 54 | $(GO_CMD) test --tags=docker -v . \ 55 | -args \ 56 | --ginkgo.v \ 57 | --ginkgo.no-color 58 | 59 | .PHONY: docker-test-distroless 60 | docker-test-distroless: ## Test only distroless image 61 | @echo "Testing distroless image only..." 62 | @REGISTRY=$(REGISTRY) VERSION=$(FULL_VERSION) \ 63 | IMAGE_UBUNTU="" IMAGE_UBI="" IMAGE_DISTROLESS=$(IMAGE_DISTROLESS) \ 64 | $(GO_CMD) test --tags=docker -v . \ 65 | -args \ 66 | --ginkgo.v \ 67 | --ginkgo.no-color 68 | 69 | .PHONY: docker-test-verbose 70 | docker-test-verbose: ## Run Docker image tests with verbose output 71 | @$(GO_CMD) test --tags=docker -v . \ 72 | -args \ 73 | --ginkgo.v \ 74 | --ginkgo.vv \ 75 | --ginkgo.trace 76 | 77 | .PHONY: docker-test-focus 78 | docker-test-focus: ## Run specific test cases (use FOCUS="pattern") 79 | @$(GO_CMD) test --tags=docker -v . \ 80 | -args \ 81 | --ginkgo.v \ 82 | --ginkgo.focus="$(FOCUS)" 83 | 84 | .PHONY: help 85 | help: ## Display this help 86 | @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}' 87 | 88 | -------------------------------------------------------------------------------- /deployment/templates/_helpers.tpl: -------------------------------------------------------------------------------- 1 | {{/* vim: set filetype=mustache: */}} 2 | {{/* 3 | Expand the name of the chart. 4 | */}} 5 | {{- define "dcgm-exporter.name" -}} 6 | {{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}} 7 | {{- end -}} 8 | 9 | {{/* 10 | Create a default fully qualified app name. 11 | We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). 12 | If release name contains chart name it will be used as a full name. 13 | */}} 14 | {{- define "dcgm-exporter.fullname" -}} 15 | {{- if .Values.fullnameOverride -}} 16 | {{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" -}} 17 | {{- else -}} 18 | {{- $name := default .Chart.Name .Values.nameOverride -}} 19 | {{- if contains $name .Release.Name -}} 20 | {{- .Release.Name | trunc 63 | trimSuffix "-" -}} 21 | {{- else -}} 22 | {{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" -}} 23 | {{- end -}} 24 | {{- end -}} 25 | {{- end -}} 26 | 27 | 28 | {{/* 29 | Allow the release namespace to be overridden for multi-namespace deployments in combined charts 30 | */}} 31 | {{- define "dcgm-exporter.namespace" -}} 32 | {{- if .Values.namespaceOverride -}} 33 | {{- .Values.namespaceOverride -}} 34 | {{- else -}} 35 | {{- .Release.Namespace -}} 36 | {{- end -}} 37 | {{- end -}} 38 | 39 | {{/* 40 | Create chart name and version as used by the chart label. 41 | */}} 42 | {{- define "dcgm-exporter.chart" -}} 43 | {{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" -}} 44 | {{- end -}} 45 | 46 | {{/* 47 | Common labels 48 | */}} 49 | {{- define "dcgm-exporter.labels" -}} 50 | helm.sh/chart: {{ include "dcgm-exporter.chart" . }} 51 | {{ include "dcgm-exporter.selectorLabels" . }} 52 | {{- if .Chart.AppVersion }} 53 | app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} 54 | {{- end }} 55 | app.kubernetes.io/managed-by: {{ .Release.Service }} 56 | {{- end -}} 57 | 58 | {{/* 59 | Selector labels 60 | */}} 61 | {{- define "dcgm-exporter.selectorLabels" -}} 62 | app.kubernetes.io/name: {{ include "dcgm-exporter.name" . }} 63 | app.kubernetes.io/instance: {{ .Release.Name }} 64 | app.kubernetes.io/component: {{ include "dcgm-exporter.name" . }} 65 | {{- end -}} 66 | 67 | {{/* 68 | Create the name of the service account to use 69 | */}} 70 | {{- define "dcgm-exporter.serviceAccountName" -}} 71 | {{- if .Values.serviceAccount.create -}} 72 | {{ default (include "dcgm-exporter.fullname" .) .Values.serviceAccount.name }} 73 | {{- else -}} 74 | {{ default "default" .Values.serviceAccount.name }} 75 | {{- end -}} 76 | {{- end -}} 77 | 78 | 79 | {{/* 80 | Create the name of the tls secret to use 81 | */}} 82 | {{- define "dcgm-exporter.tlsCertsSecretName" -}} 83 | {{- if .Values.tlsServerConfig.existingSecret -}} 84 | {{- printf "%s" (tpl .Values.tlsServerConfig.existingSecret $) -}} 85 | {{- else -}} 86 | {{ printf "%s-tls" (include "dcgm-exporter.fullname" .) }} 87 | {{- end -}} 88 | {{- end -}} 89 | 90 | 91 | {{/* 92 | Create the name of the web-config configmap name to use 93 | */}} 94 | {{- define "dcgm-exporter.webConfigConfigMap" -}} 95 | {{ printf "%s-web-config.yml" (include "dcgm-exporter.fullname" .) }} 96 | {{- end -}} 97 | -------------------------------------------------------------------------------- /internal/mocks/pkg/transformations/mock_transformer.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // Code generated by MockGen. DO NOT EDIT. 16 | // Source: github.com/NVIDIA/dcgm-exporter/internal/pkg/transformation (interfaces: Transform) 17 | // 18 | // Generated by this command: 19 | // 20 | // mockgen -destination=../../mocks/pkg/transformations/mock_transformer.go -package=transformation -copyright_file=../../../hack/header.txt . Transform 21 | // 22 | 23 | // Package transformation is a generated GoMock package. 24 | package transformation 25 | 26 | import ( 27 | reflect "reflect" 28 | 29 | collector "github.com/NVIDIA/dcgm-exporter/internal/pkg/collector" 30 | deviceinfo "github.com/NVIDIA/dcgm-exporter/internal/pkg/deviceinfo" 31 | gomock "go.uber.org/mock/gomock" 32 | ) 33 | 34 | // MockTransform is a mock of Transform interface. 35 | type MockTransform struct { 36 | ctrl *gomock.Controller 37 | recorder *MockTransformMockRecorder 38 | isgomock struct{} 39 | } 40 | 41 | // MockTransformMockRecorder is the mock recorder for MockTransform. 42 | type MockTransformMockRecorder struct { 43 | mock *MockTransform 44 | } 45 | 46 | // NewMockTransform creates a new mock instance. 47 | func NewMockTransform(ctrl *gomock.Controller) *MockTransform { 48 | mock := &MockTransform{ctrl: ctrl} 49 | mock.recorder = &MockTransformMockRecorder{mock} 50 | return mock 51 | } 52 | 53 | // EXPECT returns an object that allows the caller to indicate expected use. 54 | func (m *MockTransform) EXPECT() *MockTransformMockRecorder { 55 | return m.recorder 56 | } 57 | 58 | // Name mocks base method. 59 | func (m *MockTransform) Name() string { 60 | m.ctrl.T.Helper() 61 | ret := m.ctrl.Call(m, "Name") 62 | ret0, _ := ret[0].(string) 63 | return ret0 64 | } 65 | 66 | // Name indicates an expected call of Name. 67 | func (mr *MockTransformMockRecorder) Name() *gomock.Call { 68 | mr.mock.ctrl.T.Helper() 69 | return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Name", reflect.TypeOf((*MockTransform)(nil).Name)) 70 | } 71 | 72 | // Process mocks base method. 73 | func (m *MockTransform) Process(metrics collector.MetricsByCounter, deviceInfo deviceinfo.Provider) error { 74 | m.ctrl.T.Helper() 75 | ret := m.ctrl.Call(m, "Process", metrics, deviceInfo) 76 | ret0, _ := ret[0].(error) 77 | return ret0 78 | } 79 | 80 | // Process indicates an expected call of Process. 81 | func (mr *MockTransformMockRecorder) Process(metrics, deviceInfo any) *gomock.Call { 82 | mr.mock.ctrl.T.Helper() 83 | return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Process", reflect.TypeOf((*MockTransform)(nil).Process), metrics, deviceInfo) 84 | } 85 | -------------------------------------------------------------------------------- /tests/integration/start_with_tls_test.go: -------------------------------------------------------------------------------- 1 | package integration 2 | 3 | import ( 4 | "context" 5 | "crypto/tls" 6 | "fmt" 7 | "net/http" 8 | "os" 9 | "testing" 10 | "time" 11 | 12 | "github.com/avast/retry-go/v4" 13 | "github.com/stretchr/testify/require" 14 | 15 | "github.com/NVIDIA/dcgm-exporter/pkg/cmd" 16 | ) 17 | 18 | func TestStartWithTLSEnabledAndBasicAuth(t *testing.T) { 19 | if testing.Short() { 20 | t.Skip("skipping test in short mode.") 21 | } 22 | app := cmd.NewApp() 23 | args := os.Args[0:1] 24 | args = append(args, "-f=./testdata/default-counters.csv") // Append a file with default counters 25 | port := getRandomAvailablePort(t) 26 | args = append(args, fmt.Sprintf("-a=:%d", port)) 27 | args = append(args, "--web-config-file=./testdata/web-config.yml") 28 | ctx, cancel := context.WithCancel(context.Background()) 29 | go func(ctx context.Context) { 30 | err := app.Run(args) 31 | require.NoError(t, err) 32 | }(ctx) 33 | 34 | t.Run("server returns 400 if request uses HTTP and TLS enabled on the server", 35 | func(t *testing.T) { 36 | status, err := retry.DoWithData( 37 | func() (int, error) { 38 | _, status, err := httpGet(t, fmt.Sprintf("http://localhost:%d/metrics", port)) 39 | if err != nil { 40 | return -1, err 41 | } 42 | return status, nil 43 | }, 44 | retry.Attempts(10), 45 | retry.MaxDelay(10*time.Second), 46 | ) 47 | require.NoError(t, err) 48 | require.Equal(t, http.StatusBadRequest, status) 49 | }) 50 | 51 | t.Run("server returns 200 when request uses HTTPS and valid password", func(t *testing.T) { 52 | // Create a custom client with TLS configuration 53 | client := &http.Client{ 54 | Transport: &http.Transport{ 55 | TLSClientConfig: &tls.Config{ 56 | InsecureSkipVerify: true, 57 | }, 58 | }, 59 | } 60 | status, err := retry.DoWithData( 61 | func() (int, error) { 62 | req := newRequestWithBasicAuth(t, "alice", "password", http.MethodGet, 63 | fmt.Sprintf("https://localhost:%d/metrics", port), nil) 64 | resp, err := client.Do(req) 65 | if err != nil { 66 | return -1, err 67 | } 68 | return resp.StatusCode, nil 69 | }, 70 | retry.Attempts(10), 71 | retry.MaxDelay(10*time.Second), 72 | ) 73 | require.NoError(t, err) 74 | require.Equal(t, http.StatusOK, status) 75 | }) 76 | 77 | t.Run("server returns 401 when request uses HTTPS and password is invalid", func(t *testing.T) { 78 | // Create a custom client with TLS configuration 79 | client := &http.Client{ 80 | Transport: &http.Transport{ 81 | TLSClientConfig: &tls.Config{ 82 | InsecureSkipVerify: true, 83 | }, 84 | }, 85 | } 86 | status, err := retry.DoWithData( 87 | func() (int, error) { 88 | req := newRequestWithBasicAuth(t, "alice", "bad password", http.MethodGet, 89 | fmt.Sprintf("https://localhost:%d/metrics", port), nil) 90 | resp, err := client.Do(req) 91 | if err != nil { 92 | return -1, err 93 | } 94 | return resp.StatusCode, nil 95 | }, 96 | retry.Attempts(10), 97 | retry.MaxDelay(10*time.Second), 98 | ) 99 | require.NoError(t, err) 100 | require.Equal(t, http.StatusUnauthorized, status) 101 | }) 102 | cancel() 103 | } 104 | -------------------------------------------------------------------------------- /internal/pkg/prerequisites/validation_test.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package prerequisites 18 | 19 | import ( 20 | debugelf "debug/elf" 21 | "testing" 22 | 23 | "github.com/stretchr/testify/require" 24 | "go.uber.org/mock/gomock" 25 | 26 | mockelf "github.com/NVIDIA/dcgm-exporter/internal/mocks/pkg/elf" 27 | mockexec "github.com/NVIDIA/dcgm-exporter/internal/mocks/pkg/exec" 28 | mockos "github.com/NVIDIA/dcgm-exporter/internal/mocks/pkg/os" 29 | 30 | realos "os" 31 | ) 32 | 33 | func TestValidate(t *testing.T) { 34 | type testCase struct { 35 | Name string 36 | OSMockExpectations func(*gomock.Controller, *mockos.MockOS) 37 | LDConfigPath string 38 | } 39 | 40 | tests := []testCase{ 41 | { 42 | Name: "Ubuntu-based system with /sbin/ldconfig.real", 43 | OSMockExpectations: func(ctrl *gomock.Controller, mo *mockos.MockOS) { 44 | mfi := mockos.NewMockFileInfo(ctrl) 45 | mo.EXPECT().Stat("/sbin/ldconfig.real").Return(mfi, nil) 46 | }, 47 | LDConfigPath: "/sbin/ldconfig.real", 48 | }, 49 | { 50 | Name: "Linux system without /sbin/ldconfig.real", 51 | OSMockExpectations: func(ctrl *gomock.Controller, mo *mockos.MockOS) { 52 | mo.EXPECT().Stat("/sbin/ldconfig.real").Return(nil, &realos.PathError{}) 53 | }, 54 | LDConfigPath: "/sbin/ldconfig", 55 | }, 56 | } 57 | 58 | for _, tc := range tests { 59 | 60 | ctrl := gomock.NewController(t) 61 | 62 | osinstance := mockos.NewMockOS(ctrl) 63 | tc.OSMockExpectations(ctrl, osinstance) 64 | 65 | os = osinstance 66 | 67 | executor := mockexec.NewMockExec(ctrl) 68 | 69 | output := `1211 libs found in cache '/etc/ld.so.cache' 70 | libdcgm.so.4 (libc6,x86-64) => /lib/x86_64-linux-gnu/libdcgm.so.4 71 | Cache generated by: ldconfig (Ubuntu GLIBC 2.35-0ubuntu3.7) stable release version 2.35` 72 | cmd := mockexec.NewMockCmd(ctrl) 73 | cmd.EXPECT().Output().AnyTimes().Return([]byte(output), nil) 74 | executor.EXPECT().Command(gomock.Eq(tc.LDConfigPath), gomock.Eq(ldconfigParam)).AnyTimes().Return(cmd) 75 | 76 | exec = executor 77 | 78 | elfreader := mockelf.NewMockELF(ctrl) 79 | 80 | self := &debugelf.File{ 81 | FileHeader: debugelf.FileHeader{ 82 | Machine: debugelf.EM_X86_64, 83 | }, 84 | } 85 | elfreader.EXPECT().Open(gomock.Eq("/proc/self/exe")).AnyTimes().Return(self, nil) 86 | 87 | libdcgm := &debugelf.File{ 88 | FileHeader: debugelf.FileHeader{ 89 | Machine: debugelf.EM_X86_64, 90 | }, 91 | } 92 | elfreader.EXPECT().Open(gomock.Eq("/lib/x86_64-linux-gnu/libdcgm.so.4")).AnyTimes().Return(libdcgm, nil) 93 | 94 | elf = elfreader 95 | 96 | err := Validate() 97 | require.NoError(t, err) 98 | } 99 | } 100 | -------------------------------------------------------------------------------- /internal/pkg/appconfig/types.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package appconfig 18 | 19 | import ( 20 | "github.com/NVIDIA/go-dcgm/pkg/dcgm" 21 | ) 22 | 23 | type KubernetesGPUIDType string 24 | 25 | type DeviceOptions struct { 26 | Flex bool // If true, then monitor all GPUs if MIG mode is disabled or all GPU instances if MIG is enabled. 27 | MajorRange []int // The indices of each GPU/NvSwitch to monitor, or -1 to monitor all 28 | MinorRange []int // The indices of each GPUInstance/NvLink to monitor, or -1 to monitor all 29 | } 30 | 31 | // DumpConfig controls file-based debugging dumps 32 | type DumpConfig struct { 33 | Enabled bool `yaml:"enabled" json:"enabled"` // Enable file-based dumps 34 | Directory string `yaml:"directory" json:"directory"` // Directory to store dump files 35 | Retention int `yaml:"retention" json:"retention"` // Retention period in hours (0 = no cleanup) 36 | Compression bool `yaml:"compression" json:"compression"` // Use gzip compression for dump files 37 | } 38 | 39 | type Config struct { 40 | CollectorsFile string 41 | Address string 42 | CollectInterval int 43 | Kubernetes bool 44 | KubernetesEnablePodLabels bool 45 | KubernetesEnablePodUID bool 46 | KubernetesGPUIdType KubernetesGPUIDType 47 | KubernetesPodLabelAllowlistRegex []string // Regex patterns for filtering pod labels 48 | KubernetesPodLabelCacheSize int // Maximum number of label keys to cache (<=0 means default size) 49 | CollectDCP bool 50 | UseOldNamespace bool 51 | UseRemoteHE bool 52 | RemoteHEInfo string 53 | GPUDeviceOptions DeviceOptions 54 | SwitchDeviceOptions DeviceOptions 55 | CPUDeviceOptions DeviceOptions 56 | NoHostname bool 57 | UseFakeGPUs bool 58 | ConfigMapData string 59 | MetricGroups []dcgm.MetricGroup 60 | WebSystemdSocket bool 61 | WebConfigFile string 62 | XIDCountWindowSize int 63 | ReplaceBlanksInModelName bool 64 | Debug bool 65 | ClockEventsCountWindowSize int 66 | EnableDCGMLog bool 67 | DCGMLogLevel string 68 | PodResourcesKubeletSocket string 69 | HPCJobMappingDir string 70 | NvidiaResourceNames []string 71 | KubernetesVirtualGPUs bool 72 | DumpConfig DumpConfig // Configuration for file-based dumps 73 | KubernetesEnableDRA bool 74 | DisableStartupValidate bool 75 | } 76 | -------------------------------------------------------------------------------- /tests/e2e/e2e_verify_tls_test.go: -------------------------------------------------------------------------------- 1 | //go:build e2e 2 | 3 | /* 4 | * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package e2e 19 | 20 | import ( 21 | "context" 22 | "crypto/tls" 23 | "fmt" 24 | "io" 25 | "net/http" 26 | "time" 27 | 28 | . "github.com/onsi/ginkgo/v2" 29 | . "github.com/onsi/gomega" 30 | corev1 "k8s.io/api/core/v1" 31 | 32 | "github.com/NVIDIA/dcgm-exporter/tests/e2e/internal/framework" 33 | ) 34 | 35 | // VerifyHelmConfigurationWhenTLSEnabled tests configuration when TLS is enabled 36 | var VerifyHelmConfigurationWhenTLSEnabled = func( 37 | kubeClient *framework.KubeClient, 38 | helmClient *framework.HelmClient, 39 | testRunLabels map[string]string, 40 | ) bool { 41 | return Context("and TLS is enabled", Label("tls"), func() { 42 | var dcgmExpPod *corev1.Pod 43 | 44 | AfterAll(func(ctx context.Context) { 45 | // Helm releases will be cleaned up in AfterSuite 46 | }) 47 | 48 | It("should install dcgm-exporter helm chart", func(ctx context.Context) { 49 | shouldInstallHelmChart(ctx, helmClient, []string{"tlsServerConfig.enabled=true"}) 50 | }) 51 | 52 | It("should create dcgm-exporter pod", func(ctx context.Context) { 53 | dcgmExpPod = shouldCheckIfPodCreated(ctx, kubeClient, dcgmExporterPodLabels) 54 | }) 55 | 56 | It("should ensure that the dcgm-exporter pod is ready", func(ctx context.Context) { 57 | shouldCheckIfPodIsReady(ctx, kubeClient, dcgmExpPod.Namespace, dcgmExpPod.Name) 58 | }) 59 | 60 | It("should check that the port accepts TLS", func(ctx context.Context) { 61 | ctx, cancel := context.WithCancel(ctx) 62 | defer cancel() 63 | kubeClient.ErrWriter = GinkgoWriter 64 | kubeClient.OutWriter = GinkgoWriter 65 | localPort, err := kubeClient.PortForward(ctx, dcgmExpPod.Namespace, dcgmExpPod.Name, 9400) 66 | Expect(err).ShouldNot(HaveOccurred()) 67 | Expect(localPort).Should(BeNumerically(">", 0)) 68 | httpClient := &http.Client{ 69 | Timeout: 5 * time.Second, 70 | Transport: &http.Transport{ 71 | TLSClientConfig: &tls.Config{ 72 | InsecureSkipVerify: true, 73 | }, 74 | }, 75 | } 76 | 77 | By("Ensure that HTTP request returns 400 error") 78 | resp, err := httpClient.Get(fmt.Sprintf("http://localhost:%d/metrics", localPort)) 79 | Expect(err).ShouldNot(HaveOccurred()) 80 | Expect(resp.StatusCode).To(Equal(400)) 81 | body, err := io.ReadAll(resp.Body) 82 | Expect(err).NotTo(HaveOccurred()) 83 | Expect(string(body)).To(ContainSubstring("Client sent an HTTP request to an HTTPS server")) 84 | 85 | By("Ensure that HTTP request returns 200 error") 86 | resp, err = httpClient.Get(fmt.Sprintf("https://localhost:%d/metrics", localPort)) 87 | Expect(err).ShouldNot(HaveOccurred()) 88 | Expect(resp.StatusCode).To(Equal(200)) 89 | _, err = io.ReadAll(resp.Body) 90 | Expect(err).NotTo(HaveOccurred()) 91 | }) 92 | }) 93 | } 94 | -------------------------------------------------------------------------------- /security.md: -------------------------------------------------------------------------------- 1 | # SECURITY 2 | 3 | ## Security 4 | 5 | NVIDIA takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations. 6 | 7 | If you believe you have found a security vulnerability in any NVIDIA-owned repository that meets [NVIDIA's definition of a security vulnerability](https://www.nvidia.com/en-us/security/psirt-policies/), please report it to us as described below. 8 | 9 | ## Reporting Security Issues 10 | 11 | **Please do not report security vulnerabilities through public GitHub issues.** 12 | 13 | To report any security vulnerabilities, please contact the NVIDIA Product Security Incident Response Team (PSIRT) at: 14 | 15 | 1. [https://www.nvidia.com/en-us/security/report-vulnerability/](https://www.nvidia.com/en-us/security/report-vulnerability/). 16 | 2. If you prefer to submit without logging in, please email [psirt@nvidia.com](mailto:psirt@nvidia.com). If you report a potential vulnerability via email, please encrypt your communication using NVIDIA's public PGP key ([see PGP Key page](https://www.nvidia.com/en-us/security/pgp-key/)). 17 | 3. Alternatively, you can report a security issue through GitHub using the GitHub Security Advisories feature at [https://github.com/NVIDIA/dcgm-exporter/security/advisories/new](https://github.com/NVIDIA/dcgm-exporter/security/advisories/new). 18 | 19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue: 20 | 21 | - Type of issue (e.g., buffer overflow, remote code execution, etc.) 22 | - Permanent link of the source file(s) related to the manifestation of the issue 23 | - Any special configuration required to reproduce the issue 24 | - Step-by-step instructions to reproduce the issue 25 | - Proof-of-concept or exploit code (if possible) 26 | - Impact of the issue, including how an attacker might exploit the issue 27 | 28 | This information will help us triage your report more quickly. 29 | 30 | NVIDIA reserves the right to delete vulnerability reports until they're fixed. 31 | 32 | ## Preferred Languages 33 | 34 | We prefer all communications to be in English. 35 | 36 | ## **Coordinated Vulnerability Disclosure** 37 | 38 | NVIDIA strives to follow Coordinated Vulnerability Disclosure (CVD). CVD is a process by which independent reporters who discover a vulnerability in our product contact NVIDIA directly and allow us the opportunity to investigate and remediate the vulnerability before the reporter discloses the information to the public. 39 | 40 | NVIDIA PSIRT will coordinate with the reporter throughout the vulnerability investigation and provide the reporter with updates on progress as appropriate. With the reporter's agreement, NVIDIA PSIRT may recognize the reporter on our Acknowledgement page for finding a valid product vulnerability and privately reporting the issue. After an update or mitigation information is publicly released by NVIDIA, the reporter is welcome to discuss the vulnerability publicly. 41 | 42 | Following NVIDIA's CVD allows us to protect our customers while coordinating public disclosures and appropriately acknowledging the reporter(s) for their findings. 43 | 44 | Occasionally NVIDIA will discover security vulnerabilities in products from other vendors. NVIDIA will follow its standard Coordinated Vulnerability Disclosure process and communicate the identified issue to the affected vendor or a third-party coordination center if this occurs. 45 | -------------------------------------------------------------------------------- /internal/pkg/collector/base_collector.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package collector 18 | 19 | import ( 20 | "fmt" 21 | 22 | "github.com/NVIDIA/dcgm-exporter/internal/pkg/appconfig" 23 | "github.com/NVIDIA/dcgm-exporter/internal/pkg/counters" 24 | "github.com/NVIDIA/dcgm-exporter/internal/pkg/dcgmprovider" 25 | "github.com/NVIDIA/dcgm-exporter/internal/pkg/devicemonitoring" 26 | "github.com/NVIDIA/dcgm-exporter/internal/pkg/devicewatchlistmanager" 27 | ) 28 | 29 | type baseExpCollector struct { 30 | deviceWatchList devicewatchlistmanager.WatchList // Device info and fields used for counters and labels 31 | counter counters.Counter // Counter for a specific collector type 32 | labelsCounters []counters.Counter // Counters used for labels 33 | hostname string // Hostname 34 | config *appconfig.Config // Configuration settings 35 | cleanups []func() // Cleanup functions 36 | } 37 | 38 | func (c *baseExpCollector) createMetric( 39 | labels map[string]string, mi devicemonitoring.Info, uuid string, val int, 40 | ) Metric { 41 | gpuModel := getGPUModel(mi.DeviceInfo, c.config.ReplaceBlanksInModelName) 42 | 43 | m := Metric{ 44 | Counter: c.counter, 45 | Value: fmt.Sprint(val), 46 | UUID: uuid, 47 | GPU: fmt.Sprintf("%d", mi.DeviceInfo.GPU), 48 | GPUUUID: mi.DeviceInfo.UUID, 49 | GPUDevice: fmt.Sprintf("nvidia%d", mi.DeviceInfo.GPU), 50 | GPUModelName: gpuModel, 51 | GPUPCIBusID: mi.DeviceInfo.PCI.BusID, 52 | Hostname: c.hostname, 53 | 54 | Labels: labels, 55 | Attributes: map[string]string{}, 56 | } 57 | if mi.InstanceInfo != nil { 58 | m.MigProfile = mi.InstanceInfo.ProfileName 59 | m.GPUInstanceID = fmt.Sprintf("%d", mi.InstanceInfo.Info.NvmlInstanceId) 60 | } else { 61 | m.MigProfile = "" 62 | m.GPUInstanceID = "" 63 | } 64 | return m 65 | } 66 | 67 | func (c *baseExpCollector) getLabelsFromCounters(mi devicemonitoring.Info, labels map[string]string) error { 68 | latestValues, err := dcgmprovider.Client().EntityGetLatestValues(mi.Entity.EntityGroupId, mi.Entity.EntityId, 69 | c.deviceWatchList.LabelDeviceFields()) 70 | if err != nil { 71 | return err 72 | } 73 | // Extract Labels 74 | for _, val := range latestValues { 75 | v := toString(val) 76 | // Filter out counters with no value and ignored fields for this entity 77 | if v == skipDCGMValue { 78 | continue 79 | } 80 | 81 | counter, err := findCounterField(c.labelsCounters, val.FieldID) 82 | if err != nil { 83 | continue 84 | } 85 | 86 | if counter.IsLabel() { 87 | labels[counter.FieldName] = v 88 | continue 89 | } 90 | } 91 | return nil 92 | } 93 | 94 | func (c *baseExpCollector) Cleanup() { 95 | for _, cleanup := range c.cleanups { 96 | cleanup() 97 | } 98 | } 99 | -------------------------------------------------------------------------------- /internal/mocks/pkg/devicewatcher/mock_device_watcher.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // Code generated by MockGen. DO NOT EDIT. 16 | // Source: github.com/NVIDIA/dcgm-exporter/internal/pkg/devicewatcher (interfaces: Watcher) 17 | // 18 | // Generated by this command: 19 | // 20 | // mockgen -destination=../../mocks/pkg/devicewatcher/mock_device_watcher.go -package=devicewatcher -copyright_file=../../../hack/header.txt . Watcher 21 | // 22 | 23 | // Package devicewatcher is a generated GoMock package. 24 | package devicewatcher 25 | 26 | import ( 27 | reflect "reflect" 28 | 29 | counters "github.com/NVIDIA/dcgm-exporter/internal/pkg/counters" 30 | deviceinfo "github.com/NVIDIA/dcgm-exporter/internal/pkg/deviceinfo" 31 | dcgm "github.com/NVIDIA/go-dcgm/pkg/dcgm" 32 | gomock "go.uber.org/mock/gomock" 33 | ) 34 | 35 | // MockWatcher is a mock of Watcher interface. 36 | type MockWatcher struct { 37 | ctrl *gomock.Controller 38 | recorder *MockWatcherMockRecorder 39 | isgomock struct{} 40 | } 41 | 42 | // MockWatcherMockRecorder is the mock recorder for MockWatcher. 43 | type MockWatcherMockRecorder struct { 44 | mock *MockWatcher 45 | } 46 | 47 | // NewMockWatcher creates a new mock instance. 48 | func NewMockWatcher(ctrl *gomock.Controller) *MockWatcher { 49 | mock := &MockWatcher{ctrl: ctrl} 50 | mock.recorder = &MockWatcherMockRecorder{mock} 51 | return mock 52 | } 53 | 54 | // EXPECT returns an object that allows the caller to indicate expected use. 55 | func (m *MockWatcher) EXPECT() *MockWatcherMockRecorder { 56 | return m.recorder 57 | } 58 | 59 | // GetDeviceFields mocks base method. 60 | func (m *MockWatcher) GetDeviceFields(arg0 []counters.Counter, arg1 dcgm.Field_Entity_Group) []dcgm.Short { 61 | m.ctrl.T.Helper() 62 | ret := m.ctrl.Call(m, "GetDeviceFields", arg0, arg1) 63 | ret0, _ := ret[0].([]dcgm.Short) 64 | return ret0 65 | } 66 | 67 | // GetDeviceFields indicates an expected call of GetDeviceFields. 68 | func (mr *MockWatcherMockRecorder) GetDeviceFields(arg0, arg1 any) *gomock.Call { 69 | mr.mock.ctrl.T.Helper() 70 | return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetDeviceFields", reflect.TypeOf((*MockWatcher)(nil).GetDeviceFields), arg0, arg1) 71 | } 72 | 73 | // WatchDeviceFields mocks base method. 74 | func (m *MockWatcher) WatchDeviceFields(arg0 []dcgm.Short, arg1 deviceinfo.Provider, arg2 int64) ([]dcgm.GroupHandle, dcgm.FieldHandle, []func(), error) { 75 | m.ctrl.T.Helper() 76 | ret := m.ctrl.Call(m, "WatchDeviceFields", arg0, arg1, arg2) 77 | ret0, _ := ret[0].([]dcgm.GroupHandle) 78 | ret1, _ := ret[1].(dcgm.FieldHandle) 79 | ret2, _ := ret[2].([]func()) 80 | ret3, _ := ret[3].(error) 81 | return ret0, ret1, ret2, ret3 82 | } 83 | 84 | // WatchDeviceFields indicates an expected call of WatchDeviceFields. 85 | func (mr *MockWatcherMockRecorder) WatchDeviceFields(arg0, arg1, arg2 any) *gomock.Call { 86 | mr.mock.ctrl.T.Helper() 87 | return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "WatchDeviceFields", reflect.TypeOf((*MockWatcher)(nil).WatchDeviceFields), arg0, arg1, arg2) 88 | } 89 | -------------------------------------------------------------------------------- /internal/mocks/pkg/devicewatchlistmanager/mock_device_watchlist_manager.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // Code generated by MockGen. DO NOT EDIT. 16 | // Source: github.com/NVIDIA/dcgm-exporter/internal/pkg/devicewatchlistmanager (interfaces: Manager) 17 | // 18 | // Generated by this command: 19 | // 20 | // mockgen -destination=../../mocks/pkg/devicewatchlistmanager/mock_device_watchlist_manager.go -package=devicewatchlistmanager -copyright_file=../../../hack/header.txt . Manager 21 | // 22 | 23 | // Package devicewatchlistmanager is a generated GoMock package. 24 | package devicewatchlistmanager 25 | 26 | import ( 27 | reflect "reflect" 28 | 29 | devicewatcher "github.com/NVIDIA/dcgm-exporter/internal/pkg/devicewatcher" 30 | devicewatchlistmanager "github.com/NVIDIA/dcgm-exporter/internal/pkg/devicewatchlistmanager" 31 | dcgm "github.com/NVIDIA/go-dcgm/pkg/dcgm" 32 | gomock "go.uber.org/mock/gomock" 33 | ) 34 | 35 | // MockManager is a mock of Manager interface. 36 | type MockManager struct { 37 | ctrl *gomock.Controller 38 | recorder *MockManagerMockRecorder 39 | isgomock struct{} 40 | } 41 | 42 | // MockManagerMockRecorder is the mock recorder for MockManager. 43 | type MockManagerMockRecorder struct { 44 | mock *MockManager 45 | } 46 | 47 | // NewMockManager creates a new mock instance. 48 | func NewMockManager(ctrl *gomock.Controller) *MockManager { 49 | mock := &MockManager{ctrl: ctrl} 50 | mock.recorder = &MockManagerMockRecorder{mock} 51 | return mock 52 | } 53 | 54 | // EXPECT returns an object that allows the caller to indicate expected use. 55 | func (m *MockManager) EXPECT() *MockManagerMockRecorder { 56 | return m.recorder 57 | } 58 | 59 | // CreateEntityWatchList mocks base method. 60 | func (m *MockManager) CreateEntityWatchList(arg0 dcgm.Field_Entity_Group, arg1 devicewatcher.Watcher, arg2 int64) error { 61 | m.ctrl.T.Helper() 62 | ret := m.ctrl.Call(m, "CreateEntityWatchList", arg0, arg1, arg2) 63 | ret0, _ := ret[0].(error) 64 | return ret0 65 | } 66 | 67 | // CreateEntityWatchList indicates an expected call of CreateEntityWatchList. 68 | func (mr *MockManagerMockRecorder) CreateEntityWatchList(arg0, arg1, arg2 any) *gomock.Call { 69 | mr.mock.ctrl.T.Helper() 70 | return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "CreateEntityWatchList", reflect.TypeOf((*MockManager)(nil).CreateEntityWatchList), arg0, arg1, arg2) 71 | } 72 | 73 | // EntityWatchList mocks base method. 74 | func (m *MockManager) EntityWatchList(arg0 dcgm.Field_Entity_Group) (devicewatchlistmanager.WatchList, bool) { 75 | m.ctrl.T.Helper() 76 | ret := m.ctrl.Call(m, "EntityWatchList", arg0) 77 | ret0, _ := ret[0].(devicewatchlistmanager.WatchList) 78 | ret1, _ := ret[1].(bool) 79 | return ret0, ret1 80 | } 81 | 82 | // EntityWatchList indicates an expected call of EntityWatchList. 83 | func (mr *MockManagerMockRecorder) EntityWatchList(arg0 any) *gomock.Call { 84 | mr.mock.ctrl.T.Helper() 85 | return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "EntityWatchList", reflect.TypeOf((*MockManager)(nil).EntityWatchList), arg0) 86 | } 87 | -------------------------------------------------------------------------------- /internal/pkg/transformation/types.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package transformation 18 | 19 | import ( 20 | "container/list" 21 | "context" 22 | "regexp" 23 | "sync" 24 | 25 | "k8s.io/client-go/informers" 26 | "k8s.io/client-go/kubernetes" 27 | "k8s.io/client-go/tools/cache" 28 | 29 | "github.com/NVIDIA/dcgm-exporter/internal/pkg/appconfig" 30 | "github.com/NVIDIA/dcgm-exporter/internal/pkg/collector" 31 | "github.com/NVIDIA/dcgm-exporter/internal/pkg/deviceinfo" 32 | ) 33 | 34 | //go:generate go run -v go.uber.org/mock/mockgen -destination=../../mocks/pkg/transformations/mock_transformer.go -package=transformation -copyright_file=../../../hack/header.txt . Transform 35 | 36 | type Transform interface { 37 | Process(metrics collector.MetricsByCounter, deviceInfo deviceinfo.Provider) error 38 | Name() string 39 | } 40 | 41 | type PodMapper struct { 42 | Config *appconfig.Config 43 | Client kubernetes.Interface 44 | ResourceSliceManager *DRAResourceSliceManager 45 | labelFilterCache *LabelFilterCache 46 | } 47 | 48 | // LabelFilterCache provides efficient caching for label filtering decisions 49 | type LabelFilterCache struct { 50 | compiledPatterns []*regexp.Regexp // Pre-compiled regex patterns 51 | cache map[string]*list.Element // map[labelKey -> list element] - list element of key we've already checked 52 | lruList *list.List // Doubly-linked list for LRU ordering 53 | mu sync.Mutex // Protects cache and lruList 54 | maxSize int // Maximum number of entries to cache 55 | enabled bool // Whether filtering is enabled (has patterns) 56 | } 57 | 58 | // labelCacheEntry represents a cached label filtering result 59 | type labelCacheEntry struct { 60 | key string // Label key 61 | value bool // Whether the label is allowed 62 | } 63 | 64 | type PodInfo struct { 65 | Name string 66 | Namespace string 67 | Container string 68 | UID string 69 | VGPU string 70 | Labels map[string]string 71 | DynamicResources *DynamicResourceInfo 72 | } 73 | 74 | type DRAResourceSliceManager struct { 75 | factory informers.SharedInformerFactory 76 | informer cache.SharedIndexInformer 77 | cancelContext context.CancelFunc 78 | mu sync.RWMutex 79 | deviceToUUID map[string]string // pool/device -> UUID (for full GPUs) 80 | migDevices map[string]*DRAMigDeviceInfo // pool/device -> MIG info (for MIG devices) 81 | } 82 | 83 | // PodMetadata holds pod metadata from API server 84 | type PodMetadata struct { 85 | UID string 86 | Labels map[string]string 87 | } 88 | 89 | type DynamicResourceInfo struct { 90 | ClaimName string 91 | ClaimNamespace string 92 | DriverName string 93 | PoolName string 94 | DeviceName string 95 | // MIG-specific information 96 | MIGInfo *DRAMigDeviceInfo 97 | } 98 | 99 | type DRAMigDeviceInfo struct { 100 | MIGDeviceUUID string 101 | Profile string 102 | ParentUUID string 103 | } 104 | -------------------------------------------------------------------------------- /internal/mocks/pkg/os/mock_dir_entry.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // Code generated by MockGen. DO NOT EDIT. 16 | // Source: os (interfaces: DirEntry) 17 | // 18 | // Generated by this command: 19 | // 20 | // mockgen -destination=../../mocks/pkg/os/mock_dir_entry.go -package=os -copyright_file=../../../hack/header.txt os DirEntry 21 | // 22 | 23 | // Package os is a generated GoMock package. 24 | package os 25 | 26 | import ( 27 | fs "io/fs" 28 | reflect "reflect" 29 | 30 | gomock "go.uber.org/mock/gomock" 31 | ) 32 | 33 | // MockDirEntry is a mock of DirEntry interface. 34 | type MockDirEntry struct { 35 | ctrl *gomock.Controller 36 | recorder *MockDirEntryMockRecorder 37 | isgomock struct{} 38 | } 39 | 40 | // MockDirEntryMockRecorder is the mock recorder for MockDirEntry. 41 | type MockDirEntryMockRecorder struct { 42 | mock *MockDirEntry 43 | } 44 | 45 | // NewMockDirEntry creates a new mock instance. 46 | func NewMockDirEntry(ctrl *gomock.Controller) *MockDirEntry { 47 | mock := &MockDirEntry{ctrl: ctrl} 48 | mock.recorder = &MockDirEntryMockRecorder{mock} 49 | return mock 50 | } 51 | 52 | // EXPECT returns an object that allows the caller to indicate expected use. 53 | func (m *MockDirEntry) EXPECT() *MockDirEntryMockRecorder { 54 | return m.recorder 55 | } 56 | 57 | // Info mocks base method. 58 | func (m *MockDirEntry) Info() (fs.FileInfo, error) { 59 | m.ctrl.T.Helper() 60 | ret := m.ctrl.Call(m, "Info") 61 | ret0, _ := ret[0].(fs.FileInfo) 62 | ret1, _ := ret[1].(error) 63 | return ret0, ret1 64 | } 65 | 66 | // Info indicates an expected call of Info. 67 | func (mr *MockDirEntryMockRecorder) Info() *gomock.Call { 68 | mr.mock.ctrl.T.Helper() 69 | return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Info", reflect.TypeOf((*MockDirEntry)(nil).Info)) 70 | } 71 | 72 | // IsDir mocks base method. 73 | func (m *MockDirEntry) IsDir() bool { 74 | m.ctrl.T.Helper() 75 | ret := m.ctrl.Call(m, "IsDir") 76 | ret0, _ := ret[0].(bool) 77 | return ret0 78 | } 79 | 80 | // IsDir indicates an expected call of IsDir. 81 | func (mr *MockDirEntryMockRecorder) IsDir() *gomock.Call { 82 | mr.mock.ctrl.T.Helper() 83 | return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "IsDir", reflect.TypeOf((*MockDirEntry)(nil).IsDir)) 84 | } 85 | 86 | // Name mocks base method. 87 | func (m *MockDirEntry) Name() string { 88 | m.ctrl.T.Helper() 89 | ret := m.ctrl.Call(m, "Name") 90 | ret0, _ := ret[0].(string) 91 | return ret0 92 | } 93 | 94 | // Name indicates an expected call of Name. 95 | func (mr *MockDirEntryMockRecorder) Name() *gomock.Call { 96 | mr.mock.ctrl.T.Helper() 97 | return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Name", reflect.TypeOf((*MockDirEntry)(nil).Name)) 98 | } 99 | 100 | // Type mocks base method. 101 | func (m *MockDirEntry) Type() fs.FileMode { 102 | m.ctrl.T.Helper() 103 | ret := m.ctrl.Call(m, "Type") 104 | ret0, _ := ret[0].(fs.FileMode) 105 | return ret0 106 | } 107 | 108 | // Type indicates an expected call of Type. 109 | func (mr *MockDirEntryMockRecorder) Type() *gomock.Call { 110 | mr.mock.ctrl.T.Helper() 111 | return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Type", reflect.TypeOf((*MockDirEntry)(nil).Type)) 112 | } 113 | -------------------------------------------------------------------------------- /tests/docker/docker_suite_test.go: -------------------------------------------------------------------------------- 1 | //go:build docker 2 | 3 | /* 4 | * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | package docker 20 | 21 | import ( 22 | "context" 23 | "fmt" 24 | "os" 25 | "testing" 26 | "time" 27 | 28 | . "github.com/onsi/ginkgo/v2" 29 | . "github.com/onsi/gomega" 30 | ) 31 | 32 | const ( 33 | // Default configuration for local images 34 | defaultRegistry = "nvidia" 35 | defaultVersion = "4.4.2-4.7.0" 36 | 37 | // Test configuration 38 | testPort = 9400 39 | startupTimeout = 45 * time.Second // Increased to handle GPU initialization delays 40 | metricsTimeout = 120 * time.Second // Increased for DCGM first collection cycle (30s) + processing 41 | httpClientTimeout = 45 * time.Second // HTTP client timeout - must exceed DCGM collection interval (30s) 42 | ) 43 | 44 | var testConfig TestConfig 45 | 46 | type TestConfig struct { 47 | Images []ImageInfo 48 | TestPort int 49 | } 50 | 51 | type ImageInfo struct { 52 | FullName string 53 | Variant string 54 | } 55 | 56 | func TestDockerImages(t *testing.T) { 57 | RegisterFailHandler(Fail) 58 | RunSpecs(t, "Docker Image Test Suite") 59 | } 60 | 61 | var _ = BeforeSuite(func(ctx context.Context) { 62 | var images []ImageInfo 63 | 64 | // Get image configuration from environment (or use defaults) 65 | registry := getEnvOrDefault("REGISTRY", defaultRegistry) 66 | version := getEnvOrDefault("VERSION", defaultVersion) 67 | 68 | // Get specific images for each variant (or build default from registry/version) 69 | imageUbuntu := getEnvOrDefault("IMAGE_UBUNTU", 70 | fmt.Sprintf("%s/dcgm-exporter:%s-ubuntu22.04", registry, version)) 71 | imageUbi := getEnvOrDefault("IMAGE_UBI", 72 | fmt.Sprintf("%s/dcgm-exporter:%s-ubi9", registry, version)) 73 | imageDistroless := getEnvOrDefault("IMAGE_DISTROLESS", 74 | fmt.Sprintf("%s/dcgm-exporter:%s-distroless", registry, version)) 75 | 76 | // Add images that are configured 77 | if imageUbuntu != "" { 78 | images = append(images, ImageInfo{ 79 | FullName: imageUbuntu, 80 | Variant: "ubuntu22.04", 81 | }) 82 | } 83 | if imageUbi != "" { 84 | images = append(images, ImageInfo{ 85 | FullName: imageUbi, 86 | Variant: "ubi9", 87 | }) 88 | } 89 | if imageDistroless != "" { 90 | images = append(images, ImageInfo{ 91 | FullName: imageDistroless, 92 | Variant: "distroless", 93 | }) 94 | } 95 | 96 | testConfig = TestConfig{ 97 | Images: images, 98 | TestPort: testPort, 99 | } 100 | 101 | By(fmt.Sprintf("Testing %d image(s)", len(images))) 102 | for _, img := range images { 103 | By(fmt.Sprintf(" - %s [%s]", img.FullName, img.Variant)) 104 | } 105 | 106 | By("Validating Docker is available") 107 | available := dockerAvailable() 108 | Expect(available).To(BeTrue(), "Docker must be available to run tests") 109 | 110 | By("Cleaning up any leftover test containers") 111 | cleanupTestContainers(ctx) 112 | }) 113 | 114 | var _ = AfterSuite(func(ctx context.Context) { 115 | By("Final cleanup of test containers") 116 | cleanupTestContainers(ctx) 117 | }) 118 | 119 | func getEnvOrDefault(key, defaultValue string) string { 120 | val := os.Getenv(key) 121 | if val != "" { 122 | return val 123 | } 124 | return defaultValue 125 | } 126 | -------------------------------------------------------------------------------- /etc/1.x-compatibility-metrics.csv: -------------------------------------------------------------------------------- 1 | # Format 2 | # If line starts with a '#' it is considered a comment 3 | # DCGM FIELD, Prometheus metric type, help message 4 | 5 | # Clocks 6 | dcgm_sm_clock, gauge, SM clock frequency (in MHz). 7 | dcgm_memory_clock, gauge, Memory clock frequency (in MHz). 8 | 9 | # Temperature 10 | dcgm_memory_temp, gauge, Memory temperature (in C). 11 | dcgm_gpu_temp, gauge, GPU temperature (in C). 12 | 13 | # Power 14 | dcgm_power_usage, gauge, Power draw (in W). 15 | dcgm_total_energy_consumption, counter, Total energy consumption since boot (in mJ). 16 | 17 | # PCIe 18 | dcgm_fi_prof_pcie_tx_bytes, counter, Total number of bytes transmitted through PCIe TX via NVML. 19 | dcgm_fi_prof_pcie_rx_bytes, counter, Total number of bytes received through PCIe RX via NVML. 20 | dcgm_pcie_replay_counter, counter, Total number of PCIe retries. 21 | 22 | # Utilization (the sample period varies depending on the product) 23 | dcgm_gpu_utilization, gauge, GPU utilization (in %). 24 | dcgm_mem_copy_utilization, gauge, Memory utilization (in %). 25 | dcgm_enc_utilization, gauge, Encoder utilization (in %). 26 | dcgm_dec_utilization, gauge, Decoder utilization (in %). 27 | 28 | # Errors and violations 29 | dcgm_xid_errors, gauge, Value of the last XID error encountered. 30 | # dcgm_power_violation, counter, Throttling duration due to power constraints (in ns). 31 | # dcgm_thermal_violation, counter, Throttling duration due to thermal constraints (in ns). 32 | # dcgm_sync_boost_violation, counter, Throttling duration due to sync-boost constraints (in ns). 33 | # dcgm_board_limit_violation, counter, Throttling duration due to board limit constraints (in ns). 34 | # dcgm_low_util_violation, counter, Throttling duration due to low utilization (in ns). 35 | # dcgm_reliability_violation, counter, Throttling duration due to reliability constraints (in ns). 36 | 37 | # Memory usage 38 | dcgm_fb_free, gauge, Framebuffer memory free (in MiB). 39 | dcgm_fb_used, gauge, Framebuffer memory used (in MiB). 40 | 41 | # ECC 42 | # dcgm_ecc_sbe_volatile_total, counter, Total number of single-bit volatile ECC errors. 43 | # dcgm_ecc_dbe_volatile_total, counter, Total number of double-bit volatile ECC errors. 44 | # dcgm_ecc_sbe_aggregate_total, counter, Total number of single-bit persistent ECC errors. 45 | # dcgm_ecc_dbe_aggregate_total, counter, Total number of double-bit persistent ECC errors. 46 | 47 | # Retired pages 48 | # dcgm_retired_pages_sbe, counter, Total number of retired pages due to single-bit errors. 49 | # dcgm_retired_pages_dbe, counter, Total number of retired pages due to double-bit errors. 50 | # dcgm_retired_pages_pending, counter, Total number of pages pending retirement. 51 | 52 | # NVLink 53 | # dcgm_nvlink_flit_crc_error_count_total, counter, Total number of NVLink flow-control CRC errors. 54 | # dcgm_nvlink_data_crc_error_count_total, counter, Total number of NVLink data CRC errors. 55 | # dcgm_nvlink_replay_error_count_total, counter, Total number of NVLink retries. 56 | # dcgm_nvlink_recovery_error_count_total, counter, Total number of NVLink recovery errors. 57 | dcgm_nvlink_bandwidth_total, counter, Total number of NVLink bandwidth counters for all lanes 58 | 59 | # Add DCP metrics 60 | dcgm_fi_prof_gr_engine_active, gauge, Ratio of time the graphics engine is active (in %). 61 | # dcgm_fi_prof_sm_active, gauge, The ratio of cycles an SM has at least 1 warp assigned (in %). 62 | # dcgm_fi_prof_sm_occupancy, gauge, The ratio of number of warps resident on an SM (in %). 63 | dcgm_fi_prof_pipe_tensor_active, gauge, Ratio of cycles the tensor (HMMA) pipe is active (in %). 64 | dcgm_fi_prof_dram_active, gauge, Ratio of cycles the device memory interface is active sending or receiving data (in %). 65 | dcgm_fi_prof_pcie_tx_bytes, counter, The number of bytes of active pcie tx data including both header and payload. 66 | dcgm_fi_prof_pcie_rx_bytes, counter, The number of bytes of active pcie rx data including both header and payload. 67 | -------------------------------------------------------------------------------- /internal/pkg/nvmlprovider/provider_test.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package nvmlprovider 18 | 19 | import ( 20 | "testing" 21 | 22 | "github.com/stretchr/testify/assert" 23 | ) 24 | 25 | func TestGetMIGDeviceInfoByID_When_NVML_Not_Initialized(t *testing.T) { 26 | validMIGUUID := "MIG-GPU-b8ea3855-276c-c9cb-b366-c6fa655957c5/1/5" 27 | newNvmlProvider := nvmlProvider{} 28 | 29 | deviceInfo, err := newNvmlProvider.GetMIGDeviceInfoByID(validMIGUUID) 30 | assert.Error(t, err, "uuid: %v, Device Info: %+v", validMIGUUID, deviceInfo) 31 | } 32 | 33 | func TestGetMIGDeviceInfoByID_When_DriverVersion_Below_R470(t *testing.T) { 34 | Initialize() 35 | assert.NotNil(t, Client(), "expected NVML Client to be not nil") 36 | assert.True(t, Client().(nvmlProvider).initialized, "expected Client to be initialized") 37 | defer Client().Cleanup() 38 | 39 | tests := []struct { 40 | name string 41 | uuid string 42 | expectedMIGInfo *MIGDeviceInfo 43 | expectedError bool 44 | }{ 45 | { 46 | name: "Successful Parsing", 47 | uuid: "MIG-GPU-b8ea3855-276c-c9cb-b366-c6fa655957c5/1/5", 48 | expectedMIGInfo: &MIGDeviceInfo{ 49 | ParentUUID: "GPU-b8ea3855-276c-c9cb-b366-c6fa655957c5", 50 | GPUInstanceID: 1, 51 | ComputeInstanceID: 5, 52 | }, 53 | }, 54 | { 55 | name: "Fail, Missing MIG at the beginning of UUID", 56 | uuid: "GPU-b8ea3855-276c-c9cb-b366-c6fa655957c5/1/5", 57 | expectedError: true, 58 | }, 59 | { 60 | name: "Fail, Missing GPU at the beginning of GPU UUID", 61 | uuid: "MIG-b8ea3855-276c-c9cb-b366-c6fa655957c5/1/5", 62 | expectedError: true, 63 | }, 64 | { 65 | name: "Fail, GI not parsable", 66 | uuid: "MIG-GPU-b8ea3855-276c-c9cb-b366-c6fa655957c5/xx/5", 67 | expectedError: true, 68 | }, 69 | { 70 | name: "Fail, CI not a parsable", 71 | uuid: "MIG-GPU-b8ea3855-276c-c9cb-b366-c6fa655957c5/1/xx", 72 | expectedError: true, 73 | }, 74 | } 75 | 76 | for _, tc := range tests { 77 | t.Run(tc.name, func(t *testing.T) { 78 | deviceInfo, err := Client().GetMIGDeviceInfoByID(tc.uuid) 79 | if tc.expectedError { 80 | assert.Error(t, err, "uuid: %v, Device Info: %+v", tc.uuid, deviceInfo) 81 | } else { 82 | assert.Nil(t, err, "err: %v, uuid: %v", err, tc.uuid) 83 | assert.Equal(t, tc.expectedMIGInfo, deviceInfo, "MIG uuid '%v' parsed incorrectly", tc.uuid) 84 | } 85 | }) 86 | } 87 | } 88 | 89 | func Test_newNVMLProvider(t *testing.T) { 90 | tests := []struct { 91 | name string 92 | preRunFunc func() NVML 93 | }{ 94 | { 95 | name: "NVML not initialized", 96 | preRunFunc: func() NVML { 97 | reset() 98 | return nvmlProvider{initialized: true} 99 | }, 100 | }, 101 | { 102 | name: "NVML already initialized", 103 | preRunFunc: func() NVML { 104 | Initialize() 105 | return Client() 106 | }, 107 | }, 108 | } 109 | for _, tt := range tests { 110 | t.Run(tt.name, func(t *testing.T) { 111 | want := tt.preRunFunc() 112 | defer reset() 113 | var nvmlProvider NVML 114 | var err error 115 | nvmlProvider, err = newNVMLProvider() 116 | assert.Nil(t, err) 117 | assert.Equalf(t, want, nvmlProvider, "Unexpected Output") 118 | }) 119 | } 120 | } 121 | -------------------------------------------------------------------------------- /internal/pkg/registry/registry.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package registry 18 | 19 | import ( 20 | "sync" 21 | 22 | "github.com/NVIDIA/go-dcgm/pkg/dcgm" 23 | 24 | "golang.org/x/sync/errgroup" 25 | 26 | "github.com/NVIDIA/dcgm-exporter/internal/pkg/collector" 27 | "github.com/NVIDIA/dcgm-exporter/internal/pkg/counters" 28 | ) 29 | 30 | // groupCounterTuple represents a composite key, that consists Group and Counter. 31 | // The groupCounterTuple is necessary to maintain uniqueness of Group and Counter pairs. 32 | type groupCounterTuple struct { 33 | Group dcgm.Field_Entity_Group 34 | Counter counters.Counter 35 | } 36 | 37 | type Registry struct { 38 | collectorGroups map[dcgm.Field_Entity_Group][]collector.Collector 39 | collectorGroupsSeen map[collector.EntityCollectorTuple]struct{} 40 | mtx sync.RWMutex 41 | } 42 | 43 | // NewRegistry creates a new registry 44 | func NewRegistry() *Registry { 45 | return &Registry{ 46 | collectorGroups: map[dcgm.Field_Entity_Group][]collector.Collector{}, 47 | collectorGroupsSeen: map[collector.EntityCollectorTuple]struct{}{}, 48 | } 49 | } 50 | 51 | // Register registers a collector with the registry. 52 | func (r *Registry) Register(entityCollectorTuples collector.EntityCollectorTuple) { 53 | if _, exists := r.collectorGroupsSeen[entityCollectorTuples]; exists { 54 | return 55 | } 56 | r.collectorGroups[entityCollectorTuples.Entity()] = append(r.collectorGroups[entityCollectorTuples.Entity()], 57 | entityCollectorTuples.Collector()) 58 | r.collectorGroupsSeen[entityCollectorTuples] = struct{}{} 59 | } 60 | 61 | // Gather gathers metrics from all registered collectors. 62 | func (r *Registry) Gather() (MetricsByCounterGroup, error) { 63 | r.mtx.Lock() 64 | defer r.mtx.Unlock() 65 | 66 | var wg sync.WaitGroup 67 | 68 | g := new(errgroup.Group) 69 | 70 | var sm sync.Map 71 | 72 | for group, collectors := range r.collectorGroups { 73 | for _, c := range collectors { 74 | c := c // creates new c, see https://golang.org/doc/faq#closures_and_goroutines 75 | group := group 76 | wg.Add(1) 77 | g.Go(func() error { 78 | metrics, err := c.GetMetrics() 79 | if err != nil { 80 | return err 81 | } 82 | 83 | for counter, metricVals := range metrics { 84 | val, _ := sm.LoadOrStore(groupCounterTuple{Group: group, Counter: counter}, []collector.Metric{}) 85 | out := val.([]collector.Metric) 86 | out = append(out, metricVals...) 87 | sm.Store(groupCounterTuple{Group: group, Counter: counter}, out) 88 | } 89 | 90 | return nil 91 | }) 92 | } 93 | } 94 | 95 | if err := g.Wait(); err != nil { 96 | return nil, err 97 | } 98 | 99 | output := MetricsByCounterGroup{} 100 | 101 | sm.Range(func(key, value interface{}) bool { 102 | tuple := key.(groupCounterTuple) 103 | if _, exists := output[tuple.Group]; !exists { 104 | output[tuple.Group] = map[counters.Counter][]collector.Metric{} 105 | } 106 | output[tuple.Group][tuple.Counter] = value.([]collector.Metric) 107 | return true // continue iteration 108 | }) 109 | 110 | return output, nil 111 | } 112 | 113 | // Cleanup resources of registered collectors 114 | func (r *Registry) Cleanup() { 115 | for _, collectors := range r.collectorGroups { 116 | for _, c := range collectors { 117 | c.Cleanup() 118 | } 119 | } 120 | } 121 | --------------------------------------------------------------------------------