├── .gitmodules ├── samples ├── diag │ ├── .gitignore │ └── main.go ├── dmon │ ├── .gitignore │ └── main.go ├── health │ ├── .gitignore │ └── main.go ├── policy │ ├── .gitignore │ └── main.go ├── restApi │ ├── .gitignore │ ├── main.go │ ├── handlers │ │ ├── byUuids.go │ │ ├── byIds.go │ │ └── dcgm.go │ ├── server.go │ └── README.md ├── topology │ ├── .gitignore │ └── main.go ├── deviceInfo │ ├── .gitignore │ └── main.go ├── processInfo │ ├── .gitignore │ └── main.go ├── hostengineStatus │ ├── .gitignore │ └── main.go └── README.md ├── .gitignore ├── .hadolint.yaml ├── pkg └── dcgm │ ├── dcgm_nvml.h │ ├── callback.c │ ├── error.go │ ├── field_values_cb.h │ ├── dcgm_api_export.h │ ├── field_values_cb.c │ ├── api_test.go │ ├── profile.go │ ├── hostengine_status.go │ ├── testdata │ └── one_switch.yaml │ ├── cpu.go │ ├── gpu_group_test.go │ ├── utils.go │ ├── instances_test.go │ ├── diag_test_helpers.go │ ├── structs.go │ ├── field_values_test.go │ ├── field_values.go │ ├── internal.go │ ├── gpu_group.go │ ├── fields_test.go │ ├── mig.go │ ├── device_status.go │ ├── api.go │ ├── topology.go │ ├── test_utils.go │ ├── admin.go │ ├── health_test.go │ ├── health.go │ └── diag.go ├── .markdownlint.yaml ├── scripts └── lint │ └── go-mod-tidy.sh ├── go.mod ├── .github └── workflows │ └── go.yml ├── docker-bake.hcl ├── README.md ├── .yamllint.yaml ├── go.sum ├── tests ├── nvsmi.go ├── processinfo_test.go ├── deviceinfo_test.go ├── health_test.go ├── hostengine_test.go ├── diag_test.go ├── dmon_test.go ├── dcgm_test.go ├── policy_test.go └── README.md ├── .golangci.yml ├── Makefile ├── .pre-commit-config.yaml ├── Dockerfile ├── CONTRIBUTING.md └── .gitlab-ci.yml /.gitmodules: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /samples/diag/.gitignore: -------------------------------------------------------------------------------- 1 | diag 2 | -------------------------------------------------------------------------------- /samples/dmon/.gitignore: -------------------------------------------------------------------------------- 1 | dmon 2 | -------------------------------------------------------------------------------- /samples/health/.gitignore: -------------------------------------------------------------------------------- 1 | health 2 | -------------------------------------------------------------------------------- /samples/policy/.gitignore: -------------------------------------------------------------------------------- 1 | policy 2 | -------------------------------------------------------------------------------- /samples/restApi/.gitignore: -------------------------------------------------------------------------------- 1 | restApi 2 | -------------------------------------------------------------------------------- /samples/topology/.gitignore: -------------------------------------------------------------------------------- 1 | topology 2 | -------------------------------------------------------------------------------- /samples/deviceInfo/.gitignore: -------------------------------------------------------------------------------- 1 | deviceInfo 2 | -------------------------------------------------------------------------------- /samples/processInfo/.gitignore: -------------------------------------------------------------------------------- 1 | processInfo 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.swp 2 | *.swo 3 | tags 4 | .idea/ 5 | -------------------------------------------------------------------------------- /samples/hostengineStatus/.gitignore: -------------------------------------------------------------------------------- 1 | hostengineStatus 2 | -------------------------------------------------------------------------------- /.hadolint.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | ignored: [] 3 | trustedRegistries: [] 4 | -------------------------------------------------------------------------------- /pkg/dcgm/dcgm_nvml.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #define NVML_NO_UNVERSIONED_FUNC_DEFS 4 | #include "nvml.h" 5 | -------------------------------------------------------------------------------- /pkg/dcgm/callback.c: -------------------------------------------------------------------------------- 1 | int violationNotify(void* p) { 2 | int ViolationRegistration(void*); 3 | return ViolationRegistration(p); 4 | } 5 | -------------------------------------------------------------------------------- /.markdownlint.yaml: -------------------------------------------------------------------------------- 1 | default: true 2 | 3 | # MD013/line-length - Line length 4 | MD013: 5 | # eventually set line_length to 80 6 | line_length: 500 7 | tables: false 8 | code_blocks: false 9 | -------------------------------------------------------------------------------- /pkg/dcgm/error.go: -------------------------------------------------------------------------------- 1 | package dcgm 2 | 3 | import "errors" 4 | 5 | // ErrInvalidMode represents an error indicating that an invalid mode was used 6 | var ErrInvalidMode = errors.New("invalid mode") 7 | -------------------------------------------------------------------------------- /scripts/lint/go-mod-tidy.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | go mod tidy -v $@ 4 | if [ $? -ne 0 ]; then 5 | exit 2 6 | fi 7 | 8 | git diff --exit-code go.* &> /dev/null 9 | if [ $? -ne 0 ]; then 10 | echo "go.mod or go.sum differs, please re-add it to your commit" 11 | exit 3 12 | fi 13 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/NVIDIA/go-dcgm 2 | 3 | go 1.23 4 | 5 | require ( 6 | github.com/bits-and-blooms/bitset v1.22.0 7 | github.com/gorilla/mux v1.8.1 8 | github.com/stretchr/testify v1.10.0 9 | ) 10 | 11 | require ( 12 | github.com/davecgh/go-spew v1.1.1 // indirect 13 | github.com/pmezard/go-difflib v1.0.0 // indirect 14 | gopkg.in/yaml.v3 v3.0.1 // indirect 15 | ) 16 | -------------------------------------------------------------------------------- /.github/workflows/go.yml: -------------------------------------------------------------------------------- 1 | name: Go 2 | 3 | on: 4 | push: 5 | branches: [ main ] 6 | pull_request: 7 | branches: [ main ] 8 | 9 | jobs: 10 | 11 | build: 12 | runs-on: ubuntu-latest 13 | steps: 14 | - uses: actions/checkout@v2 15 | - name: Set up Go 16 | uses: actions/setup-go@v2 17 | with: 18 | go-version: 1.21 19 | - name: Build 20 | run: make binary 21 | - name: Lint 22 | run: make check-format 23 | -------------------------------------------------------------------------------- /samples/hostengineStatus/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "log" 6 | 7 | "github.com/NVIDIA/go-dcgm/pkg/dcgm" 8 | ) 9 | 10 | // dcgmi introspect --enable 11 | // dcgmi introspect -s -H 12 | func main() { 13 | cleanup, err := dcgm.Init(dcgm.Embedded) 14 | if err != nil { 15 | log.Panicln(err) 16 | } 17 | defer cleanup() 18 | 19 | st, err := dcgm.Introspect() 20 | if err != nil { 21 | log.Panicln(err) 22 | } 23 | 24 | fmt.Printf("Memory %2s %v KB\nCPU %5s %.2f %s\n", ":", st.Memory, ":", st.CPU, "%") 25 | } 26 | -------------------------------------------------------------------------------- /docker-bake.hcl: -------------------------------------------------------------------------------- 1 | target "default" { 2 | name = "go-dcgm-${replace(distro, ".", "-")}-${replace(go, ".", "-")}-${replace(cuda, ".", "-")}" 3 | tags = ["go-dcgm:${distro}-go${go}-cuda${cuda}-dcgm${dcgm}"] 4 | platforms = ["linux/amd64"] 5 | matrix = { 6 | go = ["1.24.4"] 7 | distro = ["ubuntu24.04", "ubuntu22.04", "ubuntu20.04"] 8 | cuda = ["12.9.1", "12.5.1"] 9 | dcgm = ["4.2.3-2"] 10 | } 11 | args = { 12 | GO_VERSION = go 13 | DISTRO_FLAVOR = distro 14 | CUDA_VERSION = cuda 15 | DCGM_VERSION = dcgm 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /samples/restApi/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "log" 5 | "os" 6 | "os/signal" 7 | "syscall" 8 | 9 | "github.com/NVIDIA/go-dcgm/pkg/dcgm" 10 | ) 11 | 12 | // res: curl localhost:8070/dcgm/device/info/id/0 13 | 14 | func main() { 15 | stopSig := make(chan os.Signal, 1) 16 | signal.Notify(stopSig, syscall.SIGINT, syscall.SIGTERM) 17 | 18 | cleanup, err := dcgm.Init(dcgm.Embedded) 19 | if err != nil { 20 | log.Panicln(err) 21 | } 22 | defer cleanup() 23 | 24 | addr := ":8070" 25 | server := newHttpServer(addr) 26 | 27 | go func() { 28 | log.Printf("Running http server on localhost%s", addr) 29 | server.serve() 30 | }() 31 | 32 | defer server.stop() 33 | 34 | <-stopSig 35 | } 36 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Overview 2 | 3 | Golang bindings are provided for [NVIDIA Data Center GPU Manager (DCGM)](https://developer.nvidia.com/dcgm). DCGM is a set of tools for managing and monitoring NVIDIA GPUs in cluster environments. It's a low overhead tool suite that performs a variety of functions on each host system including active health monitoring, diagnostics, system validation, policies, power and clock management, group configuration and accounting. 4 | 5 | You will also find samples for these bindings in this repository. 6 | 7 | ## Issues and Contributing 8 | 9 | [Checkout the Contributing document!](CONTRIBUTING.md) 10 | 11 | * Please let us know by [filing a new issue](https://github.com/NVIDIA/go-dcgm/issues/new) 12 | * You can contribute by opening a [pull request](https://github.com/NVIDIA/go-dcgm) 13 | -------------------------------------------------------------------------------- /samples/diag/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "html/template" 5 | "log" 6 | "os" 7 | 8 | "github.com/NVIDIA/go-dcgm/pkg/dcgm" 9 | ) 10 | 11 | const diagOutput = `Software: 12 | {{range $t := .Software}} 13 | {{printf "%-50s" $t.TestName}} {{$t.Status}} {{$t.TestOutput}} 14 | {{- end}} 15 | {{range $g := .PerGpu}} 16 | GPU : {{$g.GPU}} 17 | {{range $t := $g.DiagResults}} 18 | {{printf "%-20s" $t.TestName}} {{$t.Status}} {{$t.TestOutput}} 19 | {{- end}} 20 | {{- end}} 21 | ` 22 | 23 | func main() { 24 | cleanup, err := dcgm.Init(dcgm.Embedded) 25 | if err != nil { 26 | log.Panicln(err) 27 | } 28 | defer cleanup() 29 | 30 | dr, err := dcgm.RunDiag(dcgm.DiagQuick, dcgm.GroupAllGPUs()) 31 | if err != nil { 32 | log.Panicln(err) 33 | } 34 | 35 | t := template.Must(template.New("Diag").Parse(diagOutput)) 36 | if err = t.Execute(os.Stdout, dr); err != nil { 37 | log.Panicln("Template error:", err) 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /.yamllint.yaml: -------------------------------------------------------------------------------- 1 | yaml-files: 2 | - "*.yaml" 3 | - "*.yml" 4 | 5 | rules: 6 | anchors: 7 | forbid-undeclared-aliases: true 8 | forbid-duplicated-anchors: true 9 | forbid-unused-anchors: true 10 | braces: 11 | min-spaces-inside: 0 12 | max-spaces-inside: 1 13 | min-spaces-inside-empty: 0 14 | max-spaces-inside-empty: 0 15 | brackets: 16 | min-spaces-inside: 0 17 | max-spaces-inside: 1 18 | min-spaces-inside-empty: 0 19 | max-spaces-inside-empty: 0 20 | colons: enable 21 | commas: enable 22 | comments: enable 23 | comments-indentation: enable 24 | document-end: disable 25 | document-start: disable 26 | empty-lines: 27 | max: 1 28 | empty-values: enable 29 | float-values: disable 30 | hyphens: enable 31 | indentation: enable 32 | key-duplicates: enable 33 | key-ordering: disable 34 | line-length: disable 35 | new-line-at-end-of-file: enable 36 | new-lines: disable 37 | octal-values: disable 38 | quoted-strings: disable 39 | trailing-spaces: enable 40 | truthy: 41 | check-keys: false 42 | -------------------------------------------------------------------------------- /pkg/dcgm/field_values_cb.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #ifndef FIELD_VALUES 18 | #define FIELD_VALUES 19 | 20 | #include "dcgm_agent.h" 21 | #include "dcgm_structs.h" 22 | 23 | int fieldValueEntityCallback(dcgm_field_entity_group_t entityGroupId, 24 | dcgm_field_eid_t entityId, 25 | dcgmFieldValue_v1 *values, 26 | int numValues, 27 | void *userData); 28 | 29 | #endif 30 | -------------------------------------------------------------------------------- /pkg/dcgm/dcgm_api_export.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | #ifndef DCGM_DCGM_API_EXPORT_H 17 | #define DCGM_DCGM_API_EXPORT_H 18 | 19 | #undef DCGM_PUBLIC_API 20 | #undef DCGM_PRIVATE_API 21 | 22 | #if defined(DCGM_API_EXPORT) 23 | #define DCGM_PUBLIC_API __attribute((visibility("default"))) 24 | #else 25 | #define DCGM_PUBLIC_API 26 | #if defined(ERROR_IF_NOT_PUBLIC) 27 | #error(Should be public) 28 | #endif 29 | #endif 30 | 31 | #define DCGM_PRIVATE_API __attribute((visibility("hidden"))) 32 | 33 | 34 | #endif // DCGM_DCGM_API_EXPORT_H 35 | -------------------------------------------------------------------------------- /pkg/dcgm/field_values_cb.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include "dcgm_agent.h" 18 | #include "dcgm_structs.h" 19 | #include "_cgo_export.h" 20 | 21 | int fieldValueEntityCallback(dcgm_field_entity_group_t entityGroupId, 22 | dcgm_field_eid_t entityId, 23 | dcgmFieldValue_v1 *values, 24 | int numValues, 25 | void *userData) { 26 | return go_dcgmFieldValueEntityEnumeration(entityGroupId, entityId, values, numValues, userData); 27 | } 28 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | github.com/bits-and-blooms/bitset v1.22.0 h1:Tquv9S8+SGaS3EhyA+up3FXzmkhxPGjQQCkcs2uw7w4= 2 | github.com/bits-and-blooms/bitset v1.22.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8= 3 | github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= 4 | github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= 5 | github.com/gorilla/mux v1.8.1 h1:TuBL49tXwgrFYWhqrNgrUNEY92u81SPhu7sTdzQEiWY= 6 | github.com/gorilla/mux v1.8.1/go.mod h1:AKf9I4AEqPTmMytcMc0KkNouC66V3BtZ4qD5fmWSiMQ= 7 | github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= 8 | github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= 9 | github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA= 10 | github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= 11 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= 12 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= 13 | gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= 14 | gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= 15 | -------------------------------------------------------------------------------- /pkg/dcgm/api_test.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package dcgm 18 | 19 | import ( 20 | "testing" 21 | 22 | "github.com/stretchr/testify/require" 23 | ) 24 | 25 | func TestGetEntityGroupEntities(t *testing.T) { 26 | withNvsdmMockConfig(t, "testdata/one_switch.yaml", func(t *testing.T) { 27 | teardownTest := setupTest(t) 28 | defer teardownTest(t) 29 | 30 | runOnlyWithLiveGPUs(t) 31 | 32 | // Get switch entities 33 | entities, err := GetEntityGroupEntities(FE_SWITCH) 34 | require.NoError(t, err) 35 | require.NotEmpty(t, entities) 36 | 37 | // Get nvlink entities 38 | nvlinkEntities, err := GetEntityGroupEntities(FE_LINK) 39 | require.NoError(t, err) 40 | require.NotEmpty(t, nvlinkEntities) 41 | }) 42 | } 43 | -------------------------------------------------------------------------------- /samples/dmon/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "log" 6 | "os" 7 | "os/signal" 8 | "syscall" 9 | "time" 10 | 11 | "github.com/NVIDIA/go-dcgm/pkg/dcgm" 12 | ) 13 | 14 | const ( 15 | header = `# gpu pwr temp sm mem enc dec mclk pclk 16 | # Idx W C % % % % MHz MHz` 17 | ) 18 | 19 | // modelled on nvidia-smi dmon 20 | // dcgmi dmon -e 155,150,203,204,206,207,100,101 21 | func main() { 22 | sigs := make(chan os.Signal, 1) 23 | signal.Notify(sigs, syscall.SIGINT, syscall.SIGTERM) 24 | 25 | cleanup, err := dcgm.Init(dcgm.Embedded) 26 | if err != nil { 27 | log.Panicln(err) 28 | } 29 | defer cleanup() 30 | 31 | gpus, err := dcgm.GetSupportedDevices() 32 | if err != nil { 33 | log.Panicln(err) 34 | } 35 | 36 | ticker := time.NewTicker(time.Second * 1) 37 | defer ticker.Stop() 38 | 39 | fmt.Println(header) 40 | 41 | for { 42 | select { 43 | case <-ticker.C: 44 | for _, gpu := range gpus { 45 | st, err := dcgm.GetDeviceStatus(gpu) 46 | if err != nil { 47 | log.Panicln(err) 48 | } 49 | 50 | fmt.Printf("%5d %5d %5d %5d %5d %5d %5d %5d %5d\n", 51 | gpu, int64(st.Power), st.Temperature, st.Utilization.GPU, st.Utilization.Memory, 52 | st.Utilization.Encoder, st.Utilization.Decoder, st.Clocks.Memory, st.Clocks.Cores) 53 | } 54 | 55 | case <-sigs: 56 | return 57 | } 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /pkg/dcgm/profile.go: -------------------------------------------------------------------------------- 1 | package dcgm 2 | 3 | /* 4 | #include "dcgm_agent.h" 5 | #include "dcgm_structs.h" 6 | */ 7 | import "C" 8 | 9 | import ( 10 | "unsafe" 11 | ) 12 | 13 | // MetricGroup represents a group of metrics for a specific GPU 14 | type MetricGroup struct { 15 | Major uint 16 | Minor uint 17 | FieldIds []uint 18 | } 19 | 20 | func getSupportedMetricGroups(gpuID uint) ([]MetricGroup, error) { 21 | var ( 22 | groupInfo C.dcgmProfGetMetricGroups_t 23 | err error 24 | groups []MetricGroup 25 | ) 26 | 27 | groupInfo.version = makeVersion3(unsafe.Sizeof(groupInfo)) 28 | 29 | groupInfo.gpuId = C.uint(gpuID) 30 | 31 | result := C.dcgmProfGetSupportedMetricGroups(handle.handle, &groupInfo) 32 | 33 | if err = errorString(result); err != nil { 34 | return nil, &Error{msg: C.GoString(C.errorString(result)), Code: result} 35 | } 36 | 37 | count := uint(groupInfo.numMetricGroups) 38 | 39 | groups = make([]MetricGroup, count) 40 | for i := uint(0); i < count; i++ { 41 | groups[i].Major = uint(groupInfo.metricGroups[i].majorId) 42 | groups[i].Minor = uint(groupInfo.metricGroups[i].minorId) 43 | 44 | fieldCount := uint(groupInfo.metricGroups[i].numFieldIds) 45 | 46 | groups[i].FieldIds = make([]uint, fieldCount) 47 | for j := uint(0); j < fieldCount; j++ { 48 | groups[i].FieldIds[j] = uint(groupInfo.metricGroups[i].fieldIds[j]) 49 | } 50 | } 51 | 52 | return groups, nil 53 | } 54 | -------------------------------------------------------------------------------- /samples/health/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "log" 5 | "os" 6 | "os/signal" 7 | "syscall" 8 | "text/template" 9 | "time" 10 | 11 | "github.com/NVIDIA/go-dcgm/pkg/dcgm" 12 | ) 13 | 14 | const ( 15 | healthStatus = `GPU : {{.GPU}} 16 | Status : {{.Status}} 17 | {{range .Watches}} 18 | Type : {{.Type}} 19 | Status : {{.Status}} 20 | Error : {{.Error}} 21 | {{end}} 22 | ` 23 | ) 24 | 25 | // create group: dcgmi group -c "name" --default 26 | // enable watches: dcgmi health -s a 27 | // check: dcgmi health -g 1 -c 28 | func main() { 29 | sigs := make(chan os.Signal, 1) 30 | signal.Notify(sigs, syscall.SIGINT, syscall.SIGTERM) 31 | 32 | cleanup, err := dcgm.Init(dcgm.Embedded) 33 | if err != nil { 34 | log.Panicln(err) 35 | } 36 | defer cleanup() 37 | 38 | gpus, err := dcgm.GetSupportedDevices() 39 | if err != nil { 40 | log.Panicln(err) 41 | } 42 | 43 | ticker := time.NewTicker(time.Second * 1) 44 | defer ticker.Stop() 45 | 46 | t := template.Must(template.New("Health").Parse(healthStatus)) 47 | 48 | for { 49 | select { 50 | case <-ticker.C: 51 | for _, gpu := range gpus { 52 | h, err := dcgm.HealthCheckByGpuId(gpu) 53 | if err != nil { 54 | log.Panicln(err) 55 | } 56 | 57 | if err = t.Execute(os.Stdout, h); err != nil { 58 | log.Panicln("Template error:", err) 59 | } 60 | } 61 | case <-sigs: 62 | return 63 | } 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /pkg/dcgm/hostengine_status.go: -------------------------------------------------------------------------------- 1 | // Package dcgm provides bindings for NVIDIA's Data Center GPU Manager (DCGM) 2 | package dcgm 3 | 4 | /* 5 | #include "dcgm_agent.h" 6 | #include "dcgm_structs.h" 7 | */ 8 | import "C" 9 | 10 | import ( 11 | "unsafe" 12 | ) 13 | 14 | // Status represents the current resource utilization of the DCGM hostengine process 15 | type Status struct { 16 | // Memory represents the current memory usage of the DCGM hostengine in kilobytes 17 | Memory int64 18 | // CPU represents the current CPU utilization of the DCGM hostengine as a percentage (0-100) 19 | CPU float64 20 | } 21 | 22 | func introspect() (engine Status, err error) { 23 | var memory C.dcgmIntrospectMemory_t 24 | memory.version = makeVersion1(unsafe.Sizeof(memory)) 25 | waitIfNoData := 1 26 | result := C.dcgmIntrospectGetHostengineMemoryUsage(handle.handle, &memory, C.int(waitIfNoData)) 27 | 28 | if err = errorString(result); err != nil { 29 | return engine, &Error{msg: C.GoString(C.errorString(result)), Code: result} 30 | } 31 | 32 | var cpu C.dcgmIntrospectCpuUtil_t 33 | 34 | cpu.version = makeVersion1(unsafe.Sizeof(cpu)) 35 | result = C.dcgmIntrospectGetHostengineCpuUtilization(handle.handle, &cpu, C.int(waitIfNoData)) 36 | 37 | if err = errorString(result); err != nil { 38 | return engine, &Error{msg: C.GoString(C.errorString(result)), Code: result} 39 | } 40 | 41 | engine = Status{ 42 | Memory: toInt64(memory.bytesUsed) / 1024, 43 | CPU: *dblToFloat(cpu.total) * 100, 44 | } 45 | return 46 | } 47 | -------------------------------------------------------------------------------- /samples/policy/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "context" 5 | "log" 6 | "os" 7 | "os/signal" 8 | "syscall" 9 | 10 | "github.com/NVIDIA/go-dcgm/pkg/dcgm" 11 | ) 12 | 13 | // dcgmi group -c "name" --default 14 | // dcgmi policy -g GROUPID --set 0,0 -x -n -p -e -P 250 -T 100 -M 10 15 | // dcgmi policy -g GROUPID --reg 16 | func main() { 17 | ctx, done := context.WithCancel(context.Background()) 18 | // Handle SIGINT (Ctrl+C) and SIGTERM (termination signal) 19 | sigs := make(chan os.Signal, 1) 20 | signal.Notify(sigs, syscall.SIGINT, syscall.SIGTERM) 21 | 22 | go func() { 23 | <-sigs 24 | log.Println("Received termination signal, exiting...") 25 | done() 26 | }() 27 | 28 | cleanup, err := dcgm.Init(dcgm.Embedded) 29 | if err != nil { 30 | log.Panicln(err) 31 | } 32 | defer cleanup() 33 | 34 | // Choose policy conditions to register violation callback. 35 | // Note: Need to be root for some options 36 | // Available options are: 37 | // 1. dcgm.DbePolicy 38 | // 2. dcgm.PCIePolicy 39 | // 3. dcgm.MaxRtPgPolicy 40 | // 4. dcgm.ThermalPolicy 41 | // 5. dcgm.PowerPolicy 42 | // 6. dcgm.NvlinkPolicy 43 | // 7. dcgm.XidPolicy 44 | c, err := dcgm.ListenForPolicyViolations(ctx, dcgm.DbePolicy, dcgm.XidPolicy) 45 | if err != nil { 46 | log.Panicln(err) 47 | } 48 | 49 | for { 50 | select { 51 | case pe := <-c: 52 | log.Printf("PolicyViolation %6s %v\nTimestamp %2s %v\nData %7s %v", 53 | ":", pe.Condition, ":", pe.Timestamp, ":", pe.Data) 54 | case <-ctx.Done(): 55 | return 56 | } 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /tests/nvsmi.go: -------------------------------------------------------------------------------- 1 | package tests 2 | 3 | import ( 4 | "bytes" 5 | "fmt" 6 | "os/exec" 7 | "strings" 8 | ) 9 | 10 | const ( 11 | bin = "nvidia-smi" 12 | gpuArg = "--id=" 13 | queryArg = "--query-gpu=" 14 | formatArg = "--format=csv,noheader,nounits" 15 | ) 16 | 17 | // Query executes nvidia-smi with the specified GPU ID and query parameters. 18 | // It returns the query result as a trimmed string. 19 | // 20 | // Parameters: 21 | // - id: The GPU ID to query (e.g., "0" for the first GPU) 22 | // - query: The nvidia-smi query parameter (e.g., "temperature.gpu") 23 | // 24 | // Returns: 25 | // 26 | // A string containing the query result with whitespace trimmed 27 | func Query(id, query string) string { 28 | var out bytes.Buffer 29 | 30 | gpu_args := gpuArg + id 31 | query_args := queryArg + query 32 | 33 | cmd := exec.Command(bin, gpu_args, query_args, formatArg) 34 | cmd.Stdout = &out 35 | 36 | err := cmd.Run() 37 | if err != nil { 38 | fmt.Printf("nvsmi exec error: %v\n", err) 39 | } 40 | 41 | return strings.TrimSpace(out.String()) 42 | } 43 | 44 | // DeviceCount returns the number of NVIDIA GPU devices available in the system 45 | // by executing nvidia-smi with the specified query parameter. 46 | // 47 | // Parameters: 48 | // - query: The nvidia-smi query parameter to execute 49 | // 50 | // Returns: 51 | // 52 | // The number of GPU devices as an unsigned integer 53 | func DeviceCount(query string) uint { 54 | var out bytes.Buffer 55 | 56 | query_arg := queryArg + query 57 | cmd := exec.Command(bin, query_arg, formatArg) 58 | cmd.Stdout = &out 59 | 60 | err := cmd.Run() 61 | if err != nil { 62 | fmt.Printf("nvsmi exec error: %v\n", err) 63 | } 64 | 65 | nvSmi := strings.Split(strings.TrimSuffix(out.String(), "\n"), "\n") 66 | 67 | return uint(len(nvSmi)) 68 | } 69 | -------------------------------------------------------------------------------- /.golangci.yml: -------------------------------------------------------------------------------- 1 | version: "2" 2 | linters: 3 | default: none 4 | enable: 5 | - bodyclose 6 | - durationcheck 7 | - errcheck 8 | - gocritic 9 | - gosec 10 | - govet 11 | - ineffassign 12 | - mirror 13 | - misspell 14 | - nolintlint 15 | - perfsprint 16 | - prealloc 17 | - predeclared 18 | - revive 19 | - rowserrcheck 20 | - staticcheck 21 | - testifylint 22 | - unconvert 23 | - unused 24 | - wastedassign 25 | settings: 26 | gocritic: 27 | disabled-checks: 28 | - hugeParam 29 | - ifElseChain 30 | - ptrToRefParam 31 | - dupImport 32 | - uncheckedInlineErr 33 | enabled-tags: 34 | - diagnostic 35 | - experimental 36 | - opinionated 37 | - performance 38 | - style 39 | govet: 40 | disable: 41 | - fieldalignment 42 | - deepequalerrors 43 | enable-all: true 44 | revive: 45 | enable-all-rules: false 46 | rules: 47 | - name: superfluous-else 48 | - name: exported 49 | testifylint: 50 | disable-all: true 51 | enable: 52 | - nil-compare 53 | - compares 54 | - error-is-as 55 | - bool-compare 56 | - empty 57 | - len 58 | - expected-actual 59 | - error-nil 60 | exclusions: 61 | generated: lax 62 | presets: 63 | - common-false-positives 64 | - legacy 65 | - std-error-handling 66 | rules: 67 | - linters: 68 | - bodyclose 69 | path: _test.go 70 | paths: 71 | - third_party$ 72 | - builtin$ 73 | - examples$ 74 | issues: 75 | max-issues-per-linter: 0 76 | max-same-issues: 0 77 | formatters: 78 | enable: 79 | - gofmt 80 | - goimports 81 | exclusions: 82 | generated: lax 83 | paths: 84 | - third_party$ 85 | - builtin$ 86 | - examples$ 87 | -------------------------------------------------------------------------------- /samples/topology/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "log" 6 | 7 | "github.com/NVIDIA/go-dcgm/pkg/dcgm" 8 | ) 9 | 10 | const ( 11 | legend = ` 12 | Legend: 13 | X = Self 14 | SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI) 15 | NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node 16 | PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU) 17 | PXB = Connection traversing multiple PCIe switches (without traversing the PCIe Host Bridge) 18 | PIX = Connection traversing a single PCIe switch 19 | PSB = Connection traversing a single on-board PCIe switch 20 | NV# = Connection traversing a bonded set of # NVLinks` 21 | ) 22 | 23 | // based on nvidia-smi topo -m 24 | // dcgmi topo 25 | func main() { 26 | // choose dcgm hostengine running mode 27 | // 1. dcgm.Embedded 28 | // 2. dcgm.Standalone 29 | // 3. dcgm.StartHostengine 30 | cleanup, err := dcgm.Init(dcgm.Embedded) 31 | if err != nil { 32 | log.Panicln(err) 33 | } 34 | defer cleanup() 35 | 36 | gpus, err := dcgm.GetSupportedDevices() 37 | if err != nil { 38 | log.Panicln(err) 39 | } 40 | 41 | for _, gpu := range gpus { 42 | fmt.Printf("%9s%d", "GPU", gpu) 43 | } 44 | 45 | fmt.Printf("%5s\n", "CPUAffinity") 46 | 47 | numGpus := len(gpus) 48 | gpuTopo := make([]string, numGpus) 49 | 50 | for i := 0; i < numGpus; i++ { 51 | topo, err := dcgm.GetDeviceTopology(gpus[i]) 52 | if err != nil { 53 | log.Panicln(err) 54 | } 55 | 56 | fmt.Printf("GPU%d", gpus[i]) 57 | 58 | for j := 0; j < len(topo); j++ { 59 | // skip current GPU 60 | gpuTopo[topo[j].GPU] = topo[j].Link.PCIPaths() 61 | } 62 | 63 | gpuTopo[i] = "X" 64 | for j := 0; j < numGpus; j++ { 65 | fmt.Printf("%5s", gpuTopo[j]) 66 | } 67 | 68 | deviceInfo, err := dcgm.GetDeviceInfo(gpus[i]) 69 | if err != nil { 70 | log.Panicln(err) 71 | } 72 | 73 | fmt.Printf("%5s\n", deviceInfo.CPUAffinity) 74 | } 75 | 76 | fmt.Println(legend) 77 | } 78 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | GOLANGCILINT_TIMEOUT ?= 10m 16 | 17 | .PHONY: all binary check-format install install-pre-commit 18 | all: binary test-main check-format 19 | 20 | install-pre-commit: 21 | @echo "Installing pre-commit hooks..." 22 | pre-commit install --config .pre-commit-config.yaml 23 | @echo "Pre-commit hooks installed." 24 | 25 | binary: 26 | go build ./pkg/dcgm 27 | cd samples/deviceInfo; go build 28 | cd samples/dmon; go build 29 | cd samples/health; go build 30 | cd samples/hostengineStatus; go build 31 | cd samples/policy; go build 32 | cd samples/processInfo; go build 33 | cd samples/restApi; go build 34 | cd samples/topology; go build 35 | cd samples/diag; go build 36 | 37 | docker: 38 | docker buildx bake default --load 39 | 40 | test-main: 41 | go test -race -v ./tests 42 | go test -v ./tests 43 | 44 | check-format: 45 | test $$(gofumpt -l -w . | tee /dev/stderr | wc -l) -eq 0 46 | 47 | clean: 48 | rm -f samples/deviceInfo/deviceInfo 49 | rm -f samples/dmon/dmon 50 | rm -f samples/health/health 51 | rm -f samples/hostengineStatus/hostengineStatus 52 | rm -f samples/policy/policy 53 | rm -f samples/processInfo/processInfo 54 | rm -f samples/restApi/restApi 55 | rm -f samples/topology/topology 56 | 57 | lint: 58 | golangci-lint run ./... --timeout $(GOLANGCILINT_TIMEOUT) --new-from-rev=HEAD~1 --fix 59 | 60 | lint-full: 61 | golangci-lint run ./... --timeout $(GOLANGCILINT_TIMEOUT) --fix 62 | -------------------------------------------------------------------------------- /pkg/dcgm/testdata/one_switch.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | devices: 3 | - type: 2 # nvsdmDevType 4 | dev_id: 0 5 | vendor_id: 0xbaca 6 | health_state: 0 7 | fields: [] 8 | ports: 9 | - lid: 1 10 | fields: 11 | - type: 1 # nvsdmTelemType_t 12 | field: 2 13 | value: 51984 14 | value_type: 2 # nvsdmValType_t 15 | status: 0 # nvsdmRet_t 16 | - type: 1 # nvsdmTelemType_t 17 | field: 10 18 | value: 56952 19 | value_type: 2 # nvsdmValType_t 20 | status: 0 # nvsdmRet_t 21 | - type: 2 # nvsdmTelemType_t 22 | field: 1 23 | value: 0 24 | value_type: 5 # nvsdmValType_t 25 | status: 0 # nvsdmRet_t 26 | - type: 2 # nvsdmTelemType_t 27 | field: 3 28 | value: 0 29 | value_type: 5 # nvsdmValType_t 30 | status: 0 # nvsdmRet_t 31 | - type: 2 # nvsdmTelemType_t 32 | field: 4 33 | value: 65 34 | value_type: 1 # nvsdmValType_t 35 | status: 0 # nvsdmRet_t 36 | - lid: 1 37 | fields: 38 | - type: 1 # nvsdmTelemType_t 39 | field: 2 40 | value: 51984 41 | value_type: 2 # nvsdmValType_t 42 | status: 0 # nvsdmRet_t 43 | - type: 1 # nvsdmTelemType_t 44 | field: 10 45 | value: 56952 46 | value_type: 2 # nvsdmValType_t 47 | status: 0 # nvsdmRet_t 48 | - type: 2 # nvsdmTelemType_t 49 | field: 1 50 | value: 0 51 | value_type: 5 # nvsdmValType_t 52 | status: 0 # nvsdmRet_t 53 | - type: 2 # nvsdmTelemType_t 54 | field: 3 55 | value: 0 56 | value_type: 5 # nvsdmValType_t 57 | status: 0 # nvsdmRet_t 58 | - type: 2 # nvsdmTelemType_t 59 | field: 4 60 | value: 65 61 | value_type: 1 # nvsdmValType_t 62 | status: 0 # nvsdmRet_t 63 | -------------------------------------------------------------------------------- /samples/restApi/handlers/byUuids.go: -------------------------------------------------------------------------------- 1 | package handlers 2 | 3 | import ( 4 | "log" 5 | "net/http" 6 | 7 | "github.com/NVIDIA/go-dcgm/pkg/dcgm" 8 | ) 9 | 10 | // map of uuids and device id 11 | var uuids map[string]uint 12 | 13 | // DevicesUuids initializes a global map of GPU UUIDs to device IDs 14 | // This must be called before using UUID-based endpoints 15 | func DevicesUuids() { 16 | uuids = make(map[string]uint) 17 | 18 | count, err := dcgm.GetAllDeviceCount() 19 | if err != nil { 20 | log.Printf("(DCGM) Error getting devices: %s", err) 21 | return 22 | } 23 | 24 | for i := uint(0); i < count; i++ { 25 | deviceInfo, err := dcgm.GetDeviceInfo(i) 26 | if err != nil { 27 | log.Printf("(DCGM) Error getting device information: %s", err) 28 | return 29 | } 30 | 31 | uuids[deviceInfo.UUID] = i 32 | } 33 | } 34 | 35 | // DeviceInfoByUuid handles HTTP requests for device information by GPU UUID 36 | // It returns either JSON or formatted text output based on the request URL 37 | func DeviceInfoByUuid(resp http.ResponseWriter, req *http.Request) { 38 | device := getDeviceInfo(resp, req) 39 | if device == nil { 40 | return 41 | } 42 | 43 | if isJson(req) { 44 | encode(resp, req, device) 45 | return 46 | } 47 | 48 | printer(resp, req, device, deviceInfo) 49 | } 50 | 51 | // DeviceStatusByUuid handles HTTP requests for device status by GPU UUID 52 | // It returns either JSON or formatted text output based on the request URL 53 | func DeviceStatusByUuid(resp http.ResponseWriter, req *http.Request) { 54 | st := getDeviceStatus(resp, req) 55 | if st == nil { 56 | return 57 | } 58 | 59 | if isJson(req) { 60 | encode(resp, req, st) 61 | return 62 | } 63 | 64 | printer(resp, req, st, deviceStatus) 65 | } 66 | 67 | // HealthByUuid handles HTTP requests for device health status by GPU UUID 68 | // It returns either JSON or formatted text output based on the request URL 69 | func HealthByUuid(resp http.ResponseWriter, req *http.Request) { 70 | h := getHealth(resp, req) 71 | if h == nil { 72 | return 73 | } 74 | 75 | if isJson(req) { 76 | encode(resp, req, h) 77 | return 78 | } 79 | 80 | printer(resp, req, h, healthStatus) 81 | } 82 | -------------------------------------------------------------------------------- /samples/restApi/handlers/byIds.go: -------------------------------------------------------------------------------- 1 | package handlers 2 | 3 | import ( 4 | "net/http" 5 | ) 6 | 7 | // DeviceInfo handles HTTP requests for device information by device ID 8 | // It returns either JSON or formatted text output based on the request URL 9 | func DeviceInfo(resp http.ResponseWriter, req *http.Request) { 10 | device := getDeviceInfo(resp, req) 11 | if device == nil { 12 | return 13 | } 14 | 15 | if isJson(req) { 16 | encode(resp, req, device) 17 | return 18 | } 19 | 20 | printer(resp, req, device, deviceInfo) 21 | } 22 | 23 | // DeviceStatus handles HTTP requests for device status by device ID 24 | // It returns either JSON or formatted text output based on the request URL 25 | func DeviceStatus(resp http.ResponseWriter, req *http.Request) { 26 | st := getDeviceStatus(resp, req) 27 | if st == nil { 28 | return 29 | } 30 | 31 | if isJson(req) { 32 | encode(resp, req, st) 33 | return 34 | } 35 | 36 | printer(resp, req, st, deviceStatus) 37 | } 38 | 39 | // ProcessInfo handles HTTP requests for process information by PID 40 | // It returns either JSON or formatted text output based on the request URL 41 | func ProcessInfo(resp http.ResponseWriter, req *http.Request) { 42 | pInfo := getProcessInfo(resp, req) 43 | if len(pInfo) == 0 { 44 | return 45 | } 46 | 47 | if isJson(req) { 48 | encode(resp, req, pInfo) 49 | return 50 | } 51 | 52 | processPrint(resp, req, pInfo) 53 | } 54 | 55 | // Health handles HTTP requests for device health status by device ID 56 | // It returns either JSON or formatted text output based on the request URL 57 | func Health(resp http.ResponseWriter, req *http.Request) { 58 | h := getHealth(resp, req) 59 | if h == nil { 60 | return 61 | } 62 | 63 | if isJson(req) { 64 | encode(resp, req, h) 65 | return 66 | } 67 | 68 | printer(resp, req, h, healthStatus) 69 | } 70 | 71 | // Status handles HTTP requests for DCGM daemon status 72 | // It returns either JSON or formatted text output based on the request URL 73 | func Status(resp http.ResponseWriter, req *http.Request) { 74 | st := getStatus(resp, req) 75 | if st == nil { 76 | return 77 | } 78 | 79 | if isJson(req) { 80 | encode(resp, req, st) 81 | return 82 | } 83 | 84 | printer(resp, req, st, hostengine) 85 | } 86 | -------------------------------------------------------------------------------- /samples/deviceInfo/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "flag" 5 | "log" 6 | "os" 7 | "text/template" 8 | 9 | "github.com/NVIDIA/go-dcgm/pkg/dcgm" 10 | ) 11 | 12 | const ( 13 | deviceInfo = `Driver Version : {{.Identifiers.DriverVersion}} 14 | GPU : {{.GPU}} 15 | DCGMSupported : {{.DCGMSupported}} 16 | UUID : {{.UUID}} 17 | Brand : {{.Identifiers.Brand}} 18 | Model : {{.Identifiers.Model}} 19 | Serial Number : {{.Identifiers.Serial}} 20 | Vbios : {{or .Identifiers.Vbios "N/A"}} 21 | InforomImage Version : {{.Identifiers.InforomImageVersion}} 22 | Bus ID : {{.PCI.BusID}} 23 | BAR1 (MB) : {{or .PCI.BAR1 "N/A"}} 24 | FrameBuffer Memory (MB): {{or .PCI.FBTotal "N/A"}} 25 | Bandwidth (MB/s) : {{or .PCI.Bandwidth "N/A"}} 26 | Power (W) : {{or .Power "N/A"}} 27 | CPUAffinity : {{or .CPUAffinity "N/A"}} 28 | P2P Available : {{if not .Topology}}None{{else}}{{range .Topology}} 29 | GPU{{.GPU}} - (BusID){{.BusID}} - {{.Link.PCIPaths}}{{end}}{{end}} 30 | --------------------------------------------------------------------- 31 | ` 32 | ) 33 | 34 | var ( 35 | connectAddr = flag.String("connect", "localhost", "Provide nv-hostengine connection address.") 36 | isSocket = flag.String("socket", "0", "Connecting to Unix socket?") 37 | ) 38 | 39 | // mini version of nvidia-smi -q 40 | // dcgmi discovery -i apc 41 | func main() { 42 | // choose dcgm hostengine running mode 43 | // 1. dcgm.Embedded 44 | // 2. dcgm.Standalone -connect "addr", -socket "isSocket" 45 | // 3. dcgm.StartHostengine 46 | flag.Parse() 47 | 48 | cleanup, err := dcgm.Init(dcgm.Standalone, *connectAddr, *isSocket) 49 | if err != nil { 50 | log.Panicln(err) 51 | } 52 | 53 | defer cleanup() 54 | 55 | count, err := dcgm.GetAllDeviceCount() 56 | if err != nil { 57 | log.Panicln(err) 58 | } 59 | 60 | t := template.Must(template.New("Device").Parse(deviceInfo)) 61 | 62 | for i := uint(0); i < count; i++ { 63 | deviceInfo, err := dcgm.GetDeviceInfo(i) 64 | if err != nil { 65 | log.Panicln(err) 66 | } 67 | 68 | if err = t.Execute(os.Stdout, deviceInfo); err != nil { 69 | log.Panicln("Template error:", err) 70 | } 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | repos: 3 | - repo: https://github.com/pre-commit/pre-commit-hooks 4 | rev: v5.0.0 5 | hooks: 6 | - id: trailing-whitespace 7 | - id: end-of-file-fixer 8 | - id: check-added-large-files 9 | - repo: https://github.com/adrienverge/yamllint 10 | rev: v1.37.1 11 | hooks: 12 | - id: yamllint 13 | args: [--strict, -c=.yamllint.yaml] 14 | entry: yamllint 15 | files: \.ya?ml$ 16 | - repo: https://github.com/igorshubovych/markdownlint-cli 17 | rev: v0.45.0 18 | hooks: 19 | - id: markdownlint 20 | name: markdownlint 21 | description: "Checks the style of Markdown/Commonmark files." 22 | entry: ghcr.io/igorshubovych/markdownlint-cli 23 | language: docker_image 24 | types: [markdown] 25 | minimum_pre_commit_version: 0.15.0 26 | - repo: https://github.com/hadolint/hadolint 27 | rev: v2.13.1-beta 28 | hooks: 29 | - id: hadolint 30 | name: Lint Dockerfiles 31 | args: [hadolint, --config, .hadolint.yaml] 32 | description: Runs hadolint to lint Dockerfiles 33 | language: docker_image 34 | types: ["dockerfile"] 35 | entry: hadolint/hadolint:v2.12.0-alpine 36 | - repo: local 37 | hooks: 38 | - id: goimports-nvidia 39 | name: goimports-nvidia 40 | description: run goimports 41 | entry: goimports -w -local nvidia.com/NVIDIA/go-dcgm 42 | language: golang 43 | types: [go] 44 | exclude: '(\.pb|\.sql|mock_.*)\.go' 45 | - id: go-mod-tidy 46 | name: 'go-mod-tidy' 47 | entry: scripts/lint/go-mod-tidy.sh 48 | pass_filenames: false 49 | language: 'script' 50 | description: "Runs `go mod tidy -v`, requires golang" 51 | - repo: https://github.com/golangci/golangci-lint 52 | rev: v2.1.6 53 | hooks: 54 | - id: golangci-lint-config-verify 55 | name: golangci-lint-config-verify 56 | description: Verifies the configuration file 57 | entry: golangci-lint config verify 58 | files: .golangci.yml 59 | language: golang 60 | pass_filenames: false 61 | - id: golangci-lint 62 | name: golangci-lint 63 | description: Fast linters runner for Go. 64 | entry: golangci-lint run --new-from-rev origin/main --fix --allow-parallel-runners --timeout 5m 65 | types: [go] 66 | language: golang 67 | require_serial: true 68 | pass_filenames: false 69 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # CUDA_VERSION and DISTRO_FLAVOR are used to select a docker image tag from the upstream 2 | # docker registry for nvidia/cuda. The variation of DISTRO_FLAVOR and CUDA_VERSION must 3 | # point to an image that exists, see here for list: https://hub.docker.com/r/nvidia/cuda/tags 4 | 5 | # CUDA_VERSION 6 | ARG CUDA_VERSION=12.5.1 7 | # cuda image supports these images rockylinux9, rockylinux8, ubi9, ubi8, ubuntu24.04, ubuntu22.04, ubuntu20.04 8 | # Note: Testing has only been done with the ubuntu variants. 9 | ARG DISTRO_FLAVOR=ubuntu24.04 10 | 11 | # Use build arguments to select our base image or just stick with the defaults above. 12 | FROM nvidia/cuda:$CUDA_VERSION-base-$DISTRO_FLAVOR AS base 13 | ARG DCGM_VERSION=4.2.3-2 14 | ARG GO_VERSION=1.24.4 15 | ENV DEBIAN_FRONTEND=noninteractive 16 | 17 | SHELL ["/bin/bash", "-o", "pipefail", "-c"] 18 | 19 | # Setup our apt environment and install the necessary keyrings and repositories to install dcgm. Note that this strategy doesn't 20 | # support dcgm 3.x. 21 | # We want recommended packages for dcgm and we dont want to enforce version pinning...yet 22 | # hadolint ignore=DL3015,DL3008 23 | RUN apt-get update && apt-get install -y --no-install-recommends \ 24 | gnupg2 curl ca-certificates && \ 25 | curl -fsSL https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.1-1_all.deb | apt-get install -y --no-install-recommends && \ 26 | curl -fsSL https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu2004/x86_64/nvidia-machine-learning-repo-ubuntu2004_1.0.0-1_amd64.deb | apt-get install -y --no-install-recommends && \ 27 | curl -fsSL https://go.dev/dl/go${GO_VERSION}.linux-amd64.tar.gz | tar -C /usr/local -xz && \ 28 | apt-get purge --autoremove -y curl && \ 29 | apt-get install -y datacenter-gpu-manager-4-dev=1:${DCGM_VERSION} && \ 30 | rm -rf /var/lib/apt/lists/* 31 | 32 | ENV PATH=$PATH:/usr/local/go/bin 33 | 34 | # build go-dcgm and samples inside docker environment 35 | FROM base AS samples 36 | # hadolint ignore=DL3008,DL3015 37 | RUN apt-get update && apt-get install -y build-essential nvidia-utils-555 && rm -rf /var/lib/apt/lists/* 38 | COPY . /src 39 | WORKDIR /src 40 | RUN make binary && \ 41 | cp ./samples/restApi/restApi \ 42 | ./samples/processInfo/processInfo \ 43 | ./samples/diag/diag \ 44 | ./samples/hostengineStatus/hostengineStatus \ 45 | ./samples/dmon/dmon \ 46 | ./samples/health/health \ 47 | ./samples/topology/topology \ 48 | ./samples/deviceInfo/deviceInfo \ 49 | ./samples/policy/policy \ 50 | /usr/local/go/bin/ 51 | WORKDIR / 52 | -------------------------------------------------------------------------------- /pkg/dcgm/cpu.go: -------------------------------------------------------------------------------- 1 | package dcgm 2 | 3 | /* 4 | #include "dcgm_agent.h" 5 | #include "dcgm_structs.h" 6 | */ 7 | import "C" 8 | 9 | import ( 10 | "fmt" 11 | "unsafe" 12 | ) 13 | 14 | /* 15 | *See dcgm_structs.h 16 | * DCGM_CPU_CORE_BITMASK_COUNT_V1 (DCGM_MAX_NUM_CPU_CORES / sizeof(uint64_t) / CHAR_BIT) 17 | * or 18 | * 1024 / 8 / 8 19 | */ 20 | 21 | const ( 22 | // MAX_NUM_CPU_CORES represents the maximum number of CPU cores supported 23 | MAX_NUM_CPU_CORES = uint(C.DCGM_MAX_NUM_CPU_CORES) 24 | 25 | // MAX_NUM_CPUS represents the maximum number of CPUs supported 26 | MAX_NUM_CPUS = uint(C.DCGM_MAX_NUM_CPUS) 27 | 28 | // CHAR_BIT represents the number of bits in a byte 29 | CHAR_BIT = uint(C.CHAR_BIT) 30 | 31 | // MAX_CPU_CORE_BITMASK_COUNT represents the maximum count of CPU core bitmasks 32 | MAX_CPU_CORE_BITMASK_COUNT = uint(1024 / 8 / 8) 33 | ) 34 | 35 | // CPUHierarchyCPU_v1 represents information about a single CPU and its owned cores 36 | type CPUHierarchyCPU_v1 struct { 37 | // CPUID is the unique identifier for this CPU 38 | CPUID uint 39 | // OwnedCores is a bitmask array representing the cores owned by this CPU 40 | OwnedCores []uint64 41 | } 42 | 43 | // CPUHierarchy_v1 represents version 1 of the CPU hierarchy information 44 | type CPUHierarchy_v1 struct { 45 | // Version is the version number of the hierarchy structure 46 | Version uint 47 | // NumCPUs is the number of CPUs in the system 48 | NumCPUs uint 49 | // CPUs contains information about each CPU in the system 50 | CPUs [MAX_NUM_CPUS]CPUHierarchyCPU_v1 51 | } 52 | 53 | // GetCPUHierarchy retrieves the CPU hierarchy information from DCGM 54 | func GetCPUHierarchy() (hierarchy CPUHierarchy_v1, err error) { 55 | var c_hierarchy C.dcgmCpuHierarchy_v1 56 | c_hierarchy.version = C.dcgmCpuHierarchy_version1 57 | ptr_hierarchy := (*C.dcgmCpuHierarchy_v1)(unsafe.Pointer(&c_hierarchy)) 58 | result := C.dcgmGetCpuHierarchy(handle.handle, ptr_hierarchy) 59 | 60 | if err = errorString(result); err != nil { 61 | return toCpuHierarchy(c_hierarchy), fmt.Errorf("error retrieving DCGM CPU hierarchy: %s", err) 62 | } 63 | 64 | return toCpuHierarchy(c_hierarchy), nil 65 | } 66 | 67 | func toCpuHierarchy(c_hierarchy C.dcgmCpuHierarchy_v1) CPUHierarchy_v1 { 68 | var hierarchy CPUHierarchy_v1 69 | hierarchy.Version = uint(c_hierarchy.version) 70 | hierarchy.NumCPUs = uint(c_hierarchy.numCpus) 71 | for i := uint(0); i < hierarchy.NumCPUs; i++ { 72 | bits := make([]uint64, MAX_CPU_CORE_BITMASK_COUNT) 73 | 74 | for j := uint(0); j < MAX_CPU_CORE_BITMASK_COUNT; j++ { 75 | bits[j] = uint64(c_hierarchy.cpus[i].ownedCores.bitmask[j]) 76 | } 77 | 78 | hierarchy.CPUs[i] = CPUHierarchyCPU_v1{ 79 | CPUID: uint(c_hierarchy.cpus[i].cpuId), 80 | OwnedCores: bits, 81 | } 82 | } 83 | 84 | return hierarchy 85 | } 86 | -------------------------------------------------------------------------------- /pkg/dcgm/gpu_group_test.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package dcgm 18 | 19 | import ( 20 | "context" 21 | "testing" 22 | 23 | "github.com/stretchr/testify/assert" 24 | "github.com/stretchr/testify/require" 25 | ) 26 | 27 | func TestGroupHandle(t *testing.T) { 28 | gh := GroupHandle{} 29 | assert.Equal(t, uintptr(0), gh.GetHandle(), "value mismatch") 30 | 31 | inputs := []uintptr{1000, 0, 1, 10, 11, 50, 100, 1939902, 9992932938239, 999999999999999999} 32 | 33 | for _, input := range inputs { 34 | gh.SetHandle(input) 35 | assert.Equal(t, input, gh.GetHandle(), "values mismatch") 36 | } 37 | } 38 | 39 | func TestGetGroupInfo(t *testing.T) { 40 | teardownTest := setupTest(t) 41 | defer teardownTest(t) 42 | 43 | runOnlyWithLiveGPUs(t) 44 | gpus, err := withInjectionGPUs(t, 1) 45 | require.NoError(t, err) 46 | 47 | gpuID := gpus[0] 48 | 49 | groupID, err := CreateGroup("test1") 50 | require.NoError(t, err) 51 | 52 | defer func() { 53 | _ = DestroyGroup(groupID) 54 | }() 55 | 56 | err = AddEntityToGroup(groupID, FE_GPU, gpuID) 57 | require.NoError(t, err) 58 | 59 | grInfo, err := GetGroupInfo(groupID) 60 | require.NoError(t, err) 61 | 62 | assert.Equal(t, "test1", grInfo.GroupName) 63 | assert.Len(t, grInfo.EntityList, 1) 64 | assert.Equal(t, FE_GPU, grInfo.EntityList[0].EntityGroupId) 65 | assert.Equal(t, gpuID, grInfo.EntityList[0].EntityId) 66 | } 67 | 68 | func TestCreateGroupWithContext(t *testing.T) { 69 | teardownTest := setupTest(t) 70 | defer teardownTest(t) 71 | 72 | runOnlyWithLiveGPUs(t) 73 | 74 | t.Run("successful creation", func(t *testing.T) { 75 | ctx := context.Background() 76 | groupName := "test_group" 77 | 78 | group, err := CreateGroupWithContext(ctx, groupName) 79 | require.NoError(t, err) 80 | require.NotZero(t, group.GetHandle()) 81 | 82 | // Clean up 83 | err = DestroyGroup(group) 84 | require.NoError(t, err) 85 | }) 86 | 87 | t.Run("context cancellation", func(t *testing.T) { 88 | ctx, cancel := context.WithCancel(context.Background()) 89 | cancel() // Cancel immediately 90 | 91 | group, err := CreateGroupWithContext(ctx, "test_group") 92 | require.Error(t, err) 93 | require.Equal(t, context.Canceled, err) 94 | require.Zero(t, group.GetHandle()) 95 | }) 96 | } 97 | -------------------------------------------------------------------------------- /pkg/dcgm/utils.go: -------------------------------------------------------------------------------- 1 | package dcgm 2 | 3 | /* 4 | #include 5 | #include "dcgm_structs.h" 6 | */ 7 | import "C" 8 | 9 | import ( 10 | "fmt" 11 | "math" 12 | "unsafe" 13 | ) 14 | 15 | const ( 16 | dcgmInt32Blank = 0x7ffffff0 // 2147483632 17 | dcgmInt64Blank = 0x7ffffffffffffff0 // 9223372036854775792 18 | ) 19 | 20 | func uintPtr(c C.uint) *uint { 21 | i := uint(c) 22 | return &i 23 | } 24 | 25 | func uint64Ptr(c C.longlong) *uint64 { 26 | i := uint64(c) 27 | return &i 28 | } 29 | 30 | func int64Ptr(c C.longlong) *int64 { 31 | i := int64(c) 32 | return &i 33 | } 34 | 35 | func toInt64(c C.longlong) int64 { 36 | i := int64(c) 37 | return i 38 | } 39 | 40 | func dblToFloat(val C.double) *float64 { 41 | i := float64(val) 42 | return &i 43 | } 44 | 45 | func stringPtr(c *C.char) *string { 46 | s := C.GoString(c) 47 | return &s 48 | } 49 | 50 | // Error represents an error returned by the DCGM library 51 | type Error struct { 52 | msg string // description of error 53 | Code C.dcgmReturn_t // dcgmReturn_t value of error 54 | } 55 | 56 | func (e *Error) Error() string { return e.msg } 57 | 58 | func errorString(result C.dcgmReturn_t) error { 59 | if result == C.DCGM_ST_OK { 60 | return nil 61 | } 62 | err := C.GoString(C.errorString(result)) 63 | return fmt.Errorf("%v", err) 64 | } 65 | 66 | func freeCString(cStr *C.char) { 67 | C.free(unsafe.Pointer(cStr)) 68 | } 69 | 70 | // IsInt32Blank checks if an integer value represents DCGM's "blank" or sentinel value (0x7ffffff0). 71 | // These values indicate that no valid data is available for the field. 72 | func IsInt32Blank(value int) bool { 73 | return value >= dcgmInt32Blank 74 | } 75 | 76 | // IsInt64Blank checks if an integer value represents DCGM's "blank" or sentinel value (0x7ffffffffffffff0). 77 | // These values indicate that no valid data is available for the field. 78 | func IsInt64Blank(value int64) bool { 79 | return value >= dcgmInt64Blank 80 | } 81 | 82 | func makeVersion1(struct_type uintptr) C.uint { 83 | version := C.uint(struct_type | 1<<24) 84 | return version 85 | } 86 | 87 | func makeVersion2(struct_type uintptr) C.uint { 88 | version := C.uint(struct_type | 2<<24) 89 | return version 90 | } 91 | 92 | func makeVersion3(struct_type uintptr) C.uint { 93 | version := C.uint(struct_type | 3<<24) 94 | return version 95 | } 96 | 97 | func makeVersion4(struct_type uintptr) C.uint { 98 | version := C.uint(struct_type | 4<<24) 99 | return version 100 | } 101 | 102 | func makeVersion5(struct_type uintptr) C.uint { 103 | version := C.uint(struct_type | 5<<24) 104 | return version 105 | } 106 | 107 | func makeVersion12(struct_type uintptr) C.uint { 108 | version := C.uint(struct_type | 12<<24) 109 | return version 110 | } 111 | 112 | func roundFloat(f *float64) *float64 { 113 | var val float64 114 | if f != nil { 115 | val = math.Round(*f) 116 | } 117 | return &val 118 | } 119 | -------------------------------------------------------------------------------- /samples/restApi/server.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "context" 5 | "log" 6 | "net/http" 7 | "time" 8 | 9 | h "github.com/NVIDIA/go-dcgm/samples/restApi/handlers" 10 | "github.com/gorilla/mux" 11 | ) 12 | 13 | const timeout = 5 * time.Second 14 | 15 | // httpServer represents an HTTP server instance that handles DCGM REST API endpoints 16 | type httpServer struct { 17 | router *mux.Router 18 | server *http.Server 19 | } 20 | 21 | // newHttpServer creates and configures a new HTTP server instance 22 | // addr specifies the address:port to listen on 23 | func newHttpServer(addr string) *httpServer { 24 | r := mux.NewRouter() 25 | 26 | s := &httpServer{ 27 | router: r, 28 | server: &http.Server{ 29 | Addr: addr, 30 | Handler: r, 31 | ReadTimeout: timeout, 32 | WriteTimeout: timeout, 33 | }, 34 | } 35 | 36 | // make a global map of device uuids and ids 37 | h.DevicesUuids() 38 | 39 | s.handler() 40 | 41 | return s 42 | } 43 | 44 | func (s *httpServer) handler() { 45 | deviceInfo := "/dcgm/device/info" 46 | subrouter := s.router.PathPrefix(deviceInfo).Subrouter() 47 | subrouter.HandleFunc("/id/{id}", h.DeviceInfo).Methods("GET") 48 | subrouter.HandleFunc("/id/{id}/json", h.DeviceInfo).Methods("GET") 49 | subrouter.HandleFunc("/uuid/{uuid}", h.DeviceInfoByUuid).Methods("GET") 50 | subrouter.HandleFunc("/uuid/{uuid}/json", h.DeviceInfoByUuid).Methods("GET") 51 | 52 | deviceStatus := "/dcgm/device/status" 53 | subrouter = s.router.PathPrefix(deviceStatus).Subrouter() 54 | subrouter.HandleFunc("/id/{id}", h.DeviceStatus).Methods("GET") 55 | subrouter.HandleFunc("/id/{id}/json", h.DeviceStatus).Methods("GET") 56 | subrouter.HandleFunc("/uuid/{uuid}", h.DeviceStatusByUuid).Methods("GET") 57 | subrouter.HandleFunc("/uuid/{uuid}/json", h.DeviceStatusByUuid).Methods("GET") 58 | 59 | processInfo := "/dcgm/process/info/pid/{pid}" 60 | subrouter = s.router.PathPrefix(processInfo).Subrouter() 61 | subrouter.HandleFunc("", h.ProcessInfo).Methods("GET") 62 | subrouter.HandleFunc("/json", h.ProcessInfo).Methods("GET") 63 | 64 | health := "/dcgm/health" 65 | subrouter = s.router.PathPrefix(health).Subrouter() 66 | subrouter.HandleFunc("/id/{id}", h.Health).Methods("GET") 67 | subrouter.HandleFunc("/id/{id}/json", h.Health).Methods("GET") 68 | subrouter.HandleFunc("/uuid/{uuid}", h.HealthByUuid).Methods("GET") 69 | subrouter.HandleFunc("/uuid/{uuid}/json", h.HealthByUuid).Methods("GET") 70 | 71 | dcgmStatus := "/dcgm/status" 72 | subrouter = s.router.PathPrefix(dcgmStatus).Subrouter() 73 | subrouter.HandleFunc("", h.Status).Methods("GET") 74 | subrouter.HandleFunc("/json", h.Status).Methods("GET") 75 | } 76 | 77 | func (s *httpServer) serve() { 78 | if err := s.server.ListenAndServe(); err != http.ErrServerClosed { 79 | log.Printf("Error: %v", err) 80 | } 81 | } 82 | 83 | func (s *httpServer) stop() { 84 | ctx, cancel := context.WithTimeout(context.Background(), timeout) 85 | defer cancel() 86 | 87 | if err := s.server.Shutdown(ctx); err != nil { 88 | log.Printf("Error: %v", err) 89 | } else { 90 | log.Println("http server stopped") 91 | } 92 | } 93 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contribute to the DCGM Golang Bindings 2 | 3 | Want to hack on the NVIDIA DCGM Golang Bindings Project? Awesome! 4 | We only require you to sign your work, the below section describes this! 5 | 6 | ## Validate your work 7 | 8 | All changes need to be able to pass all linting and pre-commit checks. All tests 9 | must pass, including `make lint-full`, `pre-commit run --all-files`, and `make test-main` 10 | 11 | Note: There is a race in `make test-main` and it will occaisionally fail due to the race. 12 | 13 | ### Setting up pre-commit 14 | 15 | You can install pre-commit via brew, apt/dnf, or via pip: 16 | 17 | ```bash 18 | pip install pre-commit 19 | ``` 20 | 21 | Once installed, you can run: 22 | 23 | ```bash 24 | make install-pre-commit 25 | pre-commit autoupdate 26 | ``` 27 | 28 | Once you've complete this step, pre-commit is setup and ready to go. The pre-commit hooks 29 | will be executed when you run `git commit`. 30 | 31 | ## Sign your work 32 | 33 | The sign-off is a simple line at the end of the explanation for the patch. Your 34 | signature certifies that you wrote the patch or otherwise have the right to pass 35 | it on as an open-source patch. The rules are pretty simple: if you can certify 36 | the below (from [developercertificate.org](http://developercertificate.org/)): 37 | 38 | ```bash 39 | Developer Certificate of Origin 40 | Version 1.1 41 | 42 | Copyright (C) 2004, 2006 The Linux Foundation and its contributors. 43 | 1 Letterman Drive 44 | Suite D4700 45 | San Francisco, CA, 94129 46 | 47 | Everyone is permitted to copy and distribute verbatim copies of this 48 | license document, but changing it is not allowed. 49 | 50 | Developer's Certificate of Origin 1.1 51 | 52 | By making a contribution to this project, I certify that: 53 | 54 | (a) The contribution was created in whole or in part by me and I 55 | have the right to submit it under the open source license 56 | indicated in the file; or 57 | 58 | (b) The contribution is based upon previous work that, to the best 59 | of my knowledge, is covered under an appropriate open source 60 | license and I have the right under that license to submit that 61 | work with modifications, whether created in whole or in part 62 | by me, under the same open source license (unless I am 63 | permitted to submit under a different license), as indicated 64 | in the file; or 65 | 66 | (c) The contribution was provided directly to me by some other 67 | person who certified (a), (b) or (c) and I have not modified 68 | it. 69 | 70 | (d) I understand and agree that this project and the contribution 71 | are public and that a record of the contribution (including all 72 | personal information I submit with it, including my sign-off) is 73 | maintained indefinitely and may be redistributed consistent with 74 | this project or the open source license(s) involved. 75 | ``` 76 | 77 | Then you just add a line to every git commit message: 78 | 79 | ```bash 80 | Signed-off-by: Joe Smith 81 | ``` 82 | 83 | Use your real name (sorry, no pseudonyms or anonymous contributions.) 84 | 85 | If you set your `user.name` and `user.email` git configs, you can sign your 86 | commit automatically with `git commit -s`. 87 | -------------------------------------------------------------------------------- /pkg/dcgm/instances_test.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package dcgm 18 | 19 | import ( 20 | "testing" 21 | 22 | "github.com/stretchr/testify/assert" 23 | "github.com/stretchr/testify/require" 24 | ) 25 | 26 | func TestMigDeviceProfileNamesStandalone(t *testing.T) { 27 | // Setup test environment 28 | teardown := setupTest(t) 29 | defer teardown(t) 30 | 31 | // Create one fake GPU 32 | gpuIDs, err := withInjectionGPUs(t, 1) 33 | require.NoError(t, err) 34 | require.Len(t, gpuIDs, 1, "Expected 1 fake GPU to be created") 35 | 36 | // Create one GPU instance on the fake GPU 37 | gpuInstanceMap, err := withInjectionGPUInstances(t, gpuIDs[0], 1) 38 | require.NoError(t, err) 39 | require.Len(t, gpuInstanceMap, 1, "Expected 1 fake GPU instance to be created") 40 | 41 | // Get the GPU instance IDs 42 | gpuInstanceIDs := make([]uint, 0, len(gpuInstanceMap)) 43 | for instanceID := range gpuInstanceMap { 44 | gpuInstanceIDs = append(gpuInstanceIDs, instanceID) 45 | } 46 | 47 | // Create one compute instance per GPU instance 48 | ciToGiMap, err := withInjectionComputeInstances(t, gpuInstanceIDs, len(gpuInstanceIDs)) 49 | require.NoError(t, err) 50 | require.Len(t, ciToGiMap, len(gpuInstanceIDs), "Expected one compute instance per GPU instance") 51 | 52 | // Get the compute instance IDs 53 | computeInstanceIds := make([]uint, 0, len(ciToGiMap)) 54 | for ciId := range ciToGiMap { 55 | computeInstanceIds = append(computeInstanceIds, ciId) 56 | } 57 | 58 | // Verify profile names for both GPU instances and compute instances 59 | verifyProfileNames(t, gpuInstanceIDs, true) // verify GPU instances 60 | verifyProfileNames(t, computeInstanceIds, false) // verify compute instances 61 | } 62 | 63 | // verifyProfileNames verifies that the MIG profile names exist for the given entities 64 | func verifyProfileNames(tb testing.TB, entityIds []uint, isGpuInstance bool) { 65 | tb.Helper() 66 | 67 | // Create entity list for the query 68 | entities := make([]GroupEntityPair, 0, len(entityIds)) 69 | for _, entityId := range entityIds { 70 | entity := GroupEntityPair{ 71 | EntityId: entityId, 72 | } 73 | if isGpuInstance { 74 | entity.EntityGroupId = FE_GPU_I 75 | } else { 76 | entity.EntityGroupId = FE_GPU_CI 77 | } 78 | entities = append(entities, entity) 79 | } 80 | 81 | // Get the latest values for DCGM_FI_DEV_NAME field 82 | values, err := EntitiesGetLatestValues(entities, []Short{DCGM_FI_DEV_NAME}, DCGM_FV_FLAG_LIVE_DATA) 83 | require.NoError(tb, err) 84 | 85 | // Define expected profile names 86 | expectedFakeName := "1fc.1g.4gb" 87 | if isGpuInstance { 88 | expectedFakeName = "1fg.4gb" 89 | } 90 | 91 | // Verify each entity has the correct profile name 92 | for i := range values { 93 | assert.Equal(tb, expectedFakeName, values[i].String(), 94 | "Fake profile name appears to be wrong for entity %d. Expected '%s', found '%s'", 95 | values[i].EntityID, expectedFakeName, values[i].String()) 96 | } 97 | } 98 | -------------------------------------------------------------------------------- /pkg/dcgm/diag_test_helpers.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package dcgm 18 | 19 | /* 20 | #include 21 | #include 22 | #include "dcgm_agent.h" 23 | #include "dcgm_structs.h" 24 | */ 25 | import "C" 26 | 27 | import ( 28 | "unsafe" 29 | ) 30 | 31 | // createTestDiagResponse creates a dcgmDiagResponse_v12 for testing 32 | func createTestDiagResponse() C.dcgmDiagResponse_v12 { 33 | var response C.dcgmDiagResponse_v12 34 | response.version = C.dcgmDiagResponse_version12 35 | return response 36 | } 37 | 38 | // addInfoMessage adds an info message to a dcgmDiagResponse_v12 for testing 39 | func addInfoMessage(response *C.dcgmDiagResponse_v12, entityID uint, testID uint, message string) { 40 | idx := response.numInfo 41 | cStr := C.CString(message) 42 | defer C.free(unsafe.Pointer(cStr)) 43 | C.strcpy(&response.info[idx].msg[0], cStr) 44 | response.info[idx].entity.entityId = C.uint(entityID) 45 | response.info[idx].entity.entityGroupId = C.DCGM_FE_GPU 46 | response.info[idx].testId = C.uint(testID) 47 | response.numInfo++ 48 | } 49 | 50 | // addDiagResult adds a diagnostic result to a dcgmDiagResponse_v12 for testing 51 | func addDiagResult(response *C.dcgmDiagResponse_v12, entityID uint, testID uint, result int) { 52 | idx := response.numResults 53 | response.results[idx].entity.entityId = C.uint(entityID) 54 | response.results[idx].entity.entityGroupId = C.DCGM_FE_GPU 55 | response.results[idx].testId = C.uint(testID) 56 | response.results[idx].result = C.dcgmDiagResult_t(result) 57 | response.numResults++ 58 | } 59 | 60 | // addEntityWithSerial adds an entity with serial number to a dcgmDiagResponse_v12 for testing 61 | func addEntityWithSerial(response *C.dcgmDiagResponse_v12, entityID uint, serialNumber string) { 62 | idx := response.numEntities 63 | cStr := C.CString(serialNumber) 64 | defer C.free(unsafe.Pointer(cStr)) 65 | C.strcpy(&response.entities[idx].serialNum[0], cStr) 66 | response.entities[idx].entity.entityId = C.uint(entityID) 67 | response.entities[idx].entity.entityGroupId = C.DCGM_FE_GPU 68 | response.numEntities++ 69 | } 70 | 71 | // Test constants exposed for testing 72 | const ( 73 | testDiagResultPass = C.DCGM_DIAG_RESULT_PASS 74 | testDiagResultSkip = C.DCGM_DIAG_RESULT_SKIP 75 | testDiagResultWarn = C.DCGM_DIAG_RESULT_WARN 76 | testDiagResultFail = C.DCGM_DIAG_RESULT_FAIL 77 | testDiagResultNotRun = C.DCGM_DIAG_RESULT_NOT_RUN 78 | 79 | testMemoryIndex = C.DCGM_MEMORY_INDEX 80 | testDiagnosticIndex = C.DCGM_DIAGNOSTIC_INDEX 81 | testPCIIndex = C.DCGM_PCI_INDEX 82 | testSMStressIndex = C.DCGM_SM_STRESS_INDEX 83 | testTargetedStressIndex = C.DCGM_TARGETED_STRESS_INDEX 84 | testTargetedPowerIndex = C.DCGM_TARGETED_POWER_INDEX 85 | testMemoryBandwidthIndex = C.DCGM_MEMORY_BANDWIDTH_INDEX 86 | testMemtestIndex = C.DCGM_MEMTEST_INDEX 87 | testPulseTestIndex = C.DCGM_PULSE_TEST_INDEX 88 | testEUDTestIndex = C.DCGM_EUD_TEST_INDEX 89 | testSoftwareIndex = C.DCGM_SOFTWARE_INDEX 90 | testContextCreateIndex = C.DCGM_CONTEXT_CREATE_INDEX 91 | ) 92 | -------------------------------------------------------------------------------- /samples/processInfo/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "flag" 5 | "log" 6 | "os" 7 | "text/template" 8 | "time" 9 | 10 | "github.com/NVIDIA/go-dcgm/pkg/dcgm" 11 | ) 12 | 13 | const ( 14 | processInfo = `---------------------------------------------------------------------- 15 | GPU ID : {{.GPU}} 16 | ----------Execution Stats--------------------------------------------- 17 | PID : {{.PID}} 18 | Name : {{or .Name "N/A"}} 19 | Start Time : {{.ProcessUtilization.StartTime.String}} 20 | End Time : {{.ProcessUtilization.EndTime.String}} 21 | ----------Performance Stats------------------------------------------- 22 | Energy Consumed (Joules) : {{or .ProcessUtilization.EnergyConsumed "N/A"}} 23 | Max GPU Memory Used (bytes) : {{or .Memory.GlobalUsed "N/A"}} 24 | Avg SM Clock (MHz) : {{or .Clocks.Cores "N/A"}} 25 | Avg Memory Clock (MHz) : {{or .Clocks.Memory "N/A"}} 26 | Avg SM Utilization (%) : {{or .GpuUtilization.GPU "N/A"}} 27 | Avg Memory Utilization (%) : {{or .GpuUtilization.Memory "N/A"}} 28 | Avg PCIe Rx Bandwidth (MB) : {{or .PCI.Throughput.Rx "N/A"}} 29 | Avg PCIe Tx Bandwidth (MB) : {{or .PCI.Throughput.Tx "N/A"}} 30 | ----------Event Stats------------------------------------------------- 31 | Single Bit ECC Errors : {{or .Memory.ECCErrors.SingleBit "N/A"}} 32 | Double Bit ECC Errors : {{or .Memory.ECCErrors.DoubleBit "N/A"}} 33 | Critical XID Errors : {{.XIDErrors.NumErrors}} 34 | ----------Slowdown Stats---------------------------------------------- 35 | Due to - Power (%) : {{or .Violations.Power "N/A"}} 36 | - Thermal (%) : {{or .Violations.Thermal "N/A"}} 37 | - Reliability (%) : {{or .Violations.Reliability "N/A"}} 38 | - Board Limit (%) : {{or .Violations.BoardLimit "N/A"}} 39 | - Low Utilization (%) : {{or .Violations.LowUtilization "N/A"}} 40 | - Sync Boost (%) : {{or .Violations.SyncBoost "N/A"}} 41 | ----------Process Utilization----------------------------------------- 42 | Avg SM Utilization (%) : {{or .ProcessUtilization.SmUtil "N/A"}} 43 | Avg Memory Utilization (%) : {{or .ProcessUtilization.MemUtil "N/A"}} 44 | ---------------------------------------------------------------------- 45 | ` 46 | ) 47 | 48 | // NOTE: The "WatchPidFields()" function must be initially called (as root) BEFORE starting the process to be monitored: 49 | // 1. Run as root, for enabling health watches 50 | // sudo dcgmi stats -e 51 | // 2. Start process to be monitored 52 | // 3. Run processInfo. This is equivalent to "dcgmi stats --pid ENTERPID -v" 53 | // go build && ./processInfo -pid PID 54 | func main() { 55 | process := flag.Uint("pid", 0, "Provide pid to get this process information.") 56 | 57 | cleanup, err := dcgm.Init(dcgm.Embedded) 58 | if err != nil { 59 | log.Panicln(err) 60 | } 61 | defer cleanup() 62 | 63 | // Request DCGM to start recording stats for GPU process fields 64 | group, err := dcgm.WatchPidFields() 65 | if err != nil { 66 | log.Panicln(err) 67 | } 68 | 69 | // Before retrieving process stats, wait few seconds for watches to be enabled and collect data 70 | log.Println("Enabling DCGM watches to start collecting process stats. This may take a few seconds....") 71 | time.Sleep(3000 * time.Millisecond) 72 | 73 | flag.Parse() 74 | 75 | pidInfo, err := dcgm.GetProcessInfo(group, *process) 76 | if err != nil { 77 | log.Panicln(err) 78 | } 79 | 80 | t := template.Must(template.New("Process").Parse(processInfo)) 81 | for i := range pidInfo { 82 | if err = t.Execute(os.Stdout, pidInfo[i]); err != nil { 83 | log.Panicln("Template error:", err) 84 | } 85 | } 86 | } 87 | -------------------------------------------------------------------------------- /samples/restApi/handlers/dcgm.go: -------------------------------------------------------------------------------- 1 | package handlers 2 | 3 | import ( 4 | "log" 5 | "math" 6 | "net/http" 7 | "time" 8 | 9 | "github.com/NVIDIA/go-dcgm/pkg/dcgm" 10 | 11 | "github.com/gorilla/mux" 12 | ) 13 | 14 | func getStatus(resp http.ResponseWriter, req *http.Request) (status *dcgm.Status) { 15 | st, err := dcgm.Introspect() 16 | if err != nil { 17 | http.Error(resp, err.Error(), http.StatusInternalServerError) 18 | log.Printf("error: %v%v: %v", req.Host, req.URL, err.Error()) 19 | 20 | return 21 | } 22 | 23 | return &st 24 | } 25 | 26 | func getDeviceInfo(resp http.ResponseWriter, req *http.Request) (device *dcgm.Device) { 27 | var id uint 28 | 29 | params := mux.Vars(req) 30 | for k, v := range params { 31 | switch k { 32 | case "id": 33 | id = getId(resp, req, v) 34 | case "uuid": 35 | id = getIdByUuid(resp, req, v) 36 | } 37 | } 38 | 39 | if id == math.MaxUint32 { 40 | return 41 | } 42 | 43 | if !isValidId(id, resp, req) { 44 | return 45 | } 46 | 47 | d, err := dcgm.GetDeviceInfo(id) 48 | if err != nil { 49 | http.Error(resp, err.Error(), http.StatusInternalServerError) 50 | log.Printf("error: %v%v: %v", req.Host, req.URL, err.Error()) 51 | 52 | return 53 | } 54 | 55 | return &d 56 | } 57 | 58 | func getDeviceStatus(resp http.ResponseWriter, req *http.Request) (status *dcgm.DeviceStatus) { 59 | var id uint 60 | 61 | params := mux.Vars(req) 62 | for k, v := range params { 63 | switch k { 64 | case "id": 65 | id = getId(resp, req, v) 66 | case "uuid": 67 | id = getIdByUuid(resp, req, v) 68 | } 69 | } 70 | 71 | if id == math.MaxUint32 { 72 | return 73 | } 74 | 75 | if !isValidId(id, resp, req) { 76 | return 77 | } 78 | 79 | if !isDcgmSupported(id, resp, req) { 80 | return 81 | } 82 | 83 | st, err := dcgm.GetDeviceStatus(id) 84 | if err != nil { 85 | http.Error(resp, err.Error(), http.StatusInternalServerError) 86 | log.Printf("error: %v%v: %v", req.Host, req.URL, err.Error()) 87 | 88 | return 89 | } 90 | 91 | return &st 92 | } 93 | 94 | func getHealth(resp http.ResponseWriter, req *http.Request) (health *dcgm.DeviceHealth) { 95 | var id uint 96 | 97 | params := mux.Vars(req) 98 | for k, v := range params { 99 | switch k { 100 | case "id": 101 | id = getId(resp, req, v) 102 | case "uuid": 103 | id = getIdByUuid(resp, req, v) 104 | } 105 | } 106 | 107 | if id == math.MaxUint32 { 108 | return 109 | } 110 | 111 | if !isValidId(id, resp, req) { 112 | return 113 | } 114 | 115 | h, err := dcgm.HealthCheckByGpuId(id) 116 | if err != nil { 117 | http.Error(resp, err.Error(), http.StatusInternalServerError) 118 | log.Printf("error: %v%v: %v", req.Host, req.URL, err.Error()) 119 | 120 | return 121 | } 122 | 123 | return &h 124 | } 125 | 126 | func getProcessInfo(resp http.ResponseWriter, req *http.Request) (pInfo []dcgm.ProcessInfo) { 127 | params := mux.Vars(req) 128 | 129 | pid := getId(resp, req, params["pid"]) 130 | if pid == math.MaxUint32 { 131 | return 132 | } 133 | 134 | group, err := dcgm.WatchPidFields() 135 | if err != nil { 136 | http.Error(resp, err.Error(), http.StatusInternalServerError) 137 | log.Printf("error: %v%v: %v", req.Host, req.URL, err.Error()) 138 | 139 | return 140 | } 141 | 142 | // wait for watches to be enabled 143 | log.Printf("Enabling DCGM watches to start collecting process stats. This may take a few seconds....") 144 | time.Sleep(3000 * time.Millisecond) 145 | 146 | pInfo, err = dcgm.GetProcessInfo(group, pid) 147 | if err != nil { 148 | http.Error(resp, err.Error(), http.StatusInternalServerError) 149 | log.Printf("error: %v%v: %v", req.Host, req.URL, err.Error()) 150 | } 151 | 152 | return 153 | } 154 | -------------------------------------------------------------------------------- /pkg/dcgm/structs.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package dcgm 18 | 19 | // MigProfile represents the Multi-Instance GPU (MIG) profile type 20 | type MigProfile int 21 | 22 | const ( 23 | // MigProfileNone indicates no MIG profile is set (for GPUs) 24 | MigProfileNone MigProfile = 0 /*!< No profile (for GPUs) */ 25 | // MigProfileGPUInstanceSlice1 represents GPU instance slice 1 26 | MigProfileGPUInstanceSlice1 MigProfile = 1 /*!< GPU instance slice 1 */ 27 | // MigProfileGPUInstanceSlice2 represents GPU instance slice 2 28 | MigProfileGPUInstanceSlice2 MigProfile = 2 /*!< GPU instance slice 2 */ 29 | // MigProfileGPUInstanceSlice3 represents GPU instance slice 3 30 | MigProfileGPUInstanceSlice3 MigProfile = 3 /*!< GPU instance slice 3 */ 31 | // MigProfileGPUInstanceSlice4 represents GPU instance slice 4 32 | MigProfileGPUInstanceSlice4 MigProfile = 4 /*!< GPU instance slice 4 */ 33 | // MigProfileGPUInstanceSlice7 represents GPU instance slice 7 34 | MigProfileGPUInstanceSlice7 MigProfile = 5 /*!< GPU instance slice 7 */ 35 | // MigProfileGPUInstanceSlice8 represents GPU instance slice 8 36 | MigProfileGPUInstanceSlice8 MigProfile = 6 /*!< GPU instance slice 8 */ 37 | // MigProfileGPUInstanceSlice6 represents GPU instance slice 6 38 | MigProfileGPUInstanceSlice6 MigProfile = 7 /*!< GPU instance slice 6 */ 39 | // MigProfileGPUInstanceSlice1Rev1 represents GPU instance slice 1 revision 1 40 | MigProfileGPUInstanceSlice1Rev1 MigProfile = 8 /*!< GPU instance slice 1 revision 1 */ 41 | // MigProfileGPUInstanceSlice2Rev1 represents GPU instance slice 2 revision 1 42 | MigProfileGPUInstanceSlice2Rev1 MigProfile = 9 /*!< GPU instance slice 2 revision 1 */ 43 | // MigProfileGPUInstanceSlice1Rev2 represents GPU instance slice 1 revision 2 44 | MigProfileGPUInstanceSlice1Rev2 MigProfile = 10 /*!< GPU instance slice 1 revision 2 */ 45 | // MigProfileComputeInstanceSlice1 represents compute instance slice 1 46 | MigProfileComputeInstanceSlice1 MigProfile = 30 /*!< compute instance slice 1 */ 47 | // MigProfileComputeInstanceSlice2 represents compute instance slice 2 48 | MigProfileComputeInstanceSlice2 MigProfile = 31 /*!< compute instance slice 2 */ 49 | // MigProfileComputeInstanceSlice3 represents compute instance slice 3 50 | MigProfileComputeInstanceSlice3 MigProfile = 32 /*!< compute instance slice 3 */ 51 | // MigProfileComputeInstanceSlice4 represents compute instance slice 4 52 | MigProfileComputeInstanceSlice4 MigProfile = 33 /*!< compute instance slice 4*/ 53 | // MigProfileComputeInstanceSlice7 represents compute instance slice 7 54 | MigProfileComputeInstanceSlice7 MigProfile = 34 /*!< compute instance slice 7 */ 55 | // MigProfileComputeInstanceSlice8 represents compute instance slice 8 56 | MigProfileComputeInstanceSlice8 MigProfile = 35 /*!< compute instance slice 8 */ 57 | // MigProfileComputeInstanceSlice6 represents compute instance slice 6 58 | MigProfileComputeInstanceSlice6 MigProfile = 36 /*!< compute instance slice 6 */ 59 | // MigProfileComputeInstanceSlice1Rev1 represents compute instance slice 1 revision 1 60 | MigProfileComputeInstanceSlice1Rev1 MigProfile = 37 /*!< compute instance slice 1 revision 1 */ 61 | ) 62 | -------------------------------------------------------------------------------- /tests/processinfo_test.go: -------------------------------------------------------------------------------- 1 | package tests 2 | 3 | import ( 4 | "os" 5 | "testing" 6 | "time" 7 | 8 | "github.com/NVIDIA/go-dcgm/pkg/dcgm" 9 | ) 10 | 11 | // TestProcessInfo demonstrates getting process information for GPU processes 12 | // This is equivalent to the processInfo sample 13 | func TestProcessInfo(t *testing.T) { 14 | cleanup, err := dcgm.Init(dcgm.Embedded) 15 | if err != nil { 16 | t.Fatalf("Failed to initialize DCGM: %v", err) 17 | } 18 | defer cleanup() 19 | 20 | // Request DCGM to start recording stats for GPU process fields 21 | group, err := dcgm.WatchPidFields() 22 | if err != nil { 23 | t.Fatalf("Failed to watch PID fields: %v", err) 24 | } 25 | 26 | // Wait for watches to be enabled and collect data 27 | t.Log("Enabling DCGM watches to start collecting process stats. This may take a few seconds...") 28 | time.Sleep(3000 * time.Millisecond) 29 | 30 | // Get current process ID as an example 31 | //nolint:gosec // disable G115 32 | currentPid := uint(os.Getpid()) 33 | t.Logf("Testing with current process PID: %d", currentPid) 34 | 35 | pidInfo, err := dcgm.GetProcessInfo(group, currentPid) 36 | if err != nil { 37 | t.Logf("Failed to get process info for PID %d: %v", currentPid, err) 38 | t.Log("This is expected if the current process is not using GPU") 39 | return 40 | } 41 | 42 | if len(pidInfo) == 0 { 43 | t.Logf("No process information found for PID %d", currentPid) 44 | return 45 | } 46 | 47 | // Log basic process information 48 | for i, info := range pidInfo { 49 | t.Logf("Process Info %d:", i+1) 50 | t.Logf(" GPU ID: %d", info.GPU) 51 | t.Logf(" PID: %d", info.PID) 52 | if info.Name != "" { 53 | t.Logf(" Name: %s", info.Name) 54 | } 55 | t.Logf(" Start Time: %s", info.ProcessUtilization.StartTime.String()) 56 | t.Logf(" End Time: %s", info.ProcessUtilization.EndTime.String()) 57 | t.Logf(" Critical XID Errors: %d", info.XIDErrors.NumErrors) 58 | } 59 | } 60 | 61 | // TestProcessInfoWithSpecificPID demonstrates getting process info for a specific PID 62 | func TestProcessInfoWithSpecificPID(t *testing.T) { 63 | if testing.Short() { 64 | t.Skip("Skipping specific PID test in short mode") 65 | } 66 | 67 | cleanup, err := dcgm.Init(dcgm.Embedded) 68 | if err != nil { 69 | t.Fatalf("Failed to initialize DCGM: %v", err) 70 | } 71 | defer cleanup() 72 | 73 | // Request DCGM to start recording stats for GPU process fields 74 | group, err := dcgm.WatchPidFields() 75 | if err != nil { 76 | t.Fatalf("Failed to watch PID fields: %v", err) 77 | } 78 | 79 | // Wait for watches to be enabled and collect data 80 | time.Sleep(3000 * time.Millisecond) 81 | 82 | // Test with PID 1 (init process) - should not have GPU usage 83 | testPid := uint(1) 84 | pidInfo, err := dcgm.GetProcessInfo(group, testPid) 85 | if err != nil { 86 | t.Logf("Expected: No process info found for PID %d: %v", testPid, err) 87 | } else if len(pidInfo) == 0 { 88 | t.Logf("Expected: No GPU usage found for PID %d", testPid) 89 | } else { 90 | t.Logf("Unexpected: Found GPU usage for PID %d", testPid) 91 | } 92 | } 93 | 94 | // TestWatchPidFields demonstrates the WatchPidFields functionality 95 | func TestWatchPidFields(t *testing.T) { 96 | cleanup, err := dcgm.Init(dcgm.Embedded) 97 | if err != nil { 98 | t.Fatalf("Failed to initialize DCGM: %v", err) 99 | } 100 | defer cleanup() 101 | 102 | // Test WatchPidFields function 103 | group, err := dcgm.WatchPidFields() 104 | if err != nil { 105 | t.Fatalf("Failed to watch PID fields: %v", err) 106 | } 107 | 108 | t.Logf("Successfully created group for watching PID fields: %v", group) 109 | 110 | // Wait a bit to ensure watches are properly set up 111 | time.Sleep(1000 * time.Millisecond) 112 | t.Log("PID field watches enabled successfully") 113 | } 114 | -------------------------------------------------------------------------------- /pkg/dcgm/field_values_test.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package dcgm 18 | 19 | import ( 20 | "crypto/rand" 21 | "encoding/binary" 22 | "fmt" 23 | "testing" 24 | "time" 25 | 26 | "github.com/stretchr/testify/assert" 27 | "github.com/stretchr/testify/require" 28 | ) 29 | 30 | // secureRandomUint64 returns a cryptographically secure random uint64 31 | func secureRandomUint64() (uint64, error) { 32 | var buf [8]byte 33 | _, err := rand.Read(buf[:]) 34 | if err != nil { 35 | return 0, err 36 | } 37 | return binary.BigEndian.Uint64(buf[:]), nil 38 | } 39 | 40 | func TestGetValuesSince(t *testing.T) { 41 | teardownTest := setupTest(t) 42 | defer teardownTest(t) 43 | runOnlyWithLiveGPUs(t) 44 | 45 | const gpu uint = 0 46 | 47 | // Create a group of fields 48 | const ( 49 | xid int = iota 50 | ) 51 | 52 | deviceFields := make([]Short, 1) 53 | deviceFields[xid] = DCGM_FI_DEV_XID_ERRORS 54 | 55 | randID, err := secureRandomUint64() 56 | require.NoError(t, err) 57 | fieldGroupName := fmt.Sprintf("fieldGroupName%d", randID) 58 | 59 | fieldsGroup, err := FieldGroupCreate(fieldGroupName, deviceFields) 60 | require.NoError(t, err) 61 | 62 | defer func() { 63 | _ = FieldGroupDestroy(fieldsGroup) 64 | }() 65 | 66 | var values []FieldValue_v2 67 | var nextTime time.Time 68 | 69 | t.Run("When there is no data return error", func(t *testing.T) { 70 | values, nextTime, err = GetValuesSince(GroupAllGPUs(), 71 | fieldsGroup, time.Time{}) 72 | require.Error(t, err) 73 | require.Empty(t, nextTime) 74 | require.Empty(t, values) 75 | }) 76 | 77 | t.Run("When there are a few entries", func(t *testing.T) { 78 | expectedNumberOfErrors := int64(43) 79 | expectedInjectedValuesCount := 0 80 | 81 | t.Logf("injecting %s for gpuId %d", "DCGM_FI_DEV_XID_ERRORS", gpu) 82 | err = InjectFieldValue(gpu, 83 | DCGM_FI_DEV_XID_ERRORS, 84 | DCGM_FT_INT64, 85 | 0, 86 | time.Now().Add(-time.Duration(5)*time.Second).UnixMicro(), 87 | expectedNumberOfErrors, 88 | ) 89 | require.NoError(t, err) 90 | 91 | expectedInjectedValuesCount++ 92 | 93 | for i := 4; i > 0; i-- { 94 | err = InjectFieldValue(gpu, 95 | DCGM_FI_DEV_XID_ERRORS, 96 | DCGM_FT_INT64, 97 | 0, 98 | time.Now().Add(-time.Duration(i)*time.Second).UnixMicro(), 99 | int64(i), 100 | ) 101 | require.NoError(t, err) 102 | 103 | expectedInjectedValuesCount++ 104 | } 105 | // Force an update of the fields so that we can fetch initial values. 106 | err = UpdateAllFields() 107 | require.NoError(t, err) 108 | values, nextTime, err = GetValuesSince(GroupAllGPUs(), fieldsGroup, time.Time{}) 109 | require.NoError(t, err) 110 | assert.Greater(t, nextTime, time.Time{}) 111 | assert.Len(t, values, expectedInjectedValuesCount) 112 | assert.Equal(t, FE_GPU, values[0].EntityGroupId) 113 | assert.Equal(t, gpu, values[0].EntityID) 114 | assert.Equal(t, DCGM_FI_DEV_XID_ERRORS, values[0].FieldID) 115 | assert.Equal(t, expectedNumberOfErrors, values[0].Int64()) 116 | 117 | for i := 1; i < 5; i++ { 118 | assert.Equal(t, FE_GPU, values[i].EntityGroupId) 119 | assert.Equal(t, gpu, values[i].EntityID) 120 | assert.Equal(t, DCGM_FI_DEV_XID_ERRORS, values[i].FieldID) 121 | assert.Equal(t, int64(5-i), values[i].Int64()) 122 | } 123 | }) 124 | } 125 | -------------------------------------------------------------------------------- /tests/deviceinfo_test.go: -------------------------------------------------------------------------------- 1 | package tests 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/NVIDIA/go-dcgm/pkg/dcgm" 7 | ) 8 | 9 | // TestDeviceInfo demonstrates getting device information from all GPUs 10 | // This is equivalent to the deviceInfo sample 11 | func TestDeviceInfoTest(t *testing.T) { 12 | cleanup, err := dcgm.Init(dcgm.Embedded) 13 | if err != nil { 14 | t.Fatalf("Failed to initialize DCGM: %v", err) 15 | } 16 | defer cleanup() 17 | 18 | count, err := dcgm.GetAllDeviceCount() 19 | if err != nil { 20 | t.Fatalf("Failed to get device count: %v", err) 21 | } 22 | 23 | t.Logf("Found %d devices", count) 24 | 25 | for i := uint(0); i < count; i++ { 26 | deviceInfo, err := dcgm.GetDeviceInfo(i) 27 | if err != nil { 28 | t.Errorf("Failed to get device info for GPU %d: %v", i, err) 29 | continue 30 | } 31 | 32 | // Log device information 33 | t.Logf("Device %d Information:", i) 34 | t.Logf(" Driver Version: %s", deviceInfo.Identifiers.DriverVersion) 35 | t.Logf(" GPU: %d", deviceInfo.GPU) 36 | t.Logf(" DCGMSupported: %v", deviceInfo.DCGMSupported) 37 | t.Logf(" UUID: %s", deviceInfo.UUID) 38 | t.Logf(" Brand: %s", deviceInfo.Identifiers.Brand) 39 | t.Logf(" Model: %s", deviceInfo.Identifiers.Model) 40 | t.Logf(" Serial Number: %s", deviceInfo.Identifiers.Serial) 41 | 42 | if deviceInfo.Identifiers.Vbios != "" { 43 | t.Logf(" Vbios: %s", deviceInfo.Identifiers.Vbios) 44 | } 45 | 46 | t.Logf(" InforomImage Version: %s", deviceInfo.Identifiers.InforomImageVersion) 47 | t.Logf(" Bus ID: %s", deviceInfo.PCI.BusID) 48 | 49 | if deviceInfo.PCI.BAR1 != 0 { 50 | t.Logf(" BAR1 (MB): %d", deviceInfo.PCI.BAR1) 51 | } 52 | 53 | if deviceInfo.PCI.FBTotal != 0 { 54 | t.Logf(" FrameBuffer Memory (MB): %d", deviceInfo.PCI.FBTotal) 55 | } 56 | 57 | if deviceInfo.PCI.Bandwidth != 0 { 58 | t.Logf(" Bandwidth (MB/s): %d", deviceInfo.PCI.Bandwidth) 59 | } 60 | 61 | if deviceInfo.Power != 0 { 62 | t.Logf(" Power (W): %d", deviceInfo.Power) 63 | } 64 | 65 | if deviceInfo.CPUAffinity != "" { 66 | t.Logf(" CPUAffinity: %s", deviceInfo.CPUAffinity) 67 | } 68 | 69 | // Log P2P topology if available 70 | if len(deviceInfo.Topology) > 0 { 71 | t.Logf(" P2P Available:") 72 | for _, topo := range deviceInfo.Topology { 73 | t.Logf(" GPU%d - (BusID)%s - %p", topo.GPU, topo.BusID, topo.Link.PCIPaths) 74 | } 75 | } else { 76 | t.Logf(" P2P Available: None") 77 | } 78 | 79 | // Basic assertions to ensure we got valid data 80 | if deviceInfo.UUID == "" { 81 | t.Errorf("Device %d has empty UUID", i) 82 | } 83 | if deviceInfo.Identifiers.Brand == "" { 84 | t.Errorf("Device %d has empty brand", i) 85 | } 86 | if deviceInfo.PCI.BusID == "" { 87 | t.Errorf("Device %d has empty bus ID", i) 88 | } 89 | } 90 | } 91 | 92 | // TestDeviceInfoWithConnection demonstrates connecting to a standalone hostengine 93 | func TestDeviceInfoWithConnection(t *testing.T) { 94 | // Skip this test if we're not testing with a specific connection 95 | if testing.Short() { 96 | t.Skip("Skipping connection test in short mode") 97 | } 98 | 99 | connectAddr := "localhost" 100 | isSocket := "0" 101 | 102 | cleanup, err := dcgm.Init(dcgm.Standalone, connectAddr, isSocket) 103 | if err != nil { 104 | t.Skipf("Failed to connect to standalone hostengine at %s: %v", connectAddr, err) 105 | } 106 | defer cleanup() 107 | 108 | count, err := dcgm.GetAllDeviceCount() 109 | if err != nil { 110 | t.Fatalf("Failed to get device count: %v", err) 111 | } 112 | 113 | t.Logf("Connected to standalone hostengine, found %d devices", count) 114 | 115 | // Just test first device if available 116 | if count > 0 { 117 | deviceInfo, err := dcgm.GetDeviceInfo(0) 118 | if err != nil { 119 | t.Errorf("Failed to get device info for GPU 0: %v", err) 120 | } else { 121 | t.Logf("First device UUID: %s", deviceInfo.UUID) 122 | } 123 | } 124 | } 125 | -------------------------------------------------------------------------------- /pkg/dcgm/field_values.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package dcgm 18 | 19 | /* 20 | #include "dcgm_agent.h" 21 | #include "dcgm_structs.h" 22 | #include "field_values_cb.h" 23 | extern int go_dcgmFieldValueEntityEnumeration(dcgm_field_entity_group_t entityGroupId, 24 | dcgm_field_eid_t entityId, 25 | dcgmFieldValue_v1 *values, 26 | int numValues, 27 | void *userData); 28 | */ 29 | import "C" 30 | 31 | import ( 32 | "fmt" 33 | "sync" 34 | "time" 35 | "unsafe" 36 | ) 37 | 38 | type callback struct { 39 | mu sync.Mutex 40 | Values []FieldValue_v2 41 | } 42 | 43 | func (cb *callback) processValues(entityGroup Field_Entity_Group, entityID uint, cvalues []C.dcgmFieldValue_v1) { 44 | values := dcgmFieldValue_v1ToFieldValue_v2(entityGroup, entityID, cvalues) 45 | 46 | cb.mu.Lock() 47 | cb.Values = append(cb.Values, values...) 48 | cb.mu.Unlock() 49 | } 50 | 51 | //export go_dcgmFieldValueEntityEnumeration 52 | func go_dcgmFieldValueEntityEnumeration( 53 | entityGroup C.dcgm_field_entity_group_t, 54 | entityID C.dcgm_field_eid_t, 55 | values *C.dcgmFieldValue_v1, 56 | numValues C.int, 57 | userData unsafe.Pointer, 58 | ) C.int { 59 | ptrValues := unsafe.Pointer(values) 60 | if ptrValues != nil { 61 | valuesSlice := (*[1 << 30]C.dcgmFieldValue_v1)(ptrValues)[0:numValues] 62 | 63 | if userData != nil { 64 | processor := (*callback)(userData) 65 | processor.processValues(Field_Entity_Group(entityGroup), uint(entityID), valuesSlice) 66 | } 67 | } 68 | return 0 69 | } 70 | 71 | // GetValuesSince reads and returns field values for a specified group of entities, such as GPUs, 72 | // that have been updated since a given timestamp. It allows for targeted data retrieval based on time criteria. 73 | // 74 | // GPUGroup is a GroupHandle that identifies the group of entities to operate on. It can be obtained from CreateGroup 75 | // for a specific group of GPUs or use GroupAllGPUs() to target all GPUs. 76 | // 77 | // fieldGroup is a FieldHandle representing the group of fields for which data is requested. 78 | // 79 | // sinceTime is a time.Time value representing the timestamp from which to request updated values. 80 | // A zero value (time.Time{}) requests all available data. 81 | // 82 | // Returns []FieldValue_v2 slice containing the requested field values, a time.Time indicating the time 83 | // of the latest data retrieval, and an error if there is any issue during the operation. 84 | func GetValuesSince(gpuGroup GroupHandle, fieldGroup FieldHandle, sinceTime time.Time) ([]FieldValue_v2, time.Time, error) { 85 | var nextSinceTimestamp C.longlong 86 | cbResult := &callback{} 87 | result := C.dcgmGetValuesSince_v2(handle.handle, 88 | gpuGroup.handle, 89 | fieldGroup.handle, 90 | C.longlong(sinceTime.UnixMicro()), 91 | &nextSinceTimestamp, 92 | C.dcgmFieldValueEnumeration_f(C.fieldValueEntityCallback), 93 | unsafe.Pointer(cbResult)) 94 | if result != C.DCGM_ST_OK { 95 | return nil, time.Time{}, fmt.Errorf("dcgmGetValuesSince_v2 failed with error code %d", int(result)) 96 | } 97 | 98 | return cbResult.Values, timestampUSECToTime(int64(nextSinceTimestamp)), nil 99 | } 100 | 101 | func timestampUSECToTime(timestampUSEC int64) time.Time { 102 | // Convert microseconds to seconds and nanoseconds 103 | sec := timestampUSEC / 1000000 // Convert microseconds to seconds 104 | nsec := (timestampUSEC % 1000000) * 1000 // Convert the remaining microseconds to nanoseconds 105 | // Use time.Unix to get a time.Time object 106 | return time.Unix(sec, nsec) 107 | } 108 | -------------------------------------------------------------------------------- /samples/restApi/README.md: -------------------------------------------------------------------------------- 1 | # DCGM REST API 2 | 3 | A sample REST API is provided, demonstrating various endpoints for getting GPU metrics via DCGM. 4 | 5 | ```bash 6 | # Start the http server 7 | # By default the http server is started at localhost:8070 8 | 9 | $ go build && ./restApi 10 | 11 | # Query GPU 0 info 12 | $ GPUID=0 13 | $ curl localhost:8070/dcgm/device/info/id/$GPUID 14 | 15 | # sample output 16 | 17 | Driver Version : 384.130 18 | GPU : 0 19 | DCGMSupported : Yes 20 | UUID : GPU-34e8d7ba-0e4d-ac00-6852-695d5d404f51 21 | Brand : GeForce 22 | Model : GeForce GTX 980 23 | Serial Number : 0324414056639 24 | Vbios : 84.04.1F.00.02 25 | InforomImage Version : G001.0000.01.03 26 | Bus ID : 00000000:01:00.0 27 | BAR1 (MB) : 256 28 | FrameBuffer Memory (MB): 4036 29 | Bandwidth (MB/s) : 15760 30 | Cores (MHz) : 1392 31 | Memory (MHz) : 3505 32 | Power (W) : 180 33 | CPUAffinity : 0-11 34 | P2P Available : None 35 | --------------------------------------------------------------------- 36 | 37 | $ curl localhost:8070/dcgm/device/info/id/$GPUID/json 38 | 39 | # Query GPU info using its UUID 40 | 41 | $ UUID=$(curl -s localhost:8070/dcgm/device/info/id/$GPUID | grep -i uuid | cut -d ":" -f2 ) 42 | $ curl localhost:8070/dcgm/device/info/uuid/$UUID 43 | $ curl localhost:8070/dcgm/device/info/uuid/$UUID/json 44 | 45 | # sample output 46 | 47 | {"GPU":0,"DCGMSupported":"Yes","UUID":"GPU-34e8d7ba-0e4d-ac00-6852-695d5d404f51","Power":180,"PCI":{"BusID":"00000000:01:00.0","BAR1":256,"FBTotal":4036,"Bandwidth":15760},"Clocks":{"Cores":1392,"Memory":3505},"Identifiers":{"Brand":"GeForce","Model":"GeForce GTX 980","Serial":"0324414056639","Vbios":"84.04.1F.00.02","InforomImageVersion":"G001.0000.01.03","DriverVersion":"384.130"},"Topology":null,"CPUAffinity":"0-11"} 48 | 49 | # Query GPU status 50 | 51 | $ curl localhost:8070/dcgm/device/status/id/$GPUID 52 | $ curl localhost:8070/dcgm/device/status/id/$GPUID/json 53 | 54 | # sample output 55 | 56 | Power (W) : 20.985 57 | Temperature (°C) : 47 58 | Sm Utilization (%) : 2 59 | Memory Utilization (%) : 8 60 | Encoder Utilization (%) : 0 61 | Decoder Utilization (%) : 0 62 | Memory Clock (MHz : 324 63 | SM Clock (MHz) : 135 64 | 65 | $ curl localhost:8070/dcgm/device/status/uuid/$UUID 66 | 67 | # sample output 68 | 69 | {"Power":20.793,"Temperature":43,"Utilization":{"GPU":0,"Memory":8,"Encoder":0,"Decoder":0},"Memory":{"GlobalUsed":null,"ECCErrors":{"SingleBit":9223372036854775794,"DoubleBit":9223372036854775794}},"Clocks":{"Cores":135,"Memory":324},"PCI":{"BAR1Used":9,"Throughput":{"Rx":129,"Tx":47,"Replays":0},"FBUsed":423},"Performance":8,"FanSpeed":29} 70 | 71 | $ curl localhost:8070/dcgm/device/status/uuid/$UUID/json 72 | 73 | # Query GPU process info 74 | 75 | # Run CUDA nbody sample and get its PID 76 | # NOTE: The "WatchPidFields()" function must be initially called (as root) BEFORE starting the process to be monitored: 77 | # 1. Run as root, for enabling health watches 78 | $ sudo dcgmi stats -e 79 | # 2. Start process to be monitored 80 | $ nbody -benchmark -numbodies=1000192 81 | # 3. Start restApi 82 | $ go build && ./restApi 83 | $ PID=$(pgrep nbody) 84 | 85 | $ curl localhost:8070/dcgm/process/info/pid/$PID 86 | $ curl localhost:8070/dcgm/process/info/pid/$PID/json 87 | 88 | # sample output 89 | 90 | {"GPU":0,"PID":19132,"Name":"nbody","ProcessUtilization":{"StartTime":1529980640,"EndTime":0,"EnergyConsumed":1346,"SmUtil":0,"MemUtil":0},"PCI":{"BAR1Used":null,"Throughput":{"Rx":null,"Tx":null,"Replays":0},"FBUsed":null},"Memory":{"GlobalUsed":84279296,"ECCErrors":{"SingleBit":0,"DoubleBit":0}},"GpuUtilization":{"GPU":null,"Memory":null,"Encoder":null,"Decoder":null},"Clocks":{"Cores":null,"Memory":null},"Violations":{"Power":0,"Thermal":0,"Reliability":0,"BoardLimit":0,"LowUtilization":0,"SyncBoost":0},"XIDErrors":{"NumErrors":0,"TimeStamp":[]}} 91 | 92 | # Query GPU health 93 | 94 | $ curl localhost:8070/dcgm/health/id/$GPUID 95 | $ curl localhost:8070/dcgm/health/id/$GPUID/json 96 | $ curl localhost:8070/dcgm/health/uuid/$UUID 97 | $ curl localhost:8070/dcgm/health/uuid/$UUID/json 98 | 99 | # sample output 100 | 101 | {"GPU":0,"Status":"Healthy","Watches":[]} 102 | 103 | # Query DCGM hostengine memory and CPU usage 104 | 105 | $ curl localhost:8070/dcgm/status 106 | $ curl localhost:8070/dcgm/status/json 107 | 108 | # sample output 109 | 110 | {"Memory":18380,"CPU":0.16482222745467387} 111 | ``` 112 | -------------------------------------------------------------------------------- /tests/health_test.go: -------------------------------------------------------------------------------- 1 | package tests 2 | 3 | import ( 4 | "testing" 5 | "time" 6 | 7 | "github.com/NVIDIA/go-dcgm/pkg/dcgm" 8 | ) 9 | 10 | // TestHealthCheck demonstrates GPU health checking functionality 11 | // This is equivalent to the health sample but runs for a limited time 12 | func TestHealthCheck(t *testing.T) { 13 | cleanup, err := dcgm.Init(dcgm.Embedded) 14 | if err != nil { 15 | t.Fatalf("Failed to initialize DCGM: %v", err) 16 | } 17 | defer cleanup() 18 | 19 | gpus, err := dcgm.GetSupportedDevices() 20 | if err != nil { 21 | t.Fatalf("Failed to get supported devices: %v", err) 22 | } 23 | 24 | if len(gpus) == 0 { 25 | t.Skip("No supported GPUs found for health checking") 26 | } 27 | 28 | // Monitor health for a few seconds instead of indefinitely 29 | ticker := time.NewTicker(time.Second * 2) 30 | defer ticker.Stop() 31 | 32 | timeout := time.After(6 * time.Second) 33 | checkCount := 0 34 | 35 | for { 36 | select { 37 | case <-ticker.C: 38 | for _, gpu := range gpus { 39 | h, err := dcgm.HealthCheckByGpuId(gpu) 40 | if err != nil { 41 | t.Errorf("Failed to get health status for GPU %d: %v", gpu, err) 42 | continue 43 | } 44 | 45 | t.Logf("GPU %d Health Check:", gpu) 46 | t.Logf(" Status: %s", h.Status) 47 | 48 | for _, watch := range h.Watches { 49 | t.Logf(" Watch Type: %s", watch.Type) 50 | t.Logf(" Watch Status: %s", watch.Status) 51 | if watch.Error != "" { 52 | t.Logf(" Watch Error: %s", watch.Error) 53 | } 54 | } 55 | 56 | // Basic validation 57 | if h.Status == "" { 58 | t.Errorf("GPU %d has empty health status", gpu) 59 | } 60 | } 61 | checkCount++ 62 | 63 | case <-timeout: 64 | t.Logf("Health monitoring completed after %d checks", checkCount) 65 | return 66 | } 67 | } 68 | } 69 | 70 | // TestHealthCheckSingle demonstrates a single health check 71 | func TestHealthCheckSingle(t *testing.T) { 72 | cleanup, err := dcgm.Init(dcgm.Embedded) 73 | if err != nil { 74 | t.Fatalf("Failed to initialize DCGM: %v", err) 75 | } 76 | defer cleanup() 77 | 78 | gpus, err := dcgm.GetSupportedDevices() 79 | if err != nil { 80 | t.Fatalf("Failed to get supported devices: %v", err) 81 | } 82 | 83 | if len(gpus) == 0 { 84 | t.Skip("No supported GPUs found") 85 | } 86 | 87 | // Test first GPU 88 | gpu := gpus[0] 89 | h, err := dcgm.HealthCheckByGpuId(gpu) 90 | if err != nil { 91 | t.Fatalf("Failed to get health status for GPU %d: %v", gpu, err) 92 | } 93 | 94 | t.Logf("GPU %d Health Status: %s", gpu, h.Status) 95 | 96 | if len(h.Watches) == 0 { 97 | t.Logf("No health watches configured for GPU %d", gpu) 98 | } else { 99 | t.Logf("Health watches for GPU %d:", gpu) 100 | for i, watch := range h.Watches { 101 | t.Logf(" Watch %d:", i+1) 102 | t.Logf(" Type: %s", watch.Type) 103 | t.Logf(" Status: %s", watch.Status) 104 | if watch.Error != "" { 105 | t.Logf(" Error: %s", watch.Error) 106 | } 107 | } 108 | } 109 | 110 | // Basic assertions 111 | if h.Status == "" { 112 | t.Error("Health status is empty") 113 | } 114 | } 115 | 116 | // TestHealthCheckAllGPUs demonstrates health checking for all GPUs 117 | func TestHealthCheckAllGPUs(t *testing.T) { 118 | cleanup, err := dcgm.Init(dcgm.Embedded) 119 | if err != nil { 120 | t.Fatalf("Failed to initialize DCGM: %v", err) 121 | } 122 | defer cleanup() 123 | 124 | gpus, err := dcgm.GetSupportedDevices() 125 | if err != nil { 126 | t.Fatalf("Failed to get supported devices: %v", err) 127 | } 128 | 129 | if len(gpus) == 0 { 130 | t.Skip("No supported GPUs found") 131 | } 132 | 133 | healthyGPUs := 0 134 | unhealthyGPUs := 0 135 | 136 | for _, gpu := range gpus { 137 | h, err := dcgm.HealthCheckByGpuId(gpu) 138 | if err != nil { 139 | t.Errorf("Failed to get health status for GPU %d: %v", gpu, err) 140 | continue 141 | } 142 | 143 | t.Logf("GPU %d: %s", gpu, h.Status) 144 | 145 | // Count healthy vs unhealthy 146 | if h.Status == "Healthy" || h.Status == "OK" { 147 | healthyGPUs++ 148 | } else { 149 | unhealthyGPUs++ 150 | t.Logf("GPU %d is not healthy: %s", gpu, h.Status) 151 | 152 | // Log any watch errors 153 | for _, watch := range h.Watches { 154 | if watch.Error != "" { 155 | t.Logf(" Watch %s error: %s", watch.Type, watch.Error) 156 | } 157 | } 158 | } 159 | } 160 | 161 | t.Logf("Health summary: %d healthy, %d unhealthy GPUs", healthyGPUs, unhealthyGPUs) 162 | 163 | // We expect at least some GPUs to be available 164 | if healthyGPUs == 0 && unhealthyGPUs == 0 { 165 | t.Error("No GPU health status could be determined") 166 | } 167 | } 168 | -------------------------------------------------------------------------------- /pkg/dcgm/internal.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | // Package dcgm provides bindings for NVIDIA's Data Center GPU Manager (DCGM) 18 | package dcgm 19 | 20 | /* 21 | #cgo linux LDFLAGS: -ldl -Wl,--export-dynamic -Wl,--unresolved-symbols=ignore-in-object-files 22 | #cgo darwin LDFLAGS: -ldl -Wl,--export-dynamic -Wl,-undefined,dynamic_lookup 23 | 24 | #include "dcgm_test_apis.h" 25 | #include "dcgm_test_structs.h" 26 | #include "dcgm_structs_internal.h" 27 | */ 28 | import "C" 29 | 30 | import ( 31 | "unsafe" 32 | ) 33 | 34 | // MigHierarchyInfo represents the Multi-Instance GPU (MIG) hierarchy information 35 | // for a GPU entity and its relationship to other entities 36 | type MigHierarchyInfo struct { 37 | // Entity represents the current GPU entity in the hierarchy 38 | Entity GroupEntityPair 39 | // Parent represents the parent GPU entity in the hierarchy 40 | Parent GroupEntityPair 41 | // SliceProfile defines the MIG profile configuration for this entity 42 | SliceProfile MigProfile 43 | } 44 | 45 | // CreateFakeEntities creates test entities with the specified MIG hierarchy information. 46 | // This function is intended for testing purposes only. 47 | // Returns a slice of Entity IDs for the created entities and any error encountered. 48 | func CreateFakeEntities(entities []MigHierarchyInfo) ([]uint, error) { 49 | ccfe := C.dcgmCreateFakeEntities_v2{ 50 | version: C.dcgmCreateFakeEntities_version2, 51 | numToCreate: C.uint(len(entities)), 52 | entityList: [C.DCGM_MAX_HIERARCHY_INFO]C.dcgmMigHierarchyInfo_t{}, 53 | } 54 | 55 | for i := range entities { 56 | if i >= C.DCGM_MAX_HIERARCHY_INFO { 57 | break 58 | } 59 | entity := entities[i] 60 | ccfe.entityList[i] = C.dcgmMigHierarchyInfo_t{ 61 | entity: C.dcgmGroupEntityPair_t{ 62 | entityGroupId: C.dcgm_field_entity_group_t(entity.Entity.EntityGroupId), 63 | entityId: C.uint(entity.Entity.EntityId), 64 | }, 65 | parent: C.dcgmGroupEntityPair_t{ 66 | entityGroupId: C.dcgm_field_entity_group_t(entity.Parent.EntityGroupId), 67 | entityId: C.uint(entity.Parent.EntityId), 68 | }, 69 | sliceProfile: C.dcgmMigProfile_t(entity.SliceProfile), 70 | } 71 | } 72 | result := C.dcgmCreateFakeEntities(handle.handle, &ccfe) 73 | 74 | if err := errorString(result); err != nil { 75 | return nil, &Error{msg: C.GoString(C.errorString(result)), Code: result} 76 | } 77 | entityIDs := make([]uint, ccfe.numToCreate) 78 | for i := 0; i < int(ccfe.numToCreate); i++ { 79 | entityIDs[i] = uint(ccfe.entityList[i].entity.entityId) 80 | } 81 | 82 | return entityIDs, nil 83 | } 84 | 85 | // InjectFieldValue injects a test value for a specific field into DCGM's field manager. 86 | // This function is intended for testing purposes only. 87 | // 88 | // Parameters: 89 | // - gpu: The GPU ID to inject the field value for 90 | // - fieldID: The DCGM field identifier 91 | // - fieldType: The type of the field (e.g., DCGM_FT_INT64, DCGM_FT_DOUBLE) 92 | // - status: The status code for the field 93 | // - ts: The timestamp for the field value 94 | // - value: The value to inject (must match fieldType) 95 | // 96 | // Returns an error if the injection fails 97 | func InjectFieldValue(gpu uint, fieldID Short, fieldType uint, status int, ts int64, value any) error { 98 | field := C.dcgmInjectFieldValue_t{ 99 | version: C.dcgmInjectFieldValue_version1, 100 | fieldId: C.ushort(fieldID), 101 | fieldType: C.ushort(fieldType), 102 | status: C.int(status), 103 | ts: C.long(ts), 104 | } 105 | 106 | switch fieldType { 107 | case DCGM_FT_INT64: 108 | i64Val := value.(int64) 109 | ptr := (*C.int64_t)(unsafe.Pointer(&field.value[0])) 110 | *ptr = C.int64_t(i64Val) 111 | case DCGM_FT_DOUBLE: 112 | dbVal := value.(float64) 113 | ptr := (*C.double)(unsafe.Pointer(&field.value[0])) 114 | *ptr = C.double(dbVal) 115 | } 116 | 117 | result := C.dcgmInjectFieldValue(handle.handle, C.uint(gpu), &field) 118 | 119 | if err := errorString(result); err != nil { 120 | return &Error{msg: C.GoString(C.errorString(result)), Code: result} 121 | } 122 | 123 | return nil 124 | } 125 | -------------------------------------------------------------------------------- /tests/hostengine_test.go: -------------------------------------------------------------------------------- 1 | package tests 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/NVIDIA/go-dcgm/pkg/dcgm" 7 | ) 8 | 9 | // TestHostEngineStatus demonstrates DCGM host engine introspection 10 | // This is equivalent to the hostengineStatus sample 11 | func TestHostEngineStatus(t *testing.T) { 12 | cleanup, err := dcgm.Init(dcgm.Embedded) 13 | if err != nil { 14 | t.Fatalf("Failed to initialize DCGM: %v", err) 15 | } 16 | defer cleanup() 17 | 18 | st, err := dcgm.Introspect() 19 | if err != nil { 20 | t.Fatalf("Failed to introspect host engine: %v", err) 21 | } 22 | 23 | t.Logf("Host Engine Status:") 24 | t.Logf(" Memory: %v KB", st.Memory) 25 | t.Logf(" CPU: %.2f%%", st.CPU) 26 | 27 | // Basic validation 28 | if st.Memory < 0 { 29 | t.Error("Memory usage cannot be negative") 30 | } 31 | if st.CPU < 0 || st.CPU > 100 { 32 | t.Errorf("CPU usage out of expected range: %.2f%%", st.CPU) 33 | } 34 | 35 | // Log some insights 36 | if st.Memory > 100000 { // > 100MB 37 | t.Logf("Host engine is using significant memory: %v KB", st.Memory) 38 | } 39 | if st.CPU > 50 { 40 | t.Logf("Host engine is using significant CPU: %.2f%%", st.CPU) 41 | } 42 | } 43 | 44 | // TestHostEngineStatusMultipleSamples demonstrates taking multiple introspection samples 45 | func TestHostEngineStatusMultipleSamples(t *testing.T) { 46 | if testing.Short() { 47 | t.Skip("Skipping multiple samples test in short mode") 48 | } 49 | 50 | cleanup, err := dcgm.Init(dcgm.Embedded) 51 | if err != nil { 52 | t.Fatalf("Failed to initialize DCGM: %v", err) 53 | } 54 | defer cleanup() 55 | 56 | samples := 3 57 | memoryUsages := make([]int64, 0, samples) 58 | cpuUsages := make([]float64, 0, samples) 59 | 60 | for i := 0; i < samples; i++ { 61 | st, err := dcgm.Introspect() 62 | if err != nil { 63 | t.Errorf("Failed to introspect host engine sample %d: %v", i+1, err) 64 | continue 65 | } 66 | 67 | memoryUsages = append(memoryUsages, st.Memory) 68 | cpuUsages = append(cpuUsages, st.CPU) 69 | 70 | t.Logf("Sample %d - Memory: %v KB, CPU: %.2f%%", i+1, st.Memory, st.CPU) 71 | } 72 | 73 | if len(memoryUsages) > 1 { 74 | // Check for significant memory changes 75 | minMem := memoryUsages[0] 76 | maxMem := memoryUsages[0] 77 | 78 | for _, mem := range memoryUsages[1:] { 79 | if mem < minMem { 80 | minMem = mem 81 | } 82 | if mem > maxMem { 83 | maxMem = mem 84 | } 85 | } 86 | 87 | if maxMem-minMem > 1000 { // More than 1MB difference 88 | t.Logf("Memory usage varied significantly: %v KB to %v KB", minMem, maxMem) 89 | } else { 90 | t.Logf("Memory usage remained stable around %v KB", memoryUsages[0]) 91 | } 92 | } 93 | 94 | if len(cpuUsages) > 1 { 95 | // Check for significant CPU changes 96 | minCPU := cpuUsages[0] 97 | maxCPU := cpuUsages[0] 98 | 99 | for _, cpu := range cpuUsages[1:] { 100 | if cpu < minCPU { 101 | minCPU = cpu 102 | } 103 | if cpu > maxCPU { 104 | maxCPU = cpu 105 | } 106 | } 107 | 108 | if maxCPU-minCPU > 10 { // More than 10% difference 109 | t.Logf("CPU usage varied significantly: %.2f%% to %.2f%%", minCPU, maxCPU) 110 | } else { 111 | t.Logf("CPU usage remained stable around %.2f%%", cpuUsages[0]) 112 | } 113 | } 114 | } 115 | 116 | // TestHostEngineStatusWithLoad demonstrates introspection while performing operations 117 | func TestHostEngineStatusWithLoad(t *testing.T) { 118 | cleanup, err := dcgm.Init(dcgm.Embedded) 119 | if err != nil { 120 | t.Fatalf("Failed to initialize DCGM: %v", err) 121 | } 122 | defer cleanup() 123 | 124 | // Get baseline status 125 | baselineSt, err := dcgm.Introspect() 126 | if err != nil { 127 | t.Fatalf("Failed to get baseline introspection: %v", err) 128 | } 129 | 130 | t.Logf("Baseline - Memory: %v KB, CPU: %.2f%%", baselineSt.Memory, baselineSt.CPU) 131 | 132 | // Perform some operations to potentially increase load 133 | gpus, err := dcgm.GetSupportedDevices() 134 | if err != nil { 135 | t.Logf("Failed to get supported devices: %v", err) 136 | } else { 137 | // Get device info for all GPUs 138 | for _, gpu := range gpus { 139 | _, err = dcgm.GetDeviceInfo(gpu) 140 | if err != nil { 141 | t.Logf("Failed to get device info for GPU %d: %v", gpu, err) 142 | } 143 | } 144 | } 145 | 146 | // Get status after operations 147 | loadedSt, err := dcgm.Introspect() 148 | if err != nil { 149 | t.Fatalf("Failed to get loaded introspection: %v", err) 150 | } 151 | 152 | t.Logf("After load - Memory: %v KB, CPU: %.2f%%", loadedSt.Memory, loadedSt.CPU) 153 | 154 | // Compare baseline vs loaded 155 | memoryDiff := loadedSt.Memory - baselineSt.Memory 156 | cpuDiff := loadedSt.CPU - baselineSt.CPU 157 | 158 | t.Logf("Differences - Memory: %+d KB, CPU: %+.2f%%", memoryDiff, cpuDiff) 159 | 160 | // Basic checks 161 | if loadedSt.Memory == 0 { 162 | t.Error("Memory usage should not be zero") 163 | } 164 | if loadedSt.CPU < 0 { 165 | t.Error("CPU usage should not be negative") 166 | } 167 | } 168 | -------------------------------------------------------------------------------- /pkg/dcgm/gpu_group.go: -------------------------------------------------------------------------------- 1 | package dcgm 2 | 3 | /* 4 | #include "dcgm_agent.h" 5 | #include "dcgm_structs.h" 6 | */ 7 | import "C" 8 | 9 | import ( 10 | "context" 11 | "encoding/binary" 12 | "fmt" 13 | ) 14 | 15 | // DCGM_GROUP_MAX_ENTITIES represents the maximum number of entities allowed in a group 16 | const ( 17 | DCGM_GROUP_MAX_ENTITIES int = C.DCGM_GROUP_MAX_ENTITIES_V2 18 | ) 19 | 20 | // GroupHandle represents a handle to a DCGM GPU group 21 | type GroupHandle struct{ handle C.dcgmGpuGrp_t } 22 | 23 | // SetHandle sets the internal group handle value 24 | func (g *GroupHandle) SetHandle(val uintptr) { 25 | g.handle = C.dcgmGpuGrp_t(val) 26 | } 27 | 28 | // GetHandle returns the internal group handle value 29 | func (g *GroupHandle) GetHandle() uintptr { 30 | return uintptr(g.handle) 31 | } 32 | 33 | // GroupAllGPUs returns a GroupHandle representing all GPUs in the system 34 | func GroupAllGPUs() GroupHandle { 35 | return GroupHandle{C.DCGM_GROUP_ALL_GPUS} 36 | } 37 | 38 | // CreateGroup creates a new empty GPU group with the specified name 39 | func CreateGroup(groupName string) (goGroupId GroupHandle, err error) { 40 | var cGroupID C.dcgmGpuGrp_t 41 | cname := C.CString(groupName) 42 | defer freeCString(cname) 43 | 44 | result := C.dcgmGroupCreate(handle.handle, C.DCGM_GROUP_EMPTY, cname, &cGroupID) 45 | if err = errorString(result); err != nil { 46 | return goGroupId, fmt.Errorf("error creating group: %s", err) 47 | } 48 | 49 | goGroupId = GroupHandle{cGroupID} 50 | return 51 | } 52 | 53 | // NewDefaultGroup creates a new group with default GPUs and the specified name 54 | func NewDefaultGroup(groupName string) (GroupHandle, error) { 55 | var cGroupID C.dcgmGpuGrp_t 56 | 57 | cname := C.CString(groupName) 58 | defer freeCString(cname) 59 | 60 | result := C.dcgmGroupCreate(handle.handle, C.DCGM_GROUP_DEFAULT, cname, &cGroupID) 61 | if err := errorString(result); err != nil { 62 | return GroupHandle{}, fmt.Errorf("error creating group: %s", err) 63 | } 64 | 65 | return GroupHandle{cGroupID}, nil 66 | } 67 | 68 | // AddToGroup adds a GPU to an existing group 69 | func AddToGroup(groupID GroupHandle, gpuID uint) (err error) { 70 | result := C.dcgmGroupAddDevice(handle.handle, groupID.handle, C.uint(gpuID)) 71 | if err = errorString(result); err != nil { 72 | return fmt.Errorf("error adding GPU %v to group: %s", gpuID, err) 73 | } 74 | 75 | return 76 | } 77 | 78 | // AddLinkEntityToGroup adds a link entity to the group 79 | func AddLinkEntityToGroup(groupID GroupHandle, index uint, entityGroupID Field_Entity_Group, parentID uint) (err error) { 80 | /* Only supported on little-endian systems currently */ 81 | slice := make([]byte, 4) 82 | slice[0] = uint8(entityGroupID) 83 | binary.LittleEndian.PutUint16(slice[1:3], uint16(index)) 84 | slice[3] = uint8(parentID) 85 | 86 | entityId := binary.LittleEndian.Uint32(slice) 87 | 88 | return AddEntityToGroup(groupID, FE_LINK, uint(entityId)) 89 | } 90 | 91 | // AddEntityToGroup adds an entity to an existing group 92 | func AddEntityToGroup(groupID GroupHandle, entityGroupID Field_Entity_Group, entityID uint) (err error) { 93 | result := C.dcgmGroupAddEntity(handle.handle, groupID.handle, C.dcgm_field_entity_group_t(entityGroupID), 94 | C.uint(entityID)) 95 | if err = errorString(result); err != nil { 96 | return fmt.Errorf("error adding entity group type %v, entity %v to group: %s", entityGroupID, entityID, err) 97 | } 98 | 99 | return 100 | } 101 | 102 | // DestroyGroup destroys an existing GPU group 103 | func DestroyGroup(groupID GroupHandle) (err error) { 104 | result := C.dcgmGroupDestroy(handle.handle, groupID.handle) 105 | if err = errorString(result); err != nil { 106 | return fmt.Errorf("error destroying group: %s", err) 107 | } 108 | 109 | return 110 | } 111 | 112 | // GroupInfo contains information about a DCGM group 113 | type GroupInfo struct { 114 | Version uint32 115 | GroupName string 116 | EntityList []GroupEntityPair 117 | } 118 | 119 | // GetGroupInfo retrieves information about a DCGM group 120 | func GetGroupInfo(groupID GroupHandle) (*GroupInfo, error) { 121 | response := C.dcgmGroupInfo_v3{ 122 | version: C.dcgmGroupInfo_version3, 123 | } 124 | 125 | result := C.dcgmGroupGetInfo(handle.handle, groupID.handle, &response) 126 | if err := errorString(result); err != nil { 127 | return nil, err 128 | } 129 | 130 | ret := GroupInfo{ 131 | Version: uint32(response.version), 132 | GroupName: C.GoString(&response.groupName[0]), 133 | EntityList: make([]GroupEntityPair, response.count), 134 | } 135 | 136 | for i := 0; i < int(response.count); i++ { 137 | ret.EntityList[i].EntityId = uint(response.entityList[i].entityId) 138 | ret.EntityList[i].EntityGroupId = Field_Entity_Group(response.entityList[i].entityGroupId) 139 | } 140 | 141 | return &ret, nil 142 | } 143 | 144 | // CreateGroupWithContext creates a new group with a context 145 | func CreateGroupWithContext(ctx context.Context, groupName string) (GroupHandle, error) { 146 | select { 147 | case <-ctx.Done(): 148 | return GroupHandle{}, ctx.Err() 149 | default: 150 | return CreateGroup(groupName) 151 | } 152 | } 153 | -------------------------------------------------------------------------------- /tests/diag_test.go: -------------------------------------------------------------------------------- 1 | package tests 2 | 3 | import ( 4 | "strings" 5 | "testing" 6 | 7 | "github.com/NVIDIA/go-dcgm/pkg/dcgm" 8 | "github.com/stretchr/testify/assert" 9 | ) 10 | 11 | // TestDiagnostics demonstrates running DCGM diagnostics 12 | // This is equivalent to the diag sample 13 | func TestDiagnostics(t *testing.T) { 14 | cleanup, err := dcgm.Init(dcgm.Embedded) 15 | if err != nil { 16 | t.Fatalf("Failed to initialize DCGM: %v", err) 17 | } 18 | defer cleanup() 19 | 20 | // Run quick diagnostics on all GPUs 21 | dr, err := dcgm.RunDiag(dcgm.DiagQuick, dcgm.GroupAllGPUs()) 22 | if err != nil { 23 | t.Fatalf("Failed to run diagnostics: %v", err) 24 | } 25 | 26 | // Log software test results 27 | t.Logf("Software Tests:") 28 | for _, test := range dr.Software { 29 | t.Logf(" %-50s %s\t%s", test.TestName, test.Status, test.TestOutput) 30 | } 31 | 32 | // Basic validation - we should have some results 33 | if len(dr.Software) == 0 { 34 | t.Error("No diagnostic results returned") 35 | } 36 | 37 | // Check for any failed tests 38 | failedTests := 0 39 | for _, test := range dr.Software { 40 | if test.Status == "fail" { 41 | failedTests++ 42 | t.Logf("Software test failed: %s - %s", test.TestName, test.TestOutput) 43 | } 44 | } 45 | 46 | if failedTests > 0 { 47 | t.Logf("Total failed tests: %d", failedTests) 48 | } else { 49 | t.Log("All diagnostic tests passed") 50 | } 51 | } 52 | 53 | // TestDiagnosticsLong demonstrates running longer diagnostics 54 | func TestDiagnosticsLong(t *testing.T) { 55 | if testing.Short() { 56 | t.Skip("Skipping long diagnostics test in short mode") 57 | } 58 | 59 | cleanup, err := dcgm.Init(dcgm.Embedded) 60 | if err != nil { 61 | t.Fatalf("Failed to initialize DCGM: %v", err) 62 | } 63 | defer cleanup() 64 | 65 | // Get supported devices first 66 | gpus, err := dcgm.GetSupportedDevices() 67 | if err != nil { 68 | t.Fatalf("Failed to get supported devices: %v", err) 69 | } 70 | 71 | if len(gpus) == 0 { 72 | t.Skip("No supported GPUs found for diagnostics") 73 | } 74 | 75 | // Run diagnostics on first GPU only for time efficiency 76 | group, err := dcgm.CreateGroup("test-group") 77 | if err != nil { 78 | t.Fatalf("Failed to create group: %v", err) 79 | } 80 | defer func() { 81 | if err = dcgm.DestroyGroup(group); err != nil { 82 | t.Logf("Failed to destroy group: %v", err) 83 | } 84 | }() 85 | 86 | err = dcgm.AddToGroup(group, gpus[0]) 87 | if err != nil { 88 | t.Fatalf("Failed to add GPU to group: %v", err) 89 | } 90 | 91 | // Run medium-level diagnostics 92 | dr, err := dcgm.RunDiag(dcgm.DiagMedium, group) 93 | if err != nil { 94 | t.Fatalf("Failed to run medium diagnostics: %v", err) 95 | } 96 | 97 | t.Logf("Medium diagnostics completed for GPU %d", gpus[0]) 98 | 99 | // Log results 100 | for _, test := range dr.Software { 101 | t.Logf(" %s: %s", test.TestName, test.Status) 102 | } 103 | } 104 | 105 | // TestDiagTestNameFormat validates that TestName field contains category names, 106 | // not detailed test descriptions (issue #97) 107 | func TestDiagTestNameFormat(t *testing.T) { 108 | cleanup, err := dcgm.Init(dcgm.Embedded) 109 | if err != nil { 110 | t.Fatalf("Failed to initialize DCGM: %v", err) 111 | } 112 | defer cleanup() 113 | 114 | dr, err := dcgm.RunDiag(dcgm.DiagQuick, dcgm.GroupAllGPUs()) 115 | if err != nil { 116 | t.Fatalf("Failed to run diagnostics: %v", err) 117 | } 118 | 119 | assert.NotEmpty(t, dr.Software, "diagnostic results should not be empty") 120 | 121 | // Valid test category names that should appear (lowercase) 122 | validTestNames := []string{ 123 | "software", 124 | "memory", 125 | "pcie", 126 | "diagnostic", 127 | "sm stress", 128 | "targeted stress", 129 | "targeted power", 130 | "memory bandwidth", 131 | "memtest", 132 | "pulse", 133 | "eud", 134 | "context create", 135 | } 136 | 137 | // Invalid strings that should NOT appear in TestName 138 | // These are detailed descriptions that were incorrectly returned before fix 139 | invalidPatterns := []string{ 140 | "presence of drivers on the denylist", 141 | "(e.g. nouveau)", 142 | "Allocated", 143 | "bytes", 144 | "presence (and version)", 145 | } 146 | 147 | for i, test := range dr.Software { 148 | t.Logf("Result %d: TestName=%q, Status=%s", i, test.TestName, test.Status) 149 | 150 | // TestName should be one of the valid category names 151 | assert.Contains( 152 | t, 153 | validTestNames, 154 | test.TestName, 155 | "TestName should be a category name like 'software', 'memory', 'pcie', got: %q", 156 | test.TestName, 157 | ) 158 | 159 | // TestName should NOT contain detailed descriptions 160 | for _, invalid := range invalidPatterns { 161 | assert.NotContains( 162 | t, 163 | test.TestName, 164 | invalid, 165 | "TestName should not contain detailed descriptions, got: %q", 166 | test.TestName, 167 | ) 168 | } 169 | 170 | // TestName should be lowercase 171 | assert.Equal( 172 | t, 173 | strings.ToLower(test.TestName), 174 | test.TestName, 175 | "TestName should be lowercase, got: %q", 176 | test.TestName, 177 | ) 178 | } 179 | } 180 | -------------------------------------------------------------------------------- /pkg/dcgm/fields_test.go: -------------------------------------------------------------------------------- 1 | package dcgm 2 | 3 | import ( 4 | crand "crypto/rand" 5 | "fmt" 6 | "math/big" 7 | "runtime" 8 | "testing" 9 | "time" 10 | 11 | "github.com/stretchr/testify/assert" 12 | "github.com/stretchr/testify/require" 13 | ) 14 | 15 | func TestFieldHandle(t *testing.T) { 16 | fh := FieldHandle{} 17 | assert.Equal(t, uintptr(0), fh.GetHandle(), "value mismatch") 18 | 19 | inputs := []uintptr{1000, 0, 1, 10, 11, 50, 100, 1939902, 9992932938239, 999999999999999999} 20 | 21 | for _, input := range inputs { 22 | fh.SetHandle(input) 23 | assert.Equal(t, input, fh.GetHandle(), "values mismatch") 24 | } 25 | } 26 | 27 | func TestGetLatestValuesForFields(t *testing.T) { 28 | teardownTest := setupTest(t) 29 | defer teardownTest(t) 30 | 31 | runOnlyWithLiveGPUs(t) 32 | 33 | // Setup test GPU 34 | gpus, err := withInjectionGPUs(t, 1) 35 | require.NoError(t, err) 36 | gpuId := gpus[0] 37 | 38 | // Setup test group 39 | groupId, err := NewDefaultGroup("mygroup") 40 | require.NoError(t, err) 41 | defer func() { 42 | destroyGroupErr := DestroyGroup(groupId) 43 | require.NoError(t, destroyGroupErr) 44 | }() 45 | 46 | // Setup field group 47 | fieldId := DCGM_FI_DEV_XID_ERRORS 48 | n, err := crand.Int(crand.Reader, big.NewInt(1000000)) 49 | require.NoError(t, err) 50 | fieldGroupName := fmt.Sprintf("fieldGroupName%d", n.Int64()) 51 | fieldsGroup, err := FieldGroupCreate(fieldGroupName, []Short{fieldId}) 52 | require.NoError(t, err) 53 | defer func() { 54 | destroyFieldsGroupErr := FieldGroupDestroy(fieldsGroup) 55 | require.NoError(t, destroyFieldsGroupErr) 56 | }() 57 | 58 | // Inject test value 59 | err = InjectFieldValue(gpuId, 60 | DCGM_FI_DEV_XID_ERRORS, 61 | DCGM_FT_INT64, 62 | 0, 63 | time.Now().Add(-time.Duration(5)*time.Second).UnixMicro(), 64 | int64(10), 65 | ) 66 | require.NoError(t, err) 67 | 68 | // Setup field watching 69 | err = WatchFieldsWithGroupEx( 70 | fieldsGroup, 71 | groupId, 72 | defaultUpdateFreq, 73 | defaultMaxKeepAge, 74 | defaultMaxKeepSamples, 75 | ) 76 | require.NoError(t, err) 77 | 78 | err = UpdateAllFields() 79 | require.NoError(t, err) 80 | 81 | // Test 82 | values, err := GetLatestValuesForFields(gpuId, []Short{fieldId}) 83 | require.NoError(t, err) 84 | 85 | // Verify results 86 | assert.Len(t, values, 1) 87 | assert.NotEmpty(t, values[0].String()) 88 | assert.Equal(t, int64(10), values[0].Int64()) 89 | } 90 | 91 | func BenchmarkGetLatestValuesForFieldsVariousSize(b *testing.B) { 92 | teardownTest := setupTest(b) 93 | defer teardownTest(b) 94 | 95 | // Setup test GPU 96 | gpus, err := withInjectionGPUs(b, 1) 97 | require.NoError(b, err) 98 | gpuId := gpus[0] 99 | 100 | // Setup test group 101 | groupId, err := NewDefaultGroup("mygroup") 102 | require.NoError(b, err) 103 | defer func() { 104 | err := DestroyGroup(groupId) 105 | require.NoError(b, err) 106 | }() 107 | 108 | // Use the same fields as in the main benchmark 109 | allFieldIds := []Short{ 110 | DCGM_FI_DEV_XID_ERRORS, 111 | DCGM_FI_DEV_DIAG_MEMORY_RESULT, 112 | DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION, 113 | DCGM_FI_DEV_GPU_TEMP, 114 | DCGM_FI_DEV_MEMORY_TEMP, 115 | DCGM_FI_DEV_GPU_UTIL, 116 | DCGM_FI_DEV_MEM_COPY_UTIL, 117 | DCGM_FI_DEV_ENC_UTIL, 118 | DCGM_FI_DEV_DEC_UTIL, 119 | DCGM_FI_DEV_FB_FREE, 120 | DCGM_FI_DEV_FB_USED, 121 | DCGM_FI_DEV_PCIE_REPLAY_COUNTER, 122 | DCGM_FI_DEV_SM_CLOCK, 123 | DCGM_FI_DEV_RETIRED_PENDING, 124 | DCGM_FI_DEV_RETIRED_SBE, 125 | DCGM_FI_DEV_RETIRED_DBE, 126 | DCGM_FI_DEV_POWER_VIOLATION, 127 | DCGM_FI_DEV_THERMAL_VIOLATION, 128 | } 129 | 130 | // Test different field counts 131 | fieldCounts := []int{1, 5, 10, len(allFieldIds)} 132 | 133 | for _, count := range fieldCounts { 134 | b.Run(fmt.Sprintf("Fields-%d", count), func(b *testing.B) { 135 | fieldIds := allFieldIds[:count] // Take first 'count' fields 136 | 137 | // Setup field group 138 | fieldGroupName := fmt.Sprintf("fieldGroup-%d", count) 139 | fieldsGroup, err := FieldGroupCreate(fieldGroupName, fieldIds) 140 | require.NoError(b, err) 141 | defer func() { 142 | destroyFieldsGroupErr := FieldGroupDestroy(fieldsGroup) 143 | require.NoError(b, destroyFieldsGroupErr) 144 | }() 145 | 146 | // Setup field watching 147 | err = WatchFieldsWithGroupEx( 148 | fieldsGroup, 149 | groupId, 150 | defaultUpdateFreq, 151 | defaultMaxKeepAge, 152 | defaultMaxKeepSamples, 153 | ) 154 | require.NoError(b, err) 155 | 156 | // Inject values for all fields 157 | for _, fieldId := range fieldIds { 158 | err = InjectFieldValue(gpuId, 159 | fieldId, 160 | DCGM_FT_INT64, 161 | 0, 162 | time.Now().Add(-time.Duration(5)*time.Second).UnixMicro(), 163 | int64(10), 164 | ) 165 | require.NoError(b, err) 166 | } 167 | 168 | err = UpdateAllFields() 169 | require.NoError(b, err) 170 | 171 | b.ResetTimer() 172 | b.ReportAllocs() 173 | 174 | for i := 0; i < b.N; i++ { 175 | values, err := GetLatestValuesForFields(gpuId, fieldIds) 176 | require.NoError(b, err) 177 | require.Len(b, values, len(fieldIds), "expected %d values, got %d", len(fieldIds), len(values)) 178 | runtime.KeepAlive(values) 179 | } 180 | }) 181 | } 182 | } 183 | -------------------------------------------------------------------------------- /pkg/dcgm/mig.go: -------------------------------------------------------------------------------- 1 | package dcgm 2 | 3 | /* 4 | #include "dcgm_agent.h" 5 | #include "dcgm_structs.h" 6 | */ 7 | import "C" 8 | 9 | import ( 10 | "fmt" 11 | "unsafe" 12 | ) 13 | 14 | // Field_Entity_Group represents the type of DCGM entity 15 | type Field_Entity_Group uint 16 | 17 | const ( 18 | // FE_NONE represents no entity type 19 | FE_NONE Field_Entity_Group = iota 20 | // FE_GPU represents a GPU device entity 21 | FE_GPU 22 | // FE_VGPU represents a virtual GPU entity 23 | FE_VGPU 24 | // FE_SWITCH represents an NVSwitch entity 25 | FE_SWITCH 26 | // FE_GPU_I represents a GPU instance entity 27 | FE_GPU_I 28 | // FE_GPU_CI represents a GPU compute instance entity 29 | FE_GPU_CI 30 | // FE_LINK represents an NVLink entity 31 | FE_LINK 32 | // FE_CPU represents a CPU entity 33 | FE_CPU 34 | // FE_CPU_CORE represents a CPU core entity 35 | FE_CPU_CORE 36 | // FE_COUNT represents the total number of entity types 37 | FE_COUNT 38 | ) 39 | 40 | // String returns a string representation of the Field_Entity_Group 41 | func (e Field_Entity_Group) String() string { 42 | switch e { 43 | case FE_GPU: 44 | return "GPU" 45 | case FE_VGPU: 46 | return "vGPU" 47 | case FE_SWITCH: 48 | return "NvSwitch" 49 | case FE_GPU_I: 50 | return "GPU Instance" 51 | case FE_GPU_CI: 52 | return "GPU Compute Instance" 53 | case FE_LINK: 54 | return "NvLink" 55 | case FE_CPU: 56 | return "CPU" 57 | case FE_CPU_CORE: 58 | return "CPU Core" 59 | } 60 | return "unknown" 61 | } 62 | 63 | // GroupEntityPair represents a DCGM entity and its group identifier 64 | type GroupEntityPair struct { 65 | // EntityGroupId specifies the type of the entity 66 | EntityGroupId Field_Entity_Group 67 | // EntityId is the unique identifier for this entity 68 | EntityId uint 69 | } 70 | 71 | // MigEntityInfo contains information about a MIG entity 72 | type MigEntityInfo struct { 73 | // GpuUuid is the UUID of the parent GPU 74 | GpuUuid string 75 | // NvmlGpuIndex is the NVML index of the parent GPU 76 | NvmlGpuIndex uint 77 | // NvmlInstanceId is the NVML GPU instance ID 78 | NvmlInstanceId uint 79 | // NvmlComputeInstanceId is the NVML compute instance ID 80 | NvmlComputeInstanceId uint 81 | // NvmlMigProfileId is the NVML MIG profile ID 82 | NvmlMigProfileId uint 83 | // NvmlProfileSlices is the number of slices in the MIG profile 84 | NvmlProfileSlices uint 85 | } 86 | 87 | // MigHierarchyInfo_v2 represents version 2 of MIG hierarchy information 88 | type MigHierarchyInfo_v2 struct { 89 | // Entity contains the entity information 90 | Entity GroupEntityPair 91 | // Parent contains the parent entity information 92 | Parent GroupEntityPair 93 | // Info contains detailed MIG entity information 94 | Info MigEntityInfo 95 | } 96 | 97 | const ( 98 | // MAX_NUM_DEVICES represents the maximum number of GPU devices supported 99 | MAX_NUM_DEVICES = uint(C.DCGM_MAX_NUM_DEVICES) 100 | 101 | // MAX_HIERARCHY_INFO represents the maximum size of the MIG hierarchy information 102 | MAX_HIERARCHY_INFO = uint(C.DCGM_MAX_HIERARCHY_INFO) 103 | ) 104 | 105 | // MigHierarchy_v2 represents version 2 of the complete MIG hierarchy 106 | type MigHierarchy_v2 struct { 107 | // Version is the version number of the hierarchy structure 108 | Version uint 109 | // Count is the number of valid entries in EntityList 110 | Count uint 111 | // EntityList contains the MIG hierarchy information for each entity 112 | EntityList [C.DCGM_MAX_HIERARCHY_INFO]MigHierarchyInfo_v2 113 | } 114 | 115 | // GetGPUInstanceHierarchy retrieves the complete MIG hierarchy information 116 | func GetGPUInstanceHierarchy() (hierarchy MigHierarchy_v2, err error) { 117 | var c_hierarchy C.dcgmMigHierarchy_v2 118 | c_hierarchy.version = C.dcgmMigHierarchy_version2 119 | ptr_hierarchy := (*C.dcgmMigHierarchy_v2)(unsafe.Pointer(&c_hierarchy)) 120 | result := C.dcgmGetGpuInstanceHierarchy(handle.handle, ptr_hierarchy) 121 | 122 | if err = errorString(result); err != nil { 123 | return toMigHierarchy(c_hierarchy), fmt.Errorf("error retrieving DCGM MIG hierarchy: %s", err) 124 | } 125 | 126 | return toMigHierarchy(c_hierarchy), nil 127 | } 128 | 129 | func toMigHierarchy(c_hierarchy C.dcgmMigHierarchy_v2) MigHierarchy_v2 { 130 | var hierarchy MigHierarchy_v2 131 | hierarchy.Version = uint(c_hierarchy.version) 132 | hierarchy.Count = uint(c_hierarchy.count) 133 | for i := uint(0); i < hierarchy.Count; i++ { 134 | hierarchy.EntityList[i] = MigHierarchyInfo_v2{ 135 | Entity: GroupEntityPair{Field_Entity_Group(c_hierarchy.entityList[i].entity.entityGroupId), uint(c_hierarchy.entityList[i].entity.entityId)}, 136 | Parent: GroupEntityPair{Field_Entity_Group(c_hierarchy.entityList[i].parent.entityGroupId), uint(c_hierarchy.entityList[i].parent.entityId)}, 137 | Info: MigEntityInfo{ 138 | GpuUuid: *stringPtr(&c_hierarchy.entityList[i].info.gpuUuid[0]), 139 | NvmlGpuIndex: uint(c_hierarchy.entityList[i].info.nvmlGpuIndex), 140 | NvmlInstanceId: uint(c_hierarchy.entityList[i].info.nvmlInstanceId), 141 | NvmlComputeInstanceId: uint(c_hierarchy.entityList[i].info.nvmlComputeInstanceId), 142 | NvmlMigProfileId: uint(c_hierarchy.entityList[i].info.nvmlMigProfileId), 143 | NvmlProfileSlices: uint(c_hierarchy.entityList[i].info.nvmlProfileSlices), 144 | }, 145 | } 146 | } 147 | 148 | return hierarchy 149 | } 150 | -------------------------------------------------------------------------------- /pkg/dcgm/device_status.go: -------------------------------------------------------------------------------- 1 | package dcgm 2 | 3 | /* 4 | #include "dcgm_agent.h" 5 | #include "dcgm_structs.h" 6 | */ 7 | import "C" 8 | 9 | import ( 10 | "fmt" 11 | "math/rand" 12 | ) 13 | 14 | // PerfState represents the performance state (P-state) of a GPU 15 | type PerfState uint 16 | 17 | const ( 18 | // PerfStateMax represents the highest performance state (P0) 19 | PerfStateMax = 0 20 | 21 | // PerfStateMin represents the lowest performance state (P15) 22 | PerfStateMin = 15 23 | 24 | // PerfStateUnknown represents an unknown performance state 25 | PerfStateUnknown = 32 26 | ) 27 | 28 | // String returns a string representation of the performance state 29 | func (p PerfState) String() string { 30 | if p >= PerfStateMax && p <= PerfStateMin { 31 | return fmt.Sprintf("P%d", p) 32 | } 33 | return "Unknown" 34 | } 35 | 36 | // UtilizationInfo contains GPU utilization metrics 37 | type UtilizationInfo struct { 38 | GPU int64 // % 39 | Memory int64 // % 40 | Encoder int64 // % 41 | Decoder int64 // % 42 | } 43 | 44 | // ECCErrorsInfo contains ECC memory error counts 45 | type ECCErrorsInfo struct { 46 | SingleBit int64 47 | DoubleBit int64 48 | } 49 | 50 | // MemoryInfo contains GPU memory usage and error information 51 | type MemoryInfo struct { 52 | GlobalUsed int64 53 | ECCErrors ECCErrorsInfo 54 | } 55 | 56 | // ClockInfo contains GPU clock frequencies 57 | type ClockInfo struct { 58 | Cores int64 // MHz 59 | Memory int64 // MHz 60 | } 61 | 62 | // PCIThroughputInfo contains PCI bus transfer metrics 63 | type PCIThroughputInfo struct { 64 | Rx int64 // MB 65 | Tx int64 // MB 66 | Replays int64 67 | } 68 | 69 | // PCIStatusInfo contains PCI bus status information 70 | type PCIStatusInfo struct { 71 | BAR1Used int64 // MB 72 | Throughput PCIThroughputInfo 73 | FBUsed int64 74 | } 75 | 76 | // DeviceStatus contains comprehensive GPU device status information 77 | type DeviceStatus struct { 78 | Power float64 // W 79 | Temperature int64 // °C 80 | Utilization UtilizationInfo 81 | Memory MemoryInfo 82 | Clocks ClockInfo 83 | PCI PCIStatusInfo 84 | Performance PerfState 85 | FanSpeed int64 // % 86 | } 87 | 88 | func latestValuesForDevice(gpuId uint) (status DeviceStatus, err error) { 89 | const ( 90 | pwr int = iota 91 | temp 92 | sm 93 | mem 94 | enc 95 | dec 96 | smClock 97 | memClock 98 | bar1Used 99 | pcieRxThroughput 100 | pcieTxThroughput 101 | pcieReplay 102 | fbUsed 103 | sbe 104 | dbe 105 | pstate 106 | fanSpeed 107 | fieldsCount 108 | ) 109 | 110 | deviceFields := make([]Short, fieldsCount) 111 | deviceFields[pwr] = C.DCGM_FI_DEV_POWER_USAGE 112 | deviceFields[temp] = C.DCGM_FI_DEV_GPU_TEMP 113 | deviceFields[sm] = C.DCGM_FI_DEV_GPU_UTIL 114 | deviceFields[mem] = C.DCGM_FI_DEV_MEM_COPY_UTIL 115 | deviceFields[enc] = C.DCGM_FI_DEV_ENC_UTIL 116 | deviceFields[dec] = C.DCGM_FI_DEV_DEC_UTIL 117 | deviceFields[smClock] = C.DCGM_FI_DEV_SM_CLOCK 118 | deviceFields[memClock] = C.DCGM_FI_DEV_MEM_CLOCK 119 | deviceFields[bar1Used] = C.DCGM_FI_DEV_BAR1_USED 120 | deviceFields[pcieRxThroughput] = C.DCGM_FI_DEV_PCIE_RX_THROUGHPUT 121 | deviceFields[pcieTxThroughput] = C.DCGM_FI_DEV_PCIE_TX_THROUGHPUT 122 | deviceFields[pcieReplay] = C.DCGM_FI_DEV_PCIE_REPLAY_COUNTER 123 | deviceFields[fbUsed] = C.DCGM_FI_DEV_FB_USED 124 | deviceFields[sbe] = C.DCGM_FI_DEV_ECC_SBE_AGG_TOTAL 125 | deviceFields[dbe] = C.DCGM_FI_DEV_ECC_DBE_AGG_TOTAL 126 | deviceFields[pstate] = C.DCGM_FI_DEV_PSTATE 127 | deviceFields[fanSpeed] = C.DCGM_FI_DEV_FAN_SPEED 128 | 129 | fieldsName := fmt.Sprintf("devStatusFields%d", rand.Uint64()) 130 | fieldsId, err := FieldGroupCreate(fieldsName, deviceFields) 131 | if err != nil { 132 | return 133 | } 134 | 135 | groupName := fmt.Sprintf("devStatus%d", rand.Uint64()) 136 | groupId, err := WatchFields(gpuId, fieldsId, groupName) 137 | if err != nil { 138 | _ = FieldGroupDestroy(fieldsId) 139 | return 140 | } 141 | 142 | values, err := GetLatestValuesForFields(gpuId, deviceFields) 143 | if err != nil { 144 | _ = FieldGroupDestroy(fieldsId) 145 | _ = DestroyGroup(groupId) 146 | return status, err 147 | } 148 | 149 | power := values[pwr].Float64() 150 | 151 | gpuUtil := UtilizationInfo{ 152 | GPU: values[sm].Int64(), 153 | Memory: values[mem].Int64(), 154 | Encoder: values[enc].Int64(), 155 | Decoder: values[dec].Int64(), 156 | } 157 | 158 | memory := MemoryInfo{ 159 | ECCErrors: ECCErrorsInfo{ 160 | SingleBit: values[sbe].Int64(), 161 | DoubleBit: values[dbe].Int64(), 162 | }, 163 | } 164 | 165 | clocks := ClockInfo{ 166 | Cores: values[smClock].Int64(), 167 | Memory: values[memClock].Int64(), 168 | } 169 | 170 | pci := PCIStatusInfo{ 171 | BAR1Used: values[bar1Used].Int64(), 172 | Throughput: PCIThroughputInfo{ 173 | Rx: values[pcieRxThroughput].Int64(), 174 | Tx: values[pcieTxThroughput].Int64(), 175 | Replays: values[pcieReplay].Int64(), 176 | }, 177 | FBUsed: values[fbUsed].Int64(), 178 | } 179 | 180 | status = DeviceStatus{ 181 | Power: power, 182 | Temperature: values[temp].Int64(), 183 | Utilization: gpuUtil, 184 | Memory: memory, 185 | Clocks: clocks, 186 | PCI: pci, 187 | Performance: PerfState(values[pstate].Int64()), 188 | FanSpeed: values[fanSpeed].Int64(), 189 | } 190 | 191 | _ = FieldGroupDestroy(fieldsId) 192 | _ = DestroyGroup(groupId) 193 | return 194 | } 195 | -------------------------------------------------------------------------------- /tests/dmon_test.go: -------------------------------------------------------------------------------- 1 | package tests 2 | 3 | import ( 4 | "testing" 5 | "time" 6 | 7 | "github.com/NVIDIA/go-dcgm/pkg/dcgm" 8 | ) 9 | 10 | // TestDeviceMonitoring demonstrates device monitoring functionality 11 | // This is equivalent to the dmon sample but runs for a limited time 12 | func TestDeviceMonitoring(t *testing.T) { 13 | cleanup, err := dcgm.Init(dcgm.Embedded) 14 | if err != nil { 15 | t.Fatalf("Failed to initialize DCGM: %v", err) 16 | } 17 | defer cleanup() 18 | 19 | gpus, err := dcgm.GetSupportedDevices() 20 | if err != nil { 21 | t.Fatalf("Failed to get supported devices: %v", err) 22 | } 23 | 24 | if len(gpus) == 0 { 25 | t.Skip("No supported GPUs found for monitoring") 26 | } 27 | 28 | t.Log("# gpu pwr temp sm mem enc dec mclk pclk") 29 | t.Log("# Idx W C % % % % MHz MHz") 30 | 31 | // Monitor for a few seconds instead of indefinitely 32 | ticker := time.NewTicker(time.Second * 1) 33 | defer ticker.Stop() 34 | 35 | timeout := time.After(5 * time.Second) 36 | sampleCount := 0 37 | 38 | for { 39 | select { 40 | case <-ticker.C: 41 | for _, gpu := range gpus { 42 | st, err := dcgm.GetDeviceStatus(gpu) 43 | if err != nil { 44 | t.Errorf("Failed to get device status for GPU %d: %v", gpu, err) 45 | continue 46 | } 47 | 48 | t.Logf("%5d %5d %5d %5d %5d %5d %5d %5d %5d", 49 | gpu, int64(st.Power), st.Temperature, st.Utilization.GPU, st.Utilization.Memory, 50 | st.Utilization.Encoder, st.Utilization.Decoder, st.Clocks.Memory, st.Clocks.Cores) 51 | 52 | // Basic validation 53 | if st.Temperature < 0 || st.Temperature > 150 { 54 | t.Errorf("GPU %d temperature out of expected range: %d°C", gpu, st.Temperature) 55 | } 56 | if st.Utilization.GPU < 0 || st.Utilization.GPU > 100 { 57 | t.Errorf("GPU %d utilization out of range: %d%%", gpu, st.Utilization.GPU) 58 | } 59 | } 60 | sampleCount++ 61 | 62 | case <-timeout: 63 | t.Logf("Monitoring completed after %d samples", sampleCount) 64 | return 65 | } 66 | } 67 | } 68 | 69 | // TestDeviceStatusSingle demonstrates getting device status for a single GPU 70 | func TestDeviceStatusSingle(t *testing.T) { 71 | cleanup, err := dcgm.Init(dcgm.Embedded) 72 | if err != nil { 73 | t.Fatalf("Failed to initialize DCGM: %v", err) 74 | } 75 | defer cleanup() 76 | 77 | gpus, err := dcgm.GetSupportedDevices() 78 | if err != nil { 79 | t.Fatalf("Failed to get supported devices: %v", err) 80 | } 81 | 82 | if len(gpus) == 0 { 83 | t.Skip("No supported GPUs found") 84 | } 85 | 86 | // Test first GPU 87 | gpu := gpus[0] 88 | st, err := dcgm.GetDeviceStatus(gpu) 89 | if err != nil { 90 | t.Fatalf("Failed to get device status for GPU %d: %v", gpu, err) 91 | } 92 | 93 | t.Logf("GPU %d Status:", gpu) 94 | t.Logf(" Power: %d W", int64(st.Power)) 95 | t.Logf(" Temperature: %d°C", st.Temperature) 96 | t.Logf(" GPU Utilization: %d%%", st.Utilization.GPU) 97 | t.Logf(" Memory Utilization: %d%%", st.Utilization.Memory) 98 | t.Logf(" Encoder Utilization: %d%%", st.Utilization.Encoder) 99 | t.Logf(" Decoder Utilization: %d%%", st.Utilization.Decoder) 100 | t.Logf(" Memory Clock: %d MHz", st.Clocks.Memory) 101 | t.Logf(" Core Clock: %d MHz", st.Clocks.Cores) 102 | 103 | // Validate ranges 104 | if st.Temperature < 0 || st.Temperature > 150 { 105 | t.Errorf("Temperature out of expected range: %d°C", st.Temperature) 106 | } 107 | if st.Utilization.GPU < 0 || st.Utilization.GPU > 100 { 108 | t.Errorf("GPU utilization out of range: %d%%", st.Utilization.GPU) 109 | } 110 | if st.Utilization.Memory < 0 || st.Utilization.Memory > 100 { 111 | t.Errorf("Memory utilization out of range: %d%%", st.Utilization.Memory) 112 | } 113 | } 114 | 115 | // TestDeviceStatusMultipleSamples demonstrates taking multiple samples over time 116 | func TestDeviceStatusMultipleSamples(t *testing.T) { 117 | if testing.Short() { 118 | t.Skip("Skipping multiple samples test in short mode") 119 | } 120 | 121 | cleanup, err := dcgm.Init(dcgm.Embedded) 122 | if err != nil { 123 | t.Fatalf("Failed to initialize DCGM: %v", err) 124 | } 125 | defer cleanup() 126 | 127 | gpus, err := dcgm.GetSupportedDevices() 128 | if err != nil { 129 | t.Fatalf("Failed to get supported devices: %v", err) 130 | } 131 | 132 | if len(gpus) == 0 { 133 | t.Skip("No supported GPUs found") 134 | } 135 | 136 | // Take samples every 500ms for 3 seconds 137 | gpu := gpus[0] 138 | samples := make([]dcgm.DeviceStatus, 0, 6) 139 | 140 | for i := 0; i < 6; i++ { 141 | st, err := dcgm.GetDeviceStatus(gpu) 142 | if err != nil { 143 | t.Errorf("Failed to get device status sample %d: %v", i, err) 144 | continue 145 | } 146 | samples = append(samples, st) 147 | time.Sleep(500 * time.Millisecond) 148 | } 149 | 150 | t.Logf("Collected %d samples for GPU %d", len(samples), gpu) 151 | 152 | // Analyze samples for consistency 153 | if len(samples) > 1 { 154 | firstTemp := samples[0].Temperature 155 | tempVariation := false 156 | for _, sample := range samples[1:] { 157 | if abs64(sample.Temperature-firstTemp) > 5 { // Allow 5°C variation 158 | tempVariation = true 159 | break 160 | } 161 | } 162 | 163 | if !tempVariation { 164 | t.Logf("Temperature remained stable around %d°C", firstTemp) 165 | } else { 166 | t.Logf("Temperature variation detected across samples") 167 | } 168 | } 169 | } 170 | 171 | func abs64(x int64) int64 { 172 | if x < 0 { 173 | return -x 174 | } 175 | return x 176 | } 177 | -------------------------------------------------------------------------------- /.gitlab-ci.yml: -------------------------------------------------------------------------------- 1 | # GitLab CI configuration for go-dcgm 2 | # Uses Docker for building and testing based on the existing Dockerfile 3 | 4 | # Define the stages of the pipeline 5 | stages: 6 | - build 7 | - test 8 | 9 | # Define global variables 10 | variables: 11 | DOCKER_DRIVER: overlay2 12 | DOCKER_TLS_CERTDIR: "/certs" 13 | # Build arguments for the Dockerfile 14 | CUDA_VERSION: "12.5.1" 15 | DISTRO_FLAVOR: "ubuntu24.04" 16 | GO_VERSION: "1.24.4" 17 | DCGM_VERSION: "4.2.3-2" 18 | # Image names 19 | BUILD_IMAGE: "$CI_REGISTRY_IMAGE/build:$CI_COMMIT_SHA" 20 | TEST_IMAGE: "$CI_REGISTRY_IMAGE/test:$CI_COMMIT_SHA" 21 | 22 | # Use Docker-in-Docker service 23 | services: 24 | - docker:dind 25 | 26 | # Use Docker image 27 | image: docker:latest 28 | 29 | before_script: 30 | - docker info 31 | - echo $CI_REGISTRY_PASSWORD | docker login -u $CI_REGISTRY_USER --password-stdin $CI_REGISTRY 32 | 33 | # Build Docker image and compile Go code 34 | build: 35 | stage: build 36 | script: 37 | - echo "Building Docker image with go-dcgm..." 38 | # Build the samples stage which includes the compiled binaries 39 | - echo $CI_REGISTRY_PASSWORD | docker login -u $CI_REGISTRY_USER --password-stdin $CI_REGISTRY 40 | - docker build 41 | --target samples 42 | --build-arg CUDA_VERSION=$CUDA_VERSION 43 | --build-arg DISTRO_FLAVOR=$DISTRO_FLAVOR 44 | --build-arg GO_VERSION=$GO_VERSION 45 | --build-arg DCGM_VERSION=$DCGM_VERSION 46 | --tag $BUILD_IMAGE 47 | . 48 | # Push the built image for use in test stage 49 | - docker push $BUILD_IMAGE 50 | rules: 51 | - if: $CI_COMMIT_BRANCH == "main" 52 | - if: $CI_PIPELINE_SOURCE == "merge_request_event" 53 | 54 | # Run tests inside the built Docker container 55 | test: 56 | stage: test 57 | tags: 58 | - docker 59 | - gpu-enabled 60 | image: $BUILD_IMAGE 61 | needs: ["build"] 62 | before_script: [] 63 | script: 64 | # Create test reports directory 65 | - mkdir -p test-reports 66 | - echo '=== Installing test dependencies ===' 67 | - go install github.com/jstemmer/go-junit-report/v2@latest 68 | - echo '=== Running Go tests ===' 69 | - mkdir -p test-reports 70 | - > 71 | go test -v ./tests 2>&1 72 | | /root/go/bin/go-junit-report -set-exit-code 73 | > test-reports/go-tests.xml && 74 | - echo '=== Tests completed successfully! ===' 75 | artifacts: 76 | reports: 77 | junit: 78 | - test-reports/go-tests.xml 79 | paths: 80 | - test-reports/ 81 | expire_in: 1 week 82 | when: always 83 | rules: 84 | - if: $CI_COMMIT_BRANCH == "main" 85 | - if: $CI_PIPELINE_SOURCE == "merge_request_event" 86 | 87 | # Run race tests inside the built Docker container 88 | test-race: 89 | stage: test 90 | tags: 91 | - docker 92 | - gpu-enabled 93 | image: $BUILD_IMAGE 94 | needs: ["build"] 95 | before_script: [] 96 | script: 97 | # Create test reports directory 98 | - mkdir -p test-reports 99 | - echo '=== Installing test dependencies ===' 100 | - go install github.com/jstemmer/go-junit-report/v2@latest 101 | - echo '=== Running Go race tests ===' 102 | - mkdir -p test-reports 103 | - > 104 | go test -race -v ./tests 2>&1 105 | | /root/go/bin/go-junit-report -set-exit-code 106 | > test-reports/go-race-tests.xml 107 | - echo '=== Race Tests completed successfully! ===' 108 | artifacts: 109 | reports: 110 | junit: 111 | - test-reports/go-race-tests.xml 112 | paths: 113 | - test-reports/ 114 | expire_in: 1 week 115 | when: always 116 | rules: 117 | - if: $CI_COMMIT_BRANCH == "main" 118 | - if: $CI_PIPELINE_SOURCE == "merge_request_event" 119 | 120 | # Code format check inside Docker 121 | format-check: 122 | stage: test 123 | needs: ["build"] 124 | before_script: [] 125 | tags: 126 | - docker 127 | - gpu-disabled 128 | image: $BUILD_IMAGE 129 | script: 130 | - echo "Checking code format in Docker container..." 131 | # Install gofumpt 132 | - echo 'Installing gofumpt...' 133 | - go install mvdan.cc/gofumpt@latest 134 | # Run format check 135 | - echo 'Checking code format...' 136 | - make check-format 137 | rules: 138 | - if: $CI_COMMIT_BRANCH == "main" 139 | - if: $CI_PIPELINE_SOURCE == "merge_request_event" 140 | 141 | .go: 142 | tags: 143 | - docker 144 | - gpu-disabled 145 | image: golang 146 | 147 | # Full linting inside Docker (optional) 148 | lint-full: 149 | extends: 150 | - .go 151 | stage: test 152 | before_script: [] 153 | needs: ["build"] 154 | script: 155 | - echo "Running full linting..." 156 | # Install golangci-lint and run full linting 157 | - echo 'Installing golangci-lint...' 158 | - wget -O- -nv https://raw.githubusercontent.com/golangci/golangci-lint/HEAD/install.sh | sh -s v2.1.6 159 | - echo 'Running full linting...' 160 | - ./bin/golangci-lint run ./... --timeout 10m --fix 161 | rules: 162 | - if: $CI_COMMIT_BRANCH == "main" 163 | - if: $CI_PIPELINE_SOURCE == "merge_request_event" 164 | allow_failure: true 165 | 166 | # Build matrix for different configurations (optional) 167 | build-matrix: 168 | stage: build 169 | script: 170 | - echo "Building Docker images with docker-bake for matrix configurations..." 171 | - docker context create go-dcgm 172 | - docker buildx create --use go-dcgm 173 | - docker buildx bake --load 174 | rules: 175 | - if: $CI_COMMIT_BRANCH == "main" 176 | - if: $CI_PIPELINE_SOURCE == "merge_request_event" 177 | -------------------------------------------------------------------------------- /pkg/dcgm/api.go: -------------------------------------------------------------------------------- 1 | package dcgm 2 | 3 | import ( 4 | "context" 5 | "errors" 6 | "fmt" 7 | "os" 8 | "strconv" 9 | "sync" 10 | "time" 11 | ) 12 | 13 | var ( 14 | dcgmInitCounter int 15 | mux sync.Mutex 16 | ) 17 | 18 | // Init starts DCGM in the specified mode 19 | // Mode can be: 20 | // - Embedded: Start hostengine within this process 21 | // - Standalone: Connect to an already running nv-hostengine 22 | // - StartHostengine: Start and connect to nv-hostengine, terminate before exiting 23 | // Returns a cleanup function and any error encountered 24 | func Init(m mode, args ...string) (cleanup func(), err error) { 25 | mux.Lock() 26 | defer mux.Unlock() 27 | 28 | if dcgmInitCounter < 0 { 29 | count := strconv.Itoa(dcgmInitCounter) 30 | err = fmt.Errorf("shutdown() is called %s times, before init()", count[1:]) 31 | } 32 | 33 | if dcgmInitCounter == 0 { 34 | err = initDCGM(m, args...) 35 | if err != nil { 36 | return nil, err 37 | } 38 | } 39 | 40 | dcgmInitCounter += 1 41 | 42 | return func() { 43 | if shutdownErr := Shutdown(); shutdownErr != nil { 44 | fmt.Fprintf(os.Stderr, "Failed to shutdown DCGM with error: `%v`", shutdownErr) 45 | } 46 | }, err 47 | } 48 | 49 | // Shutdown stops DCGM and destroys all connections 50 | // Returns an error if DCGM is not initialized 51 | func Shutdown() (err error) { 52 | mux.Lock() 53 | defer mux.Unlock() 54 | 55 | if dcgmInitCounter <= 0 { 56 | err = errors.New("init() needs to be called before shutdown()") 57 | } 58 | 59 | if dcgmInitCounter == 1 { 60 | err = shutdown() 61 | } 62 | 63 | dcgmInitCounter -= 1 64 | 65 | return 66 | } 67 | 68 | // GetAllDeviceCount returns the count of all GPUs in the system 69 | func GetAllDeviceCount() (uint, error) { 70 | return getAllDeviceCount() 71 | } 72 | 73 | // GetEntityGroupEntities returns all entities of the specified group type 74 | func GetEntityGroupEntities(entityGroup Field_Entity_Group) ([]uint, error) { 75 | return getEntityGroupEntities(entityGroup) 76 | } 77 | 78 | // GetSupportedDevices returns a list of DCGM-supported GPU IDs 79 | func GetSupportedDevices() ([]uint, error) { 80 | return getSupportedDevices() 81 | } 82 | 83 | // GetDeviceInfo returns detailed information about the specified GPU 84 | func GetDeviceInfo(gpuID uint) (Device, error) { 85 | return getDeviceInfo(gpuID) 86 | } 87 | 88 | // GetDeviceStatus returns current status information about the specified GPU 89 | func GetDeviceStatus(gpuID uint) (DeviceStatus, error) { 90 | return latestValuesForDevice(gpuID) 91 | } 92 | 93 | // GetDeviceTopology returns the topology (connectivity) information for the specified GPU 94 | func GetDeviceTopology(gpuID uint) ([]P2PLink, error) { 95 | return getDeviceTopology(gpuID) 96 | } 97 | 98 | // WatchPidFields configures DCGM to start recording stats for GPU processes 99 | // Must be called before GetProcessInfo 100 | func WatchPidFields() (GroupHandle, error) { 101 | return watchPidFields(time.Microsecond*time.Duration(defaultUpdateFreq), time.Second*time.Duration(defaultMaxKeepAge), defaultMaxKeepSamples) 102 | } 103 | 104 | // GetProcessInfo returns detailed per-GPU statistics for the specified process 105 | func GetProcessInfo(group GroupHandle, pid uint) ([]ProcessInfo, error) { 106 | return getProcessInfo(group, pid) 107 | } 108 | 109 | // HealthCheckByGpuId performs a health check on the specified GPU 110 | func HealthCheckByGpuId(gpuID uint) (DeviceHealth, error) { 111 | return healthCheckByGpuId(gpuID) 112 | } 113 | 114 | // ListenForPolicyViolations sets up monitoring for the specified policy conditions on all GPUs 115 | // Returns a channel that receives policy violations and any error encountered 116 | func ListenForPolicyViolations(ctx context.Context, typ ...PolicyCondition) (<-chan PolicyViolation, error) { 117 | groupID := GroupAllGPUs() 118 | return ListenForPolicyViolationsForGroup(ctx, groupID, typ...) 119 | } 120 | 121 | // ListenForPolicyViolationsForGroup sets up policy monitoring for the specified GPU group 122 | // Returns a channel that receives policy violations and any error encountered 123 | func ListenForPolicyViolationsForGroup(ctx context.Context, group GroupHandle, typ ...PolicyCondition) (<-chan PolicyViolation, error) { 124 | return registerPolicy(ctx, group, typ...) 125 | } 126 | 127 | // Introspect returns memory and CPU usage statistics for the DCGM hostengine 128 | func Introspect() (Status, error) { 129 | return introspect() 130 | } 131 | 132 | // GetSupportedMetricGroups returns all supported metric groups for the specified GPU 133 | func GetSupportedMetricGroups(gpuID uint) ([]MetricGroup, error) { 134 | return getSupportedMetricGroups(gpuID) 135 | } 136 | 137 | // GetNvLinkLinkStatus returns the status of all NVLink connections 138 | func GetNvLinkLinkStatus() ([]NvLinkStatus, error) { 139 | return getNvLinkLinkStatus() 140 | } 141 | 142 | // GetNvLinkP2PStatus returns the status of NvLinks between GPU pairs 143 | func GetNvLinkP2PStatus() (NvLinkP2PStatus, error) { 144 | return getNvLinkP2PStatus() 145 | } 146 | 147 | // SetPolicyForGroup configures policies with optional custom thresholds and actions for a GPU group 148 | func SetPolicyForGroup(group GroupHandle, configs ...PolicyConfig) error { 149 | return setPolicyForGroupWithConfig(group, configs...) 150 | } 151 | 152 | // GetPolicyForGroup retrieves the current policy configuration for a GPU group 153 | func GetPolicyForGroup(group GroupHandle) (*PolicyStatus, error) { 154 | return getPolicyForGroup(group) 155 | } 156 | 157 | // ClearPolicyForGroup clears all policy conditions for a GPU group 158 | func ClearPolicyForGroup(group GroupHandle) error { 159 | return clearPolicyForGroup(group) 160 | } 161 | 162 | // WatchPolicyViolationsForGroup registers to receive violation notifications for a specific GPU group 163 | func WatchPolicyViolationsForGroup(ctx context.Context, group GroupHandle, typ ...PolicyCondition) (<-chan PolicyViolation, error) { 164 | return registerPolicyOnly(ctx, group, typ...) 165 | } 166 | -------------------------------------------------------------------------------- /tests/dcgm_test.go: -------------------------------------------------------------------------------- 1 | package tests 2 | 3 | import ( 4 | "math" 5 | "strconv" 6 | "strings" 7 | "testing" 8 | 9 | "github.com/NVIDIA/go-dcgm/pkg/dcgm" 10 | ) 11 | 12 | func check(t *testing.T, err error) { 13 | if err != nil { 14 | t.Errorf("%v\n", err) 15 | } 16 | } 17 | 18 | func TestDeviceCount(t *testing.T) { 19 | cleanup, err := dcgm.Init(dcgm.Embedded) 20 | check(t, err) 21 | defer cleanup() 22 | 23 | count, err := dcgm.GetAllDeviceCount() 24 | check(t, err) 25 | 26 | query := "count" 27 | c := DeviceCount(query) 28 | 29 | if c != count { 30 | t.Errorf("Device Count from dcgm is wrong, got %d, want: %d", count, c) 31 | } 32 | } 33 | 34 | func BenchmarkDeviceCount1(b *testing.B) { 35 | _, _ = dcgm.Init(dcgm.Embedded) 36 | 37 | b.StartTimer() 38 | 39 | for n := 0; n < b.N; n++ { 40 | _, _ = dcgm.GetAllDeviceCount() 41 | } 42 | 43 | b.StopTimer() 44 | 45 | _ = dcgm.Shutdown() 46 | } 47 | 48 | func TestCpuQuery(t *testing.T) { 49 | t.Setenv("DCGM_SKIP_SYSMON_HARDWARE_CHECK", "1") 50 | 51 | cleanup, err := dcgm.Init(dcgm.Embedded) 52 | check(t, err) 53 | 54 | defer cleanup() 55 | 56 | hierarchy, err := dcgm.GetCPUHierarchy() 57 | check(t, err) 58 | 59 | if hierarchy.NumCPUs == 0 { 60 | t.Errorf("Found no CPUs") 61 | } 62 | 63 | for i := uint(0); i < hierarchy.NumCPUs; i++ { 64 | coresFound := false 65 | 66 | for j := uint(0); j < dcgm.MAX_CPU_CORE_BITMASK_COUNT; j++ { 67 | if hierarchy.CPUs[i].OwnedCores[j] != 0 { 68 | coresFound = true 69 | } 70 | } 71 | 72 | if coresFound == false { 73 | t.Errorf("Cpu %d has no cores", i) 74 | } 75 | } 76 | } 77 | 78 | func TestDeviceInfo(t *testing.T) { 79 | cleanup, err := dcgm.Init(dcgm.Embedded) 80 | check(t, err) 81 | defer cleanup() 82 | 83 | fields := []string{ 84 | "driver_version", 85 | "name", 86 | "serial", 87 | "uuid", 88 | "pci.bus_id", 89 | "vbios_version", 90 | "inforom.img", 91 | "power.limit", 92 | } 93 | 94 | gpus, err := dcgm.GetSupportedDevices() 95 | check(t, err) 96 | 97 | for _, gpu := range gpus { 98 | info, err := dcgm.GetDeviceInfo(gpu) 99 | check(t, err) 100 | 101 | id := strconv.FormatUint(uint64(gpu), 10) 102 | 103 | for _, val := range fields { 104 | var msg, output string 105 | 106 | res := Query(id, val) 107 | if res == "[N/A]" { 108 | continue 109 | } 110 | 111 | switch val { 112 | case "driver_version": 113 | msg = "Driver version" 114 | output = info.Identifiers.DriverVersion 115 | case "name": 116 | msg = "Device name" 117 | output = info.Identifiers.Model 118 | case "serial": 119 | msg = "Device Serial number" 120 | output = info.Identifiers.Serial 121 | case "uuid": 122 | msg = "Device UUID" 123 | output = info.UUID 124 | case "pci.bus_id": 125 | msg = "Device PCI busId" 126 | output = info.PCI.BusID 127 | case "vbios_version": 128 | msg = "Device vbios version" 129 | output = info.Identifiers.Vbios 130 | case "inforom.img": 131 | msg = "Device inforom image" 132 | output = info.Identifiers.InforomImageVersion 133 | case "power.limit": 134 | msg = "Device power limit" 135 | output = strconv.FormatUint(uint64(info.Power), 10) 136 | power, err := strconv.ParseFloat(res, 64) 137 | check(t, err) 138 | 139 | res = strconv.FormatUint(uint64(math.Round(power)), 10) 140 | } 141 | 142 | if strings.Compare(res, output) != 0 { 143 | if strings.Contains(output, "NOT_SUPPORTED") { 144 | continue 145 | } 146 | 147 | t.Errorf("%v from dcgm is wrong, got: %v, want: %v", msg, output, res) 148 | } 149 | } 150 | } 151 | } 152 | 153 | func BenchmarkDeviceInfo1(b *testing.B) { 154 | _, _ = dcgm.Init(dcgm.Embedded) 155 | 156 | b.StartTimer() 157 | 158 | for n := 0; n < b.N; n++ { 159 | // assuming there will be atleast 1 GPU attached 160 | _, _ = dcgm.GetDeviceInfo(uint(0)) 161 | } 162 | 163 | b.StopTimer() 164 | 165 | _ = dcgm.Shutdown() 166 | } 167 | 168 | func TestDeviceStatus(t *testing.T) { 169 | cleanup, err := dcgm.Init(dcgm.Embedded) 170 | check(t, err) 171 | defer cleanup() 172 | 173 | gpus, err := dcgm.GetSupportedDevices() 174 | check(t, err) 175 | 176 | fields := []string{ 177 | "power.draw", 178 | "temperature.gpu", 179 | "utilization.gpu", 180 | "utilization.memory", 181 | "encoder.stats.averageFps", 182 | "clocks.current.sm", 183 | "clocks.current.memory", 184 | } 185 | 186 | for _, gpu := range gpus { 187 | status, err := dcgm.GetDeviceStatus(gpu) 188 | check(t, err) 189 | 190 | id := strconv.FormatUint(uint64(gpu), 10) 191 | 192 | for _, val := range fields { 193 | var msg, output string 194 | 195 | res := Query(id, val) 196 | if res == "[N/A]" { 197 | continue 198 | } 199 | 200 | switch val { 201 | case "power.draw": 202 | msg = "Device power utilization" 203 | output = strconv.FormatFloat(math.Round(status.Power), 'f', -1, 64) 204 | power, err := strconv.ParseFloat(res, 64) 205 | check(t, err) 206 | 207 | res = strconv.FormatFloat(math.Round(power), 'f', -1, 64) 208 | case "temperature.gpu": 209 | msg = "Device temperature" 210 | output = strconv.FormatInt(status.Temperature, 10) 211 | case "utilization.gpu": 212 | msg = "Device gpu utilization" 213 | output = strconv.FormatInt(status.Utilization.GPU, 10) 214 | case "utilization.memory": 215 | msg = "Device memory utilization" 216 | output = strconv.FormatInt(status.Utilization.Memory, 10) 217 | case "encoder.stats.averageFps": 218 | msg = "Device encoder utilization" 219 | output = strconv.FormatInt(status.Utilization.Encoder, 10) 220 | case "clocks.current.sm": 221 | msg = "Device sm clock" 222 | output = strconv.FormatInt(status.Clocks.Cores, 10) 223 | case "clocks.current.memory": 224 | msg = "Device mem clock" 225 | output = strconv.FormatInt(status.Clocks.Memory, 10) 226 | } 227 | 228 | if strings.Compare(res, output) != 0 { 229 | t.Errorf("%v from dcgm is wrong, got: %v, want: %v", msg, output, res) 230 | } 231 | } 232 | } 233 | } 234 | -------------------------------------------------------------------------------- /pkg/dcgm/topology.go: -------------------------------------------------------------------------------- 1 | package dcgm 2 | 3 | /* 4 | #include "dcgm_agent.h" 5 | #include "dcgm_structs.h" 6 | */ 7 | import "C" 8 | 9 | import ( 10 | "fmt" 11 | "unsafe" 12 | ) 13 | 14 | // P2PLinkType represents the type of peer-to-peer connection between GPUs 15 | type P2PLinkType uint 16 | 17 | const ( 18 | // P2PLinkUnknown represents an unknown link type 19 | P2PLinkUnknown P2PLinkType = iota 20 | // P2PLinkCrossCPU represents a connection across different CPUs 21 | P2PLinkCrossCPU 22 | // P2PLinkSameCPU represents a connection within the same CPU 23 | P2PLinkSameCPU 24 | // P2PLinkHostBridge represents a connection through the host bridge 25 | P2PLinkHostBridge 26 | // P2PLinkMultiSwitch represents a connection through multiple PCIe switches 27 | P2PLinkMultiSwitch 28 | // P2PLinkSingleSwitch represents a connection through a single PCIe switch 29 | P2PLinkSingleSwitch 30 | // P2PLinkSameBoard represents a connection on the same board 31 | P2PLinkSameBoard 32 | // SingleNVLINKLink represents a single NVLINK connection 33 | SingleNVLINKLink 34 | // TwoNVLINKLinks represents two NVLINK connections 35 | TwoNVLINKLinks 36 | // ThreeNVLINKLinks represents three NVLINK connections 37 | ThreeNVLINKLinks 38 | // FourNVLINKLinks represents four NVLINK connections 39 | FourNVLINKLinks 40 | ) 41 | 42 | // PCIPaths returns a string representation of the P2P link type 43 | func (l P2PLinkType) PCIPaths() string { 44 | switch l { 45 | case P2PLinkSameBoard: 46 | return "PSB" 47 | case P2PLinkSingleSwitch: 48 | return "PIX" 49 | case P2PLinkMultiSwitch: 50 | return "PXB" 51 | case P2PLinkHostBridge: 52 | return "PHB" 53 | case P2PLinkSameCPU: 54 | return "NODE" 55 | case P2PLinkCrossCPU: 56 | return "SYS" 57 | case SingleNVLINKLink: 58 | return "NV1" 59 | case TwoNVLINKLinks: 60 | return "NV2" 61 | case ThreeNVLINKLinks: 62 | return "NV3" 63 | case FourNVLINKLinks: 64 | return "NV4" 65 | case P2PLinkUnknown: 66 | } 67 | return "N/A" 68 | } 69 | 70 | // P2PLink contains information about a peer-to-peer connection 71 | type P2PLink struct { 72 | // GPU is the ID of the GPU 73 | GPU uint 74 | // BusID is the PCIe bus ID of the GPU 75 | BusID string 76 | // Link is the type of P2P connection 77 | Link P2PLinkType 78 | } 79 | 80 | func getP2PLink(path uint) P2PLinkType { 81 | switch path { 82 | case C.DCGM_TOPOLOGY_BOARD: 83 | return P2PLinkSameBoard 84 | case C.DCGM_TOPOLOGY_SINGLE: 85 | return P2PLinkSingleSwitch 86 | case C.DCGM_TOPOLOGY_MULTIPLE: 87 | return P2PLinkMultiSwitch 88 | case C.DCGM_TOPOLOGY_HOSTBRIDGE: 89 | return P2PLinkHostBridge 90 | case C.DCGM_TOPOLOGY_CPU: 91 | return P2PLinkSameCPU 92 | case C.DCGM_TOPOLOGY_SYSTEM: 93 | return P2PLinkCrossCPU 94 | case C.DCGM_TOPOLOGY_NVLINK1: 95 | return SingleNVLINKLink 96 | case C.DCGM_TOPOLOGY_NVLINK2: 97 | return TwoNVLINKLinks 98 | case C.DCGM_TOPOLOGY_NVLINK3: 99 | return ThreeNVLINKLinks 100 | case C.DCGM_TOPOLOGY_NVLINK4: 101 | return FourNVLINKLinks 102 | } 103 | return P2PLinkUnknown 104 | } 105 | 106 | func getBusID(gpuID uint) (string, error) { 107 | var device C.dcgmDeviceAttributes_v3 108 | device.version = makeVersion3(unsafe.Sizeof(device)) 109 | 110 | result := C.dcgmGetDeviceAttributes(handle.handle, C.uint(gpuID), &device) 111 | if err := errorString(result); err != nil { 112 | return "", fmt.Errorf("error getting device busid: %s", err) 113 | } 114 | return *stringPtr(&device.identifiers.pciBusId[0]), nil 115 | } 116 | 117 | func getDeviceTopology(gpuID uint) (links []P2PLink, err error) { 118 | var topology C.dcgmDeviceTopology_v1 119 | topology.version = makeVersion1(unsafe.Sizeof(topology)) 120 | 121 | result := C.dcgmGetDeviceTopology(handle.handle, C.uint(gpuID), &topology) 122 | if result == C.DCGM_ST_NOT_SUPPORTED { 123 | return links, nil 124 | } 125 | if result != C.DCGM_ST_OK { 126 | return links, &Error{msg: C.GoString(C.errorString(result)), Code: result} 127 | } 128 | 129 | busid, err := getBusID(gpuID) 130 | if err != nil { 131 | return 132 | } 133 | links = make([]P2PLink, topology.numGpus) 134 | for i := uint(0); i < uint(topology.numGpus); i++ { 135 | links[i].GPU = uint(topology.gpuPaths[i].gpuId) 136 | links[i].BusID = busid 137 | links[i].Link = getP2PLink(uint(topology.gpuPaths[i].path)) 138 | } 139 | return 140 | } 141 | 142 | // Link_State represents the state of an NVLINK connection 143 | type Link_State uint 144 | 145 | const ( 146 | // LS_NOT_SUPPORTED indicates the link is unsupported (Default for GPUs) 147 | LS_NOT_SUPPORTED Link_State = iota 148 | // LS_DISABLED indicates the link is supported but disabled (Default for NvSwitches) 149 | LS_DISABLED 150 | // LS_DOWN indicates the link is down (inactive) 151 | LS_DOWN 152 | // LS_UP indicates the link is up (active) 153 | LS_UP 154 | ) 155 | 156 | // NvLinkStatus contains information about an NVLINK connection status 157 | type NvLinkStatus struct { 158 | // ParentId is the ID of the parent entity (GPU or NVSwitch) 159 | ParentId uint 160 | // ParentType is the type of the parent entity 161 | ParentType Field_Entity_Group 162 | // State is the current state of the NVLINK 163 | State Link_State 164 | // Index is the link index number 165 | Index uint 166 | } 167 | 168 | func getNvLinkLinkStatus() ([]NvLinkStatus, error) { 169 | var linkStatus C.dcgmNvLinkStatus_v4 170 | linkStatus.version = makeVersion4(unsafe.Sizeof(linkStatus)) 171 | 172 | result := C.dcgmGetNvLinkLinkStatus(handle.handle, &linkStatus) 173 | if result == C.DCGM_ST_NOT_SUPPORTED { 174 | return nil, nil 175 | } 176 | 177 | if result != C.DCGM_ST_OK { 178 | return nil, &Error{msg: C.GoString(C.errorString(result)), Code: result} 179 | } 180 | 181 | links := make([]NvLinkStatus, linkStatus.numGpus*C.DCGM_NVLINK_MAX_LINKS_PER_GPU+linkStatus.numNvSwitches*C.DCGM_NVLINK_MAX_LINKS_PER_NVSWITCH) 182 | 183 | idx := 0 184 | for i := uint(0); i < uint(linkStatus.numGpus); i++ { 185 | for j := 0; j < int(C.DCGM_NVLINK_MAX_LINKS_PER_GPU); j++ { 186 | link := NvLinkStatus{ 187 | uint(linkStatus.gpus[i].entityId), 188 | FE_GPU, 189 | Link_State(linkStatus.gpus[i].linkState[j]), 190 | uint(j), 191 | } 192 | 193 | links[idx] = link 194 | idx++ 195 | } 196 | } 197 | 198 | for i := uint(0); i < uint(linkStatus.numNvSwitches); i++ { 199 | for j := 0; j < C.DCGM_NVLINK_MAX_LINKS_PER_NVSWITCH; j++ { 200 | link := NvLinkStatus{ 201 | uint(linkStatus.nvSwitches[i].entityId), 202 | FE_SWITCH, 203 | Link_State(linkStatus.nvSwitches[i].linkState[j]), 204 | uint(j), 205 | } 206 | 207 | links[idx] = link 208 | idx++ 209 | } 210 | } 211 | 212 | return links, nil 213 | } 214 | -------------------------------------------------------------------------------- /pkg/dcgm/test_utils.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package dcgm 18 | 19 | import ( 20 | "os" 21 | "path/filepath" 22 | "testing" 23 | 24 | "github.com/stretchr/testify/assert" 25 | "github.com/stretchr/testify/require" 26 | ) 27 | 28 | const ( 29 | // DCGM_NVSDM_MOCK_YAML environment variable for enabling NVSDM mock configuration 30 | DCGM_NVSDM_MOCK_YAML = "DCGM_NVSDM_MOCK_YAML" 31 | // DCGM_DBG_FILE is environment variables which enables DCGM to write debug logs to a specific file 32 | DCGM_DBG_FILE = "__DCGM_DBG_FILE" 33 | // DCGM_DBG_LVL is environment variables which enables DCGM logging level 34 | DCGM_DBG_LVL = "__DCGM_DBG_LVL" 35 | ) 36 | 37 | func setupTest(tb testing.TB) func(testing.TB) { 38 | // Store original debug settings 39 | originalDebugLevel, hasDebugLevel := os.LookupEnv(DCGM_DBG_LVL) 40 | originalDebugFile, hasDebugFile := os.LookupEnv(DCGM_DBG_FILE) 41 | 42 | // Enable debug output to stdout 43 | err := os.Setenv(DCGM_DBG_LVL, "6") 44 | require.NoError(tb, err) 45 | err = os.Setenv(DCGM_DBG_FILE, "/dev/stdout") 46 | require.NoError(tb, err) 47 | 48 | // Initialize DCGM 49 | cleanup, err := Init(Embedded) 50 | assert.NoError(tb, err) 51 | 52 | return func(tb testing.TB) { 53 | defer cleanup() 54 | 55 | // Restore original debug settings 56 | if hasDebugLevel { 57 | _ = os.Setenv(DCGM_DBG_LVL, originalDebugLevel) 58 | } else { 59 | _ = os.Unsetenv(DCGM_DBG_LVL) 60 | } 61 | 62 | if hasDebugFile { 63 | _ = os.Setenv(DCGM_DBG_FILE, originalDebugFile) 64 | } else { 65 | _ = os.Unsetenv(DCGM_DBG_FILE) 66 | } 67 | } 68 | } 69 | 70 | func runOnlyWithLiveGPUs(t *testing.T) { 71 | t.Helper() 72 | 73 | gpus, err := getSupportedDevices() 74 | require.NoError(t, err) 75 | 76 | if len(gpus) < 1 { 77 | t.Skip("Skipping test that requires live GPUs. None were found") 78 | } 79 | } 80 | 81 | func withInjectionGPUs(tb testing.TB, count int) ([]uint, error) { 82 | tb.Helper() 83 | numGPUs, err := GetAllDeviceCount() 84 | require.NoError(tb, err) 85 | 86 | if numGPUs+1 > MAX_NUM_DEVICES { 87 | tb.Skipf("Unable to add fake GPU with more than %d gpus", MAX_NUM_DEVICES) 88 | } 89 | 90 | entityList := make([]MigHierarchyInfo, count) 91 | for i := range entityList { 92 | entityList[i] = MigHierarchyInfo{ 93 | Entity: GroupEntityPair{EntityGroupId: FE_GPU}, 94 | } 95 | } 96 | 97 | return CreateFakeEntities(entityList) 98 | } 99 | 100 | // withInjectionGPUInstances creates fake GPU instances on the specified GPU. 101 | // It returns a map of fake GPU instance IDs to their parent GPU ID. 102 | func withInjectionGPUInstances(tb testing.TB, gpuId uint, instanceCount int) (map[uint]uint, error) { 103 | tb.Helper() 104 | 105 | if instanceCount <= 0 { 106 | return nil, nil 107 | } 108 | 109 | entities := make([]MigHierarchyInfo, 0, instanceCount) 110 | for i := 0; i < instanceCount; i++ { 111 | entities = append(entities, MigHierarchyInfo{ 112 | Parent: GroupEntityPair{ 113 | EntityGroupId: FE_GPU, 114 | EntityId: gpuId, 115 | }, 116 | Entity: GroupEntityPair{ 117 | EntityGroupId: FE_GPU_I, 118 | }, 119 | }) 120 | } 121 | 122 | createdIDs, err := CreateFakeEntities(entities) 123 | if err != nil { 124 | return nil, err 125 | } 126 | 127 | result := make(map[uint]uint, len(createdIDs)) 128 | for _, id := range createdIDs { 129 | result[id] = gpuId 130 | } 131 | 132 | return result, nil 133 | } 134 | 135 | // withInjectionComputeInstances creates fake compute instances on the specified GPU instances. 136 | // It returns a mapping of compute instance IDs to their parent GPU instance IDs. 137 | // If count is 0 or parentIDs is empty, it returns an empty map. 138 | func withInjectionComputeInstances(tb testing.TB, parentIDs []uint, count int) (map[uint]uint, error) { 139 | tb.Helper() 140 | 141 | if count <= 0 { 142 | return nil, nil 143 | } 144 | 145 | if len(parentIDs) == 0 { 146 | return nil, nil 147 | } 148 | 149 | entities := make([]MigHierarchyInfo, 0, count) 150 | instanceIndex := 0 151 | for i := 0; i < count; i++ { 152 | if instanceIndex >= len(parentIDs) { 153 | instanceIndex = 0 154 | } 155 | entities = append(entities, MigHierarchyInfo{ 156 | Parent: GroupEntityPair{ 157 | EntityGroupId: FE_GPU_I, 158 | EntityId: parentIDs[instanceIndex], 159 | }, 160 | Entity: GroupEntityPair{ 161 | EntityGroupId: FE_GPU_CI, 162 | }, 163 | }) 164 | instanceIndex++ 165 | } 166 | 167 | createdIDs, err := CreateFakeEntities(entities) 168 | if err != nil { 169 | return nil, err 170 | } 171 | 172 | result := make(map[uint]uint, len(createdIDs)) 173 | instanceIndex = 0 174 | for _, id := range createdIDs { 175 | if instanceIndex >= len(parentIDs) { 176 | instanceIndex = 0 177 | } 178 | result[id] = parentIDs[instanceIndex] 179 | instanceIndex++ 180 | } 181 | 182 | return result, nil 183 | } 184 | 185 | // withNvsdmMockConfig runs a test with a specified NVSDM mock configuration 186 | // It handles setting up and tearing down the environment variable for the mock config 187 | func withNvsdmMockConfig(t *testing.T, configYamlPath string, testFunc func(t *testing.T)) { 188 | t.Helper() 189 | 190 | // Get absolute path for the config file 191 | 192 | absPath, err := filepath.Abs(configYamlPath) 193 | require.NoError(t, err, "Failed to get absolute path for config file") 194 | 195 | // Check if config file exists 196 | if _, err = os.Stat(absPath); os.IsNotExist(err) { 197 | t.Skipf("Skip test due to missing config YAML file [%s]", absPath) 198 | return 199 | } 200 | 201 | // Store original env var value if it exists 202 | originalValue, hasOriginal := os.LookupEnv(DCGM_NVSDM_MOCK_YAML) 203 | 204 | // Set the environment variable 205 | err = os.Setenv(DCGM_NVSDM_MOCK_YAML, absPath) 206 | require.NoError(t, err, "Failed to set mock config environment variable") 207 | 208 | // Cleanup function to restore original state 209 | defer func() { 210 | if hasOriginal { 211 | _ = os.Setenv(DCGM_NVSDM_MOCK_YAML, originalValue) 212 | } else { 213 | _ = os.Unsetenv(DCGM_NVSDM_MOCK_YAML) 214 | } 215 | }() 216 | 217 | // Run the test 218 | testFunc(t) 219 | } 220 | -------------------------------------------------------------------------------- /tests/policy_test.go: -------------------------------------------------------------------------------- 1 | package tests 2 | 3 | import ( 4 | "context" 5 | "testing" 6 | "time" 7 | 8 | "github.com/NVIDIA/go-dcgm/pkg/dcgm" 9 | ) 10 | 11 | // TestPolicyViolations demonstrates listening for policy violations 12 | // This is equivalent to the policy sample but runs for a limited time 13 | func TestPolicyViolations(t *testing.T) { 14 | cleanup, err := dcgm.Init(dcgm.Embedded) 15 | if err != nil { 16 | t.Fatalf("Failed to initialize DCGM: %v", err) 17 | } 18 | defer cleanup() 19 | 20 | ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) 21 | defer cancel() 22 | 23 | // Listen for policy violations (DBE and XID errors) 24 | c, err := dcgm.ListenForPolicyViolations(ctx, dcgm.DbePolicy, dcgm.XidPolicy) 25 | if err != nil { 26 | t.Fatalf("Failed to start listening for policy violations: %v", err) 27 | } 28 | 29 | t.Log("Listening for policy violations (DBE and XID errors) for 10 seconds...") 30 | 31 | violationCount := 0 32 | timeout := time.After(10 * time.Second) 33 | 34 | for { 35 | select { 36 | case pe := <-c: 37 | violationCount++ 38 | t.Logf("Policy Violation %d:", violationCount) 39 | t.Logf(" Condition: %v", pe.Condition) 40 | t.Logf(" Timestamp: %v", pe.Timestamp) 41 | t.Logf(" Data: %v", pe.Data) 42 | 43 | case <-ctx.Done(): 44 | t.Logf("Policy violation monitoring completed") 45 | t.Logf("Total violations detected: %d", violationCount) 46 | return 47 | 48 | case <-timeout: 49 | t.Logf("Policy violation monitoring timed out") 50 | t.Logf("Total violations detected: %d", violationCount) 51 | return 52 | } 53 | } 54 | } 55 | 56 | // TestPolicyViolationsSingleType demonstrates listening for a specific type of policy violation 57 | func TestPolicyViolationsSingleType(t *testing.T) { 58 | if testing.Short() { 59 | t.Skip("Skipping single type policy test in short mode") 60 | } 61 | 62 | cleanup, err := dcgm.Init(dcgm.Embedded) 63 | if err != nil { 64 | t.Fatalf("Failed to initialize DCGM: %v", err) 65 | } 66 | defer cleanup() 67 | 68 | ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) 69 | defer cancel() 70 | 71 | // Listen for only XID policy violations 72 | c, err := dcgm.ListenForPolicyViolations(ctx, dcgm.XidPolicy) 73 | if err != nil { 74 | t.Fatalf("Failed to start listening for XID policy violations: %v", err) 75 | } 76 | 77 | t.Log("Listening for XID policy violations for 5 seconds...") 78 | 79 | xidCount := 0 80 | timeout := time.After(5 * time.Second) 81 | 82 | for { 83 | select { 84 | case pe := <-c: 85 | xidCount++ 86 | t.Logf("XID Policy Violation %d:", xidCount) 87 | t.Logf(" Condition: %v", pe.Condition) 88 | t.Logf(" Timestamp: %v", pe.Timestamp) 89 | t.Logf(" Data: %v", pe.Data) 90 | 91 | case <-ctx.Done(): 92 | t.Logf("XID policy violation monitoring completed") 93 | t.Logf("Total XID violations detected: %d", xidCount) 94 | return 95 | 96 | case <-timeout: 97 | t.Logf("XID policy violation monitoring timed out") 98 | t.Logf("Total XID violations detected: %d", xidCount) 99 | return 100 | } 101 | } 102 | } 103 | 104 | // TestPolicyViolationsMultipleTypes demonstrates listening for multiple types of policy violations 105 | func TestPolicyViolationsMultipleTypes(t *testing.T) { 106 | if testing.Short() { 107 | t.Skip("Skipping multiple types policy test in short mode") 108 | } 109 | 110 | cleanup, err := dcgm.Init(dcgm.Embedded) 111 | if err != nil { 112 | t.Fatalf("Failed to initialize DCGM: %v", err) 113 | } 114 | defer cleanup() 115 | 116 | ctx, cancel := context.WithTimeout(context.Background(), 8*time.Second) 117 | defer cancel() 118 | 119 | // Listen for multiple types of policy violations 120 | // Note: Some policies may require root privileges 121 | 122 | c, err := dcgm.ListenForPolicyViolations(ctx, dcgm.DbePolicy, dcgm.XidPolicy, dcgm.ThermalPolicy, dcgm.PowerPolicy) 123 | if err != nil { 124 | t.Logf("Failed to start listening for all policy violations (may need root): %v", err) 125 | // Try with just basic policies 126 | c, err = dcgm.ListenForPolicyViolations(ctx, dcgm.DbePolicy, dcgm.XidPolicy) 127 | if err != nil { 128 | t.Fatalf("Failed to start listening for basic policy violations: %v", err) 129 | } 130 | t.Log("Listening for basic policy violations (DBE and XID) for 8 seconds...") 131 | } else { 132 | t.Log("Listening for multiple policy violations for 8 seconds...") 133 | } 134 | 135 | violationsByType := make(map[string]int) 136 | timeout := time.After(8 * time.Second) 137 | 138 | for { 139 | select { 140 | case pe := <-c: 141 | conditionStr := string(pe.Condition) 142 | violationsByType[conditionStr]++ 143 | 144 | t.Logf("Policy Violation:") 145 | t.Logf(" Type: %s", conditionStr) 146 | t.Logf(" Timestamp: %v", pe.Timestamp) 147 | t.Logf(" Data: %v", pe.Data) 148 | 149 | case <-ctx.Done(): 150 | t.Log("Multi-type policy violation monitoring completed") 151 | for policyType, count := range violationsByType { 152 | t.Logf(" %s violations: %d", policyType, count) 153 | } 154 | return 155 | 156 | case <-timeout: 157 | t.Log("Multi-type policy violation monitoring timed out") 158 | for policyType, count := range violationsByType { 159 | t.Logf(" %s violations: %d", policyType, count) 160 | } 161 | return 162 | } 163 | } 164 | } 165 | 166 | // TestPolicyViolationsContextCancellation demonstrates proper context cancellation 167 | func TestPolicyViolationsContextCancellation(t *testing.T) { 168 | cleanup, err := dcgm.Init(dcgm.Embedded) 169 | if err != nil { 170 | t.Fatalf("Failed to initialize DCGM: %v", err) 171 | } 172 | defer cleanup() 173 | 174 | ctx, cancel := context.WithCancel(context.Background()) 175 | 176 | c, err := dcgm.ListenForPolicyViolations(ctx, dcgm.DbePolicy) 177 | if err != nil { 178 | t.Fatalf("Failed to start listening for policy violations: %v", err) 179 | } 180 | 181 | t.Log("Starting policy violation monitoring, will cancel after 2 seconds...") 182 | 183 | // Cancel after 2 seconds 184 | go func() { 185 | time.Sleep(2 * time.Second) 186 | t.Log("Cancelling policy violation monitoring...") 187 | cancel() 188 | }() 189 | 190 | violationCount := 0 191 | startTime := time.Now() 192 | 193 | for { 194 | select { 195 | case pe := <-c: 196 | violationCount++ 197 | t.Logf("Policy violation %d: %v", violationCount, pe.Condition) 198 | 199 | case <-ctx.Done(): 200 | elapsed := time.Since(startTime) 201 | t.Logf("Policy violation monitoring stopped after %v", elapsed) 202 | t.Logf("Total violations detected: %d", violationCount) 203 | 204 | // Should have stopped within reasonable time after cancellation 205 | if elapsed > 3*time.Second { 206 | t.Errorf("Context cancellation took too long: %v", elapsed) 207 | } 208 | return 209 | } 210 | } 211 | } 212 | -------------------------------------------------------------------------------- /pkg/dcgm/admin.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package dcgm 18 | 19 | /* 20 | #cgo linux LDFLAGS: -ldl -Wl,--export-dynamic -Wl,--unresolved-symbols=ignore-in-object-files 21 | #cgo darwin LDFLAGS: -ldl -Wl,-undefined,dynamic_lookup 22 | 23 | #include 24 | #include "dcgm_agent.h" 25 | #include "dcgm_structs.h" 26 | 27 | */ 28 | import "C" 29 | 30 | import ( 31 | "errors" 32 | "fmt" 33 | "log" 34 | "os" 35 | "os/exec" 36 | "strconv" 37 | "syscall" 38 | "unsafe" 39 | ) 40 | 41 | type mode int 42 | 43 | // const for DCGM hostengine running modes: Embedded, Standalone or StartHostengine 44 | const ( 45 | Embedded mode = iota 46 | Standalone 47 | StartHostengine 48 | ) 49 | 50 | type dcgmHandle struct{ handle C.dcgmHandle_t } 51 | 52 | var ( 53 | dcgmLibHandle unsafe.Pointer 54 | stopMode mode 55 | handle dcgmHandle 56 | hostengineAsChildPid int 57 | ) 58 | 59 | func initDCGM(m mode, args ...string) (err error) { 60 | const ( 61 | dcgmLib = "libdcgm.so.4" 62 | ) 63 | lib := C.CString(dcgmLib) 64 | defer freeCString(lib) 65 | 66 | dcgmLibHandle = C.dlopen(lib, C.RTLD_LAZY|C.RTLD_GLOBAL) 67 | if dcgmLibHandle == nil { 68 | return fmt.Errorf("%s not found", dcgmLib) 69 | } 70 | 71 | // set the stopMode for shutdown() 72 | stopMode = m 73 | 74 | switch m { 75 | case Embedded: 76 | return startEmbedded() 77 | case Standalone: 78 | return connectStandalone(args...) 79 | case StartHostengine: 80 | return startHostengine() 81 | default: 82 | panic(ErrInvalidMode) 83 | } 84 | } 85 | 86 | func shutdown() (err error) { 87 | switch stopMode { 88 | case Embedded: 89 | err = stopEmbedded() 90 | case Standalone: 91 | err = disconnectStandalone() 92 | case StartHostengine: 93 | err = stopHostengine() 94 | } 95 | 96 | C.dlclose(dcgmLibHandle) 97 | return 98 | } 99 | 100 | func startEmbedded() (err error) { 101 | result := C.dcgmInit() 102 | if err = errorString(result); err != nil { 103 | return fmt.Errorf("error initializing DCGM: %s", err) 104 | } 105 | 106 | var cHandle C.dcgmHandle_t 107 | result = C.dcgmStartEmbedded(C.DCGM_OPERATION_MODE_AUTO, &cHandle) 108 | if err = errorString(result); err != nil { 109 | return fmt.Errorf("error starting nv-hostengine: %s", err) 110 | } 111 | handle = dcgmHandle{cHandle} 112 | return 113 | } 114 | 115 | func stopEmbedded() (err error) { 116 | result := C.dcgmStopEmbedded(handle.handle) 117 | if err = errorString(result); err != nil { 118 | return fmt.Errorf("error stopping nv-hostengine: %s", err) 119 | } 120 | 121 | result = C.dcgmShutdown() 122 | if err = errorString(result); err != nil { 123 | return fmt.Errorf("error shutting down DCGM: %s", err) 124 | } 125 | return 126 | } 127 | 128 | func connectStandalone(args ...string) (err error) { 129 | var ( 130 | cHandle C.dcgmHandle_t 131 | connectParams C.dcgmConnectV2Params_v2 132 | ) 133 | 134 | if len(args) < 2 { 135 | return errors.New("missing dcgm address and / or port") 136 | } 137 | 138 | result := C.dcgmInit() 139 | if err = errorString(result); err != nil { 140 | return fmt.Errorf("error initializing DCGM: %s", err) 141 | } 142 | 143 | addr := C.CString(args[0]) 144 | defer freeCString(addr) 145 | connectParams.version = makeVersion2(unsafe.Sizeof(connectParams)) 146 | 147 | sck, err := strconv.ParseUint(args[1], 10, 32) 148 | if err != nil { 149 | return fmt.Errorf("error parsing %s: %v", args[1], err) 150 | } 151 | connectParams.addressIsUnixSocket = C.uint(sck) 152 | 153 | result = C.dcgmConnect_v2(addr, &connectParams, &cHandle) 154 | if err = errorString(result); err != nil { 155 | return fmt.Errorf("error connecting to nv-hostengine: %s", err) 156 | } 157 | 158 | handle = dcgmHandle{cHandle} 159 | 160 | return 161 | } 162 | 163 | func disconnectStandalone() (err error) { 164 | result := C.dcgmDisconnect(handle.handle) 165 | if err = errorString(result); err != nil { 166 | return fmt.Errorf("error disconnecting from nv-hostengine: %s", err) 167 | } 168 | 169 | result = C.dcgmShutdown() 170 | if err = errorString(result); err != nil { 171 | return fmt.Errorf("error shutting down DCGM: %s", err) 172 | } 173 | return 174 | } 175 | 176 | func startHostengine() (err error) { 177 | var ( 178 | procAttr syscall.ProcAttr 179 | cHandle C.dcgmHandle_t 180 | connectParams C.dcgmConnectV2Params_v2 181 | ) 182 | 183 | bin, err := exec.LookPath("nv-hostengine") 184 | if err != nil { 185 | return fmt.Errorf("error finding nv-hostengine: %s", err) 186 | } 187 | procAttr.Files = []uintptr{ 188 | uintptr(syscall.Stdin), 189 | uintptr(syscall.Stdout), 190 | uintptr(syscall.Stderr), 191 | } 192 | procAttr.Sys = &syscall.SysProcAttr{Setpgid: true} 193 | 194 | dir := "/tmp" 195 | tmpfile, err := os.CreateTemp(dir, "dcgm") 196 | if err != nil { 197 | return fmt.Errorf("error creating temporary file in %s directory: %s", dir, err) 198 | } 199 | socketPath := tmpfile.Name() 200 | defer os.Remove(socketPath) 201 | 202 | connectArg := "--domain-socket" 203 | hostengineAsChildPid, err = syscall.ForkExec(bin, []string{bin, connectArg, socketPath}, &procAttr) 204 | if err != nil { 205 | return fmt.Errorf("error fork-execing nv-hostengine: %s", err) 206 | } 207 | 208 | result := C.dcgmInit() 209 | if err = errorString(result); err != nil { 210 | return fmt.Errorf("error initializing DCGM: %s", err) 211 | } 212 | 213 | connectParams.version = makeVersion2(unsafe.Sizeof(connectParams)) 214 | isSocket := C.uint(1) 215 | connectParams.addressIsUnixSocket = isSocket 216 | cSockPath := C.CString(socketPath) 217 | defer freeCString(cSockPath) 218 | result = C.dcgmConnect_v2(cSockPath, &connectParams, &cHandle) 219 | if err = errorString(result); err != nil { 220 | return fmt.Errorf("error connecting to nv-hostengine: %s", err) 221 | } 222 | 223 | handle = dcgmHandle{cHandle} 224 | return 225 | } 226 | 227 | func stopHostengine() (err error) { 228 | if err = disconnectStandalone(); err != nil { 229 | return 230 | } 231 | 232 | // terminate nv-hostengine 233 | cmd := exec.Command("nv-hostengine", "--term") 234 | if err = cmd.Run(); err != nil { 235 | return fmt.Errorf("error terminating nv-hostengine: %s", err) 236 | } 237 | 238 | log.Println("Successfully terminated nv-hostengine.") 239 | 240 | return syscall.Kill(hostengineAsChildPid, syscall.SIGKILL) 241 | } 242 | -------------------------------------------------------------------------------- /samples/README.md: -------------------------------------------------------------------------------- 1 | # DCGM Samples 2 | 3 | Modeled on [dcgmi (Data Center GPU Manager Interface)](https://developer.nvidia.com/data-center-gpu-manager-dcgm) and [nvidia-smi (NVIDIA System Management Interface)](https://developer.nvidia.com/nvidia-system-management-interface), seven samples and a [REST API](https://github.com/NVIDIA/go-dcgm/samples/dcgm/restApi/README.md) have been provided to show how to use DCGM go bindings. 4 | 5 | ## DCGM running modes 6 | 7 | DCGM can be run in three different ways. 8 | 9 | ### Embedded Mode 10 | 11 | In embedded mode, hostengine is started as part of the running process and is loaded as a shared library. In this mode, metrics are also updated and collected automatically. This mode is recommended for users who wants to avoid managing an autonomous hostengine. 12 | 13 | ### Standalone Mode 14 | 15 | This mode lets you connect to an already running hostengine at a specified TCP/IP or Unix socket address. This mode is recommended for remote connections to the hostengine. By default, DCGM will assume a TCP connection and attempt to connect to localhost, unless specified. 16 | 17 | ```bash 18 | # If hostengine is running at a different address, pass it as 19 | 20 | IP - Valid IP address for the remote hostengine to connect to, at port 5555. 21 | 22 | IP:PORT - Valid IP address and port 23 | 24 | O - Given address is a TCP/IP address 25 | 26 | 1 - Given address is an Unix socket filename 27 | 28 | $ ./sample -connect "IP" -socket "0" 29 | 30 | ``` 31 | 32 | ### StartHostengine 33 | 34 | This is an add-on mode which opens an Unix socket for starting and connecting with hostengine. The hostengine is started as a child process of the running process and automatically terminated on exit. When operating in this mode, make sure to stop an already running hostengine to avoid any connection address conflicts. This mode is recommended for safely integrating DCGM in an already existing setup. 35 | 36 | ## Samples 37 | 38 | ### deviceInfo 39 | 40 | Provides detailed information about each GPU on the system, along with whether the given GPU is DCGM supported or not. 41 | 42 | ```bash 43 | $ go build && ./deviceInfo 44 | 45 | # sample output 46 | 47 | Driver Version : 384.130 48 | GPU : 0 49 | DCGMSupported : Yes 50 | UUID : GPU-34e8d7ba-0e4d-ac00-6852-695d5d404f51 51 | Brand : GeForce 52 | Model : GeForce GTX 980 53 | Serial Number : 0324414056639 54 | Vbios : 84.04.1F.00.02 55 | InforomImage Version : G001.0000.01.03 56 | Bus ID : 00000000:01:00.0 57 | BAR1 (MB) : 256 58 | FrameBuffer Memory (MB): 4036 59 | Bandwidth (MB/s) : 15760 60 | Cores (MHz) : 1392 61 | Memory (MHz) : 3505 62 | Power (W) : 180 63 | CPUAffinity : 0-11 64 | P2P Available : None 65 | --------------------------------------------------------------------- 66 | ``` 67 | 68 | #### dmon 69 | 70 | Monitors each device status including its power, memory and GPU utilization. 71 | 72 | ```bash 73 | $ go build && ./dmon 74 | 75 | # sample output 76 | 77 | Started host engine version 1.4.3 using socket path: /tmp/dcgmrxvqro.socket 78 | # gpu pwr temp sm mem enc dec mclk pclk 79 | # Idx W C % % % % MHz MHz 80 | 0 43 48 0 1 0 0 3505 936 81 | 0 43 48 0 1 0 0 3505 936 82 | ``` 83 | 84 | #### health 85 | 86 | Monitors the health of the given GPU every second, by checking the configured watches for any errors/failures/warnings. 87 | 88 | ```bash 89 | $ go build && ./health 90 | 91 | # sample output 92 | GPU : 0 93 | Status : Healthy 94 | ``` 95 | 96 | #### hostengineStatus 97 | 98 | Reports about DCGM hostengine memory and CPU usage. 99 | 100 | ```bash 101 | $ go build && ./hostengineStatus 102 | 103 | # sample output 104 | 105 | Memory : 11480 KB 106 | CPU : 0.08 % 107 | ``` 108 | 109 | #### policy 110 | 111 | Sets GPU usage and error policies and notifies in case of violations via callback functions. 112 | 113 | ```bash 114 | $ go build && ./policy 115 | 116 | # sample output 117 | 118 | 2018/06/25 23:48:34 Policy successfully set. 119 | 2018/06/25 23:48:34 Listening for violations... 120 | GPU : 0 121 | Error : XID Error 122 | Timestamp : 2018-06-25 18:55:30 +0000 UTC 123 | Data : {31} 124 | ``` 125 | 126 | #### processInfo 127 | 128 | Provides per GPU detailed stats for this process. 129 | 130 | ```bash 131 | $ go build && ./processInfo -pid PID 132 | 133 | # sample output 134 | 135 | ---------------------------------------------------------------------- 136 | GPU ID : 0 137 | ----------Execution Stats--------------------------------------------- 138 | PID : 15074 139 | Name : nbody 140 | Start Time : 2018-06-25 16:50:28 -0700 PDT 141 | End Time : Still Running 142 | ----------Performance Stats------------------------------------------- 143 | Energy Consumed (Joules) : 181 144 | Max GPU Memory Used (bytes) : 84279296 145 | Avg SM Clock (MHz) : N/A 146 | Avg Memory Clock (MHz) : N/A 147 | Avg SM Utilization (%) : N/A 148 | Avg Memory Utilization (%) : N/A 149 | Avg PCIe Rx Bandwidth (MB) : N/A 150 | Avg PCIe Tx Bandwidth (MB) : N/A 151 | ----------Event Stats------------------------------------------------- 152 | Single Bit ECC Errors : 0 153 | Double Bit ECC Errors : 0 154 | Critical XID Errors : 0 155 | ----------Slowdown Stats---------------------------------------------- 156 | Due to - Power (%) : 0 157 | - Thermal (%) : 0 158 | - Reliability (%) : 0 159 | - Board Limit (%) : 0 160 | - Low Utilization (%) : 0 161 | - Sync Boost (%) : 0 162 | ----------Process Utilization----------------------------------------- 163 | Avg SM Utilization (%) : 0 164 | Avg Memory Utilization (%) : 0 165 | ---------------------------------------------------------------------- 166 | ``` 167 | 168 | #### topology 169 | 170 | Informs about GPU topology and its CPU affinity. 171 | 172 | ```bash 173 | $ go build && ./topology 174 | 175 | # sample output 176 | 177 | Started host engine version 1.4.3 using socket path: /tmp/dcgmvjeqkh.socket 178 | GPU0CPUAffinity 179 | GPU0 X 0-11 180 | 181 | Legend: 182 | X = Self 183 | SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI) 184 | NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node 185 | PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU) 186 | PXB = Connection traversing multiple PCIe switches (without traversing the PCIe Host Bridge) 187 | PIX = Connection traversing a single PCIe switch 188 | PSB = Connection traversing a single on-board PCIe switch 189 | NV# = Connection traversing a bonded set of # NVLinks 190 | 2018/06/25 15:36:38 Successfully terminated nv-hostengine. 191 | ``` 192 | -------------------------------------------------------------------------------- /pkg/dcgm/health_test.go: -------------------------------------------------------------------------------- 1 | //go:build linux && cgo 2 | 3 | /* 4 | * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | package dcgm 20 | 21 | import ( 22 | "crypto/rand" 23 | "fmt" 24 | "math" 25 | "math/big" 26 | "strings" 27 | "testing" 28 | "time" 29 | 30 | "github.com/stretchr/testify/assert" 31 | "github.com/stretchr/testify/require" 32 | ) 33 | 34 | func TestHealthWhenInvalidGroupID(t *testing.T) { 35 | teardownTest := setupTest(t) 36 | defer teardownTest(t) 37 | runOnlyWithLiveGPUs(t) 38 | 39 | var invalidGroupID uintptr = 99 40 | gh := GroupHandle{} 41 | gh.SetHandle(invalidGroupID) 42 | err := HealthSet(gh, DCGM_HEALTH_WATCH_PCIE) 43 | assert.Error(t, err) 44 | assert.Contains(t, err.Error(), "Setting not configured") 45 | 46 | _, err = HealthGet(gh) 47 | assert.Error(t, err) 48 | assert.Contains(t, err.Error(), "Setting not configured") 49 | 50 | _, err = HealthGet(gh) 51 | assert.Error(t, err) 52 | assert.Contains(t, err.Error(), "Setting not configured") 53 | } 54 | 55 | func TestHealthCheckPCIE(t *testing.T) { 56 | teardownTest := setupTest(t) 57 | defer teardownTest(t) 58 | 59 | runOnlyWithLiveGPUs(t) 60 | gpus, err := withInjectionGPUs(t, 1) 61 | require.NoError(t, err) 62 | 63 | type testCase struct { 64 | name string 65 | pcieGen int 66 | pcieGenSpeed float64 // in Gbps 67 | pcieLanes int 68 | pcieReplayCounter int 69 | expectingIncident bool 70 | } 71 | 72 | pcieGenSpeeds := []float64{ 73 | 2.5, // Gen1 speed in Gbps 74 | 5.0, // Gen2 75 | 8.0, // Gen3 76 | 16.0, // Gen4 77 | 32.0, // Gen5 78 | 64.0, // Gen6 79 | } 80 | 81 | var tests []testCase 82 | // Generate test cases 83 | for i := 0; i < 1; i++ { // Run multiple iterations 84 | for gen, speed := range pcieGenSpeeds { 85 | pcieGen := gen + 1 86 | // Generate random number between 1 and 16 87 | n, err := rand.Int(rand.Reader, big.NewInt(16)) 88 | require.NoError(t, err) 89 | pcieLanes := int(n.Int64()) + 1 90 | 91 | ratePerLane := speed / 1000 * 60 // Convert to errors/min per lane 92 | expectedLimit := math.Ceil(ratePerLane * float64(pcieLanes)) 93 | 94 | // Generate random number between 1 and 2*expectedLimit 95 | n, err = rand.Int(rand.Reader, big.NewInt(2*int64(expectedLimit))) 96 | require.NoError(t, err) 97 | pcieReplayCounter := int(n.Int64()) + 1 98 | expectingIncident := pcieReplayCounter > int(expectedLimit) 99 | 100 | tests = append(tests, testCase{ 101 | name: fmt.Sprintf("PCIe_Gen%d_%dLanes_Counter%d", pcieGen, pcieLanes, pcieReplayCounter), 102 | pcieGen: pcieGen, 103 | pcieGenSpeed: speed, 104 | pcieLanes: pcieLanes, 105 | pcieReplayCounter: pcieReplayCounter, 106 | expectingIncident: expectingIncident, 107 | }) 108 | } 109 | } 110 | 111 | for _, tc := range tests { 112 | t.Run(tc.name, func(t *testing.T) { 113 | ratePerLane := tc.pcieGenSpeed / 1000 * 60 114 | expectedLimit := math.Ceil(ratePerLane * float64(tc.pcieLanes)) 115 | 116 | errMsg := fmt.Sprintf("pcieGen=%d pcieGenSpeed=%f pcieLanes=%d expectedLimit=%f pcieReplayCounter=%d expectingIncident=%v", 117 | tc.pcieGen, tc.pcieGenSpeed, tc.pcieLanes, expectedLimit, tc.pcieReplayCounter, tc.expectingIncident) 118 | 119 | healthCheckPCIE(t, gpus, tc.pcieGen, tc.pcieLanes, tc.pcieReplayCounter, tc.expectingIncident, errMsg) 120 | defer resetPCICReplayCounter(t, gpus) 121 | }) 122 | } 123 | } 124 | 125 | func resetPCICReplayCounter(t *testing.T, gpuIDs []uint) { 126 | gpuID := gpuIDs[0] 127 | err := InjectFieldValue(gpuID, 128 | DCGM_FI_DEV_PCIE_REPLAY_COUNTER, 129 | DCGM_FT_INT64, 130 | 0, 131 | time.Now().Add(100*time.Second).UnixMicro(), 132 | int64(0), 133 | ) 134 | require.NoError(t, err) 135 | } 136 | 137 | func healthCheckPCIE(t *testing.T, gpuIDs []uint, pcieGen, pcieLanes, pcieReplayCounter int, expectingPCIEIncident bool, errMessage string) { 138 | gpuID := gpuIDs[0] 139 | 140 | groupID, err := CreateGroup("test1") 141 | require.NoError(t, err) 142 | defer func() { 143 | _ = DestroyGroup(groupID) 144 | }() 145 | err = AddEntityToGroup(groupID, FE_GPU, gpuID) 146 | require.NoError(t, err) 147 | 148 | err = HealthSet(groupID, DCGM_HEALTH_WATCH_PCIE) 149 | require.NoError(t, err) 150 | 151 | system, err := HealthGet(groupID) 152 | require.NoError(t, err) 153 | require.Equal(t, DCGM_HEALTH_WATCH_PCIE, system) 154 | 155 | skipTestIfUnhealthy(t, groupID) 156 | 157 | // inject PCIe Gen and width/lanes 158 | err = InjectFieldValue(gpuID, 159 | DCGM_FI_DEV_PCIE_LINK_GEN, 160 | DCGM_FT_INT64, 161 | 0, 162 | 0, 163 | int64(pcieGen), 164 | ) 165 | require.NoError(t, err) 166 | 167 | err = InjectFieldValue(gpuID, 168 | DCGM_FI_DEV_PCIE_LINK_WIDTH, 169 | DCGM_FT_INT64, 170 | 0, 171 | 0, 172 | int64(pcieLanes), 173 | ) 174 | require.NoError(t, err) 175 | 176 | err = InjectFieldValue(gpuID, 177 | DCGM_FI_DEV_PCIE_REPLAY_COUNTER, 178 | DCGM_FT_INT64, 179 | 0, 180 | time.Now().Add(-50*time.Second).UnixMicro(), 181 | int64(0), 182 | ) 183 | require.NoError(t, err) 184 | 185 | // we expect that there will be no data here 186 | response, err := HealthCheck(groupID) 187 | require.NoError(t, err) 188 | require.Equal(t, DCGM_HEALTH_RESULT_PASS, response.OverallHealth) 189 | 190 | // inject an error into PCIe 191 | err = InjectFieldValue(gpuID, 192 | DCGM_FI_DEV_PCIE_REPLAY_COUNTER, 193 | DCGM_FT_INT64, 194 | 0, 195 | time.Now().Add(100*time.Second).UnixMicro(), 196 | int64(pcieReplayCounter), 197 | ) // set the injected data into the future 198 | require.NoError(t, err) 199 | 200 | response, err = HealthCheck(groupID) 201 | require.NoError(t, err) 202 | if expectingPCIEIncident { 203 | require.Len(t, response.Incidents, 1, errMessage) 204 | require.Equal(t, gpuID, response.Incidents[0].EntityInfo.EntityId) 205 | require.Equal(t, DCGM_HEALTH_WATCH_PCIE, response.Incidents[0].System) 206 | require.Equal(t, DCGM_FR_PCI_REPLAY_RATE, response.Incidents[0].Error.Code) 207 | } else { 208 | require.Empty(t, response.Incidents, errMessage) 209 | } 210 | } 211 | 212 | func skipTestIfUnhealthy(t *testing.T, groupId GroupHandle) { 213 | health, err := HealthCheck(groupId) 214 | require.NoError(t, err) 215 | if health.OverallHealth != DCGM_HEALTH_RESULT_PASS { 216 | msg := "Skipping health check test because we are already unhealthy: " 217 | incidents := []string{} 218 | for _, incident := range health.Incidents { 219 | incidents = append(incidents, incident.Error.Message) 220 | } 221 | 222 | t.Skip(msg + strings.Join(incidents, ", ")) 223 | } 224 | } 225 | -------------------------------------------------------------------------------- /pkg/dcgm/health.go: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package dcgm 18 | 19 | /* 20 | #include "dcgm_agent.h" 21 | #include "dcgm_structs.h" 22 | */ 23 | import "C" 24 | 25 | import ( 26 | "fmt" 27 | "math/rand" 28 | "unsafe" 29 | ) 30 | 31 | // SystemWatch represents a health watch system and its status 32 | type SystemWatch struct { 33 | // Type identifies the type of health watch system 34 | Type string 35 | // Status indicates the current health status 36 | Status string 37 | // Error contains any error message if status is not healthy 38 | Error string 39 | } 40 | 41 | // DeviceHealth represents the health status of a GPU device 42 | type DeviceHealth struct { 43 | // GPU is the ID of the GPU device 44 | GPU uint 45 | // Status indicates the overall health status of the GPU 46 | Status string 47 | // Watches contains the status of individual health watch systems 48 | Watches []SystemWatch 49 | } 50 | 51 | // HealthSet enables the DCGM health check system for the given systems. 52 | // It configures which health watch systems should be monitored for the specified group. 53 | func HealthSet(groupID GroupHandle, systems HealthSystem) (err error) { 54 | result := C.dcgmHealthSet(handle.handle, groupID.handle, C.dcgmHealthSystems_t(systems)) 55 | if err := errorString(result); err != nil { 56 | return fmt.Errorf("error setting health watches: %w", err) 57 | } 58 | return nil 59 | } 60 | 61 | // HealthGet retrieves the current state of the DCGM health check system. 62 | // It returns which health watch systems are currently enabled for the specified group. 63 | func HealthGet(groupID GroupHandle) (HealthSystem, error) { 64 | var systems C.dcgmHealthSystems_t 65 | 66 | result := C.dcgmHealthGet(handle.handle, groupID.handle, (*C.dcgmHealthSystems_t)(unsafe.Pointer(&systems))) 67 | if err := errorString(result); err != nil { 68 | return HealthSystem(0), err 69 | } 70 | return HealthSystem(systems), nil 71 | } 72 | 73 | // DiagErrorDetail contains detailed information about a health check error 74 | type DiagErrorDetail struct { 75 | // Message contains a human-readable description of the error 76 | Message string 77 | // Code identifies the specific type of error 78 | Code HealthCheckErrorCode 79 | } 80 | 81 | // Incident represents a health check incident that occurred 82 | type Incident struct { 83 | // System identifies which health watch system detected the incident 84 | System HealthSystem 85 | // Health indicates the severity of the incident 86 | Health HealthResult 87 | // Error contains detailed information about the incident 88 | Error DiagErrorDetail 89 | // EntityInfo identifies the GPU or component where the incident occurred 90 | EntityInfo GroupEntityPair 91 | } 92 | 93 | // HealthResponse contains the results of a health check operation 94 | type HealthResponse struct { 95 | // OverallHealth indicates the aggregate health status across all watches 96 | OverallHealth HealthResult 97 | // Incidents contains details about any health issues detected 98 | Incidents []Incident 99 | } 100 | 101 | // HealthCheck checks the configured watches for any errors/failures/warnings that have occurred 102 | // since the last time this check was invoked. On the first call, stateful information 103 | // about all of the enabled watches within a group is created but no error results are 104 | // provided. On subsequent calls, any error information will be returned. 105 | func HealthCheck(groupID GroupHandle) (HealthResponse, error) { 106 | var healthResults C.dcgmHealthResponse_v5 107 | healthResults.version = makeVersion5(unsafe.Sizeof(healthResults)) 108 | 109 | result := C.dcgmHealthCheck(handle.handle, groupID.handle, (*C.dcgmHealthResponse_t)(unsafe.Pointer(&healthResults))) 110 | 111 | if err := errorString(result); err != nil { 112 | return HealthResponse{}, &Error{msg: C.GoString(C.errorString(result)), Code: result} 113 | } 114 | 115 | response := HealthResponse{ 116 | OverallHealth: HealthResult(healthResults.overallHealth), 117 | } 118 | 119 | // number of watches that encountered error/warning 120 | incidents := uint(healthResults.incidentCount) 121 | 122 | response.Incidents = make([]Incident, incidents) 123 | 124 | for i := uint(0); i < incidents; i++ { 125 | response.Incidents[i] = Incident{ 126 | System: HealthSystem(healthResults.incidents[i].system), 127 | Health: HealthResult(healthResults.incidents[i].health), 128 | Error: DiagErrorDetail{ 129 | Message: *stringPtr(&healthResults.incidents[i].error.msg[0]), 130 | Code: HealthCheckErrorCode(healthResults.incidents[i].error.code), 131 | }, 132 | EntityInfo: GroupEntityPair{ 133 | EntityGroupId: Field_Entity_Group(healthResults.incidents[i].entityInfo.entityGroupId), 134 | EntityId: uint(healthResults.incidents[i].entityInfo.entityId), 135 | }, 136 | } 137 | } 138 | 139 | return response, nil 140 | } 141 | 142 | func healthCheckByGpuId(gpuID uint) (deviceHealth DeviceHealth, err error) { 143 | name := fmt.Sprintf("health%d", rand.Uint64()) 144 | groupID, err := CreateGroup(name) 145 | if err != nil { 146 | return 147 | } 148 | 149 | err = AddToGroup(groupID, gpuID) 150 | if err != nil { 151 | return 152 | } 153 | 154 | err = HealthSet(groupID, DCGM_HEALTH_WATCH_ALL) 155 | if err != nil { 156 | return 157 | } 158 | 159 | result, err := HealthCheck(groupID) 160 | if err != nil { 161 | return 162 | } 163 | 164 | status := healthStatus(result.OverallHealth) 165 | 166 | // number of watches that encountered error/warning 167 | incidents := len(result.Incidents) 168 | watches := make([]SystemWatch, incidents) 169 | 170 | for j := 0; j < incidents; j++ { 171 | watches[j] = SystemWatch{ 172 | Type: systemWatch(result.Incidents[j].System), 173 | Status: healthStatus(result.Incidents[j].Health), 174 | Error: result.Incidents[j].Error.Message, 175 | } 176 | } 177 | 178 | deviceHealth = DeviceHealth{ 179 | GPU: gpuID, 180 | Status: status, 181 | Watches: watches, 182 | } 183 | _ = DestroyGroup(groupID) 184 | return 185 | } 186 | 187 | func healthStatus(status HealthResult) string { 188 | switch status { 189 | case 0: 190 | return "Healthy" 191 | case 10: 192 | return "Warning" 193 | case 20: 194 | return "Failure" 195 | } 196 | return "N/A" 197 | } 198 | 199 | func systemWatch(watch HealthSystem) string { 200 | switch watch { 201 | case 1: 202 | return "PCIe watches" 203 | case 2: 204 | return "NVLINK watches" 205 | case 4: 206 | return "Power Managemnt unit watches" 207 | case 8: 208 | return "Microcontroller unit watches" 209 | case 16: 210 | return "Memory watches" 211 | case 32: 212 | return "Streaming Multiprocessor watches" 213 | case 64: 214 | return "Inforom watches" 215 | case 128: 216 | return "Temperature watches" 217 | case 256: 218 | return "Power watches" 219 | case 512: 220 | return "Driver-related watches" 221 | } 222 | return "N/A" 223 | } 224 | -------------------------------------------------------------------------------- /pkg/dcgm/diag.go: -------------------------------------------------------------------------------- 1 | package dcgm 2 | 3 | /* 4 | #include "dcgm_agent.h" 5 | #include "dcgm_structs.h" 6 | */ 7 | import "C" 8 | 9 | import ( 10 | "strings" 11 | "unsafe" 12 | ) 13 | 14 | // Package dcgm provides bindings for NVIDIA's Data Center GPU Manager (DCGM) 15 | 16 | // DIAG_RESULT_STRING_SIZE represents the maximum size of diagnostic result strings 17 | const DIAG_RESULT_STRING_SIZE = 1024 18 | 19 | // DiagType represents the type of diagnostic test to run 20 | type DiagType int 21 | 22 | const ( 23 | // DiagQuick represents a quick diagnostic test that performs basic health checks 24 | DiagQuick DiagType = 1 25 | 26 | // DiagMedium represents a medium-length diagnostic test that performs more comprehensive checks 27 | DiagMedium DiagType = 2 28 | 29 | // DiagLong represents a long diagnostic test that performs extensive health checks 30 | DiagLong DiagType = 3 31 | 32 | // DiagExtended represents an extended diagnostic test that performs the most thorough system checks 33 | DiagExtended DiagType = 4 34 | ) 35 | 36 | // DiagResult represents the result of a single diagnostic test 37 | type DiagResult struct { 38 | // Status indicates the test result: "pass", "fail", "warn", "skip", or "notrun" 39 | Status string 40 | // TestName is the name of the diagnostic test that was run 41 | TestName string 42 | // TestOutput contains any additional output or messages from the test 43 | TestOutput string 44 | // ErrorCode is the numeric error code if the test failed 45 | ErrorCode uint 46 | // ErrorMessage contains a detailed error message if the test failed 47 | ErrorMessage string 48 | // Serial number of the tested entity 49 | SerialNumber string 50 | // EntityID 51 | EntityID uint 52 | } 53 | 54 | // DiagResults contains the results of all diagnostic tests 55 | type DiagResults struct { 56 | // Software contains the results of software-related diagnostic tests 57 | Software []DiagResult 58 | } 59 | 60 | // diagResultString converts a diagnostic result code to its string representation 61 | func diagResultString(r int) string { 62 | switch r { 63 | case C.DCGM_DIAG_RESULT_PASS: 64 | return "pass" 65 | case C.DCGM_DIAG_RESULT_SKIP: 66 | return "skipped" 67 | case C.DCGM_DIAG_RESULT_WARN: 68 | return "warn" 69 | case C.DCGM_DIAG_RESULT_FAIL: 70 | return "fail" 71 | case C.DCGM_DIAG_RESULT_NOT_RUN: 72 | return "notrun" 73 | } 74 | return "" 75 | } 76 | 77 | // gpuTestName returns the category name for a diagnostic test based on its test ID. 78 | // This function handles all diagnostic test types including GPU tests and software tests. 79 | // Software tests (DCGM_SWTEST_*) all report under DCGM_SOFTWARE_INDEX and return "software". 80 | // Detailed test information is provided in TestOutput, not in the TestName. 81 | func gpuTestName(t int) string { 82 | switch t { 83 | case C.DCGM_MEMORY_INDEX: 84 | return "memory" 85 | case C.DCGM_DIAGNOSTIC_INDEX: 86 | return "diagnostic" 87 | case C.DCGM_PCI_INDEX: 88 | return "pcie" 89 | case C.DCGM_SM_STRESS_INDEX: 90 | return "sm stress" 91 | case C.DCGM_TARGETED_STRESS_INDEX: 92 | return "targeted stress" 93 | case C.DCGM_TARGETED_POWER_INDEX: 94 | return "targeted power" 95 | case C.DCGM_MEMORY_BANDWIDTH_INDEX: 96 | return "memory bandwidth" 97 | case C.DCGM_MEMTEST_INDEX: 98 | return "memtest" 99 | case C.DCGM_PULSE_TEST_INDEX: 100 | return "pulse" 101 | case C.DCGM_EUD_TEST_INDEX: 102 | return "eud" 103 | case C.DCGM_SOFTWARE_INDEX: 104 | return "software" 105 | case C.DCGM_CONTEXT_CREATE_INDEX: 106 | return "context create" 107 | } 108 | return "" 109 | } 110 | 111 | func getErrorMsg(entityId uint, testId uint, response C.dcgmDiagResponse_v12) (msg string, code uint) { 112 | for i := 0; i < int(response.numErrors); i++ { 113 | if uint(response.errors[i].entity.entityId) != entityId || uint(response.errors[i].testId) != testId { 114 | continue 115 | } 116 | 117 | msg = C.GoString((*C.char)(unsafe.Pointer(&response.errors[i].msg))) 118 | code = uint(response.errors[i].code) 119 | return 120 | } 121 | 122 | return 123 | } 124 | 125 | func getInfoMsg(entityId uint, testId uint, response C.dcgmDiagResponse_v12) string { 126 | var msgs []string 127 | for i := 0; i < int(response.numInfo); i++ { 128 | if uint(response.info[i].entity.entityId) != entityId || uint(response.info[i].testId) != testId { 129 | continue 130 | } 131 | msgs = append(msgs, C.GoString((*C.char)(unsafe.Pointer(&response.info[i].msg)))) 132 | } 133 | return strings.Join(msgs, " | ") 134 | } 135 | 136 | func getTestName(resultIdx uint, response C.dcgmDiagResponse_v12) string { 137 | for i := uint(0); i < uint(response.numTests); i++ { 138 | t := response.tests[i] 139 | for j := uint16(0); j < uint16(t.numResults); j++ { 140 | if uint16(t.resultIndices[j]) == uint16(resultIdx) { 141 | plugin := C.GoString((*C.char)(unsafe.Pointer(&t.pluginName))) 142 | if plugin != "" { 143 | plugin = "/" + plugin 144 | } 145 | return C.GoString((*C.char)(unsafe.Pointer(&t.name))) + plugin 146 | } 147 | } 148 | } 149 | return "" 150 | } 151 | 152 | func getSerial(resultIdx uint, response C.dcgmDiagResponse_v12) string { 153 | for i := 0; i < int(response.numEntities); i++ { 154 | if response.entities[i].entity.entityId == response.results[resultIdx].entity.entityId && 155 | response.entities[i].entity.entityGroupId == response.results[resultIdx].entity.entityGroupId { 156 | return C.GoString((*C.char)(unsafe.Pointer(&response.entities[i].serialNum))) 157 | } 158 | } 159 | return "" 160 | } 161 | 162 | func newDiagResult(resultIndex uint, response C.dcgmDiagResponse_v12) DiagResult { 163 | entityId := uint(response.results[resultIndex].entity.entityId) 164 | testId := uint(response.results[resultIndex].testId) 165 | 166 | msg, code := getErrorMsg(entityId, testId, response) 167 | info := getInfoMsg(entityId, testId, response) 168 | testName := gpuTestName(int(testId)) 169 | serial := getSerial(resultIndex, response) 170 | 171 | return DiagResult{ 172 | Status: diagResultString(int(response.results[resultIndex].result)), 173 | TestName: testName, 174 | TestOutput: info, 175 | ErrorCode: code, 176 | ErrorMessage: msg, 177 | SerialNumber: serial, 178 | EntityID: entityId, 179 | } 180 | } 181 | 182 | func diagLevel(diagType DiagType) C.dcgmDiagnosticLevel_t { 183 | switch diagType { 184 | case DiagQuick: 185 | return C.DCGM_DIAG_LVL_SHORT 186 | case DiagMedium: 187 | return C.DCGM_DIAG_LVL_MED 188 | case DiagLong: 189 | return C.DCGM_DIAG_LVL_LONG 190 | case DiagExtended: 191 | return C.DCGM_DIAG_LVL_XLONG 192 | } 193 | return C.DCGM_DIAG_LVL_INVALID 194 | } 195 | 196 | // RunDiag runs diagnostic tests on a group of GPUs with the specified diagnostic level. 197 | // Parameters: 198 | // - diagType: The type/level of diagnostic test to run (Quick, Medium, Long, or Extended) 199 | // - groupId: The group of GPUs to run diagnostics on 200 | // 201 | // Returns: 202 | // - DiagResults containing the results of all diagnostic tests 203 | // - error if the diagnostics failed to run 204 | func RunDiag(diagType DiagType, groupID GroupHandle) (DiagResults, error) { 205 | var diagResults C.dcgmDiagResponse_v12 206 | diagResults.version = C.dcgmDiagResponse_version12 207 | 208 | result := C.dcgmRunDiagnostic(handle.handle, groupID.handle, diagLevel(diagType), &diagResults) 209 | if err := errorString(result); err != nil { 210 | return DiagResults{}, &Error{msg: C.GoString(C.errorString(result)), Code: result} 211 | } 212 | 213 | var diagRun DiagResults 214 | diagRun.Software = make([]DiagResult, diagResults.numResults) 215 | for i := 0; i < int(diagResults.numResults); i++ { 216 | diagRun.Software[i] = newDiagResult(uint(i), diagResults) 217 | } 218 | 219 | return diagRun, nil 220 | } 221 | -------------------------------------------------------------------------------- /tests/README.md: -------------------------------------------------------------------------------- 1 | # DCGM Go Testing Samples 2 | 3 | This directory contains test versions of all the DCGM samples, reimplemented using the Go testing framework. These tests demonstrate the functionality of the NVIDIA Data Center GPU Manager (DCGM) Go bindings while being suitable for automated testing and CI/CD pipelines. 4 | 5 | ## Test Files Overview 6 | 7 | ### Core Device Management 8 | 9 | - **`deviceinfo_test.go`** - Tests device information retrieval functionality 10 | - Equivalent to `samples/deviceInfo/main.go` 11 | - Tests GPU device properties, identification, and topology information 12 | - Includes tests for both embedded and standalone hostengine connections 13 | 14 | - **`dmon_test.go`** - Tests device monitoring capabilities 15 | - Equivalent to `samples/dmon/main.go` 16 | - Monitors GPU utilization, temperature, power, and clock speeds 17 | - Includes time-limited monitoring tests and sample consistency checks 18 | 19 | - **`device_status_test.go`** - Tests device status querying (part of dmon functionality) 20 | - Tests single and multiple GPU status queries 21 | - Validates utilization metrics and system health indicators 22 | 23 | ### Diagnostics and Health 24 | 25 | - **`diag_test.go`** - Tests DCGM diagnostic functionality 26 | - Equivalent to `samples/diag/main.go` 27 | - Runs quick and medium-level diagnostic tests 28 | - Validates software and hardware diagnostic results 29 | 30 | - **`health_test.go`** - Tests GPU health monitoring 31 | - Equivalent to `samples/health/main.go` 32 | - Performs single and continuous health checks 33 | - Tests health watch configuration and error reporting 34 | 35 | ### System Management 36 | 37 | - **`hostengine_test.go`** - Tests DCGM hostengine introspection 38 | - Equivalent to `samples/hostengineStatus/main.go` 39 | - Monitors hostengine memory and CPU usage 40 | - Tests introspection under different load conditions 41 | 42 | - **`policy_test.go`** - Tests policy violation monitoring 43 | - Equivalent to `samples/policy/main.go` 44 | - Tests various policy condition types (DBE, XID, thermal, power) 45 | - Includes context cancellation and timeout handling 46 | 47 | ### Process and Topology 48 | 49 | - **`processinfo_test.go`** - Tests GPU process monitoring 50 | - Equivalent to `samples/processInfo/main.go` 51 | - Tests process field watching and information retrieval 52 | - Includes PID-specific testing capabilities 53 | 54 | - **`topology_test.go`** - Tests GPU topology analysis 55 | - Equivalent to `samples/topology/main.go` 56 | - Tests inter-GPU connection discovery and analysis 57 | - Includes topology consistency validation 58 | 59 | ### REST API 60 | 61 | - **`restapi_test.go`** - Tests REST API endpoint functionality 62 | - Equivalent to `samples/restApi/` (complete implementation) 63 | - Uses `httptest` for testing HTTP endpoints without starting a real server 64 | - Tests JSON response formats and error handling 65 | 66 | ## Running the Tests 67 | 68 | ### Run All Tests 69 | 70 | ```bash 71 | go test ./tests/... -v 72 | ``` 73 | 74 | ### Run Specific Test Files 75 | 76 | ```bash 77 | # Run device information tests 78 | go test ./tests/deviceinfo_test.go -v 79 | 80 | # Run monitoring tests 81 | go test ./tests/dmon_test.go -v 82 | 83 | # Run diagnostic tests 84 | go test ./tests/diag_test.go -v 85 | ``` 86 | 87 | ### Run Tests with Different Modes 88 | 89 | ```bash 90 | # Run only quick tests (skip long-running tests) 91 | go test ./tests/... -v -short 92 | 93 | # Run tests with timeout 94 | go test ./tests/... -v -timeout 5m 95 | ``` 96 | 97 | ### Run Specific Test Functions 98 | 99 | ```bash 100 | # Run specific test function 101 | go test ./tests/deviceinfo_test.go -v -run TestDeviceInfo 102 | 103 | # Run all tests matching a pattern 104 | go test ./tests/... -v -run "TestDevice.*" 105 | ``` 106 | 107 | ## Test Features 108 | 109 | ### Adaptive Testing 110 | 111 | - Tests automatically skip when no GPUs are available 112 | - Different behavior for single vs. multi-GPU systems 113 | - Graceful handling of permission-restricted operations 114 | 115 | ### Time-Limited Execution 116 | 117 | - Long-running samples (like monitoring) are time-limited in tests 118 | - Configurable test durations for CI/CD environments 119 | - Background operations are properly cancelled 120 | 121 | ### Comprehensive Coverage 122 | 123 | - Each test covers the core functionality of its corresponding sample 124 | - Additional test scenarios for error conditions and edge cases 125 | - Validation of return values and data consistency 126 | 127 | ### CI/CD Friendly 128 | 129 | - Tests use the Go testing framework's standard patterns 130 | - Proper test isolation and cleanup 131 | - Structured logging for debugging 132 | 133 | ## Prerequisites 134 | 135 | ### System Requirements 136 | 137 | - NVIDIA GPU(s) with DCGM support 138 | - NVIDIA drivers installed 139 | - DCGM libraries available 140 | - Go 1.19+ for testing framework features 141 | 142 | ### Dependencies 143 | 144 | The tests require the same dependencies as the original samples: 145 | 146 | - `github.com/NVIDIA/go-dcgm/pkg/dcgm` 147 | - `github.com/gorilla/mux` (for REST API tests only) 148 | 149 | ### Permissions 150 | 151 | Some tests may require elevated privileges: 152 | 153 | - Process monitoring tests work best when run as root 154 | - Certain policy violation tests require administrative access 155 | - Diagnostic tests may need elevated permissions for hardware access 156 | 157 | ## Test Structure 158 | 159 | Each test file follows a consistent pattern: 160 | 161 | 1. **Basic Functionality Test** - Core sample functionality 162 | 2. **Extended Tests** - Additional scenarios and edge cases 163 | 3. **Error Handling Tests** - Validation of error conditions 164 | 4. **Performance/Consistency Tests** - Multi-sample validation 165 | 166 | ### Example Test Pattern 167 | 168 | ```go 169 | func TestSampleFunctionality(t *testing.T) { 170 | // Initialize DCGM 171 | cleanup, err := dcgm.Init(dcgm.Embedded) 172 | if err != nil { 173 | t.Fatalf("Failed to initialize DCGM: %v", err) 174 | } 175 | defer cleanup() 176 | 177 | // Test core functionality 178 | // ... test implementation 179 | 180 | // Validate results 181 | // ... assertions and checks 182 | } 183 | ``` 184 | 185 | ## Integration with CI/CD 186 | 187 | These tests are designed to integrate well with continuous integration systems: 188 | 189 | - Use standard Go testing patterns 190 | - Provide detailed logging for troubleshooting 191 | - Support timeout and cancellation 192 | - Can run with or without actual GPU hardware (with appropriate skipping) 193 | 194 | ### Example GitHub Actions Integration 195 | 196 | ```yaml 197 | - name: Run DCGM Tests 198 | run: | 199 | go test ./tests/... -v -timeout 10m 200 | continue-on-error: true # Optional: allow failure if no GPU available 201 | ``` 202 | 203 | ## Troubleshooting 204 | 205 | ### Common Issues 206 | 207 | 1. **No GPUs Found** - Tests will skip automatically 208 | 2. **Permission Denied** - Some tests require root privileges 209 | 3. **DCGM Not Available** - Ensure DCGM libraries are installed 210 | 4. **Timeout Issues** - Increase test timeout for slow systems 211 | 212 | ### Debug Information 213 | 214 | All tests provide verbose logging when run with `-v` flag: 215 | 216 | ```bash 217 | go test ./tests/deviceinfo_test.go -v 218 | ``` 219 | 220 | ### Environment Variables 221 | 222 | Tests respect standard Go testing environment variables: 223 | 224 | - `GO_TEST_TIMEOUT_SCALE` - Scale test timeouts 225 | - `DCGM_TESTING_MODE` - Custom testing configurations (if implemented) 226 | 227 | ## Contributing 228 | 229 | When adding new tests: 230 | 231 | 1. Follow the existing naming pattern (`*_test.go`) 232 | 2. Include comprehensive documentation 233 | 3. Add appropriate test skipping for missing hardware 234 | 4. Include both positive and negative test cases 235 | 5. Update this README with new test descriptions 236 | --------------------------------------------------------------------------------