├── .gitmodules
├── samples
    ├── diag
    │   ├── .gitignore
    │   └── main.go
    ├── dmon
    │   ├── .gitignore
    │   └── main.go
    ├── health
    │   ├── .gitignore
    │   └── main.go
    ├── policy
    │   ├── .gitignore
    │   └── main.go
    ├── restApi
    │   ├── .gitignore
    │   ├── main.go
    │   ├── handlers
    │   │   ├── byUuids.go
    │   │   ├── byIds.go
    │   │   └── dcgm.go
    │   ├── server.go
    │   └── README.md
    ├── topology
    │   ├── .gitignore
    │   └── main.go
    ├── deviceInfo
    │   ├── .gitignore
    │   └── main.go
    ├── processInfo
    │   ├── .gitignore
    │   └── main.go
    ├── hostengineStatus
    │   ├── .gitignore
    │   └── main.go
    └── README.md
├── .gitignore
├── .hadolint.yaml
├── pkg
    └── dcgm
    │   ├── dcgm_nvml.h
    │   ├── callback.c
    │   ├── error.go
    │   ├── field_values_cb.h
    │   ├── dcgm_api_export.h
    │   ├── field_values_cb.c
    │   ├── api_test.go
    │   ├── profile.go
    │   ├── hostengine_status.go
    │   ├── testdata
    │       └── one_switch.yaml
    │   ├── cpu.go
    │   ├── gpu_group_test.go
    │   ├── utils.go
    │   ├── instances_test.go
    │   ├── diag_test_helpers.go
    │   ├── structs.go
    │   ├── field_values_test.go
    │   ├── field_values.go
    │   ├── internal.go
    │   ├── gpu_group.go
    │   ├── fields_test.go
    │   ├── mig.go
    │   ├── device_status.go
    │   ├── api.go
    │   ├── topology.go
    │   ├── test_utils.go
    │   ├── admin.go
    │   ├── health_test.go
    │   ├── health.go
    │   └── diag.go
├── .markdownlint.yaml
├── scripts
    └── lint
    │   └── go-mod-tidy.sh
├── go.mod
├── .github
    └── workflows
    │   └── go.yml
├── docker-bake.hcl
├── README.md
├── .yamllint.yaml
├── go.sum
├── tests
    ├── nvsmi.go
    ├── processinfo_test.go
    ├── deviceinfo_test.go
    ├── health_test.go
    ├── hostengine_test.go
    ├── diag_test.go
    ├── dmon_test.go
    ├── dcgm_test.go
    ├── policy_test.go
    └── README.md
├── .golangci.yml
├── Makefile
├── .pre-commit-config.yaml
├── Dockerfile
├── CONTRIBUTING.md
└── .gitlab-ci.yml


/.gitmodules:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/samples/diag/.gitignore:
--------------------------------------------------------------------------------
1 | diag
2 | 


--------------------------------------------------------------------------------
/samples/dmon/.gitignore:
--------------------------------------------------------------------------------
1 | dmon
2 | 


--------------------------------------------------------------------------------
/samples/health/.gitignore:
--------------------------------------------------------------------------------
1 | health
2 | 


--------------------------------------------------------------------------------
/samples/policy/.gitignore:
--------------------------------------------------------------------------------
1 | policy
2 | 


--------------------------------------------------------------------------------
/samples/restApi/.gitignore:
--------------------------------------------------------------------------------
1 | restApi
2 | 


--------------------------------------------------------------------------------
/samples/topology/.gitignore:
--------------------------------------------------------------------------------
1 | topology
2 | 


--------------------------------------------------------------------------------
/samples/deviceInfo/.gitignore:
--------------------------------------------------------------------------------
1 | deviceInfo
2 | 


--------------------------------------------------------------------------------
/samples/processInfo/.gitignore:
--------------------------------------------------------------------------------
1 | processInfo
2 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.swp
2 | *.swo
3 | tags
4 | .idea/
5 | 


--------------------------------------------------------------------------------
/samples/hostengineStatus/.gitignore:
--------------------------------------------------------------------------------
1 | hostengineStatus
2 | 


--------------------------------------------------------------------------------
/.hadolint.yaml:
--------------------------------------------------------------------------------
1 | ---
2 | ignored: []
3 | trustedRegistries: []
4 | 


--------------------------------------------------------------------------------
/pkg/dcgm/dcgm_nvml.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | 
3 | #define NVML_NO_UNVERSIONED_FUNC_DEFS
4 | #include "nvml.h"
5 | 


--------------------------------------------------------------------------------
/pkg/dcgm/callback.c:
--------------------------------------------------------------------------------
1 | int violationNotify(void* p) {
2 |     int ViolationRegistration(void*);
3 |     return ViolationRegistration(p);
4 | }
5 | 


--------------------------------------------------------------------------------
/.markdownlint.yaml:
--------------------------------------------------------------------------------
1 | default: true
2 | 
3 | # MD013/line-length - Line length
4 | MD013:
5 |   # eventually set line_length to 80
6 |   line_length: 500
7 |   tables: false
8 |   code_blocks: false
9 | 


--------------------------------------------------------------------------------
/pkg/dcgm/error.go:
--------------------------------------------------------------------------------
1 | package dcgm
2 | 
3 | import "errors"
4 | 
5 | // ErrInvalidMode represents an error indicating that an invalid mode was used
6 | var ErrInvalidMode = errors.New("invalid mode")
7 | 


--------------------------------------------------------------------------------
/scripts/lint/go-mod-tidy.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | go mod tidy -v $@
 4 | if [ $? -ne 0 ]; then
 5 |   exit 2
 6 | fi
 7 | 
 8 | git diff --exit-code go.* &> /dev/null
 9 | if [ $? -ne 0 ]; then
10 |     echo "go.mod or go.sum differs, please re-add it to your commit"
11 |     exit 3
12 | fi
13 | 


--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
 1 | module github.com/NVIDIA/go-dcgm
 2 | 
 3 | go 1.23
 4 | 
 5 | require (
 6 | 	github.com/bits-and-blooms/bitset v1.22.0
 7 | 	github.com/gorilla/mux v1.8.1
 8 | 	github.com/stretchr/testify v1.10.0
 9 | )
10 | 
11 | require (
12 | 	github.com/davecgh/go-spew v1.1.1 // indirect
13 | 	github.com/pmezard/go-difflib v1.0.0 // indirect
14 | 	gopkg.in/yaml.v3 v3.0.1 // indirect
15 | )
16 | 


--------------------------------------------------------------------------------
/.github/workflows/go.yml:
--------------------------------------------------------------------------------
 1 | name: Go
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ main ]
 6 |   pull_request:
 7 |     branches: [ main ]
 8 | 
 9 | jobs:
10 | 
11 |   build:
12 |     runs-on: ubuntu-latest
13 |     steps:
14 |       - uses: actions/checkout@v2
15 |       - name: Set up Go
16 |         uses: actions/setup-go@v2
17 |         with:
18 |           go-version: 1.21
19 |       - name: Build
20 |         run: make binary
21 |       - name: Lint
22 |         run: make check-format
23 | 


--------------------------------------------------------------------------------
/samples/hostengineStatus/main.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"log"
 6 | 
 7 | 	"github.com/NVIDIA/go-dcgm/pkg/dcgm"
 8 | )
 9 | 
10 | // dcgmi introspect --enable
11 | // dcgmi introspect -s -H
12 | func main() {
13 | 	cleanup, err := dcgm.Init(dcgm.Embedded)
14 | 	if err != nil {
15 | 		log.Panicln(err)
16 | 	}
17 | 	defer cleanup()
18 | 
19 | 	st, err := dcgm.Introspect()
20 | 	if err != nil {
21 | 		log.Panicln(err)
22 | 	}
23 | 
24 | 	fmt.Printf("Memory %2s %v KB\nCPU %5s %.2f %s\n", ":", st.Memory, ":", st.CPU, "%")
25 | }
26 | 


--------------------------------------------------------------------------------
/docker-bake.hcl:
--------------------------------------------------------------------------------
 1 | target "default" {
 2 |   name = "go-dcgm-${replace(distro, ".", "-")}-${replace(go, ".", "-")}-${replace(cuda, ".", "-")}"
 3 |   tags = ["go-dcgm:${distro}-go${go}-cuda${cuda}-dcgm${dcgm}"]
 4 |   platforms = ["linux/amd64"]
 5 |   matrix = {
 6 |     go = ["1.24.4"]
 7 |     distro = ["ubuntu24.04", "ubuntu22.04", "ubuntu20.04"]
 8 |     cuda = ["12.9.1", "12.5.1"]
 9 |     dcgm = ["4.2.3-2"]
10 |   }
11 |   args = {
12 |     GO_VERSION = go
13 |     DISTRO_FLAVOR = distro
14 |     CUDA_VERSION = cuda
15 |     DCGM_VERSION = dcgm
16 |   }
17 | }
18 | 


--------------------------------------------------------------------------------
/samples/restApi/main.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"log"
 5 | 	"os"
 6 | 	"os/signal"
 7 | 	"syscall"
 8 | 
 9 | 	"github.com/NVIDIA/go-dcgm/pkg/dcgm"
10 | )
11 | 
12 | // res: curl localhost:8070/dcgm/device/info/id/0
13 | 
14 | func main() {
15 | 	stopSig := make(chan os.Signal, 1)
16 | 	signal.Notify(stopSig, syscall.SIGINT, syscall.SIGTERM)
17 | 
18 | 	cleanup, err := dcgm.Init(dcgm.Embedded)
19 | 	if err != nil {
20 | 		log.Panicln(err)
21 | 	}
22 | 	defer cleanup()
23 | 
24 | 	addr := ":8070"
25 | 	server := newHttpServer(addr)
26 | 
27 | 	go func() {
28 | 		log.Printf("Running http server on localhost%s", addr)
29 | 		server.serve()
30 | 	}()
31 | 
32 | 	defer server.stop()
33 | 
34 | 	<-stopSig
35 | }
36 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Overview
 2 | 
 3 | Golang bindings are provided for [NVIDIA Data Center GPU Manager (DCGM)](https://developer.nvidia.com/dcgm). DCGM is a set of tools for managing and monitoring NVIDIA GPUs in cluster environments. It's a low overhead tool suite that performs a variety of functions on each host system including active health monitoring, diagnostics, system validation, policies, power and clock management, group configuration and accounting.
 4 | 
 5 | You will also find samples for these bindings in this repository.
 6 | 
 7 | ## Issues and Contributing
 8 | 
 9 | [Checkout the Contributing document!](CONTRIBUTING.md)
10 | 
11 | * Please let us know by [filing a new issue](https://github.com/NVIDIA/go-dcgm/issues/new)
12 | * You can contribute by opening a [pull request](https://github.com/NVIDIA/go-dcgm)
13 | 


--------------------------------------------------------------------------------
/samples/diag/main.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"html/template"
 5 | 	"log"
 6 | 	"os"
 7 | 
 8 | 	"github.com/NVIDIA/go-dcgm/pkg/dcgm"
 9 | )
10 | 
11 | const diagOutput = `Software:
12 | {{range $t := .Software}}
13 |   {{printf "%-50s" $t.TestName}} {{$t.Status}}	{{$t.TestOutput}}
14 | {{- end}}
15 | {{range $g := .PerGpu}}
16 | GPU	: {{$g.GPU}}
17 |   {{range $t := $g.DiagResults}}
18 |   {{printf "%-20s" $t.TestName}} {{$t.Status}}	{{$t.TestOutput}}
19 |   {{- end}}
20 | {{- end}}
21 | `
22 | 
23 | func main() {
24 | 	cleanup, err := dcgm.Init(dcgm.Embedded)
25 | 	if err != nil {
26 | 		log.Panicln(err)
27 | 	}
28 | 	defer cleanup()
29 | 
30 | 	dr, err := dcgm.RunDiag(dcgm.DiagQuick, dcgm.GroupAllGPUs())
31 | 	if err != nil {
32 | 		log.Panicln(err)
33 | 	}
34 | 
35 | 	t := template.Must(template.New("Diag").Parse(diagOutput))
36 | 	if err = t.Execute(os.Stdout, dr); err != nil {
37 | 		log.Panicln("Template error:", err)
38 | 	}
39 | }
40 | 


--------------------------------------------------------------------------------
/.yamllint.yaml:
--------------------------------------------------------------------------------
 1 | yaml-files:
 2 |   - "*.yaml"
 3 |   - "*.yml"
 4 | 
 5 | rules:
 6 |   anchors:
 7 |     forbid-undeclared-aliases: true
 8 |     forbid-duplicated-anchors: true
 9 |     forbid-unused-anchors: true
10 |   braces:
11 |     min-spaces-inside: 0
12 |     max-spaces-inside: 1
13 |     min-spaces-inside-empty: 0
14 |     max-spaces-inside-empty: 0
15 |   brackets:
16 |     min-spaces-inside: 0
17 |     max-spaces-inside: 1
18 |     min-spaces-inside-empty: 0
19 |     max-spaces-inside-empty: 0
20 |   colons: enable
21 |   commas: enable
22 |   comments: enable
23 |   comments-indentation: enable
24 |   document-end: disable
25 |   document-start: disable
26 |   empty-lines:
27 |     max: 1
28 |   empty-values: enable
29 |   float-values: disable
30 |   hyphens: enable
31 |   indentation: enable
32 |   key-duplicates: enable
33 |   key-ordering: disable
34 |   line-length: disable
35 |   new-line-at-end-of-file: enable
36 |   new-lines: disable
37 |   octal-values: disable
38 |   quoted-strings: disable
39 |   trailing-spaces: enable
40 |   truthy:
41 |     check-keys: false
42 | 


--------------------------------------------------------------------------------
/pkg/dcgm/field_values_cb.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #ifndef FIELD_VALUES
18 | #define FIELD_VALUES
19 | 
20 | #include "dcgm_agent.h"
21 | #include "dcgm_structs.h"
22 | 
23 | int fieldValueEntityCallback(dcgm_field_entity_group_t entityGroupId,
24 |                                       dcgm_field_eid_t entityId,
25 |                                       dcgmFieldValue_v1 *values,
26 |                                       int numValues,
27 |                                       void *userData);
28 | 
29 | #endif
30 | 


--------------------------------------------------------------------------------
/pkg/dcgm/dcgm_api_export.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | #ifndef DCGM_DCGM_API_EXPORT_H
17 | #define DCGM_DCGM_API_EXPORT_H
18 | 
19 | #undef DCGM_PUBLIC_API
20 | #undef DCGM_PRIVATE_API
21 | 
22 | #if defined(DCGM_API_EXPORT)
23 | #define DCGM_PUBLIC_API __attribute((visibility("default")))
24 | #else
25 | #define DCGM_PUBLIC_API
26 | #if defined(ERROR_IF_NOT_PUBLIC)
27 | #error(Should be public)
28 | #endif
29 | #endif
30 | 
31 | #define DCGM_PRIVATE_API __attribute((visibility("hidden")))
32 | 
33 | 
34 | #endif // DCGM_DCGM_API_EXPORT_H
35 | 


--------------------------------------------------------------------------------
/pkg/dcgm/field_values_cb.c:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #include "dcgm_agent.h"
18 | #include "dcgm_structs.h"
19 | #include "_cgo_export.h"
20 | 
21 | int fieldValueEntityCallback(dcgm_field_entity_group_t entityGroupId,
22 |                                     dcgm_field_eid_t entityId,
23 |                                     dcgmFieldValue_v1 *values,
24 |                                     int numValues,
25 |                                     void *userData) {
26 |  return go_dcgmFieldValueEntityEnumeration(entityGroupId, entityId, values, numValues, userData);
27 | }
28 | 


--------------------------------------------------------------------------------
/go.sum:
--------------------------------------------------------------------------------
 1 | github.com/bits-and-blooms/bitset v1.22.0 h1:Tquv9S8+SGaS3EhyA+up3FXzmkhxPGjQQCkcs2uw7w4=
 2 | github.com/bits-and-blooms/bitset v1.22.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8=
 3 | github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
 4 | github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 5 | github.com/gorilla/mux v1.8.1 h1:TuBL49tXwgrFYWhqrNgrUNEY92u81SPhu7sTdzQEiWY=
 6 | github.com/gorilla/mux v1.8.1/go.mod h1:AKf9I4AEqPTmMytcMc0KkNouC66V3BtZ4qD5fmWSiMQ=
 7 | github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
 8 | github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
 9 | github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA=
10 | github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
11 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
12 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
13 | gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
14 | gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
15 | 


--------------------------------------------------------------------------------
/pkg/dcgm/api_test.go:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package dcgm
18 | 
19 | import (
20 | 	"testing"
21 | 
22 | 	"github.com/stretchr/testify/require"
23 | )
24 | 
25 | func TestGetEntityGroupEntities(t *testing.T) {
26 | 	withNvsdmMockConfig(t, "testdata/one_switch.yaml", func(t *testing.T) {
27 | 		teardownTest := setupTest(t)
28 | 		defer teardownTest(t)
29 | 
30 | 		runOnlyWithLiveGPUs(t)
31 | 
32 | 		// Get switch entities
33 | 		entities, err := GetEntityGroupEntities(FE_SWITCH)
34 | 		require.NoError(t, err)
35 | 		require.NotEmpty(t, entities)
36 | 
37 | 		// Get nvlink entities
38 | 		nvlinkEntities, err := GetEntityGroupEntities(FE_LINK)
39 | 		require.NoError(t, err)
40 | 		require.NotEmpty(t, nvlinkEntities)
41 | 	})
42 | }
43 | 


--------------------------------------------------------------------------------
/samples/dmon/main.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"log"
 6 | 	"os"
 7 | 	"os/signal"
 8 | 	"syscall"
 9 | 	"time"
10 | 
11 | 	"github.com/NVIDIA/go-dcgm/pkg/dcgm"
12 | )
13 | 
14 | const (
15 | 	header = `# gpu   pwr  temp    sm   mem   enc   dec  mclk  pclk
16 | # Idx     W     C     %     %     %     %   MHz   MHz`
17 | )
18 | 
19 | // modelled on nvidia-smi dmon
20 | // dcgmi dmon -e 155,150,203,204,206,207,100,101
21 | func main() {
22 | 	sigs := make(chan os.Signal, 1)
23 | 	signal.Notify(sigs, syscall.SIGINT, syscall.SIGTERM)
24 | 
25 | 	cleanup, err := dcgm.Init(dcgm.Embedded)
26 | 	if err != nil {
27 | 		log.Panicln(err)
28 | 	}
29 | 	defer cleanup()
30 | 
31 | 	gpus, err := dcgm.GetSupportedDevices()
32 | 	if err != nil {
33 | 		log.Panicln(err)
34 | 	}
35 | 
36 | 	ticker := time.NewTicker(time.Second * 1)
37 | 	defer ticker.Stop()
38 | 
39 | 	fmt.Println(header)
40 | 
41 | 	for {
42 | 		select {
43 | 		case <-ticker.C:
44 | 			for _, gpu := range gpus {
45 | 				st, err := dcgm.GetDeviceStatus(gpu)
46 | 				if err != nil {
47 | 					log.Panicln(err)
48 | 				}
49 | 
50 | 				fmt.Printf("%5d %5d %5d %5d %5d %5d %5d %5d %5d\n",
51 | 					gpu, int64(st.Power), st.Temperature, st.Utilization.GPU, st.Utilization.Memory,
52 | 					st.Utilization.Encoder, st.Utilization.Decoder, st.Clocks.Memory, st.Clocks.Cores)
53 | 			}
54 | 
55 | 		case <-sigs:
56 | 			return
57 | 		}
58 | 	}
59 | }
60 | 


--------------------------------------------------------------------------------
/pkg/dcgm/profile.go:
--------------------------------------------------------------------------------
 1 | package dcgm
 2 | 
 3 | /*
 4 | #include "dcgm_agent.h"
 5 | #include "dcgm_structs.h"
 6 | */
 7 | import "C"
 8 | 
 9 | import (
10 | 	"unsafe"
11 | )
12 | 
13 | // MetricGroup represents a group of metrics for a specific GPU
14 | type MetricGroup struct {
15 | 	Major    uint
16 | 	Minor    uint
17 | 	FieldIds []uint
18 | }
19 | 
20 | func getSupportedMetricGroups(gpuID uint) ([]MetricGroup, error) {
21 | 	var (
22 | 		groupInfo C.dcgmProfGetMetricGroups_t
23 | 		err       error
24 | 		groups    []MetricGroup
25 | 	)
26 | 
27 | 	groupInfo.version = makeVersion3(unsafe.Sizeof(groupInfo))
28 | 
29 | 	groupInfo.gpuId = C.uint(gpuID)
30 | 
31 | 	result := C.dcgmProfGetSupportedMetricGroups(handle.handle, &groupInfo)
32 | 
33 | 	if err = errorString(result); err != nil {
34 | 		return nil, &Error{msg: C.GoString(C.errorString(result)), Code: result}
35 | 	}
36 | 
37 | 	count := uint(groupInfo.numMetricGroups)
38 | 
39 | 	groups = make([]MetricGroup, count)
40 | 	for i := uint(0); i < count; i++ {
41 | 		groups[i].Major = uint(groupInfo.metricGroups[i].majorId)
42 | 		groups[i].Minor = uint(groupInfo.metricGroups[i].minorId)
43 | 
44 | 		fieldCount := uint(groupInfo.metricGroups[i].numFieldIds)
45 | 
46 | 		groups[i].FieldIds = make([]uint, fieldCount)
47 | 		for j := uint(0); j < fieldCount; j++ {
48 | 			groups[i].FieldIds[j] = uint(groupInfo.metricGroups[i].fieldIds[j])
49 | 		}
50 | 	}
51 | 
52 | 	return groups, nil
53 | }
54 | 


--------------------------------------------------------------------------------
/samples/health/main.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"log"
 5 | 	"os"
 6 | 	"os/signal"
 7 | 	"syscall"
 8 | 	"text/template"
 9 | 	"time"
10 | 
11 | 	"github.com/NVIDIA/go-dcgm/pkg/dcgm"
12 | )
13 | 
14 | const (
15 | 	healthStatus = `GPU                : {{.GPU}}
16 | Status             : {{.Status}}
17 | {{range .Watches}}
18 | Type               : {{.Type}}
19 | Status             : {{.Status}}
20 | Error              : {{.Error}}
21 | {{end}}
22 | `
23 | )
24 | 
25 | // create group: dcgmi group -c "name" --default
26 | // enable watches: dcgmi health -s a
27 | // check: dcgmi health -g 1 -c
28 | func main() {
29 | 	sigs := make(chan os.Signal, 1)
30 | 	signal.Notify(sigs, syscall.SIGINT, syscall.SIGTERM)
31 | 
32 | 	cleanup, err := dcgm.Init(dcgm.Embedded)
33 | 	if err != nil {
34 | 		log.Panicln(err)
35 | 	}
36 | 	defer cleanup()
37 | 
38 | 	gpus, err := dcgm.GetSupportedDevices()
39 | 	if err != nil {
40 | 		log.Panicln(err)
41 | 	}
42 | 
43 | 	ticker := time.NewTicker(time.Second * 1)
44 | 	defer ticker.Stop()
45 | 
46 | 	t := template.Must(template.New("Health").Parse(healthStatus))
47 | 
48 | 	for {
49 | 		select {
50 | 		case <-ticker.C:
51 | 			for _, gpu := range gpus {
52 | 				h, err := dcgm.HealthCheckByGpuId(gpu)
53 | 				if err != nil {
54 | 					log.Panicln(err)
55 | 				}
56 | 
57 | 				if err = t.Execute(os.Stdout, h); err != nil {
58 | 					log.Panicln("Template error:", err)
59 | 				}
60 | 			}
61 | 		case <-sigs:
62 | 			return
63 | 		}
64 | 	}
65 | }
66 | 


--------------------------------------------------------------------------------
/pkg/dcgm/hostengine_status.go:
--------------------------------------------------------------------------------
 1 | // Package dcgm provides bindings for NVIDIA's Data Center GPU Manager (DCGM)
 2 | package dcgm
 3 | 
 4 | /*
 5 | #include "dcgm_agent.h"
 6 | #include "dcgm_structs.h"
 7 | */
 8 | import "C"
 9 | 
10 | import (
11 | 	"unsafe"
12 | )
13 | 
14 | // Status represents the current resource utilization of the DCGM hostengine process
15 | type Status struct {
16 | 	// Memory represents the current memory usage of the DCGM hostengine in kilobytes
17 | 	Memory int64
18 | 	// CPU represents the current CPU utilization of the DCGM hostengine as a percentage (0-100)
19 | 	CPU float64
20 | }
21 | 
22 | func introspect() (engine Status, err error) {
23 | 	var memory C.dcgmIntrospectMemory_t
24 | 	memory.version = makeVersion1(unsafe.Sizeof(memory))
25 | 	waitIfNoData := 1
26 | 	result := C.dcgmIntrospectGetHostengineMemoryUsage(handle.handle, &memory, C.int(waitIfNoData))
27 | 
28 | 	if err = errorString(result); err != nil {
29 | 		return engine, &Error{msg: C.GoString(C.errorString(result)), Code: result}
30 | 	}
31 | 
32 | 	var cpu C.dcgmIntrospectCpuUtil_t
33 | 
34 | 	cpu.version = makeVersion1(unsafe.Sizeof(cpu))
35 | 	result = C.dcgmIntrospectGetHostengineCpuUtilization(handle.handle, &cpu, C.int(waitIfNoData))
36 | 
37 | 	if err = errorString(result); err != nil {
38 | 		return engine, &Error{msg: C.GoString(C.errorString(result)), Code: result}
39 | 	}
40 | 
41 | 	engine = Status{
42 | 		Memory: toInt64(memory.bytesUsed) / 1024,
43 | 		CPU:    *dblToFloat(cpu.total) * 100,
44 | 	}
45 | 	return
46 | }
47 | 


--------------------------------------------------------------------------------
/samples/policy/main.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"context"
 5 | 	"log"
 6 | 	"os"
 7 | 	"os/signal"
 8 | 	"syscall"
 9 | 
10 | 	"github.com/NVIDIA/go-dcgm/pkg/dcgm"
11 | )
12 | 
13 | // dcgmi group -c "name" --default
14 | // dcgmi policy -g GROUPID --set 0,0 -x -n -p -e -P 250 -T 100 -M 10
15 | // dcgmi policy -g GROUPID --reg
16 | func main() {
17 | 	ctx, done := context.WithCancel(context.Background())
18 | 	// Handle SIGINT (Ctrl+C) and SIGTERM (termination signal)
19 | 	sigs := make(chan os.Signal, 1)
20 | 	signal.Notify(sigs, syscall.SIGINT, syscall.SIGTERM)
21 | 
22 | 	go func() {
23 | 		<-sigs
24 | 		log.Println("Received termination signal, exiting...")
25 | 		done()
26 | 	}()
27 | 
28 | 	cleanup, err := dcgm.Init(dcgm.Embedded)
29 | 	if err != nil {
30 | 		log.Panicln(err)
31 | 	}
32 | 	defer cleanup()
33 | 
34 | 	// Choose policy conditions to register violation callback.
35 | 	// Note: Need to be root for some options
36 | 	// Available options are:
37 | 	// 1. dcgm.DbePolicy
38 | 	// 2. dcgm.PCIePolicy
39 | 	// 3. dcgm.MaxRtPgPolicy
40 | 	// 4. dcgm.ThermalPolicy
41 | 	// 5. dcgm.PowerPolicy
42 | 	// 6. dcgm.NvlinkPolicy
43 | 	// 7. dcgm.XidPolicy
44 | 	c, err := dcgm.ListenForPolicyViolations(ctx, dcgm.DbePolicy, dcgm.XidPolicy)
45 | 	if err != nil {
46 | 		log.Panicln(err)
47 | 	}
48 | 
49 | 	for {
50 | 		select {
51 | 		case pe := <-c:
52 | 			log.Printf("PolicyViolation %6s %v\nTimestamp %2s %v\nData %7s %v",
53 | 				":", pe.Condition, ":", pe.Timestamp, ":", pe.Data)
54 | 		case <-ctx.Done():
55 | 			return
56 | 		}
57 | 	}
58 | }
59 | 


--------------------------------------------------------------------------------
/tests/nvsmi.go:
--------------------------------------------------------------------------------
 1 | package tests
 2 | 
 3 | import (
 4 | 	"bytes"
 5 | 	"fmt"
 6 | 	"os/exec"
 7 | 	"strings"
 8 | )
 9 | 
10 | const (
11 | 	bin       = "nvidia-smi"
12 | 	gpuArg    = "--id="
13 | 	queryArg  = "--query-gpu="
14 | 	formatArg = "--format=csv,noheader,nounits"
15 | )
16 | 
17 | // Query executes nvidia-smi with the specified GPU ID and query parameters.
18 | // It returns the query result as a trimmed string.
19 | //
20 | // Parameters:
21 | //   - id: The GPU ID to query (e.g., "0" for the first GPU)
22 | //   - query: The nvidia-smi query parameter (e.g., "temperature.gpu")
23 | //
24 | // Returns:
25 | //
26 | //	A string containing the query result with whitespace trimmed
27 | func Query(id, query string) string {
28 | 	var out bytes.Buffer
29 | 
30 | 	gpu_args := gpuArg + id
31 | 	query_args := queryArg + query
32 | 
33 | 	cmd := exec.Command(bin, gpu_args, query_args, formatArg)
34 | 	cmd.Stdout = &out
35 | 
36 | 	err := cmd.Run()
37 | 	if err != nil {
38 | 		fmt.Printf("nvsmi exec error: %v\n", err)
39 | 	}
40 | 
41 | 	return strings.TrimSpace(out.String())
42 | }
43 | 
44 | // DeviceCount returns the number of NVIDIA GPU devices available in the system
45 | // by executing nvidia-smi with the specified query parameter.
46 | //
47 | // Parameters:
48 | //   - query: The nvidia-smi query parameter to execute
49 | //
50 | // Returns:
51 | //
52 | //	The number of GPU devices as an unsigned integer
53 | func DeviceCount(query string) uint {
54 | 	var out bytes.Buffer
55 | 
56 | 	query_arg := queryArg + query
57 | 	cmd := exec.Command(bin, query_arg, formatArg)
58 | 	cmd.Stdout = &out
59 | 
60 | 	err := cmd.Run()
61 | 	if err != nil {
62 | 		fmt.Printf("nvsmi exec error: %v\n", err)
63 | 	}
64 | 
65 | 	nvSmi := strings.Split(strings.TrimSuffix(out.String(), "\n"), "\n")
66 | 
67 | 	return uint(len(nvSmi))
68 | }
69 | 


--------------------------------------------------------------------------------
/.golangci.yml:
--------------------------------------------------------------------------------
 1 | version: "2"
 2 | linters:
 3 |   default: none
 4 |   enable:
 5 |     - bodyclose
 6 |     - durationcheck
 7 |     - errcheck
 8 |     - gocritic
 9 |     - gosec
10 |     - govet
11 |     - ineffassign
12 |     - mirror
13 |     - misspell
14 |     - nolintlint
15 |     - perfsprint
16 |     - prealloc
17 |     - predeclared
18 |     - revive
19 |     - rowserrcheck
20 |     - staticcheck
21 |     - testifylint
22 |     - unconvert
23 |     - unused
24 |     - wastedassign
25 |   settings:
26 |     gocritic:
27 |       disabled-checks:
28 |         - hugeParam
29 |         - ifElseChain
30 |         - ptrToRefParam
31 |         - dupImport
32 |         - uncheckedInlineErr
33 |       enabled-tags:
34 |         - diagnostic
35 |         - experimental
36 |         - opinionated
37 |         - performance
38 |         - style
39 |     govet:
40 |       disable:
41 |         - fieldalignment
42 |         - deepequalerrors
43 |       enable-all: true
44 |     revive:
45 |       enable-all-rules: false
46 |       rules:
47 |         - name: superfluous-else
48 |         - name: exported
49 |     testifylint:
50 |       disable-all: true
51 |       enable:
52 |         - nil-compare
53 |         - compares
54 |         - error-is-as
55 |         - bool-compare
56 |         - empty
57 |         - len
58 |         - expected-actual
59 |         - error-nil
60 |   exclusions:
61 |     generated: lax
62 |     presets:
63 |       - common-false-positives
64 |       - legacy
65 |       - std-error-handling
66 |     rules:
67 |       - linters:
68 |           - bodyclose
69 |         path: _test.go
70 |     paths:
71 |       - third_party$
72 |       - builtin$
73 |       - examples$
74 | issues:
75 |   max-issues-per-linter: 0
76 |   max-same-issues: 0
77 | formatters:
78 |   enable:
79 |     - gofmt
80 |     - goimports
81 |   exclusions:
82 |     generated: lax
83 |     paths:
84 |       - third_party$
85 |       - builtin$
86 |       - examples$
87 | 


--------------------------------------------------------------------------------
/samples/topology/main.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"log"
 6 | 
 7 | 	"github.com/NVIDIA/go-dcgm/pkg/dcgm"
 8 | )
 9 | 
10 | const (
11 | 	legend = `
12 | Legend:
13 |  X    = Self
14 |  SYS  = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)
15 |  NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node
16 |  PHB  = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)
17 |  PXB  = Connection traversing multiple PCIe switches (without traversing the PCIe Host Bridge)
18 |  PIX  = Connection traversing a single PCIe switch
19 |  PSB  = Connection traversing a single on-board PCIe switch
20 |  NV#  = Connection traversing a bonded set of # NVLinks`
21 | )
22 | 
23 | // based on nvidia-smi topo -m
24 | // dcgmi topo
25 | func main() {
26 | 	// choose dcgm hostengine running mode
27 | 	// 1. dcgm.Embedded
28 | 	// 2. dcgm.Standalone
29 | 	// 3. dcgm.StartHostengine
30 | 	cleanup, err := dcgm.Init(dcgm.Embedded)
31 | 	if err != nil {
32 | 		log.Panicln(err)
33 | 	}
34 | 	defer cleanup()
35 | 
36 | 	gpus, err := dcgm.GetSupportedDevices()
37 | 	if err != nil {
38 | 		log.Panicln(err)
39 | 	}
40 | 
41 | 	for _, gpu := range gpus {
42 | 		fmt.Printf("%9s%d", "GPU", gpu)
43 | 	}
44 | 
45 | 	fmt.Printf("%5s\n", "CPUAffinity")
46 | 
47 | 	numGpus := len(gpus)
48 | 	gpuTopo := make([]string, numGpus)
49 | 
50 | 	for i := 0; i < numGpus; i++ {
51 | 		topo, err := dcgm.GetDeviceTopology(gpus[i])
52 | 		if err != nil {
53 | 			log.Panicln(err)
54 | 		}
55 | 
56 | 		fmt.Printf("GPU%d", gpus[i])
57 | 
58 | 		for j := 0; j < len(topo); j++ {
59 | 			// skip current GPU
60 | 			gpuTopo[topo[j].GPU] = topo[j].Link.PCIPaths()
61 | 		}
62 | 
63 | 		gpuTopo[i] = "X"
64 | 		for j := 0; j < numGpus; j++ {
65 | 			fmt.Printf("%5s", gpuTopo[j])
66 | 		}
67 | 
68 | 		deviceInfo, err := dcgm.GetDeviceInfo(gpus[i])
69 | 		if err != nil {
70 | 			log.Panicln(err)
71 | 		}
72 | 
73 | 		fmt.Printf("%5s\n", deviceInfo.CPUAffinity)
74 | 	}
75 | 
76 | 	fmt.Println(legend)
77 | }
78 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | GOLANGCILINT_TIMEOUT ?= 10m
16 | 
17 | .PHONY: all binary check-format install install-pre-commit
18 | all: binary test-main check-format
19 | 
20 | install-pre-commit:
21 | 	@echo "Installing pre-commit hooks..."
22 | 	pre-commit install --config .pre-commit-config.yaml
23 | 	@echo "Pre-commit hooks installed."
24 | 
25 | binary:
26 | 	go build ./pkg/dcgm
27 | 	cd samples/deviceInfo; go build
28 | 	cd samples/dmon; go build
29 | 	cd samples/health; go build
30 | 	cd samples/hostengineStatus; go build
31 | 	cd samples/policy; go build
32 | 	cd samples/processInfo; go build
33 | 	cd samples/restApi; go build
34 | 	cd samples/topology; go build
35 | 	cd samples/diag; go build
36 | 
37 | docker:
38 | 	docker buildx bake default --load
39 | 
40 | test-main:
41 | 	go test -race -v ./tests
42 | 	go test -v ./tests
43 | 
44 | check-format:
45 | 	test $$(gofumpt -l -w . | tee /dev/stderr | wc -l) -eq 0
46 | 
47 | clean:
48 | 	rm -f samples/deviceInfo/deviceInfo
49 | 	rm -f samples/dmon/dmon
50 | 	rm -f samples/health/health
51 | 	rm -f samples/hostengineStatus/hostengineStatus
52 | 	rm -f samples/policy/policy
53 | 	rm -f samples/processInfo/processInfo
54 | 	rm -f samples/restApi/restApi
55 | 	rm -f samples/topology/topology
56 | 
57 | lint:
58 | 	golangci-lint run ./... --timeout $(GOLANGCILINT_TIMEOUT)  --new-from-rev=HEAD~1 --fix
59 | 
60 | lint-full:
61 | 	golangci-lint run ./... --timeout $(GOLANGCILINT_TIMEOUT) --fix
62 | 


--------------------------------------------------------------------------------
/pkg/dcgm/testdata/one_switch.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | devices:
 3 |   - type: 2  # nvsdmDevType
 4 |     dev_id: 0
 5 |     vendor_id: 0xbaca
 6 |     health_state: 0
 7 |     fields: []
 8 |     ports:
 9 |       - lid: 1
10 |         fields:
11 |           - type: 1  # nvsdmTelemType_t
12 |             field: 2
13 |             value: 51984
14 |             value_type: 2  # nvsdmValType_t
15 |             status: 0  # nvsdmRet_t
16 |           - type: 1  # nvsdmTelemType_t
17 |             field: 10
18 |             value: 56952
19 |             value_type: 2  # nvsdmValType_t
20 |             status: 0  # nvsdmRet_t
21 |           - type: 2  # nvsdmTelemType_t
22 |             field: 1
23 |             value: 0
24 |             value_type: 5  # nvsdmValType_t
25 |             status: 0  # nvsdmRet_t
26 |           - type: 2  # nvsdmTelemType_t
27 |             field: 3
28 |             value: 0
29 |             value_type: 5  # nvsdmValType_t
30 |             status: 0  # nvsdmRet_t
31 |           - type: 2  # nvsdmTelemType_t
32 |             field: 4
33 |             value: 65
34 |             value_type: 1  # nvsdmValType_t
35 |             status: 0  # nvsdmRet_t
36 |       - lid: 1
37 |         fields:
38 |           - type: 1  # nvsdmTelemType_t
39 |             field: 2
40 |             value: 51984
41 |             value_type: 2  # nvsdmValType_t
42 |             status: 0  # nvsdmRet_t
43 |           - type: 1  # nvsdmTelemType_t
44 |             field: 10
45 |             value: 56952
46 |             value_type: 2  # nvsdmValType_t
47 |             status: 0  # nvsdmRet_t
48 |           - type: 2  # nvsdmTelemType_t
49 |             field: 1
50 |             value: 0
51 |             value_type: 5  # nvsdmValType_t
52 |             status: 0  # nvsdmRet_t
53 |           - type: 2  # nvsdmTelemType_t
54 |             field: 3
55 |             value: 0
56 |             value_type: 5  # nvsdmValType_t
57 |             status: 0  # nvsdmRet_t
58 |           - type: 2  # nvsdmTelemType_t
59 |             field: 4
60 |             value: 65
61 |             value_type: 1  # nvsdmValType_t
62 |             status: 0  # nvsdmRet_t
63 | 


--------------------------------------------------------------------------------
/samples/restApi/handlers/byUuids.go:
--------------------------------------------------------------------------------
 1 | package handlers
 2 | 
 3 | import (
 4 | 	"log"
 5 | 	"net/http"
 6 | 
 7 | 	"github.com/NVIDIA/go-dcgm/pkg/dcgm"
 8 | )
 9 | 
10 | // map of uuids and device id
11 | var uuids map[string]uint
12 | 
13 | // DevicesUuids initializes a global map of GPU UUIDs to device IDs
14 | // This must be called before using UUID-based endpoints
15 | func DevicesUuids() {
16 | 	uuids = make(map[string]uint)
17 | 
18 | 	count, err := dcgm.GetAllDeviceCount()
19 | 	if err != nil {
20 | 		log.Printf("(DCGM) Error getting devices: %s", err)
21 | 		return
22 | 	}
23 | 
24 | 	for i := uint(0); i < count; i++ {
25 | 		deviceInfo, err := dcgm.GetDeviceInfo(i)
26 | 		if err != nil {
27 | 			log.Printf("(DCGM) Error getting device information: %s", err)
28 | 			return
29 | 		}
30 | 
31 | 		uuids[deviceInfo.UUID] = i
32 | 	}
33 | }
34 | 
35 | // DeviceInfoByUuid handles HTTP requests for device information by GPU UUID
36 | // It returns either JSON or formatted text output based on the request URL
37 | func DeviceInfoByUuid(resp http.ResponseWriter, req *http.Request) {
38 | 	device := getDeviceInfo(resp, req)
39 | 	if device == nil {
40 | 		return
41 | 	}
42 | 
43 | 	if isJson(req) {
44 | 		encode(resp, req, device)
45 | 		return
46 | 	}
47 | 
48 | 	printer(resp, req, device, deviceInfo)
49 | }
50 | 
51 | // DeviceStatusByUuid handles HTTP requests for device status by GPU UUID
52 | // It returns either JSON or formatted text output based on the request URL
53 | func DeviceStatusByUuid(resp http.ResponseWriter, req *http.Request) {
54 | 	st := getDeviceStatus(resp, req)
55 | 	if st == nil {
56 | 		return
57 | 	}
58 | 
59 | 	if isJson(req) {
60 | 		encode(resp, req, st)
61 | 		return
62 | 	}
63 | 
64 | 	printer(resp, req, st, deviceStatus)
65 | }
66 | 
67 | // HealthByUuid handles HTTP requests for device health status by GPU UUID
68 | // It returns either JSON or formatted text output based on the request URL
69 | func HealthByUuid(resp http.ResponseWriter, req *http.Request) {
70 | 	h := getHealth(resp, req)
71 | 	if h == nil {
72 | 		return
73 | 	}
74 | 
75 | 	if isJson(req) {
76 | 		encode(resp, req, h)
77 | 		return
78 | 	}
79 | 
80 | 	printer(resp, req, h, healthStatus)
81 | }
82 | 


--------------------------------------------------------------------------------
/samples/restApi/handlers/byIds.go:
--------------------------------------------------------------------------------
 1 | package handlers
 2 | 
 3 | import (
 4 | 	"net/http"
 5 | )
 6 | 
 7 | // DeviceInfo handles HTTP requests for device information by device ID
 8 | // It returns either JSON or formatted text output based on the request URL
 9 | func DeviceInfo(resp http.ResponseWriter, req *http.Request) {
10 | 	device := getDeviceInfo(resp, req)
11 | 	if device == nil {
12 | 		return
13 | 	}
14 | 
15 | 	if isJson(req) {
16 | 		encode(resp, req, device)
17 | 		return
18 | 	}
19 | 
20 | 	printer(resp, req, device, deviceInfo)
21 | }
22 | 
23 | // DeviceStatus handles HTTP requests for device status by device ID
24 | // It returns either JSON or formatted text output based on the request URL
25 | func DeviceStatus(resp http.ResponseWriter, req *http.Request) {
26 | 	st := getDeviceStatus(resp, req)
27 | 	if st == nil {
28 | 		return
29 | 	}
30 | 
31 | 	if isJson(req) {
32 | 		encode(resp, req, st)
33 | 		return
34 | 	}
35 | 
36 | 	printer(resp, req, st, deviceStatus)
37 | }
38 | 
39 | // ProcessInfo handles HTTP requests for process information by PID
40 | // It returns either JSON or formatted text output based on the request URL
41 | func ProcessInfo(resp http.ResponseWriter, req *http.Request) {
42 | 	pInfo := getProcessInfo(resp, req)
43 | 	if len(pInfo) == 0 {
44 | 		return
45 | 	}
46 | 
47 | 	if isJson(req) {
48 | 		encode(resp, req, pInfo)
49 | 		return
50 | 	}
51 | 
52 | 	processPrint(resp, req, pInfo)
53 | }
54 | 
55 | // Health handles HTTP requests for device health status by device ID
56 | // It returns either JSON or formatted text output based on the request URL
57 | func Health(resp http.ResponseWriter, req *http.Request) {
58 | 	h := getHealth(resp, req)
59 | 	if h == nil {
60 | 		return
61 | 	}
62 | 
63 | 	if isJson(req) {
64 | 		encode(resp, req, h)
65 | 		return
66 | 	}
67 | 
68 | 	printer(resp, req, h, healthStatus)
69 | }
70 | 
71 | // Status handles HTTP requests for DCGM daemon status
72 | // It returns either JSON or formatted text output based on the request URL
73 | func Status(resp http.ResponseWriter, req *http.Request) {
74 | 	st := getStatus(resp, req)
75 | 	if st == nil {
76 | 		return
77 | 	}
78 | 
79 | 	if isJson(req) {
80 | 		encode(resp, req, st)
81 | 		return
82 | 	}
83 | 
84 | 	printer(resp, req, st, hostengine)
85 | }
86 | 


--------------------------------------------------------------------------------
/samples/deviceInfo/main.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"flag"
 5 | 	"log"
 6 | 	"os"
 7 | 	"text/template"
 8 | 
 9 | 	"github.com/NVIDIA/go-dcgm/pkg/dcgm"
10 | )
11 | 
12 | const (
13 | 	deviceInfo = `Driver Version         : {{.Identifiers.DriverVersion}}
14 | GPU		       : {{.GPU}}
15 | DCGMSupported          : {{.DCGMSupported}}
16 | UUID                   : {{.UUID}}
17 | Brand                  : {{.Identifiers.Brand}}
18 | Model                  : {{.Identifiers.Model}}
19 | Serial Number          : {{.Identifiers.Serial}}
20 | Vbios                  : {{or .Identifiers.Vbios "N/A"}}
21 | InforomImage Version   : {{.Identifiers.InforomImageVersion}}
22 | Bus ID                 : {{.PCI.BusID}}
23 | BAR1 (MB)              : {{or .PCI.BAR1 "N/A"}}
24 | FrameBuffer Memory (MB): {{or .PCI.FBTotal "N/A"}}
25 | Bandwidth (MB/s)       : {{or .PCI.Bandwidth "N/A"}}
26 | Power (W)              : {{or .Power "N/A"}}
27 | CPUAffinity            : {{or .CPUAffinity "N/A"}}
28 | P2P Available          : {{if not .Topology}}None{{else}}{{range .Topology}}
29 |     GPU{{.GPU}} - (BusID){{.BusID}} - {{.Link.PCIPaths}}{{end}}{{end}}
30 | ---------------------------------------------------------------------
31 | `
32 | )
33 | 
34 | var (
35 | 	connectAddr = flag.String("connect", "localhost", "Provide nv-hostengine connection address.")
36 | 	isSocket    = flag.String("socket", "0", "Connecting to Unix socket?")
37 | )
38 | 
39 | // mini version of nvidia-smi -q
40 | // dcgmi discovery -i apc
41 | func main() {
42 | 	// choose dcgm hostengine running mode
43 | 	// 1. dcgm.Embedded
44 | 	// 2. dcgm.Standalone -connect "addr", -socket "isSocket"
45 | 	// 3. dcgm.StartHostengine
46 | 	flag.Parse()
47 | 
48 | 	cleanup, err := dcgm.Init(dcgm.Standalone, *connectAddr, *isSocket)
49 | 	if err != nil {
50 | 		log.Panicln(err)
51 | 	}
52 | 
53 | 	defer cleanup()
54 | 
55 | 	count, err := dcgm.GetAllDeviceCount()
56 | 	if err != nil {
57 | 		log.Panicln(err)
58 | 	}
59 | 
60 | 	t := template.Must(template.New("Device").Parse(deviceInfo))
61 | 
62 | 	for i := uint(0); i < count; i++ {
63 | 		deviceInfo, err := dcgm.GetDeviceInfo(i)
64 | 		if err != nil {
65 | 			log.Panicln(err)
66 | 		}
67 | 
68 | 		if err = t.Execute(os.Stdout, deviceInfo); err != nil {
69 | 			log.Panicln("Template error:", err)
70 | 		}
71 | 	}
72 | }
73 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | repos:
 3 |   - repo: https://github.com/pre-commit/pre-commit-hooks
 4 |     rev: v5.0.0
 5 |     hooks:
 6 |       - id: trailing-whitespace
 7 |       - id: end-of-file-fixer
 8 |       - id: check-added-large-files
 9 |   - repo: https://github.com/adrienverge/yamllint
10 |     rev: v1.37.1
11 |     hooks:
12 |       - id: yamllint
13 |         args: [--strict, -c=.yamllint.yaml]
14 |         entry: yamllint
15 |         files: \.ya?ml$
16 |   - repo: https://github.com/igorshubovych/markdownlint-cli
17 |     rev: v0.45.0
18 |     hooks:
19 |       - id: markdownlint
20 |         name: markdownlint
21 |         description: "Checks the style of Markdown/Commonmark files."
22 |         entry: ghcr.io/igorshubovych/markdownlint-cli
23 |         language: docker_image
24 |         types: [markdown]
25 |         minimum_pre_commit_version: 0.15.0
26 |   - repo: https://github.com/hadolint/hadolint
27 |     rev: v2.13.1-beta
28 |     hooks:
29 |       - id: hadolint
30 |         name: Lint Dockerfiles
31 |         args: [hadolint, --config, .hadolint.yaml]
32 |         description: Runs hadolint to lint Dockerfiles
33 |         language: docker_image
34 |         types: ["dockerfile"]
35 |         entry: hadolint/hadolint:v2.12.0-alpine
36 |   - repo: local
37 |     hooks:
38 |       - id: goimports-nvidia
39 |         name: goimports-nvidia
40 |         description: run goimports
41 |         entry: goimports -w -local nvidia.com/NVIDIA/go-dcgm
42 |         language: golang
43 |         types: [go]
44 |         exclude: '(\.pb|\.sql|mock_.*)\.go'
45 |       - id: go-mod-tidy
46 |         name: 'go-mod-tidy'
47 |         entry: scripts/lint/go-mod-tidy.sh
48 |         pass_filenames: false
49 |         language: 'script'
50 |         description: "Runs `go mod tidy -v`, requires golang"
51 |   - repo: https://github.com/golangci/golangci-lint
52 |     rev: v2.1.6
53 |     hooks:
54 |       - id: golangci-lint-config-verify
55 |         name: golangci-lint-config-verify
56 |         description: Verifies the configuration file
57 |         entry: golangci-lint config verify
58 |         files: .golangci.yml
59 |         language: golang
60 |         pass_filenames: false
61 |       - id: golangci-lint
62 |         name: golangci-lint
63 |         description: Fast linters runner for Go.
64 |         entry: golangci-lint run --new-from-rev origin/main --fix --allow-parallel-runners --timeout 5m
65 |         types: [go]
66 |         language: golang
67 |         require_serial: true
68 |         pass_filenames: false
69 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | # CUDA_VERSION and DISTRO_FLAVOR are used to select a docker image tag from the upstream
 2 | # docker registry for nvidia/cuda.   The variation of DISTRO_FLAVOR and CUDA_VERSION must
 3 | # point to an image that exists, see here for list: https://hub.docker.com/r/nvidia/cuda/tags
 4 | 
 5 | # CUDA_VERSION
 6 | ARG CUDA_VERSION=12.5.1
 7 | # cuda image supports these images rockylinux9, rockylinux8, ubi9, ubi8, ubuntu24.04, ubuntu22.04, ubuntu20.04
 8 | # Note: Testing has only been done with the ubuntu variants.
 9 | ARG DISTRO_FLAVOR=ubuntu24.04
10 | 
11 | # Use build arguments to select our base image or just stick with the defaults above.
12 | FROM nvidia/cuda:$CUDA_VERSION-base-$DISTRO_FLAVOR AS base
13 | ARG DCGM_VERSION=4.2.3-2
14 | ARG GO_VERSION=1.24.4
15 | ENV DEBIAN_FRONTEND=noninteractive
16 | 
17 | SHELL ["/bin/bash", "-o", "pipefail", "-c"]
18 | 
19 | # Setup our apt environment and install the necessary keyrings and repositories to install dcgm.  Note that this strategy doesn't
20 | # support dcgm 3.x.
21 | # We want recommended packages for dcgm and we dont want to enforce version pinning...yet
22 | # hadolint ignore=DL3015,DL3008
23 | RUN apt-get update && apt-get install -y --no-install-recommends \
24 |     gnupg2 curl ca-certificates && \
25 |     curl -fsSL https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.1-1_all.deb | apt-get install -y --no-install-recommends && \
26 |     curl -fsSL https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu2004/x86_64/nvidia-machine-learning-repo-ubuntu2004_1.0.0-1_amd64.deb | apt-get install -y --no-install-recommends && \
27 |     curl -fsSL https://go.dev/dl/go${GO_VERSION}.linux-amd64.tar.gz | tar -C /usr/local -xz && \
28 |     apt-get purge --autoremove -y curl && \
29 |     apt-get install -y datacenter-gpu-manager-4-dev=1:${DCGM_VERSION} && \
30 |     rm -rf /var/lib/apt/lists/*
31 | 
32 | ENV PATH=$PATH:/usr/local/go/bin
33 | 
34 | # build go-dcgm and samples inside docker environment
35 | FROM base AS samples
36 | # hadolint ignore=DL3008,DL3015
37 | RUN apt-get update && apt-get install -y build-essential nvidia-utils-555 && rm -rf /var/lib/apt/lists/*
38 | COPY . /src
39 | WORKDIR /src
40 | RUN make binary && \
41 |     cp ./samples/restApi/restApi \
42 |       ./samples/processInfo/processInfo \
43 |       ./samples/diag/diag \
44 |       ./samples/hostengineStatus/hostengineStatus \
45 |       ./samples/dmon/dmon \
46 |       ./samples/health/health \
47 |       ./samples/topology/topology \
48 |       ./samples/deviceInfo/deviceInfo \
49 |       ./samples/policy/policy \
50 |     /usr/local/go/bin/
51 | WORKDIR /
52 | 


--------------------------------------------------------------------------------
/pkg/dcgm/cpu.go:
--------------------------------------------------------------------------------
 1 | package dcgm
 2 | 
 3 | /*
 4 | #include "dcgm_agent.h"
 5 | #include "dcgm_structs.h"
 6 | */
 7 | import "C"
 8 | 
 9 | import (
10 | 	"fmt"
11 | 	"unsafe"
12 | )
13 | 
14 | /*
15 |  *See dcgm_structs.h
16 |  *	DCGM_CPU_CORE_BITMASK_COUNT_V1 (DCGM_MAX_NUM_CPU_CORES / sizeof(uint64_t) / CHAR_BIT)
17 |  *	or
18 |  *	1024 / 8 / 8
19 |  */
20 | 
21 | const (
22 | 	// MAX_NUM_CPU_CORES represents the maximum number of CPU cores supported
23 | 	MAX_NUM_CPU_CORES = uint(C.DCGM_MAX_NUM_CPU_CORES)
24 | 
25 | 	// MAX_NUM_CPUS represents the maximum number of CPUs supported
26 | 	MAX_NUM_CPUS = uint(C.DCGM_MAX_NUM_CPUS)
27 | 
28 | 	// CHAR_BIT represents the number of bits in a byte
29 | 	CHAR_BIT = uint(C.CHAR_BIT)
30 | 
31 | 	// MAX_CPU_CORE_BITMASK_COUNT represents the maximum count of CPU core bitmasks
32 | 	MAX_CPU_CORE_BITMASK_COUNT = uint(1024 / 8 / 8)
33 | )
34 | 
35 | // CPUHierarchyCPU_v1 represents information about a single CPU and its owned cores
36 | type CPUHierarchyCPU_v1 struct {
37 | 	// CPUID is the unique identifier for this CPU
38 | 	CPUID uint
39 | 	// OwnedCores is a bitmask array representing the cores owned by this CPU
40 | 	OwnedCores []uint64
41 | }
42 | 
43 | // CPUHierarchy_v1 represents version 1 of the CPU hierarchy information
44 | type CPUHierarchy_v1 struct {
45 | 	// Version is the version number of the hierarchy structure
46 | 	Version uint
47 | 	// NumCPUs is the number of CPUs in the system
48 | 	NumCPUs uint
49 | 	// CPUs contains information about each CPU in the system
50 | 	CPUs [MAX_NUM_CPUS]CPUHierarchyCPU_v1
51 | }
52 | 
53 | // GetCPUHierarchy retrieves the CPU hierarchy information from DCGM
54 | func GetCPUHierarchy() (hierarchy CPUHierarchy_v1, err error) {
55 | 	var c_hierarchy C.dcgmCpuHierarchy_v1
56 | 	c_hierarchy.version = C.dcgmCpuHierarchy_version1
57 | 	ptr_hierarchy := (*C.dcgmCpuHierarchy_v1)(unsafe.Pointer(&c_hierarchy))
58 | 	result := C.dcgmGetCpuHierarchy(handle.handle, ptr_hierarchy)
59 | 
60 | 	if err = errorString(result); err != nil {
61 | 		return toCpuHierarchy(c_hierarchy), fmt.Errorf("error retrieving DCGM CPU hierarchy: %s", err)
62 | 	}
63 | 
64 | 	return toCpuHierarchy(c_hierarchy), nil
65 | }
66 | 
67 | func toCpuHierarchy(c_hierarchy C.dcgmCpuHierarchy_v1) CPUHierarchy_v1 {
68 | 	var hierarchy CPUHierarchy_v1
69 | 	hierarchy.Version = uint(c_hierarchy.version)
70 | 	hierarchy.NumCPUs = uint(c_hierarchy.numCpus)
71 | 	for i := uint(0); i < hierarchy.NumCPUs; i++ {
72 | 		bits := make([]uint64, MAX_CPU_CORE_BITMASK_COUNT)
73 | 
74 | 		for j := uint(0); j < MAX_CPU_CORE_BITMASK_COUNT; j++ {
75 | 			bits[j] = uint64(c_hierarchy.cpus[i].ownedCores.bitmask[j])
76 | 		}
77 | 
78 | 		hierarchy.CPUs[i] = CPUHierarchyCPU_v1{
79 | 			CPUID:      uint(c_hierarchy.cpus[i].cpuId),
80 | 			OwnedCores: bits,
81 | 		}
82 | 	}
83 | 
84 | 	return hierarchy
85 | }
86 | 


--------------------------------------------------------------------------------
/pkg/dcgm/gpu_group_test.go:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package dcgm
18 | 
19 | import (
20 | 	"context"
21 | 	"testing"
22 | 
23 | 	"github.com/stretchr/testify/assert"
24 | 	"github.com/stretchr/testify/require"
25 | )
26 | 
27 | func TestGroupHandle(t *testing.T) {
28 | 	gh := GroupHandle{}
29 | 	assert.Equal(t, uintptr(0), gh.GetHandle(), "value mismatch")
30 | 
31 | 	inputs := []uintptr{1000, 0, 1, 10, 11, 50, 100, 1939902, 9992932938239, 999999999999999999}
32 | 
33 | 	for _, input := range inputs {
34 | 		gh.SetHandle(input)
35 | 		assert.Equal(t, input, gh.GetHandle(), "values mismatch")
36 | 	}
37 | }
38 | 
39 | func TestGetGroupInfo(t *testing.T) {
40 | 	teardownTest := setupTest(t)
41 | 	defer teardownTest(t)
42 | 
43 | 	runOnlyWithLiveGPUs(t)
44 | 	gpus, err := withInjectionGPUs(t, 1)
45 | 	require.NoError(t, err)
46 | 
47 | 	gpuID := gpus[0]
48 | 
49 | 	groupID, err := CreateGroup("test1")
50 | 	require.NoError(t, err)
51 | 
52 | 	defer func() {
53 | 		_ = DestroyGroup(groupID)
54 | 	}()
55 | 
56 | 	err = AddEntityToGroup(groupID, FE_GPU, gpuID)
57 | 	require.NoError(t, err)
58 | 
59 | 	grInfo, err := GetGroupInfo(groupID)
60 | 	require.NoError(t, err)
61 | 
62 | 	assert.Equal(t, "test1", grInfo.GroupName)
63 | 	assert.Len(t, grInfo.EntityList, 1)
64 | 	assert.Equal(t, FE_GPU, grInfo.EntityList[0].EntityGroupId)
65 | 	assert.Equal(t, gpuID, grInfo.EntityList[0].EntityId)
66 | }
67 | 
68 | func TestCreateGroupWithContext(t *testing.T) {
69 | 	teardownTest := setupTest(t)
70 | 	defer teardownTest(t)
71 | 
72 | 	runOnlyWithLiveGPUs(t)
73 | 
74 | 	t.Run("successful creation", func(t *testing.T) {
75 | 		ctx := context.Background()
76 | 		groupName := "test_group"
77 | 
78 | 		group, err := CreateGroupWithContext(ctx, groupName)
79 | 		require.NoError(t, err)
80 | 		require.NotZero(t, group.GetHandle())
81 | 
82 | 		// Clean up
83 | 		err = DestroyGroup(group)
84 | 		require.NoError(t, err)
85 | 	})
86 | 
87 | 	t.Run("context cancellation", func(t *testing.T) {
88 | 		ctx, cancel := context.WithCancel(context.Background())
89 | 		cancel() // Cancel immediately
90 | 
91 | 		group, err := CreateGroupWithContext(ctx, "test_group")
92 | 		require.Error(t, err)
93 | 		require.Equal(t, context.Canceled, err)
94 | 		require.Zero(t, group.GetHandle())
95 | 	})
96 | }
97 | 


--------------------------------------------------------------------------------
/pkg/dcgm/utils.go:
--------------------------------------------------------------------------------
  1 | package dcgm
  2 | 
  3 | /*
  4 | #include <stdlib.h>
  5 | #include "dcgm_structs.h"
  6 | */
  7 | import "C"
  8 | 
  9 | import (
 10 | 	"fmt"
 11 | 	"math"
 12 | 	"unsafe"
 13 | )
 14 | 
 15 | const (
 16 | 	dcgmInt32Blank = 0x7ffffff0         // 2147483632
 17 | 	dcgmInt64Blank = 0x7ffffffffffffff0 // 9223372036854775792
 18 | )
 19 | 
 20 | func uintPtr(c C.uint) *uint {
 21 | 	i := uint(c)
 22 | 	return &i
 23 | }
 24 | 
 25 | func uint64Ptr(c C.longlong) *uint64 {
 26 | 	i := uint64(c)
 27 | 	return &i
 28 | }
 29 | 
 30 | func int64Ptr(c C.longlong) *int64 {
 31 | 	i := int64(c)
 32 | 	return &i
 33 | }
 34 | 
 35 | func toInt64(c C.longlong) int64 {
 36 | 	i := int64(c)
 37 | 	return i
 38 | }
 39 | 
 40 | func dblToFloat(val C.double) *float64 {
 41 | 	i := float64(val)
 42 | 	return &i
 43 | }
 44 | 
 45 | func stringPtr(c *C.char) *string {
 46 | 	s := C.GoString(c)
 47 | 	return &s
 48 | }
 49 | 
 50 | // Error represents an error returned by the DCGM library
 51 | type Error struct {
 52 | 	msg  string         // description of error
 53 | 	Code C.dcgmReturn_t // dcgmReturn_t value of error
 54 | }
 55 | 
 56 | func (e *Error) Error() string { return e.msg }
 57 | 
 58 | func errorString(result C.dcgmReturn_t) error {
 59 | 	if result == C.DCGM_ST_OK {
 60 | 		return nil
 61 | 	}
 62 | 	err := C.GoString(C.errorString(result))
 63 | 	return fmt.Errorf("%v", err)
 64 | }
 65 | 
 66 | func freeCString(cStr *C.char) {
 67 | 	C.free(unsafe.Pointer(cStr))
 68 | }
 69 | 
 70 | // IsInt32Blank checks if an integer value represents DCGM's "blank" or sentinel value (0x7ffffff0).
 71 | // These values indicate that no valid data is available for the field.
 72 | func IsInt32Blank(value int) bool {
 73 | 	return value >= dcgmInt32Blank
 74 | }
 75 | 
 76 | // IsInt64Blank checks if an integer value represents DCGM's "blank" or sentinel value (0x7ffffffffffffff0).
 77 | // These values indicate that no valid data is available for the field.
 78 | func IsInt64Blank(value int64) bool {
 79 | 	return value >= dcgmInt64Blank
 80 | }
 81 | 
 82 | func makeVersion1(struct_type uintptr) C.uint {
 83 | 	version := C.uint(struct_type | 1<<24)
 84 | 	return version
 85 | }
 86 | 
 87 | func makeVersion2(struct_type uintptr) C.uint {
 88 | 	version := C.uint(struct_type | 2<<24)
 89 | 	return version
 90 | }
 91 | 
 92 | func makeVersion3(struct_type uintptr) C.uint {
 93 | 	version := C.uint(struct_type | 3<<24)
 94 | 	return version
 95 | }
 96 | 
 97 | func makeVersion4(struct_type uintptr) C.uint {
 98 | 	version := C.uint(struct_type | 4<<24)
 99 | 	return version
100 | }
101 | 
102 | func makeVersion5(struct_type uintptr) C.uint {
103 | 	version := C.uint(struct_type | 5<<24)
104 | 	return version
105 | }
106 | 
107 | func makeVersion12(struct_type uintptr) C.uint {
108 | 	version := C.uint(struct_type | 12<<24)
109 | 	return version
110 | }
111 | 
112 | func roundFloat(f *float64) *float64 {
113 | 	var val float64
114 | 	if f != nil {
115 | 		val = math.Round(*f)
116 | 	}
117 | 	return &val
118 | }
119 | 


--------------------------------------------------------------------------------
/samples/restApi/server.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"context"
 5 | 	"log"
 6 | 	"net/http"
 7 | 	"time"
 8 | 
 9 | 	h "github.com/NVIDIA/go-dcgm/samples/restApi/handlers"
10 | 	"github.com/gorilla/mux"
11 | )
12 | 
13 | const timeout = 5 * time.Second
14 | 
15 | // httpServer represents an HTTP server instance that handles DCGM REST API endpoints
16 | type httpServer struct {
17 | 	router *mux.Router
18 | 	server *http.Server
19 | }
20 | 
21 | // newHttpServer creates and configures a new HTTP server instance
22 | // addr specifies the address:port to listen on
23 | func newHttpServer(addr string) *httpServer {
24 | 	r := mux.NewRouter()
25 | 
26 | 	s := &httpServer{
27 | 		router: r,
28 | 		server: &http.Server{
29 | 			Addr:         addr,
30 | 			Handler:      r,
31 | 			ReadTimeout:  timeout,
32 | 			WriteTimeout: timeout,
33 | 		},
34 | 	}
35 | 
36 | 	// make a global map of device uuids and ids
37 | 	h.DevicesUuids()
38 | 
39 | 	s.handler()
40 | 
41 | 	return s
42 | }
43 | 
44 | func (s *httpServer) handler() {
45 | 	deviceInfo := "/dcgm/device/info"
46 | 	subrouter := s.router.PathPrefix(deviceInfo).Subrouter()
47 | 	subrouter.HandleFunc("/id/{id}", h.DeviceInfo).Methods("GET")
48 | 	subrouter.HandleFunc("/id/{id}/json", h.DeviceInfo).Methods("GET")
49 | 	subrouter.HandleFunc("/uuid/{uuid}", h.DeviceInfoByUuid).Methods("GET")
50 | 	subrouter.HandleFunc("/uuid/{uuid}/json", h.DeviceInfoByUuid).Methods("GET")
51 | 
52 | 	deviceStatus := "/dcgm/device/status"
53 | 	subrouter = s.router.PathPrefix(deviceStatus).Subrouter()
54 | 	subrouter.HandleFunc("/id/{id}", h.DeviceStatus).Methods("GET")
55 | 	subrouter.HandleFunc("/id/{id}/json", h.DeviceStatus).Methods("GET")
56 | 	subrouter.HandleFunc("/uuid/{uuid}", h.DeviceStatusByUuid).Methods("GET")
57 | 	subrouter.HandleFunc("/uuid/{uuid}/json", h.DeviceStatusByUuid).Methods("GET")
58 | 
59 | 	processInfo := "/dcgm/process/info/pid/{pid}"
60 | 	subrouter = s.router.PathPrefix(processInfo).Subrouter()
61 | 	subrouter.HandleFunc("", h.ProcessInfo).Methods("GET")
62 | 	subrouter.HandleFunc("/json", h.ProcessInfo).Methods("GET")
63 | 
64 | 	health := "/dcgm/health"
65 | 	subrouter = s.router.PathPrefix(health).Subrouter()
66 | 	subrouter.HandleFunc("/id/{id}", h.Health).Methods("GET")
67 | 	subrouter.HandleFunc("/id/{id}/json", h.Health).Methods("GET")
68 | 	subrouter.HandleFunc("/uuid/{uuid}", h.HealthByUuid).Methods("GET")
69 | 	subrouter.HandleFunc("/uuid/{uuid}/json", h.HealthByUuid).Methods("GET")
70 | 
71 | 	dcgmStatus := "/dcgm/status"
72 | 	subrouter = s.router.PathPrefix(dcgmStatus).Subrouter()
73 | 	subrouter.HandleFunc("", h.Status).Methods("GET")
74 | 	subrouter.HandleFunc("/json", h.Status).Methods("GET")
75 | }
76 | 
77 | func (s *httpServer) serve() {
78 | 	if err := s.server.ListenAndServe(); err != http.ErrServerClosed {
79 | 		log.Printf("Error: %v", err)
80 | 	}
81 | }
82 | 
83 | func (s *httpServer) stop() {
84 | 	ctx, cancel := context.WithTimeout(context.Background(), timeout)
85 | 	defer cancel()
86 | 
87 | 	if err := s.server.Shutdown(ctx); err != nil {
88 | 		log.Printf("Error: %v", err)
89 | 	} else {
90 | 		log.Println("http server stopped")
91 | 	}
92 | }
93 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contribute to the DCGM Golang Bindings
 2 | 
 3 | Want to hack on the NVIDIA DCGM Golang Bindings Project? Awesome!
 4 | We only require you to sign your work, the below section describes this!
 5 | 
 6 | ## Validate your work
 7 | 
 8 | All changes need to be able to pass all linting and pre-commit checks.  All tests
 9 | must pass, including `make lint-full`, `pre-commit run --all-files`, and `make test-main`
10 | 
11 | Note: There is a race in `make test-main` and it will occaisionally fail due to the race.
12 | 
13 | ### Setting up pre-commit
14 | 
15 | You can install pre-commit via brew, apt/dnf, or via pip:
16 | 
17 | ```bash
18 | pip install pre-commit
19 | ```
20 | 
21 | Once installed, you can run:
22 | 
23 | ```bash
24 | make install-pre-commit
25 | pre-commit autoupdate
26 | ```
27 | 
28 | Once you've complete this step, pre-commit is setup and ready to go.  The pre-commit hooks
29 | will be executed when you run `git commit`.
30 | 
31 | ## Sign your work
32 | 
33 | The sign-off is a simple line at the end of the explanation for the patch. Your
34 | signature certifies that you wrote the patch or otherwise have the right to pass
35 | it on as an open-source patch. The rules are pretty simple: if you can certify
36 | the below (from [developercertificate.org](http://developercertificate.org/)):
37 | 
38 | ```bash
39 | Developer Certificate of Origin
40 | Version 1.1
41 | 
42 | Copyright (C) 2004, 2006 The Linux Foundation and its contributors.
43 | 1 Letterman Drive
44 | Suite D4700
45 | San Francisco, CA, 94129
46 | 
47 | Everyone is permitted to copy and distribute verbatim copies of this
48 | license document, but changing it is not allowed.
49 | 
50 | Developer's Certificate of Origin 1.1
51 | 
52 | By making a contribution to this project, I certify that:
53 | 
54 | (a) The contribution was created in whole or in part by me and I
55 |     have the right to submit it under the open source license
56 |     indicated in the file; or
57 | 
58 | (b) The contribution is based upon previous work that, to the best
59 |     of my knowledge, is covered under an appropriate open source
60 |     license and I have the right under that license to submit that
61 |     work with modifications, whether created in whole or in part
62 |     by me, under the same open source license (unless I am
63 |     permitted to submit under a different license), as indicated
64 |     in the file; or
65 | 
66 | (c) The contribution was provided directly to me by some other
67 |     person who certified (a), (b) or (c) and I have not modified
68 |     it.
69 | 
70 | (d) I understand and agree that this project and the contribution
71 |     are public and that a record of the contribution (including all
72 |     personal information I submit with it, including my sign-off) is
73 |     maintained indefinitely and may be redistributed consistent with
74 |     this project or the open source license(s) involved.
75 | ```
76 | 
77 | Then you just add a line to every git commit message:
78 | 
79 | ```bash
80 |     Signed-off-by: Joe Smith <joe.smith@email.com>
81 | ```
82 | 
83 | Use your real name (sorry, no pseudonyms or anonymous contributions.)
84 | 
85 | If you set your `user.name` and `user.email` git configs, you can sign your
86 | commit automatically with `git commit -s`.
87 | 


--------------------------------------------------------------------------------
/pkg/dcgm/instances_test.go:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package dcgm
18 | 
19 | import (
20 | 	"testing"
21 | 
22 | 	"github.com/stretchr/testify/assert"
23 | 	"github.com/stretchr/testify/require"
24 | )
25 | 
26 | func TestMigDeviceProfileNamesStandalone(t *testing.T) {
27 | 	// Setup test environment
28 | 	teardown := setupTest(t)
29 | 	defer teardown(t)
30 | 
31 | 	// Create one fake GPU
32 | 	gpuIDs, err := withInjectionGPUs(t, 1)
33 | 	require.NoError(t, err)
34 | 	require.Len(t, gpuIDs, 1, "Expected 1 fake GPU to be created")
35 | 
36 | 	// Create one GPU instance on the fake GPU
37 | 	gpuInstanceMap, err := withInjectionGPUInstances(t, gpuIDs[0], 1)
38 | 	require.NoError(t, err)
39 | 	require.Len(t, gpuInstanceMap, 1, "Expected 1 fake GPU instance to be created")
40 | 
41 | 	// Get the GPU instance IDs
42 | 	gpuInstanceIDs := make([]uint, 0, len(gpuInstanceMap))
43 | 	for instanceID := range gpuInstanceMap {
44 | 		gpuInstanceIDs = append(gpuInstanceIDs, instanceID)
45 | 	}
46 | 
47 | 	// Create one compute instance per GPU instance
48 | 	ciToGiMap, err := withInjectionComputeInstances(t, gpuInstanceIDs, len(gpuInstanceIDs))
49 | 	require.NoError(t, err)
50 | 	require.Len(t, ciToGiMap, len(gpuInstanceIDs), "Expected one compute instance per GPU instance")
51 | 
52 | 	// Get the compute instance IDs
53 | 	computeInstanceIds := make([]uint, 0, len(ciToGiMap))
54 | 	for ciId := range ciToGiMap {
55 | 		computeInstanceIds = append(computeInstanceIds, ciId)
56 | 	}
57 | 
58 | 	// Verify profile names for both GPU instances and compute instances
59 | 	verifyProfileNames(t, gpuInstanceIDs, true)      // verify GPU instances
60 | 	verifyProfileNames(t, computeInstanceIds, false) // verify compute instances
61 | }
62 | 
63 | // verifyProfileNames verifies that the MIG profile names exist for the given entities
64 | func verifyProfileNames(tb testing.TB, entityIds []uint, isGpuInstance bool) {
65 | 	tb.Helper()
66 | 
67 | 	// Create entity list for the query
68 | 	entities := make([]GroupEntityPair, 0, len(entityIds))
69 | 	for _, entityId := range entityIds {
70 | 		entity := GroupEntityPair{
71 | 			EntityId: entityId,
72 | 		}
73 | 		if isGpuInstance {
74 | 			entity.EntityGroupId = FE_GPU_I
75 | 		} else {
76 | 			entity.EntityGroupId = FE_GPU_CI
77 | 		}
78 | 		entities = append(entities, entity)
79 | 	}
80 | 
81 | 	// Get the latest values for DCGM_FI_DEV_NAME field
82 | 	values, err := EntitiesGetLatestValues(entities, []Short{DCGM_FI_DEV_NAME}, DCGM_FV_FLAG_LIVE_DATA)
83 | 	require.NoError(tb, err)
84 | 
85 | 	// Define expected profile names
86 | 	expectedFakeName := "1fc.1g.4gb"
87 | 	if isGpuInstance {
88 | 		expectedFakeName = "1fg.4gb"
89 | 	}
90 | 
91 | 	// Verify each entity has the correct profile name
92 | 	for i := range values {
93 | 		assert.Equal(tb, expectedFakeName, values[i].String(),
94 | 			"Fake profile name appears to be wrong for entity %d. Expected '%s', found '%s'",
95 | 			values[i].EntityID, expectedFakeName, values[i].String())
96 | 	}
97 | }
98 | 


--------------------------------------------------------------------------------
/pkg/dcgm/diag_test_helpers.go:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package dcgm
18 | 
19 | /*
20 | #include <stdlib.h>
21 | #include <string.h>
22 | #include "dcgm_agent.h"
23 | #include "dcgm_structs.h"
24 | */
25 | import "C"
26 | 
27 | import (
28 | 	"unsafe"
29 | )
30 | 
31 | // createTestDiagResponse creates a dcgmDiagResponse_v12 for testing
32 | func createTestDiagResponse() C.dcgmDiagResponse_v12 {
33 | 	var response C.dcgmDiagResponse_v12
34 | 	response.version = C.dcgmDiagResponse_version12
35 | 	return response
36 | }
37 | 
38 | // addInfoMessage adds an info message to a dcgmDiagResponse_v12 for testing
39 | func addInfoMessage(response *C.dcgmDiagResponse_v12, entityID uint, testID uint, message string) {
40 | 	idx := response.numInfo
41 | 	cStr := C.CString(message)
42 | 	defer C.free(unsafe.Pointer(cStr))
43 | 	C.strcpy(&response.info[idx].msg[0], cStr)
44 | 	response.info[idx].entity.entityId = C.uint(entityID)
45 | 	response.info[idx].entity.entityGroupId = C.DCGM_FE_GPU
46 | 	response.info[idx].testId = C.uint(testID)
47 | 	response.numInfo++
48 | }
49 | 
50 | // addDiagResult adds a diagnostic result to a dcgmDiagResponse_v12 for testing
51 | func addDiagResult(response *C.dcgmDiagResponse_v12, entityID uint, testID uint, result int) {
52 | 	idx := response.numResults
53 | 	response.results[idx].entity.entityId = C.uint(entityID)
54 | 	response.results[idx].entity.entityGroupId = C.DCGM_FE_GPU
55 | 	response.results[idx].testId = C.uint(testID)
56 | 	response.results[idx].result = C.dcgmDiagResult_t(result)
57 | 	response.numResults++
58 | }
59 | 
60 | // addEntityWithSerial adds an entity with serial number to a dcgmDiagResponse_v12 for testing
61 | func addEntityWithSerial(response *C.dcgmDiagResponse_v12, entityID uint, serialNumber string) {
62 | 	idx := response.numEntities
63 | 	cStr := C.CString(serialNumber)
64 | 	defer C.free(unsafe.Pointer(cStr))
65 | 	C.strcpy(&response.entities[idx].serialNum[0], cStr)
66 | 	response.entities[idx].entity.entityId = C.uint(entityID)
67 | 	response.entities[idx].entity.entityGroupId = C.DCGM_FE_GPU
68 | 	response.numEntities++
69 | }
70 | 
71 | // Test constants exposed for testing
72 | const (
73 | 	testDiagResultPass   = C.DCGM_DIAG_RESULT_PASS
74 | 	testDiagResultSkip   = C.DCGM_DIAG_RESULT_SKIP
75 | 	testDiagResultWarn   = C.DCGM_DIAG_RESULT_WARN
76 | 	testDiagResultFail   = C.DCGM_DIAG_RESULT_FAIL
77 | 	testDiagResultNotRun = C.DCGM_DIAG_RESULT_NOT_RUN
78 | 
79 | 	testMemoryIndex          = C.DCGM_MEMORY_INDEX
80 | 	testDiagnosticIndex      = C.DCGM_DIAGNOSTIC_INDEX
81 | 	testPCIIndex             = C.DCGM_PCI_INDEX
82 | 	testSMStressIndex        = C.DCGM_SM_STRESS_INDEX
83 | 	testTargetedStressIndex  = C.DCGM_TARGETED_STRESS_INDEX
84 | 	testTargetedPowerIndex   = C.DCGM_TARGETED_POWER_INDEX
85 | 	testMemoryBandwidthIndex = C.DCGM_MEMORY_BANDWIDTH_INDEX
86 | 	testMemtestIndex         = C.DCGM_MEMTEST_INDEX
87 | 	testPulseTestIndex       = C.DCGM_PULSE_TEST_INDEX
88 | 	testEUDTestIndex         = C.DCGM_EUD_TEST_INDEX
89 | 	testSoftwareIndex        = C.DCGM_SOFTWARE_INDEX
90 | 	testContextCreateIndex   = C.DCGM_CONTEXT_CREATE_INDEX
91 | )
92 | 


--------------------------------------------------------------------------------
/samples/processInfo/main.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"flag"
 5 | 	"log"
 6 | 	"os"
 7 | 	"text/template"
 8 | 	"time"
 9 | 
10 | 	"github.com/NVIDIA/go-dcgm/pkg/dcgm"
11 | )
12 | 
13 | const (
14 | 	processInfo = `----------------------------------------------------------------------
15 | GPU ID			     : {{.GPU}}
16 | ----------Execution Stats---------------------------------------------
17 | PID                          : {{.PID}}
18 | Name                         : {{or .Name "N/A"}}
19 | Start Time                   : {{.ProcessUtilization.StartTime.String}}
20 | End Time                     : {{.ProcessUtilization.EndTime.String}}
21 | ----------Performance Stats-------------------------------------------
22 | Energy Consumed (Joules)     : {{or .ProcessUtilization.EnergyConsumed "N/A"}}
23 | Max GPU Memory Used (bytes)  : {{or .Memory.GlobalUsed "N/A"}}
24 | Avg SM Clock (MHz)           : {{or .Clocks.Cores "N/A"}}
25 | Avg Memory Clock (MHz)       : {{or .Clocks.Memory "N/A"}}
26 | Avg SM Utilization (%)       : {{or .GpuUtilization.GPU "N/A"}}
27 | Avg Memory Utilization (%)   : {{or .GpuUtilization.Memory "N/A"}}
28 | Avg PCIe Rx Bandwidth (MB)   : {{or .PCI.Throughput.Rx "N/A"}}
29 | Avg PCIe Tx Bandwidth (MB)   : {{or .PCI.Throughput.Tx "N/A"}}
30 | ----------Event Stats-------------------------------------------------
31 | Single Bit ECC Errors        : {{or .Memory.ECCErrors.SingleBit "N/A"}}
32 | Double Bit ECC Errors        : {{or .Memory.ECCErrors.DoubleBit "N/A"}}
33 | Critical XID Errors          : {{.XIDErrors.NumErrors}}
34 | ----------Slowdown Stats----------------------------------------------
35 | Due to - Power (%)           : {{or .Violations.Power "N/A"}}
36 |        - Thermal (%)         : {{or .Violations.Thermal "N/A"}}
37 |        - Reliability (%)     : {{or .Violations.Reliability "N/A"}}
38 |        - Board Limit (%)     : {{or .Violations.BoardLimit "N/A"}}
39 |        - Low Utilization (%) : {{or .Violations.LowUtilization "N/A"}}
40 |        - Sync Boost (%)      : {{or .Violations.SyncBoost "N/A"}}
41 | ----------Process Utilization-----------------------------------------
42 | Avg SM Utilization (%)       : {{or .ProcessUtilization.SmUtil "N/A"}}
43 | Avg Memory Utilization (%)   : {{or .ProcessUtilization.MemUtil "N/A"}}
44 | ----------------------------------------------------------------------
45 | `
46 | )
47 | 
48 | // NOTE: The "WatchPidFields()" function must be initially called (as root) BEFORE starting the process to be monitored:
49 | //  1. Run as root, for enabling health watches
50 | //     sudo dcgmi stats -e
51 | //  2. Start process to be monitored
52 | //  3. Run processInfo. This is equivalent to "dcgmi stats --pid ENTERPID -v"
53 | //     go build && ./processInfo -pid PID
54 | func main() {
55 | 	process := flag.Uint("pid", 0, "Provide pid to get this process information.")
56 | 
57 | 	cleanup, err := dcgm.Init(dcgm.Embedded)
58 | 	if err != nil {
59 | 		log.Panicln(err)
60 | 	}
61 | 	defer cleanup()
62 | 
63 | 	// Request DCGM to start recording stats for GPU process fields
64 | 	group, err := dcgm.WatchPidFields()
65 | 	if err != nil {
66 | 		log.Panicln(err)
67 | 	}
68 | 
69 | 	// Before retrieving process stats, wait few seconds for watches to be enabled and collect data
70 | 	log.Println("Enabling DCGM watches to start collecting process stats. This may take a few seconds....")
71 | 	time.Sleep(3000 * time.Millisecond)
72 | 
73 | 	flag.Parse()
74 | 
75 | 	pidInfo, err := dcgm.GetProcessInfo(group, *process)
76 | 	if err != nil {
77 | 		log.Panicln(err)
78 | 	}
79 | 
80 | 	t := template.Must(template.New("Process").Parse(processInfo))
81 | 	for i := range pidInfo {
82 | 		if err = t.Execute(os.Stdout, pidInfo[i]); err != nil {
83 | 			log.Panicln("Template error:", err)
84 | 		}
85 | 	}
86 | }
87 | 


--------------------------------------------------------------------------------
/samples/restApi/handlers/dcgm.go:
--------------------------------------------------------------------------------
  1 | package handlers
  2 | 
  3 | import (
  4 | 	"log"
  5 | 	"math"
  6 | 	"net/http"
  7 | 	"time"
  8 | 
  9 | 	"github.com/NVIDIA/go-dcgm/pkg/dcgm"
 10 | 
 11 | 	"github.com/gorilla/mux"
 12 | )
 13 | 
 14 | func getStatus(resp http.ResponseWriter, req *http.Request) (status *dcgm.Status) {
 15 | 	st, err := dcgm.Introspect()
 16 | 	if err != nil {
 17 | 		http.Error(resp, err.Error(), http.StatusInternalServerError)
 18 | 		log.Printf("error: %v%v: %v", req.Host, req.URL, err.Error())
 19 | 
 20 | 		return
 21 | 	}
 22 | 
 23 | 	return &st
 24 | }
 25 | 
 26 | func getDeviceInfo(resp http.ResponseWriter, req *http.Request) (device *dcgm.Device) {
 27 | 	var id uint
 28 | 
 29 | 	params := mux.Vars(req)
 30 | 	for k, v := range params {
 31 | 		switch k {
 32 | 		case "id":
 33 | 			id = getId(resp, req, v)
 34 | 		case "uuid":
 35 | 			id = getIdByUuid(resp, req, v)
 36 | 		}
 37 | 	}
 38 | 
 39 | 	if id == math.MaxUint32 {
 40 | 		return
 41 | 	}
 42 | 
 43 | 	if !isValidId(id, resp, req) {
 44 | 		return
 45 | 	}
 46 | 
 47 | 	d, err := dcgm.GetDeviceInfo(id)
 48 | 	if err != nil {
 49 | 		http.Error(resp, err.Error(), http.StatusInternalServerError)
 50 | 		log.Printf("error: %v%v: %v", req.Host, req.URL, err.Error())
 51 | 
 52 | 		return
 53 | 	}
 54 | 
 55 | 	return &d
 56 | }
 57 | 
 58 | func getDeviceStatus(resp http.ResponseWriter, req *http.Request) (status *dcgm.DeviceStatus) {
 59 | 	var id uint
 60 | 
 61 | 	params := mux.Vars(req)
 62 | 	for k, v := range params {
 63 | 		switch k {
 64 | 		case "id":
 65 | 			id = getId(resp, req, v)
 66 | 		case "uuid":
 67 | 			id = getIdByUuid(resp, req, v)
 68 | 		}
 69 | 	}
 70 | 
 71 | 	if id == math.MaxUint32 {
 72 | 		return
 73 | 	}
 74 | 
 75 | 	if !isValidId(id, resp, req) {
 76 | 		return
 77 | 	}
 78 | 
 79 | 	if !isDcgmSupported(id, resp, req) {
 80 | 		return
 81 | 	}
 82 | 
 83 | 	st, err := dcgm.GetDeviceStatus(id)
 84 | 	if err != nil {
 85 | 		http.Error(resp, err.Error(), http.StatusInternalServerError)
 86 | 		log.Printf("error: %v%v: %v", req.Host, req.URL, err.Error())
 87 | 
 88 | 		return
 89 | 	}
 90 | 
 91 | 	return &st
 92 | }
 93 | 
 94 | func getHealth(resp http.ResponseWriter, req *http.Request) (health *dcgm.DeviceHealth) {
 95 | 	var id uint
 96 | 
 97 | 	params := mux.Vars(req)
 98 | 	for k, v := range params {
 99 | 		switch k {
100 | 		case "id":
101 | 			id = getId(resp, req, v)
102 | 		case "uuid":
103 | 			id = getIdByUuid(resp, req, v)
104 | 		}
105 | 	}
106 | 
107 | 	if id == math.MaxUint32 {
108 | 		return
109 | 	}
110 | 
111 | 	if !isValidId(id, resp, req) {
112 | 		return
113 | 	}
114 | 
115 | 	h, err := dcgm.HealthCheckByGpuId(id)
116 | 	if err != nil {
117 | 		http.Error(resp, err.Error(), http.StatusInternalServerError)
118 | 		log.Printf("error: %v%v: %v", req.Host, req.URL, err.Error())
119 | 
120 | 		return
121 | 	}
122 | 
123 | 	return &h
124 | }
125 | 
126 | func getProcessInfo(resp http.ResponseWriter, req *http.Request) (pInfo []dcgm.ProcessInfo) {
127 | 	params := mux.Vars(req)
128 | 
129 | 	pid := getId(resp, req, params["pid"])
130 | 	if pid == math.MaxUint32 {
131 | 		return
132 | 	}
133 | 
134 | 	group, err := dcgm.WatchPidFields()
135 | 	if err != nil {
136 | 		http.Error(resp, err.Error(), http.StatusInternalServerError)
137 | 		log.Printf("error: %v%v: %v", req.Host, req.URL, err.Error())
138 | 
139 | 		return
140 | 	}
141 | 
142 | 	// wait for watches to be enabled
143 | 	log.Printf("Enabling DCGM watches to start collecting process stats. This may take a few seconds....")
144 | 	time.Sleep(3000 * time.Millisecond)
145 | 
146 | 	pInfo, err = dcgm.GetProcessInfo(group, pid)
147 | 	if err != nil {
148 | 		http.Error(resp, err.Error(), http.StatusInternalServerError)
149 | 		log.Printf("error: %v%v: %v", req.Host, req.URL, err.Error())
150 | 	}
151 | 
152 | 	return
153 | }
154 | 


--------------------------------------------------------------------------------
/pkg/dcgm/structs.go:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package dcgm
18 | 
19 | // MigProfile represents the Multi-Instance GPU (MIG) profile type
20 | type MigProfile int
21 | 
22 | const (
23 | 	// MigProfileNone indicates no MIG profile is set (for GPUs)
24 | 	MigProfileNone MigProfile = 0 /*!< No profile (for GPUs) */
25 | 	// MigProfileGPUInstanceSlice1 represents GPU instance slice 1
26 | 	MigProfileGPUInstanceSlice1 MigProfile = 1 /*!< GPU instance slice 1 */
27 | 	// MigProfileGPUInstanceSlice2 represents GPU instance slice 2
28 | 	MigProfileGPUInstanceSlice2 MigProfile = 2 /*!< GPU instance slice 2 */
29 | 	// MigProfileGPUInstanceSlice3 represents GPU instance slice 3
30 | 	MigProfileGPUInstanceSlice3 MigProfile = 3 /*!< GPU instance slice 3 */
31 | 	// MigProfileGPUInstanceSlice4 represents GPU instance slice 4
32 | 	MigProfileGPUInstanceSlice4 MigProfile = 4 /*!< GPU instance slice 4 */
33 | 	// MigProfileGPUInstanceSlice7 represents GPU instance slice 7
34 | 	MigProfileGPUInstanceSlice7 MigProfile = 5 /*!< GPU instance slice 7 */
35 | 	// MigProfileGPUInstanceSlice8 represents GPU instance slice 8
36 | 	MigProfileGPUInstanceSlice8 MigProfile = 6 /*!< GPU instance slice 8 */
37 | 	// MigProfileGPUInstanceSlice6 represents GPU instance slice 6
38 | 	MigProfileGPUInstanceSlice6 MigProfile = 7 /*!< GPU instance slice 6 */
39 | 	// MigProfileGPUInstanceSlice1Rev1 represents GPU instance slice 1 revision 1
40 | 	MigProfileGPUInstanceSlice1Rev1 MigProfile = 8 /*!< GPU instance slice 1 revision 1 */
41 | 	// MigProfileGPUInstanceSlice2Rev1 represents GPU instance slice 2 revision 1
42 | 	MigProfileGPUInstanceSlice2Rev1 MigProfile = 9 /*!< GPU instance slice 2 revision 1 */
43 | 	// MigProfileGPUInstanceSlice1Rev2 represents GPU instance slice 1 revision 2
44 | 	MigProfileGPUInstanceSlice1Rev2 MigProfile = 10 /*!< GPU instance slice 1 revision 2 */
45 | 	// MigProfileComputeInstanceSlice1 represents compute instance slice 1
46 | 	MigProfileComputeInstanceSlice1 MigProfile = 30 /*!< compute instance slice 1 */
47 | 	// MigProfileComputeInstanceSlice2 represents compute instance slice 2
48 | 	MigProfileComputeInstanceSlice2 MigProfile = 31 /*!< compute instance slice 2 */
49 | 	// MigProfileComputeInstanceSlice3 represents compute instance slice 3
50 | 	MigProfileComputeInstanceSlice3 MigProfile = 32 /*!< compute instance slice 3 */
51 | 	// MigProfileComputeInstanceSlice4 represents compute instance slice 4
52 | 	MigProfileComputeInstanceSlice4 MigProfile = 33 /*!< compute instance slice 4*/
53 | 	// MigProfileComputeInstanceSlice7 represents compute instance slice 7
54 | 	MigProfileComputeInstanceSlice7 MigProfile = 34 /*!< compute instance slice 7 */
55 | 	// MigProfileComputeInstanceSlice8 represents compute instance slice 8
56 | 	MigProfileComputeInstanceSlice8 MigProfile = 35 /*!< compute instance slice 8 */
57 | 	// MigProfileComputeInstanceSlice6 represents compute instance slice 6
58 | 	MigProfileComputeInstanceSlice6 MigProfile = 36 /*!< compute instance slice 6 */
59 | 	// MigProfileComputeInstanceSlice1Rev1 represents compute instance slice 1 revision 1
60 | 	MigProfileComputeInstanceSlice1Rev1 MigProfile = 37 /*!< compute instance slice 1 revision 1 */
61 | )
62 | 


--------------------------------------------------------------------------------
/tests/processinfo_test.go:
--------------------------------------------------------------------------------
  1 | package tests
  2 | 
  3 | import (
  4 | 	"os"
  5 | 	"testing"
  6 | 	"time"
  7 | 
  8 | 	"github.com/NVIDIA/go-dcgm/pkg/dcgm"
  9 | )
 10 | 
 11 | // TestProcessInfo demonstrates getting process information for GPU processes
 12 | // This is equivalent to the processInfo sample
 13 | func TestProcessInfo(t *testing.T) {
 14 | 	cleanup, err := dcgm.Init(dcgm.Embedded)
 15 | 	if err != nil {
 16 | 		t.Fatalf("Failed to initialize DCGM: %v", err)
 17 | 	}
 18 | 	defer cleanup()
 19 | 
 20 | 	// Request DCGM to start recording stats for GPU process fields
 21 | 	group, err := dcgm.WatchPidFields()
 22 | 	if err != nil {
 23 | 		t.Fatalf("Failed to watch PID fields: %v", err)
 24 | 	}
 25 | 
 26 | 	// Wait for watches to be enabled and collect data
 27 | 	t.Log("Enabling DCGM watches to start collecting process stats. This may take a few seconds...")
 28 | 	time.Sleep(3000 * time.Millisecond)
 29 | 
 30 | 	// Get current process ID as an example
 31 | 	//nolint:gosec // disable G115
 32 | 	currentPid := uint(os.Getpid())
 33 | 	t.Logf("Testing with current process PID: %d", currentPid)
 34 | 
 35 | 	pidInfo, err := dcgm.GetProcessInfo(group, currentPid)
 36 | 	if err != nil {
 37 | 		t.Logf("Failed to get process info for PID %d: %v", currentPid, err)
 38 | 		t.Log("This is expected if the current process is not using GPU")
 39 | 		return
 40 | 	}
 41 | 
 42 | 	if len(pidInfo) == 0 {
 43 | 		t.Logf("No process information found for PID %d", currentPid)
 44 | 		return
 45 | 	}
 46 | 
 47 | 	// Log basic process information
 48 | 	for i, info := range pidInfo {
 49 | 		t.Logf("Process Info %d:", i+1)
 50 | 		t.Logf("  GPU ID: %d", info.GPU)
 51 | 		t.Logf("  PID: %d", info.PID)
 52 | 		if info.Name != "" {
 53 | 			t.Logf("  Name: %s", info.Name)
 54 | 		}
 55 | 		t.Logf("  Start Time: %s", info.ProcessUtilization.StartTime.String())
 56 | 		t.Logf("  End Time: %s", info.ProcessUtilization.EndTime.String())
 57 | 		t.Logf("  Critical XID Errors: %d", info.XIDErrors.NumErrors)
 58 | 	}
 59 | }
 60 | 
 61 | // TestProcessInfoWithSpecificPID demonstrates getting process info for a specific PID
 62 | func TestProcessInfoWithSpecificPID(t *testing.T) {
 63 | 	if testing.Short() {
 64 | 		t.Skip("Skipping specific PID test in short mode")
 65 | 	}
 66 | 
 67 | 	cleanup, err := dcgm.Init(dcgm.Embedded)
 68 | 	if err != nil {
 69 | 		t.Fatalf("Failed to initialize DCGM: %v", err)
 70 | 	}
 71 | 	defer cleanup()
 72 | 
 73 | 	// Request DCGM to start recording stats for GPU process fields
 74 | 	group, err := dcgm.WatchPidFields()
 75 | 	if err != nil {
 76 | 		t.Fatalf("Failed to watch PID fields: %v", err)
 77 | 	}
 78 | 
 79 | 	// Wait for watches to be enabled and collect data
 80 | 	time.Sleep(3000 * time.Millisecond)
 81 | 
 82 | 	// Test with PID 1 (init process) - should not have GPU usage
 83 | 	testPid := uint(1)
 84 | 	pidInfo, err := dcgm.GetProcessInfo(group, testPid)
 85 | 	if err != nil {
 86 | 		t.Logf("Expected: No process info found for PID %d: %v", testPid, err)
 87 | 	} else if len(pidInfo) == 0 {
 88 | 		t.Logf("Expected: No GPU usage found for PID %d", testPid)
 89 | 	} else {
 90 | 		t.Logf("Unexpected: Found GPU usage for PID %d", testPid)
 91 | 	}
 92 | }
 93 | 
 94 | // TestWatchPidFields demonstrates the WatchPidFields functionality
 95 | func TestWatchPidFields(t *testing.T) {
 96 | 	cleanup, err := dcgm.Init(dcgm.Embedded)
 97 | 	if err != nil {
 98 | 		t.Fatalf("Failed to initialize DCGM: %v", err)
 99 | 	}
100 | 	defer cleanup()
101 | 
102 | 	// Test WatchPidFields function
103 | 	group, err := dcgm.WatchPidFields()
104 | 	if err != nil {
105 | 		t.Fatalf("Failed to watch PID fields: %v", err)
106 | 	}
107 | 
108 | 	t.Logf("Successfully created group for watching PID fields: %v", group)
109 | 
110 | 	// Wait a bit to ensure watches are properly set up
111 | 	time.Sleep(1000 * time.Millisecond)
112 | 	t.Log("PID field watches enabled successfully")
113 | }
114 | 


--------------------------------------------------------------------------------
/pkg/dcgm/field_values_test.go:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | package dcgm
 18 | 
 19 | import (
 20 | 	"crypto/rand"
 21 | 	"encoding/binary"
 22 | 	"fmt"
 23 | 	"testing"
 24 | 	"time"
 25 | 
 26 | 	"github.com/stretchr/testify/assert"
 27 | 	"github.com/stretchr/testify/require"
 28 | )
 29 | 
 30 | // secureRandomUint64 returns a cryptographically secure random uint64
 31 | func secureRandomUint64() (uint64, error) {
 32 | 	var buf [8]byte
 33 | 	_, err := rand.Read(buf[:])
 34 | 	if err != nil {
 35 | 		return 0, err
 36 | 	}
 37 | 	return binary.BigEndian.Uint64(buf[:]), nil
 38 | }
 39 | 
 40 | func TestGetValuesSince(t *testing.T) {
 41 | 	teardownTest := setupTest(t)
 42 | 	defer teardownTest(t)
 43 | 	runOnlyWithLiveGPUs(t)
 44 | 
 45 | 	const gpu uint = 0
 46 | 
 47 | 	// Create a group of fields
 48 | 	const (
 49 | 		xid int = iota
 50 | 	)
 51 | 
 52 | 	deviceFields := make([]Short, 1)
 53 | 	deviceFields[xid] = DCGM_FI_DEV_XID_ERRORS
 54 | 
 55 | 	randID, err := secureRandomUint64()
 56 | 	require.NoError(t, err)
 57 | 	fieldGroupName := fmt.Sprintf("fieldGroupName%d", randID)
 58 | 
 59 | 	fieldsGroup, err := FieldGroupCreate(fieldGroupName, deviceFields)
 60 | 	require.NoError(t, err)
 61 | 
 62 | 	defer func() {
 63 | 		_ = FieldGroupDestroy(fieldsGroup)
 64 | 	}()
 65 | 
 66 | 	var values []FieldValue_v2
 67 | 	var nextTime time.Time
 68 | 
 69 | 	t.Run("When there is no data return error", func(t *testing.T) {
 70 | 		values, nextTime, err = GetValuesSince(GroupAllGPUs(),
 71 | 			fieldsGroup, time.Time{})
 72 | 		require.Error(t, err)
 73 | 		require.Empty(t, nextTime)
 74 | 		require.Empty(t, values)
 75 | 	})
 76 | 
 77 | 	t.Run("When there are a few entries", func(t *testing.T) {
 78 | 		expectedNumberOfErrors := int64(43)
 79 | 		expectedInjectedValuesCount := 0
 80 | 
 81 | 		t.Logf("injecting %s for gpuId %d", "DCGM_FI_DEV_XID_ERRORS", gpu)
 82 | 		err = InjectFieldValue(gpu,
 83 | 			DCGM_FI_DEV_XID_ERRORS,
 84 | 			DCGM_FT_INT64,
 85 | 			0,
 86 | 			time.Now().Add(-time.Duration(5)*time.Second).UnixMicro(),
 87 | 			expectedNumberOfErrors,
 88 | 		)
 89 | 		require.NoError(t, err)
 90 | 
 91 | 		expectedInjectedValuesCount++
 92 | 
 93 | 		for i := 4; i > 0; i-- {
 94 | 			err = InjectFieldValue(gpu,
 95 | 				DCGM_FI_DEV_XID_ERRORS,
 96 | 				DCGM_FT_INT64,
 97 | 				0,
 98 | 				time.Now().Add(-time.Duration(i)*time.Second).UnixMicro(),
 99 | 				int64(i),
100 | 			)
101 | 			require.NoError(t, err)
102 | 
103 | 			expectedInjectedValuesCount++
104 | 		}
105 | 		// Force an update of the fields so that we can fetch initial values.
106 | 		err = UpdateAllFields()
107 | 		require.NoError(t, err)
108 | 		values, nextTime, err = GetValuesSince(GroupAllGPUs(), fieldsGroup, time.Time{})
109 | 		require.NoError(t, err)
110 | 		assert.Greater(t, nextTime, time.Time{})
111 | 		assert.Len(t, values, expectedInjectedValuesCount)
112 | 		assert.Equal(t, FE_GPU, values[0].EntityGroupId)
113 | 		assert.Equal(t, gpu, values[0].EntityID)
114 | 		assert.Equal(t, DCGM_FI_DEV_XID_ERRORS, values[0].FieldID)
115 | 		assert.Equal(t, expectedNumberOfErrors, values[0].Int64())
116 | 
117 | 		for i := 1; i < 5; i++ {
118 | 			assert.Equal(t, FE_GPU, values[i].EntityGroupId)
119 | 			assert.Equal(t, gpu, values[i].EntityID)
120 | 			assert.Equal(t, DCGM_FI_DEV_XID_ERRORS, values[i].FieldID)
121 | 			assert.Equal(t, int64(5-i), values[i].Int64())
122 | 		}
123 | 	})
124 | }
125 | 


--------------------------------------------------------------------------------
/tests/deviceinfo_test.go:
--------------------------------------------------------------------------------
  1 | package tests
  2 | 
  3 | import (
  4 | 	"testing"
  5 | 
  6 | 	"github.com/NVIDIA/go-dcgm/pkg/dcgm"
  7 | )
  8 | 
  9 | // TestDeviceInfo demonstrates getting device information from all GPUs
 10 | // This is equivalent to the deviceInfo sample
 11 | func TestDeviceInfoTest(t *testing.T) {
 12 | 	cleanup, err := dcgm.Init(dcgm.Embedded)
 13 | 	if err != nil {
 14 | 		t.Fatalf("Failed to initialize DCGM: %v", err)
 15 | 	}
 16 | 	defer cleanup()
 17 | 
 18 | 	count, err := dcgm.GetAllDeviceCount()
 19 | 	if err != nil {
 20 | 		t.Fatalf("Failed to get device count: %v", err)
 21 | 	}
 22 | 
 23 | 	t.Logf("Found %d devices", count)
 24 | 
 25 | 	for i := uint(0); i < count; i++ {
 26 | 		deviceInfo, err := dcgm.GetDeviceInfo(i)
 27 | 		if err != nil {
 28 | 			t.Errorf("Failed to get device info for GPU %d: %v", i, err)
 29 | 			continue
 30 | 		}
 31 | 
 32 | 		// Log device information
 33 | 		t.Logf("Device %d Information:", i)
 34 | 		t.Logf("  Driver Version: %s", deviceInfo.Identifiers.DriverVersion)
 35 | 		t.Logf("  GPU: %d", deviceInfo.GPU)
 36 | 		t.Logf("  DCGMSupported: %v", deviceInfo.DCGMSupported)
 37 | 		t.Logf("  UUID: %s", deviceInfo.UUID)
 38 | 		t.Logf("  Brand: %s", deviceInfo.Identifiers.Brand)
 39 | 		t.Logf("  Model: %s", deviceInfo.Identifiers.Model)
 40 | 		t.Logf("  Serial Number: %s", deviceInfo.Identifiers.Serial)
 41 | 
 42 | 		if deviceInfo.Identifiers.Vbios != "" {
 43 | 			t.Logf("  Vbios: %s", deviceInfo.Identifiers.Vbios)
 44 | 		}
 45 | 
 46 | 		t.Logf("  InforomImage Version: %s", deviceInfo.Identifiers.InforomImageVersion)
 47 | 		t.Logf("  Bus ID: %s", deviceInfo.PCI.BusID)
 48 | 
 49 | 		if deviceInfo.PCI.BAR1 != 0 {
 50 | 			t.Logf("  BAR1 (MB): %d", deviceInfo.PCI.BAR1)
 51 | 		}
 52 | 
 53 | 		if deviceInfo.PCI.FBTotal != 0 {
 54 | 			t.Logf("  FrameBuffer Memory (MB): %d", deviceInfo.PCI.FBTotal)
 55 | 		}
 56 | 
 57 | 		if deviceInfo.PCI.Bandwidth != 0 {
 58 | 			t.Logf("  Bandwidth (MB/s): %d", deviceInfo.PCI.Bandwidth)
 59 | 		}
 60 | 
 61 | 		if deviceInfo.Power != 0 {
 62 | 			t.Logf("  Power (W): %d", deviceInfo.Power)
 63 | 		}
 64 | 
 65 | 		if deviceInfo.CPUAffinity != "" {
 66 | 			t.Logf("  CPUAffinity: %s", deviceInfo.CPUAffinity)
 67 | 		}
 68 | 
 69 | 		// Log P2P topology if available
 70 | 		if len(deviceInfo.Topology) > 0 {
 71 | 			t.Logf("  P2P Available:")
 72 | 			for _, topo := range deviceInfo.Topology {
 73 | 				t.Logf("    GPU%d - (BusID)%s - %p", topo.GPU, topo.BusID, topo.Link.PCIPaths)
 74 | 			}
 75 | 		} else {
 76 | 			t.Logf("  P2P Available: None")
 77 | 		}
 78 | 
 79 | 		// Basic assertions to ensure we got valid data
 80 | 		if deviceInfo.UUID == "" {
 81 | 			t.Errorf("Device %d has empty UUID", i)
 82 | 		}
 83 | 		if deviceInfo.Identifiers.Brand == "" {
 84 | 			t.Errorf("Device %d has empty brand", i)
 85 | 		}
 86 | 		if deviceInfo.PCI.BusID == "" {
 87 | 			t.Errorf("Device %d has empty bus ID", i)
 88 | 		}
 89 | 	}
 90 | }
 91 | 
 92 | // TestDeviceInfoWithConnection demonstrates connecting to a standalone hostengine
 93 | func TestDeviceInfoWithConnection(t *testing.T) {
 94 | 	// Skip this test if we're not testing with a specific connection
 95 | 	if testing.Short() {
 96 | 		t.Skip("Skipping connection test in short mode")
 97 | 	}
 98 | 
 99 | 	connectAddr := "localhost"
100 | 	isSocket := "0"
101 | 
102 | 	cleanup, err := dcgm.Init(dcgm.Standalone, connectAddr, isSocket)
103 | 	if err != nil {
104 | 		t.Skipf("Failed to connect to standalone hostengine at %s: %v", connectAddr, err)
105 | 	}
106 | 	defer cleanup()
107 | 
108 | 	count, err := dcgm.GetAllDeviceCount()
109 | 	if err != nil {
110 | 		t.Fatalf("Failed to get device count: %v", err)
111 | 	}
112 | 
113 | 	t.Logf("Connected to standalone hostengine, found %d devices", count)
114 | 
115 | 	// Just test first device if available
116 | 	if count > 0 {
117 | 		deviceInfo, err := dcgm.GetDeviceInfo(0)
118 | 		if err != nil {
119 | 			t.Errorf("Failed to get device info for GPU 0: %v", err)
120 | 		} else {
121 | 			t.Logf("First device UUID: %s", deviceInfo.UUID)
122 | 		}
123 | 	}
124 | }
125 | 


--------------------------------------------------------------------------------
/pkg/dcgm/field_values.go:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | package dcgm
 18 | 
 19 | /*
 20 | #include "dcgm_agent.h"
 21 | #include "dcgm_structs.h"
 22 | #include "field_values_cb.h"
 23 | extern int go_dcgmFieldValueEntityEnumeration(dcgm_field_entity_group_t entityGroupId,
 24 |             dcgm_field_eid_t entityId,
 25 |             dcgmFieldValue_v1 *values,
 26 |             int numValues,
 27 |             void *userData);
 28 | */
 29 | import "C"
 30 | 
 31 | import (
 32 | 	"fmt"
 33 | 	"sync"
 34 | 	"time"
 35 | 	"unsafe"
 36 | )
 37 | 
 38 | type callback struct {
 39 | 	mu     sync.Mutex
 40 | 	Values []FieldValue_v2
 41 | }
 42 | 
 43 | func (cb *callback) processValues(entityGroup Field_Entity_Group, entityID uint, cvalues []C.dcgmFieldValue_v1) {
 44 | 	values := dcgmFieldValue_v1ToFieldValue_v2(entityGroup, entityID, cvalues)
 45 | 
 46 | 	cb.mu.Lock()
 47 | 	cb.Values = append(cb.Values, values...)
 48 | 	cb.mu.Unlock()
 49 | }
 50 | 
 51 | //export go_dcgmFieldValueEntityEnumeration
 52 | func go_dcgmFieldValueEntityEnumeration(
 53 | 	entityGroup C.dcgm_field_entity_group_t,
 54 | 	entityID C.dcgm_field_eid_t,
 55 | 	values *C.dcgmFieldValue_v1,
 56 | 	numValues C.int,
 57 | 	userData unsafe.Pointer,
 58 | ) C.int {
 59 | 	ptrValues := unsafe.Pointer(values)
 60 | 	if ptrValues != nil {
 61 | 		valuesSlice := (*[1 << 30]C.dcgmFieldValue_v1)(ptrValues)[0:numValues]
 62 | 
 63 | 		if userData != nil {
 64 | 			processor := (*callback)(userData)
 65 | 			processor.processValues(Field_Entity_Group(entityGroup), uint(entityID), valuesSlice)
 66 | 		}
 67 | 	}
 68 | 	return 0
 69 | }
 70 | 
 71 | // GetValuesSince reads and returns field values for a specified group of entities, such as GPUs,
 72 | // that have been updated since a given timestamp. It allows for targeted data retrieval based on time criteria.
 73 | //
 74 | // GPUGroup is a GroupHandle that identifies the group of entities to operate on. It can be obtained from CreateGroup
 75 | // for a specific group of GPUs or use GroupAllGPUs() to target all GPUs.
 76 | //
 77 | // fieldGroup is a FieldHandle representing the group of fields for which data is requested.
 78 | //
 79 | // sinceTime is a time.Time value representing the timestamp from which to request updated values.
 80 | // A zero value (time.Time{}) requests all available data.
 81 | //
 82 | // Returns []FieldValue_v2 slice containing the requested field values, a time.Time indicating the time
 83 | // of the latest data retrieval, and an error if there is any issue during the operation.
 84 | func GetValuesSince(gpuGroup GroupHandle, fieldGroup FieldHandle, sinceTime time.Time) ([]FieldValue_v2, time.Time, error) {
 85 | 	var nextSinceTimestamp C.longlong
 86 | 	cbResult := &callback{}
 87 | 	result := C.dcgmGetValuesSince_v2(handle.handle,
 88 | 		gpuGroup.handle,
 89 | 		fieldGroup.handle,
 90 | 		C.longlong(sinceTime.UnixMicro()),
 91 | 		&nextSinceTimestamp,
 92 | 		C.dcgmFieldValueEnumeration_f(C.fieldValueEntityCallback),
 93 | 		unsafe.Pointer(cbResult))
 94 | 	if result != C.DCGM_ST_OK {
 95 | 		return nil, time.Time{}, fmt.Errorf("dcgmGetValuesSince_v2 failed with error code %d", int(result))
 96 | 	}
 97 | 
 98 | 	return cbResult.Values, timestampUSECToTime(int64(nextSinceTimestamp)), nil
 99 | }
100 | 
101 | func timestampUSECToTime(timestampUSEC int64) time.Time {
102 | 	// Convert microseconds to seconds and nanoseconds
103 | 	sec := timestampUSEC / 1000000           // Convert microseconds to seconds
104 | 	nsec := (timestampUSEC % 1000000) * 1000 // Convert the remaining microseconds to nanoseconds
105 | 	// Use time.Unix to get a time.Time object
106 | 	return time.Unix(sec, nsec)
107 | }
108 | 


--------------------------------------------------------------------------------
/samples/restApi/README.md:
--------------------------------------------------------------------------------
  1 | # DCGM REST API
  2 | 
  3 | A sample REST API is provided, demonstrating various endpoints for getting GPU metrics via DCGM.
  4 | 
  5 | ```bash
  6 | # Start the http server
  7 | # By default the http server is started at localhost:8070
  8 | 
  9 | $ go build && ./restApi
 10 | 
 11 | # Query GPU 0 info
 12 | $ GPUID=0
 13 | $ curl localhost:8070/dcgm/device/info/id/$GPUID
 14 | 
 15 | # sample output
 16 | 
 17 | Driver Version         : 384.130
 18 | GPU                    : 0
 19 | DCGMSupported          : Yes
 20 | UUID                   : GPU-34e8d7ba-0e4d-ac00-6852-695d5d404f51
 21 | Brand                  : GeForce
 22 | Model                  : GeForce GTX 980
 23 | Serial Number          : 0324414056639
 24 | Vbios                  : 84.04.1F.00.02
 25 | InforomImage Version   : G001.0000.01.03
 26 | Bus ID                 : 00000000:01:00.0
 27 | BAR1 (MB)              : 256
 28 | FrameBuffer Memory (MB): 4036
 29 | Bandwidth (MB/s)       : 15760
 30 | Cores (MHz)            : 1392
 31 | Memory (MHz)           : 3505
 32 | Power (W)              : 180
 33 | CPUAffinity            : 0-11
 34 | P2P Available          : None
 35 | ---------------------------------------------------------------------
 36 | 
 37 | $ curl localhost:8070/dcgm/device/info/id/$GPUID/json
 38 | 
 39 | # Query GPU info using its UUID
 40 | 
 41 | $ UUID=$(curl -s localhost:8070/dcgm/device/info/id/$GPUID | grep -i uuid | cut -d ":" -f2 )
 42 | $ curl localhost:8070/dcgm/device/info/uuid/$UUID
 43 | $ curl localhost:8070/dcgm/device/info/uuid/$UUID/json
 44 | 
 45 | # sample output
 46 | 
 47 | {"GPU":0,"DCGMSupported":"Yes","UUID":"GPU-34e8d7ba-0e4d-ac00-6852-695d5d404f51","Power":180,"PCI":{"BusID":"00000000:01:00.0","BAR1":256,"FBTotal":4036,"Bandwidth":15760},"Clocks":{"Cores":1392,"Memory":3505},"Identifiers":{"Brand":"GeForce","Model":"GeForce GTX 980","Serial":"0324414056639","Vbios":"84.04.1F.00.02","InforomImageVersion":"G001.0000.01.03","DriverVersion":"384.130"},"Topology":null,"CPUAffinity":"0-11"}
 48 | 
 49 | # Query GPU status
 50 | 
 51 | $ curl localhost:8070/dcgm/device/status/id/$GPUID
 52 | $ curl localhost:8070/dcgm/device/status/id/$GPUID/json
 53 | 
 54 | # sample output
 55 | 
 56 | Power (W)               : 20.985
 57 | Temperature (°C)        : 47
 58 | Sm Utilization (%)      : 2
 59 | Memory Utilization (%)  : 8
 60 | Encoder Utilization (%) : 0
 61 | Decoder Utilization (%) : 0
 62 | Memory Clock (MHz       : 324
 63 | SM Clock (MHz)          : 135
 64 | 
 65 | $ curl localhost:8070/dcgm/device/status/uuid/$UUID
 66 | 
 67 | # sample output
 68 | 
 69 | {"Power":20.793,"Temperature":43,"Utilization":{"GPU":0,"Memory":8,"Encoder":0,"Decoder":0},"Memory":{"GlobalUsed":null,"ECCErrors":{"SingleBit":9223372036854775794,"DoubleBit":9223372036854775794}},"Clocks":{"Cores":135,"Memory":324},"PCI":{"BAR1Used":9,"Throughput":{"Rx":129,"Tx":47,"Replays":0},"FBUsed":423},"Performance":8,"FanSpeed":29}
 70 | 
 71 | $ curl localhost:8070/dcgm/device/status/uuid/$UUID/json
 72 | 
 73 | # Query GPU process info
 74 | 
 75 | # Run CUDA nbody sample and get its PID
 76 | # NOTE: The "WatchPidFields()" function must be initially called (as root) BEFORE starting the process to be monitored:
 77 | # 1. Run as root, for enabling health watches
 78 | $ sudo dcgmi stats -e
 79 | # 2. Start process to be monitored
 80 | $ nbody -benchmark -numbodies=1000192
 81 | # 3. Start restApi
 82 | $ go build && ./restApi
 83 | $ PID=$(pgrep nbody)
 84 | 
 85 | $ curl localhost:8070/dcgm/process/info/pid/$PID
 86 | $ curl localhost:8070/dcgm/process/info/pid/$PID/json
 87 | 
 88 | # sample output
 89 | 
 90 | {"GPU":0,"PID":19132,"Name":"nbody","ProcessUtilization":{"StartTime":1529980640,"EndTime":0,"EnergyConsumed":1346,"SmUtil":0,"MemUtil":0},"PCI":{"BAR1Used":null,"Throughput":{"Rx":null,"Tx":null,"Replays":0},"FBUsed":null},"Memory":{"GlobalUsed":84279296,"ECCErrors":{"SingleBit":0,"DoubleBit":0}},"GpuUtilization":{"GPU":null,"Memory":null,"Encoder":null,"Decoder":null},"Clocks":{"Cores":null,"Memory":null},"Violations":{"Power":0,"Thermal":0,"Reliability":0,"BoardLimit":0,"LowUtilization":0,"SyncBoost":0},"XIDErrors":{"NumErrors":0,"TimeStamp":[]}}
 91 | 
 92 | # Query GPU health
 93 | 
 94 | $ curl localhost:8070/dcgm/health/id/$GPUID
 95 | $ curl localhost:8070/dcgm/health/id/$GPUID/json
 96 | $ curl localhost:8070/dcgm/health/uuid/$UUID
 97 | $ curl localhost:8070/dcgm/health/uuid/$UUID/json
 98 | 
 99 | # sample output
100 | 
101 | {"GPU":0,"Status":"Healthy","Watches":[]}
102 | 
103 | # Query DCGM hostengine memory and CPU usage
104 | 
105 | $ curl localhost:8070/dcgm/status
106 | $ curl localhost:8070/dcgm/status/json
107 | 
108 | # sample output
109 | 
110 | {"Memory":18380,"CPU":0.16482222745467387}
111 | ```
112 | 


--------------------------------------------------------------------------------
/tests/health_test.go:
--------------------------------------------------------------------------------
  1 | package tests
  2 | 
  3 | import (
  4 | 	"testing"
  5 | 	"time"
  6 | 
  7 | 	"github.com/NVIDIA/go-dcgm/pkg/dcgm"
  8 | )
  9 | 
 10 | // TestHealthCheck demonstrates GPU health checking functionality
 11 | // This is equivalent to the health sample but runs for a limited time
 12 | func TestHealthCheck(t *testing.T) {
 13 | 	cleanup, err := dcgm.Init(dcgm.Embedded)
 14 | 	if err != nil {
 15 | 		t.Fatalf("Failed to initialize DCGM: %v", err)
 16 | 	}
 17 | 	defer cleanup()
 18 | 
 19 | 	gpus, err := dcgm.GetSupportedDevices()
 20 | 	if err != nil {
 21 | 		t.Fatalf("Failed to get supported devices: %v", err)
 22 | 	}
 23 | 
 24 | 	if len(gpus) == 0 {
 25 | 		t.Skip("No supported GPUs found for health checking")
 26 | 	}
 27 | 
 28 | 	// Monitor health for a few seconds instead of indefinitely
 29 | 	ticker := time.NewTicker(time.Second * 2)
 30 | 	defer ticker.Stop()
 31 | 
 32 | 	timeout := time.After(6 * time.Second)
 33 | 	checkCount := 0
 34 | 
 35 | 	for {
 36 | 		select {
 37 | 		case <-ticker.C:
 38 | 			for _, gpu := range gpus {
 39 | 				h, err := dcgm.HealthCheckByGpuId(gpu)
 40 | 				if err != nil {
 41 | 					t.Errorf("Failed to get health status for GPU %d: %v", gpu, err)
 42 | 					continue
 43 | 				}
 44 | 
 45 | 				t.Logf("GPU %d Health Check:", gpu)
 46 | 				t.Logf("  Status: %s", h.Status)
 47 | 
 48 | 				for _, watch := range h.Watches {
 49 | 					t.Logf("  Watch Type: %s", watch.Type)
 50 | 					t.Logf("  Watch Status: %s", watch.Status)
 51 | 					if watch.Error != "" {
 52 | 						t.Logf("  Watch Error: %s", watch.Error)
 53 | 					}
 54 | 				}
 55 | 
 56 | 				// Basic validation
 57 | 				if h.Status == "" {
 58 | 					t.Errorf("GPU %d has empty health status", gpu)
 59 | 				}
 60 | 			}
 61 | 			checkCount++
 62 | 
 63 | 		case <-timeout:
 64 | 			t.Logf("Health monitoring completed after %d checks", checkCount)
 65 | 			return
 66 | 		}
 67 | 	}
 68 | }
 69 | 
 70 | // TestHealthCheckSingle demonstrates a single health check
 71 | func TestHealthCheckSingle(t *testing.T) {
 72 | 	cleanup, err := dcgm.Init(dcgm.Embedded)
 73 | 	if err != nil {
 74 | 		t.Fatalf("Failed to initialize DCGM: %v", err)
 75 | 	}
 76 | 	defer cleanup()
 77 | 
 78 | 	gpus, err := dcgm.GetSupportedDevices()
 79 | 	if err != nil {
 80 | 		t.Fatalf("Failed to get supported devices: %v", err)
 81 | 	}
 82 | 
 83 | 	if len(gpus) == 0 {
 84 | 		t.Skip("No supported GPUs found")
 85 | 	}
 86 | 
 87 | 	// Test first GPU
 88 | 	gpu := gpus[0]
 89 | 	h, err := dcgm.HealthCheckByGpuId(gpu)
 90 | 	if err != nil {
 91 | 		t.Fatalf("Failed to get health status for GPU %d: %v", gpu, err)
 92 | 	}
 93 | 
 94 | 	t.Logf("GPU %d Health Status: %s", gpu, h.Status)
 95 | 
 96 | 	if len(h.Watches) == 0 {
 97 | 		t.Logf("No health watches configured for GPU %d", gpu)
 98 | 	} else {
 99 | 		t.Logf("Health watches for GPU %d:", gpu)
100 | 		for i, watch := range h.Watches {
101 | 			t.Logf("  Watch %d:", i+1)
102 | 			t.Logf("    Type: %s", watch.Type)
103 | 			t.Logf("    Status: %s", watch.Status)
104 | 			if watch.Error != "" {
105 | 				t.Logf("    Error: %s", watch.Error)
106 | 			}
107 | 		}
108 | 	}
109 | 
110 | 	// Basic assertions
111 | 	if h.Status == "" {
112 | 		t.Error("Health status is empty")
113 | 	}
114 | }
115 | 
116 | // TestHealthCheckAllGPUs demonstrates health checking for all GPUs
117 | func TestHealthCheckAllGPUs(t *testing.T) {
118 | 	cleanup, err := dcgm.Init(dcgm.Embedded)
119 | 	if err != nil {
120 | 		t.Fatalf("Failed to initialize DCGM: %v", err)
121 | 	}
122 | 	defer cleanup()
123 | 
124 | 	gpus, err := dcgm.GetSupportedDevices()
125 | 	if err != nil {
126 | 		t.Fatalf("Failed to get supported devices: %v", err)
127 | 	}
128 | 
129 | 	if len(gpus) == 0 {
130 | 		t.Skip("No supported GPUs found")
131 | 	}
132 | 
133 | 	healthyGPUs := 0
134 | 	unhealthyGPUs := 0
135 | 
136 | 	for _, gpu := range gpus {
137 | 		h, err := dcgm.HealthCheckByGpuId(gpu)
138 | 		if err != nil {
139 | 			t.Errorf("Failed to get health status for GPU %d: %v", gpu, err)
140 | 			continue
141 | 		}
142 | 
143 | 		t.Logf("GPU %d: %s", gpu, h.Status)
144 | 
145 | 		// Count healthy vs unhealthy
146 | 		if h.Status == "Healthy" || h.Status == "OK" {
147 | 			healthyGPUs++
148 | 		} else {
149 | 			unhealthyGPUs++
150 | 			t.Logf("GPU %d is not healthy: %s", gpu, h.Status)
151 | 
152 | 			// Log any watch errors
153 | 			for _, watch := range h.Watches {
154 | 				if watch.Error != "" {
155 | 					t.Logf("  Watch %s error: %s", watch.Type, watch.Error)
156 | 				}
157 | 			}
158 | 		}
159 | 	}
160 | 
161 | 	t.Logf("Health summary: %d healthy, %d unhealthy GPUs", healthyGPUs, unhealthyGPUs)
162 | 
163 | 	// We expect at least some GPUs to be available
164 | 	if healthyGPUs == 0 && unhealthyGPUs == 0 {
165 | 		t.Error("No GPU health status could be determined")
166 | 	}
167 | }
168 | 


--------------------------------------------------------------------------------
/pkg/dcgm/internal.go:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | // Package dcgm provides bindings for NVIDIA's Data Center GPU Manager (DCGM)
 18 | package dcgm
 19 | 
 20 | /*
 21 | #cgo linux LDFLAGS: -ldl -Wl,--export-dynamic -Wl,--unresolved-symbols=ignore-in-object-files
 22 | #cgo darwin LDFLAGS: -ldl -Wl,--export-dynamic -Wl,-undefined,dynamic_lookup
 23 | 
 24 | #include "dcgm_test_apis.h"
 25 | #include "dcgm_test_structs.h"
 26 | #include "dcgm_structs_internal.h"
 27 | */
 28 | import "C"
 29 | 
 30 | import (
 31 | 	"unsafe"
 32 | )
 33 | 
 34 | // MigHierarchyInfo represents the Multi-Instance GPU (MIG) hierarchy information
 35 | // for a GPU entity and its relationship to other entities
 36 | type MigHierarchyInfo struct {
 37 | 	// Entity represents the current GPU entity in the hierarchy
 38 | 	Entity GroupEntityPair
 39 | 	// Parent represents the parent GPU entity in the hierarchy
 40 | 	Parent GroupEntityPair
 41 | 	// SliceProfile defines the MIG profile configuration for this entity
 42 | 	SliceProfile MigProfile
 43 | }
 44 | 
 45 | // CreateFakeEntities creates test entities with the specified MIG hierarchy information.
 46 | // This function is intended for testing purposes only.
 47 | // Returns a slice of Entity IDs for the created entities and any error encountered.
 48 | func CreateFakeEntities(entities []MigHierarchyInfo) ([]uint, error) {
 49 | 	ccfe := C.dcgmCreateFakeEntities_v2{
 50 | 		version:     C.dcgmCreateFakeEntities_version2,
 51 | 		numToCreate: C.uint(len(entities)),
 52 | 		entityList:  [C.DCGM_MAX_HIERARCHY_INFO]C.dcgmMigHierarchyInfo_t{},
 53 | 	}
 54 | 
 55 | 	for i := range entities {
 56 | 		if i >= C.DCGM_MAX_HIERARCHY_INFO {
 57 | 			break
 58 | 		}
 59 | 		entity := entities[i]
 60 | 		ccfe.entityList[i] = C.dcgmMigHierarchyInfo_t{
 61 | 			entity: C.dcgmGroupEntityPair_t{
 62 | 				entityGroupId: C.dcgm_field_entity_group_t(entity.Entity.EntityGroupId),
 63 | 				entityId:      C.uint(entity.Entity.EntityId),
 64 | 			},
 65 | 			parent: C.dcgmGroupEntityPair_t{
 66 | 				entityGroupId: C.dcgm_field_entity_group_t(entity.Parent.EntityGroupId),
 67 | 				entityId:      C.uint(entity.Parent.EntityId),
 68 | 			},
 69 | 			sliceProfile: C.dcgmMigProfile_t(entity.SliceProfile),
 70 | 		}
 71 | 	}
 72 | 	result := C.dcgmCreateFakeEntities(handle.handle, &ccfe)
 73 | 
 74 | 	if err := errorString(result); err != nil {
 75 | 		return nil, &Error{msg: C.GoString(C.errorString(result)), Code: result}
 76 | 	}
 77 | 	entityIDs := make([]uint, ccfe.numToCreate)
 78 | 	for i := 0; i < int(ccfe.numToCreate); i++ {
 79 | 		entityIDs[i] = uint(ccfe.entityList[i].entity.entityId)
 80 | 	}
 81 | 
 82 | 	return entityIDs, nil
 83 | }
 84 | 
 85 | // InjectFieldValue injects a test value for a specific field into DCGM's field manager.
 86 | // This function is intended for testing purposes only.
 87 | //
 88 | // Parameters:
 89 | //   - gpu: The GPU ID to inject the field value for
 90 | //   - fieldID: The DCGM field identifier
 91 | //   - fieldType: The type of the field (e.g., DCGM_FT_INT64, DCGM_FT_DOUBLE)
 92 | //   - status: The status code for the field
 93 | //   - ts: The timestamp for the field value
 94 | //   - value: The value to inject (must match fieldType)
 95 | //
 96 | // Returns an error if the injection fails
 97 | func InjectFieldValue(gpu uint, fieldID Short, fieldType uint, status int, ts int64, value any) error {
 98 | 	field := C.dcgmInjectFieldValue_t{
 99 | 		version:   C.dcgmInjectFieldValue_version1,
100 | 		fieldId:   C.ushort(fieldID),
101 | 		fieldType: C.ushort(fieldType),
102 | 		status:    C.int(status),
103 | 		ts:        C.long(ts),
104 | 	}
105 | 
106 | 	switch fieldType {
107 | 	case DCGM_FT_INT64:
108 | 		i64Val := value.(int64)
109 | 		ptr := (*C.int64_t)(unsafe.Pointer(&field.value[0]))
110 | 		*ptr = C.int64_t(i64Val)
111 | 	case DCGM_FT_DOUBLE:
112 | 		dbVal := value.(float64)
113 | 		ptr := (*C.double)(unsafe.Pointer(&field.value[0]))
114 | 		*ptr = C.double(dbVal)
115 | 	}
116 | 
117 | 	result := C.dcgmInjectFieldValue(handle.handle, C.uint(gpu), &field)
118 | 
119 | 	if err := errorString(result); err != nil {
120 | 		return &Error{msg: C.GoString(C.errorString(result)), Code: result}
121 | 	}
122 | 
123 | 	return nil
124 | }
125 | 


--------------------------------------------------------------------------------
/tests/hostengine_test.go:
--------------------------------------------------------------------------------
  1 | package tests
  2 | 
  3 | import (
  4 | 	"testing"
  5 | 
  6 | 	"github.com/NVIDIA/go-dcgm/pkg/dcgm"
  7 | )
  8 | 
  9 | // TestHostEngineStatus demonstrates DCGM host engine introspection
 10 | // This is equivalent to the hostengineStatus sample
 11 | func TestHostEngineStatus(t *testing.T) {
 12 | 	cleanup, err := dcgm.Init(dcgm.Embedded)
 13 | 	if err != nil {
 14 | 		t.Fatalf("Failed to initialize DCGM: %v", err)
 15 | 	}
 16 | 	defer cleanup()
 17 | 
 18 | 	st, err := dcgm.Introspect()
 19 | 	if err != nil {
 20 | 		t.Fatalf("Failed to introspect host engine: %v", err)
 21 | 	}
 22 | 
 23 | 	t.Logf("Host Engine Status:")
 24 | 	t.Logf("  Memory: %v KB", st.Memory)
 25 | 	t.Logf("  CPU: %.2f%%", st.CPU)
 26 | 
 27 | 	// Basic validation
 28 | 	if st.Memory < 0 {
 29 | 		t.Error("Memory usage cannot be negative")
 30 | 	}
 31 | 	if st.CPU < 0 || st.CPU > 100 {
 32 | 		t.Errorf("CPU usage out of expected range: %.2f%%", st.CPU)
 33 | 	}
 34 | 
 35 | 	// Log some insights
 36 | 	if st.Memory > 100000 { // > 100MB
 37 | 		t.Logf("Host engine is using significant memory: %v KB", st.Memory)
 38 | 	}
 39 | 	if st.CPU > 50 {
 40 | 		t.Logf("Host engine is using significant CPU: %.2f%%", st.CPU)
 41 | 	}
 42 | }
 43 | 
 44 | // TestHostEngineStatusMultipleSamples demonstrates taking multiple introspection samples
 45 | func TestHostEngineStatusMultipleSamples(t *testing.T) {
 46 | 	if testing.Short() {
 47 | 		t.Skip("Skipping multiple samples test in short mode")
 48 | 	}
 49 | 
 50 | 	cleanup, err := dcgm.Init(dcgm.Embedded)
 51 | 	if err != nil {
 52 | 		t.Fatalf("Failed to initialize DCGM: %v", err)
 53 | 	}
 54 | 	defer cleanup()
 55 | 
 56 | 	samples := 3
 57 | 	memoryUsages := make([]int64, 0, samples)
 58 | 	cpuUsages := make([]float64, 0, samples)
 59 | 
 60 | 	for i := 0; i < samples; i++ {
 61 | 		st, err := dcgm.Introspect()
 62 | 		if err != nil {
 63 | 			t.Errorf("Failed to introspect host engine sample %d: %v", i+1, err)
 64 | 			continue
 65 | 		}
 66 | 
 67 | 		memoryUsages = append(memoryUsages, st.Memory)
 68 | 		cpuUsages = append(cpuUsages, st.CPU)
 69 | 
 70 | 		t.Logf("Sample %d - Memory: %v KB, CPU: %.2f%%", i+1, st.Memory, st.CPU)
 71 | 	}
 72 | 
 73 | 	if len(memoryUsages) > 1 {
 74 | 		// Check for significant memory changes
 75 | 		minMem := memoryUsages[0]
 76 | 		maxMem := memoryUsages[0]
 77 | 
 78 | 		for _, mem := range memoryUsages[1:] {
 79 | 			if mem < minMem {
 80 | 				minMem = mem
 81 | 			}
 82 | 			if mem > maxMem {
 83 | 				maxMem = mem
 84 | 			}
 85 | 		}
 86 | 
 87 | 		if maxMem-minMem > 1000 { // More than 1MB difference
 88 | 			t.Logf("Memory usage varied significantly: %v KB to %v KB", minMem, maxMem)
 89 | 		} else {
 90 | 			t.Logf("Memory usage remained stable around %v KB", memoryUsages[0])
 91 | 		}
 92 | 	}
 93 | 
 94 | 	if len(cpuUsages) > 1 {
 95 | 		// Check for significant CPU changes
 96 | 		minCPU := cpuUsages[0]
 97 | 		maxCPU := cpuUsages[0]
 98 | 
 99 | 		for _, cpu := range cpuUsages[1:] {
100 | 			if cpu < minCPU {
101 | 				minCPU = cpu
102 | 			}
103 | 			if cpu > maxCPU {
104 | 				maxCPU = cpu
105 | 			}
106 | 		}
107 | 
108 | 		if maxCPU-minCPU > 10 { // More than 10% difference
109 | 			t.Logf("CPU usage varied significantly: %.2f%% to %.2f%%", minCPU, maxCPU)
110 | 		} else {
111 | 			t.Logf("CPU usage remained stable around %.2f%%", cpuUsages[0])
112 | 		}
113 | 	}
114 | }
115 | 
116 | // TestHostEngineStatusWithLoad demonstrates introspection while performing operations
117 | func TestHostEngineStatusWithLoad(t *testing.T) {
118 | 	cleanup, err := dcgm.Init(dcgm.Embedded)
119 | 	if err != nil {
120 | 		t.Fatalf("Failed to initialize DCGM: %v", err)
121 | 	}
122 | 	defer cleanup()
123 | 
124 | 	// Get baseline status
125 | 	baselineSt, err := dcgm.Introspect()
126 | 	if err != nil {
127 | 		t.Fatalf("Failed to get baseline introspection: %v", err)
128 | 	}
129 | 
130 | 	t.Logf("Baseline - Memory: %v KB, CPU: %.2f%%", baselineSt.Memory, baselineSt.CPU)
131 | 
132 | 	// Perform some operations to potentially increase load
133 | 	gpus, err := dcgm.GetSupportedDevices()
134 | 	if err != nil {
135 | 		t.Logf("Failed to get supported devices: %v", err)
136 | 	} else {
137 | 		// Get device info for all GPUs
138 | 		for _, gpu := range gpus {
139 | 			_, err = dcgm.GetDeviceInfo(gpu)
140 | 			if err != nil {
141 | 				t.Logf("Failed to get device info for GPU %d: %v", gpu, err)
142 | 			}
143 | 		}
144 | 	}
145 | 
146 | 	// Get status after operations
147 | 	loadedSt, err := dcgm.Introspect()
148 | 	if err != nil {
149 | 		t.Fatalf("Failed to get loaded introspection: %v", err)
150 | 	}
151 | 
152 | 	t.Logf("After load - Memory: %v KB, CPU: %.2f%%", loadedSt.Memory, loadedSt.CPU)
153 | 
154 | 	// Compare baseline vs loaded
155 | 	memoryDiff := loadedSt.Memory - baselineSt.Memory
156 | 	cpuDiff := loadedSt.CPU - baselineSt.CPU
157 | 
158 | 	t.Logf("Differences - Memory: %+d KB, CPU: %+.2f%%", memoryDiff, cpuDiff)
159 | 
160 | 	// Basic checks
161 | 	if loadedSt.Memory == 0 {
162 | 		t.Error("Memory usage should not be zero")
163 | 	}
164 | 	if loadedSt.CPU < 0 {
165 | 		t.Error("CPU usage should not be negative")
166 | 	}
167 | }
168 | 


--------------------------------------------------------------------------------
/pkg/dcgm/gpu_group.go:
--------------------------------------------------------------------------------
  1 | package dcgm
  2 | 
  3 | /*
  4 | #include "dcgm_agent.h"
  5 | #include "dcgm_structs.h"
  6 | */
  7 | import "C"
  8 | 
  9 | import (
 10 | 	"context"
 11 | 	"encoding/binary"
 12 | 	"fmt"
 13 | )
 14 | 
 15 | // DCGM_GROUP_MAX_ENTITIES represents the maximum number of entities allowed in a group
 16 | const (
 17 | 	DCGM_GROUP_MAX_ENTITIES int = C.DCGM_GROUP_MAX_ENTITIES_V2
 18 | )
 19 | 
 20 | // GroupHandle represents a handle to a DCGM GPU group
 21 | type GroupHandle struct{ handle C.dcgmGpuGrp_t }
 22 | 
 23 | // SetHandle sets the internal group handle value
 24 | func (g *GroupHandle) SetHandle(val uintptr) {
 25 | 	g.handle = C.dcgmGpuGrp_t(val)
 26 | }
 27 | 
 28 | // GetHandle returns the internal group handle value
 29 | func (g *GroupHandle) GetHandle() uintptr {
 30 | 	return uintptr(g.handle)
 31 | }
 32 | 
 33 | // GroupAllGPUs returns a GroupHandle representing all GPUs in the system
 34 | func GroupAllGPUs() GroupHandle {
 35 | 	return GroupHandle{C.DCGM_GROUP_ALL_GPUS}
 36 | }
 37 | 
 38 | // CreateGroup creates a new empty GPU group with the specified name
 39 | func CreateGroup(groupName string) (goGroupId GroupHandle, err error) {
 40 | 	var cGroupID C.dcgmGpuGrp_t
 41 | 	cname := C.CString(groupName)
 42 | 	defer freeCString(cname)
 43 | 
 44 | 	result := C.dcgmGroupCreate(handle.handle, C.DCGM_GROUP_EMPTY, cname, &cGroupID)
 45 | 	if err = errorString(result); err != nil {
 46 | 		return goGroupId, fmt.Errorf("error creating group: %s", err)
 47 | 	}
 48 | 
 49 | 	goGroupId = GroupHandle{cGroupID}
 50 | 	return
 51 | }
 52 | 
 53 | // NewDefaultGroup creates a new group with default GPUs and the specified name
 54 | func NewDefaultGroup(groupName string) (GroupHandle, error) {
 55 | 	var cGroupID C.dcgmGpuGrp_t
 56 | 
 57 | 	cname := C.CString(groupName)
 58 | 	defer freeCString(cname)
 59 | 
 60 | 	result := C.dcgmGroupCreate(handle.handle, C.DCGM_GROUP_DEFAULT, cname, &cGroupID)
 61 | 	if err := errorString(result); err != nil {
 62 | 		return GroupHandle{}, fmt.Errorf("error creating group: %s", err)
 63 | 	}
 64 | 
 65 | 	return GroupHandle{cGroupID}, nil
 66 | }
 67 | 
 68 | // AddToGroup adds a GPU to an existing group
 69 | func AddToGroup(groupID GroupHandle, gpuID uint) (err error) {
 70 | 	result := C.dcgmGroupAddDevice(handle.handle, groupID.handle, C.uint(gpuID))
 71 | 	if err = errorString(result); err != nil {
 72 | 		return fmt.Errorf("error adding GPU %v to group: %s", gpuID, err)
 73 | 	}
 74 | 
 75 | 	return
 76 | }
 77 | 
 78 | // AddLinkEntityToGroup adds a link entity to the group
 79 | func AddLinkEntityToGroup(groupID GroupHandle, index uint, entityGroupID Field_Entity_Group, parentID uint) (err error) {
 80 | 	/* Only supported on little-endian systems currently */
 81 | 	slice := make([]byte, 4)
 82 | 	slice[0] = uint8(entityGroupID)
 83 | 	binary.LittleEndian.PutUint16(slice[1:3], uint16(index))
 84 | 	slice[3] = uint8(parentID)
 85 | 
 86 | 	entityId := binary.LittleEndian.Uint32(slice)
 87 | 
 88 | 	return AddEntityToGroup(groupID, FE_LINK, uint(entityId))
 89 | }
 90 | 
 91 | // AddEntityToGroup adds an entity to an existing group
 92 | func AddEntityToGroup(groupID GroupHandle, entityGroupID Field_Entity_Group, entityID uint) (err error) {
 93 | 	result := C.dcgmGroupAddEntity(handle.handle, groupID.handle, C.dcgm_field_entity_group_t(entityGroupID),
 94 | 		C.uint(entityID))
 95 | 	if err = errorString(result); err != nil {
 96 | 		return fmt.Errorf("error adding entity group type %v, entity %v to group: %s", entityGroupID, entityID, err)
 97 | 	}
 98 | 
 99 | 	return
100 | }
101 | 
102 | // DestroyGroup destroys an existing GPU group
103 | func DestroyGroup(groupID GroupHandle) (err error) {
104 | 	result := C.dcgmGroupDestroy(handle.handle, groupID.handle)
105 | 	if err = errorString(result); err != nil {
106 | 		return fmt.Errorf("error destroying group: %s", err)
107 | 	}
108 | 
109 | 	return
110 | }
111 | 
112 | // GroupInfo contains information about a DCGM group
113 | type GroupInfo struct {
114 | 	Version    uint32
115 | 	GroupName  string
116 | 	EntityList []GroupEntityPair
117 | }
118 | 
119 | // GetGroupInfo retrieves information about a DCGM group
120 | func GetGroupInfo(groupID GroupHandle) (*GroupInfo, error) {
121 | 	response := C.dcgmGroupInfo_v3{
122 | 		version: C.dcgmGroupInfo_version3,
123 | 	}
124 | 
125 | 	result := C.dcgmGroupGetInfo(handle.handle, groupID.handle, &response)
126 | 	if err := errorString(result); err != nil {
127 | 		return nil, err
128 | 	}
129 | 
130 | 	ret := GroupInfo{
131 | 		Version:    uint32(response.version),
132 | 		GroupName:  C.GoString(&response.groupName[0]),
133 | 		EntityList: make([]GroupEntityPair, response.count),
134 | 	}
135 | 
136 | 	for i := 0; i < int(response.count); i++ {
137 | 		ret.EntityList[i].EntityId = uint(response.entityList[i].entityId)
138 | 		ret.EntityList[i].EntityGroupId = Field_Entity_Group(response.entityList[i].entityGroupId)
139 | 	}
140 | 
141 | 	return &ret, nil
142 | }
143 | 
144 | // CreateGroupWithContext creates a new group with a context
145 | func CreateGroupWithContext(ctx context.Context, groupName string) (GroupHandle, error) {
146 | 	select {
147 | 	case <-ctx.Done():
148 | 		return GroupHandle{}, ctx.Err()
149 | 	default:
150 | 		return CreateGroup(groupName)
151 | 	}
152 | }
153 | 


--------------------------------------------------------------------------------
/tests/diag_test.go:
--------------------------------------------------------------------------------
  1 | package tests
  2 | 
  3 | import (
  4 | 	"strings"
  5 | 	"testing"
  6 | 
  7 | 	"github.com/NVIDIA/go-dcgm/pkg/dcgm"
  8 | 	"github.com/stretchr/testify/assert"
  9 | )
 10 | 
 11 | // TestDiagnostics demonstrates running DCGM diagnostics
 12 | // This is equivalent to the diag sample
 13 | func TestDiagnostics(t *testing.T) {
 14 | 	cleanup, err := dcgm.Init(dcgm.Embedded)
 15 | 	if err != nil {
 16 | 		t.Fatalf("Failed to initialize DCGM: %v", err)
 17 | 	}
 18 | 	defer cleanup()
 19 | 
 20 | 	// Run quick diagnostics on all GPUs
 21 | 	dr, err := dcgm.RunDiag(dcgm.DiagQuick, dcgm.GroupAllGPUs())
 22 | 	if err != nil {
 23 | 		t.Fatalf("Failed to run diagnostics: %v", err)
 24 | 	}
 25 | 
 26 | 	// Log software test results
 27 | 	t.Logf("Software Tests:")
 28 | 	for _, test := range dr.Software {
 29 | 		t.Logf("  %-50s %s\t%s", test.TestName, test.Status, test.TestOutput)
 30 | 	}
 31 | 
 32 | 	// Basic validation - we should have some results
 33 | 	if len(dr.Software) == 0 {
 34 | 		t.Error("No diagnostic results returned")
 35 | 	}
 36 | 
 37 | 	// Check for any failed tests
 38 | 	failedTests := 0
 39 | 	for _, test := range dr.Software {
 40 | 		if test.Status == "fail" {
 41 | 			failedTests++
 42 | 			t.Logf("Software test failed: %s - %s", test.TestName, test.TestOutput)
 43 | 		}
 44 | 	}
 45 | 
 46 | 	if failedTests > 0 {
 47 | 		t.Logf("Total failed tests: %d", failedTests)
 48 | 	} else {
 49 | 		t.Log("All diagnostic tests passed")
 50 | 	}
 51 | }
 52 | 
 53 | // TestDiagnosticsLong demonstrates running longer diagnostics
 54 | func TestDiagnosticsLong(t *testing.T) {
 55 | 	if testing.Short() {
 56 | 		t.Skip("Skipping long diagnostics test in short mode")
 57 | 	}
 58 | 
 59 | 	cleanup, err := dcgm.Init(dcgm.Embedded)
 60 | 	if err != nil {
 61 | 		t.Fatalf("Failed to initialize DCGM: %v", err)
 62 | 	}
 63 | 	defer cleanup()
 64 | 
 65 | 	// Get supported devices first
 66 | 	gpus, err := dcgm.GetSupportedDevices()
 67 | 	if err != nil {
 68 | 		t.Fatalf("Failed to get supported devices: %v", err)
 69 | 	}
 70 | 
 71 | 	if len(gpus) == 0 {
 72 | 		t.Skip("No supported GPUs found for diagnostics")
 73 | 	}
 74 | 
 75 | 	// Run diagnostics on first GPU only for time efficiency
 76 | 	group, err := dcgm.CreateGroup("test-group")
 77 | 	if err != nil {
 78 | 		t.Fatalf("Failed to create group: %v", err)
 79 | 	}
 80 | 	defer func() {
 81 | 		if err = dcgm.DestroyGroup(group); err != nil {
 82 | 			t.Logf("Failed to destroy group: %v", err)
 83 | 		}
 84 | 	}()
 85 | 
 86 | 	err = dcgm.AddToGroup(group, gpus[0])
 87 | 	if err != nil {
 88 | 		t.Fatalf("Failed to add GPU to group: %v", err)
 89 | 	}
 90 | 
 91 | 	// Run medium-level diagnostics
 92 | 	dr, err := dcgm.RunDiag(dcgm.DiagMedium, group)
 93 | 	if err != nil {
 94 | 		t.Fatalf("Failed to run medium diagnostics: %v", err)
 95 | 	}
 96 | 
 97 | 	t.Logf("Medium diagnostics completed for GPU %d", gpus[0])
 98 | 
 99 | 	// Log results
100 | 	for _, test := range dr.Software {
101 | 		t.Logf("  %s: %s", test.TestName, test.Status)
102 | 	}
103 | }
104 | 
105 | // TestDiagTestNameFormat validates that TestName field contains category names,
106 | // not detailed test descriptions (issue #97)
107 | func TestDiagTestNameFormat(t *testing.T) {
108 | 	cleanup, err := dcgm.Init(dcgm.Embedded)
109 | 	if err != nil {
110 | 		t.Fatalf("Failed to initialize DCGM: %v", err)
111 | 	}
112 | 	defer cleanup()
113 | 
114 | 	dr, err := dcgm.RunDiag(dcgm.DiagQuick, dcgm.GroupAllGPUs())
115 | 	if err != nil {
116 | 		t.Fatalf("Failed to run diagnostics: %v", err)
117 | 	}
118 | 
119 | 	assert.NotEmpty(t, dr.Software, "diagnostic results should not be empty")
120 | 
121 | 	// Valid test category names that should appear (lowercase)
122 | 	validTestNames := []string{
123 | 		"software",
124 | 		"memory",
125 | 		"pcie",
126 | 		"diagnostic",
127 | 		"sm stress",
128 | 		"targeted stress",
129 | 		"targeted power",
130 | 		"memory bandwidth",
131 | 		"memtest",
132 | 		"pulse",
133 | 		"eud",
134 | 		"context create",
135 | 	}
136 | 
137 | 	// Invalid strings that should NOT appear in TestName
138 | 	// These are detailed descriptions that were incorrectly returned before fix
139 | 	invalidPatterns := []string{
140 | 		"presence of drivers on the denylist",
141 | 		"(e.g. nouveau)",
142 | 		"Allocated",
143 | 		"bytes",
144 | 		"presence (and version)",
145 | 	}
146 | 
147 | 	for i, test := range dr.Software {
148 | 		t.Logf("Result %d: TestName=%q, Status=%s", i, test.TestName, test.Status)
149 | 
150 | 		// TestName should be one of the valid category names
151 | 		assert.Contains(
152 | 			t,
153 | 			validTestNames,
154 | 			test.TestName,
155 | 			"TestName should be a category name like 'software', 'memory', 'pcie', got: %q",
156 | 			test.TestName,
157 | 		)
158 | 
159 | 		// TestName should NOT contain detailed descriptions
160 | 		for _, invalid := range invalidPatterns {
161 | 			assert.NotContains(
162 | 				t,
163 | 				test.TestName,
164 | 				invalid,
165 | 				"TestName should not contain detailed descriptions, got: %q",
166 | 				test.TestName,
167 | 			)
168 | 		}
169 | 
170 | 		// TestName should be lowercase
171 | 		assert.Equal(
172 | 			t,
173 | 			strings.ToLower(test.TestName),
174 | 			test.TestName,
175 | 			"TestName should be lowercase, got: %q",
176 | 			test.TestName,
177 | 		)
178 | 	}
179 | }
180 | 


--------------------------------------------------------------------------------
/pkg/dcgm/fields_test.go:
--------------------------------------------------------------------------------
  1 | package dcgm
  2 | 
  3 | import (
  4 | 	crand "crypto/rand"
  5 | 	"fmt"
  6 | 	"math/big"
  7 | 	"runtime"
  8 | 	"testing"
  9 | 	"time"
 10 | 
 11 | 	"github.com/stretchr/testify/assert"
 12 | 	"github.com/stretchr/testify/require"
 13 | )
 14 | 
 15 | func TestFieldHandle(t *testing.T) {
 16 | 	fh := FieldHandle{}
 17 | 	assert.Equal(t, uintptr(0), fh.GetHandle(), "value mismatch")
 18 | 
 19 | 	inputs := []uintptr{1000, 0, 1, 10, 11, 50, 100, 1939902, 9992932938239, 999999999999999999}
 20 | 
 21 | 	for _, input := range inputs {
 22 | 		fh.SetHandle(input)
 23 | 		assert.Equal(t, input, fh.GetHandle(), "values mismatch")
 24 | 	}
 25 | }
 26 | 
 27 | func TestGetLatestValuesForFields(t *testing.T) {
 28 | 	teardownTest := setupTest(t)
 29 | 	defer teardownTest(t)
 30 | 
 31 | 	runOnlyWithLiveGPUs(t)
 32 | 
 33 | 	// Setup test GPU
 34 | 	gpus, err := withInjectionGPUs(t, 1)
 35 | 	require.NoError(t, err)
 36 | 	gpuId := gpus[0]
 37 | 
 38 | 	// Setup test group
 39 | 	groupId, err := NewDefaultGroup("mygroup")
 40 | 	require.NoError(t, err)
 41 | 	defer func() {
 42 | 		destroyGroupErr := DestroyGroup(groupId)
 43 | 		require.NoError(t, destroyGroupErr)
 44 | 	}()
 45 | 
 46 | 	// Setup field group
 47 | 	fieldId := DCGM_FI_DEV_XID_ERRORS
 48 | 	n, err := crand.Int(crand.Reader, big.NewInt(1000000))
 49 | 	require.NoError(t, err)
 50 | 	fieldGroupName := fmt.Sprintf("fieldGroupName%d", n.Int64())
 51 | 	fieldsGroup, err := FieldGroupCreate(fieldGroupName, []Short{fieldId})
 52 | 	require.NoError(t, err)
 53 | 	defer func() {
 54 | 		destroyFieldsGroupErr := FieldGroupDestroy(fieldsGroup)
 55 | 		require.NoError(t, destroyFieldsGroupErr)
 56 | 	}()
 57 | 
 58 | 	// Inject test value
 59 | 	err = InjectFieldValue(gpuId,
 60 | 		DCGM_FI_DEV_XID_ERRORS,
 61 | 		DCGM_FT_INT64,
 62 | 		0,
 63 | 		time.Now().Add(-time.Duration(5)*time.Second).UnixMicro(),
 64 | 		int64(10),
 65 | 	)
 66 | 	require.NoError(t, err)
 67 | 
 68 | 	// Setup field watching
 69 | 	err = WatchFieldsWithGroupEx(
 70 | 		fieldsGroup,
 71 | 		groupId,
 72 | 		defaultUpdateFreq,
 73 | 		defaultMaxKeepAge,
 74 | 		defaultMaxKeepSamples,
 75 | 	)
 76 | 	require.NoError(t, err)
 77 | 
 78 | 	err = UpdateAllFields()
 79 | 	require.NoError(t, err)
 80 | 
 81 | 	// Test
 82 | 	values, err := GetLatestValuesForFields(gpuId, []Short{fieldId})
 83 | 	require.NoError(t, err)
 84 | 
 85 | 	// Verify results
 86 | 	assert.Len(t, values, 1)
 87 | 	assert.NotEmpty(t, values[0].String())
 88 | 	assert.Equal(t, int64(10), values[0].Int64())
 89 | }
 90 | 
 91 | func BenchmarkGetLatestValuesForFieldsVariousSize(b *testing.B) {
 92 | 	teardownTest := setupTest(b)
 93 | 	defer teardownTest(b)
 94 | 
 95 | 	// Setup test GPU
 96 | 	gpus, err := withInjectionGPUs(b, 1)
 97 | 	require.NoError(b, err)
 98 | 	gpuId := gpus[0]
 99 | 
100 | 	// Setup test group
101 | 	groupId, err := NewDefaultGroup("mygroup")
102 | 	require.NoError(b, err)
103 | 	defer func() {
104 | 		err := DestroyGroup(groupId)
105 | 		require.NoError(b, err)
106 | 	}()
107 | 
108 | 	// Use the same fields as in the main benchmark
109 | 	allFieldIds := []Short{
110 | 		DCGM_FI_DEV_XID_ERRORS,
111 | 		DCGM_FI_DEV_DIAG_MEMORY_RESULT,
112 | 		DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION,
113 | 		DCGM_FI_DEV_GPU_TEMP,
114 | 		DCGM_FI_DEV_MEMORY_TEMP,
115 | 		DCGM_FI_DEV_GPU_UTIL,
116 | 		DCGM_FI_DEV_MEM_COPY_UTIL,
117 | 		DCGM_FI_DEV_ENC_UTIL,
118 | 		DCGM_FI_DEV_DEC_UTIL,
119 | 		DCGM_FI_DEV_FB_FREE,
120 | 		DCGM_FI_DEV_FB_USED,
121 | 		DCGM_FI_DEV_PCIE_REPLAY_COUNTER,
122 | 		DCGM_FI_DEV_SM_CLOCK,
123 | 		DCGM_FI_DEV_RETIRED_PENDING,
124 | 		DCGM_FI_DEV_RETIRED_SBE,
125 | 		DCGM_FI_DEV_RETIRED_DBE,
126 | 		DCGM_FI_DEV_POWER_VIOLATION,
127 | 		DCGM_FI_DEV_THERMAL_VIOLATION,
128 | 	}
129 | 
130 | 	// Test different field counts
131 | 	fieldCounts := []int{1, 5, 10, len(allFieldIds)}
132 | 
133 | 	for _, count := range fieldCounts {
134 | 		b.Run(fmt.Sprintf("Fields-%d", count), func(b *testing.B) {
135 | 			fieldIds := allFieldIds[:count] // Take first 'count' fields
136 | 
137 | 			// Setup field group
138 | 			fieldGroupName := fmt.Sprintf("fieldGroup-%d", count)
139 | 			fieldsGroup, err := FieldGroupCreate(fieldGroupName, fieldIds)
140 | 			require.NoError(b, err)
141 | 			defer func() {
142 | 				destroyFieldsGroupErr := FieldGroupDestroy(fieldsGroup)
143 | 				require.NoError(b, destroyFieldsGroupErr)
144 | 			}()
145 | 
146 | 			// Setup field watching
147 | 			err = WatchFieldsWithGroupEx(
148 | 				fieldsGroup,
149 | 				groupId,
150 | 				defaultUpdateFreq,
151 | 				defaultMaxKeepAge,
152 | 				defaultMaxKeepSamples,
153 | 			)
154 | 			require.NoError(b, err)
155 | 
156 | 			// Inject values for all fields
157 | 			for _, fieldId := range fieldIds {
158 | 				err = InjectFieldValue(gpuId,
159 | 					fieldId,
160 | 					DCGM_FT_INT64,
161 | 					0,
162 | 					time.Now().Add(-time.Duration(5)*time.Second).UnixMicro(),
163 | 					int64(10),
164 | 				)
165 | 				require.NoError(b, err)
166 | 			}
167 | 
168 | 			err = UpdateAllFields()
169 | 			require.NoError(b, err)
170 | 
171 | 			b.ResetTimer()
172 | 			b.ReportAllocs()
173 | 
174 | 			for i := 0; i < b.N; i++ {
175 | 				values, err := GetLatestValuesForFields(gpuId, fieldIds)
176 | 				require.NoError(b, err)
177 | 				require.Len(b, values, len(fieldIds), "expected %d values, got %d", len(fieldIds), len(values))
178 | 				runtime.KeepAlive(values)
179 | 			}
180 | 		})
181 | 	}
182 | }
183 | 


--------------------------------------------------------------------------------
/pkg/dcgm/mig.go:
--------------------------------------------------------------------------------
  1 | package dcgm
  2 | 
  3 | /*
  4 | #include "dcgm_agent.h"
  5 | #include "dcgm_structs.h"
  6 | */
  7 | import "C"
  8 | 
  9 | import (
 10 | 	"fmt"
 11 | 	"unsafe"
 12 | )
 13 | 
 14 | // Field_Entity_Group represents the type of DCGM entity
 15 | type Field_Entity_Group uint
 16 | 
 17 | const (
 18 | 	// FE_NONE represents no entity type
 19 | 	FE_NONE Field_Entity_Group = iota
 20 | 	// FE_GPU represents a GPU device entity
 21 | 	FE_GPU
 22 | 	// FE_VGPU represents a virtual GPU entity
 23 | 	FE_VGPU
 24 | 	// FE_SWITCH represents an NVSwitch entity
 25 | 	FE_SWITCH
 26 | 	// FE_GPU_I represents a GPU instance entity
 27 | 	FE_GPU_I
 28 | 	// FE_GPU_CI represents a GPU compute instance entity
 29 | 	FE_GPU_CI
 30 | 	// FE_LINK represents an NVLink entity
 31 | 	FE_LINK
 32 | 	// FE_CPU represents a CPU entity
 33 | 	FE_CPU
 34 | 	// FE_CPU_CORE represents a CPU core entity
 35 | 	FE_CPU_CORE
 36 | 	// FE_COUNT represents the total number of entity types
 37 | 	FE_COUNT
 38 | )
 39 | 
 40 | // String returns a string representation of the Field_Entity_Group
 41 | func (e Field_Entity_Group) String() string {
 42 | 	switch e {
 43 | 	case FE_GPU:
 44 | 		return "GPU"
 45 | 	case FE_VGPU:
 46 | 		return "vGPU"
 47 | 	case FE_SWITCH:
 48 | 		return "NvSwitch"
 49 | 	case FE_GPU_I:
 50 | 		return "GPU Instance"
 51 | 	case FE_GPU_CI:
 52 | 		return "GPU Compute Instance"
 53 | 	case FE_LINK:
 54 | 		return "NvLink"
 55 | 	case FE_CPU:
 56 | 		return "CPU"
 57 | 	case FE_CPU_CORE:
 58 | 		return "CPU Core"
 59 | 	}
 60 | 	return "unknown"
 61 | }
 62 | 
 63 | // GroupEntityPair represents a DCGM entity and its group identifier
 64 | type GroupEntityPair struct {
 65 | 	// EntityGroupId specifies the type of the entity
 66 | 	EntityGroupId Field_Entity_Group
 67 | 	// EntityId is the unique identifier for this entity
 68 | 	EntityId uint
 69 | }
 70 | 
 71 | // MigEntityInfo contains information about a MIG entity
 72 | type MigEntityInfo struct {
 73 | 	// GpuUuid is the UUID of the parent GPU
 74 | 	GpuUuid string
 75 | 	// NvmlGpuIndex is the NVML index of the parent GPU
 76 | 	NvmlGpuIndex uint
 77 | 	// NvmlInstanceId is the NVML GPU instance ID
 78 | 	NvmlInstanceId uint
 79 | 	// NvmlComputeInstanceId is the NVML compute instance ID
 80 | 	NvmlComputeInstanceId uint
 81 | 	// NvmlMigProfileId is the NVML MIG profile ID
 82 | 	NvmlMigProfileId uint
 83 | 	// NvmlProfileSlices is the number of slices in the MIG profile
 84 | 	NvmlProfileSlices uint
 85 | }
 86 | 
 87 | // MigHierarchyInfo_v2 represents version 2 of MIG hierarchy information
 88 | type MigHierarchyInfo_v2 struct {
 89 | 	// Entity contains the entity information
 90 | 	Entity GroupEntityPair
 91 | 	// Parent contains the parent entity information
 92 | 	Parent GroupEntityPair
 93 | 	// Info contains detailed MIG entity information
 94 | 	Info MigEntityInfo
 95 | }
 96 | 
 97 | const (
 98 | 	// MAX_NUM_DEVICES represents the maximum number of GPU devices supported
 99 | 	MAX_NUM_DEVICES = uint(C.DCGM_MAX_NUM_DEVICES)
100 | 
101 | 	// MAX_HIERARCHY_INFO represents the maximum size of the MIG hierarchy information
102 | 	MAX_HIERARCHY_INFO = uint(C.DCGM_MAX_HIERARCHY_INFO)
103 | )
104 | 
105 | // MigHierarchy_v2 represents version 2 of the complete MIG hierarchy
106 | type MigHierarchy_v2 struct {
107 | 	// Version is the version number of the hierarchy structure
108 | 	Version uint
109 | 	// Count is the number of valid entries in EntityList
110 | 	Count uint
111 | 	// EntityList contains the MIG hierarchy information for each entity
112 | 	EntityList [C.DCGM_MAX_HIERARCHY_INFO]MigHierarchyInfo_v2
113 | }
114 | 
115 | // GetGPUInstanceHierarchy retrieves the complete MIG hierarchy information
116 | func GetGPUInstanceHierarchy() (hierarchy MigHierarchy_v2, err error) {
117 | 	var c_hierarchy C.dcgmMigHierarchy_v2
118 | 	c_hierarchy.version = C.dcgmMigHierarchy_version2
119 | 	ptr_hierarchy := (*C.dcgmMigHierarchy_v2)(unsafe.Pointer(&c_hierarchy))
120 | 	result := C.dcgmGetGpuInstanceHierarchy(handle.handle, ptr_hierarchy)
121 | 
122 | 	if err = errorString(result); err != nil {
123 | 		return toMigHierarchy(c_hierarchy), fmt.Errorf("error retrieving DCGM MIG hierarchy: %s", err)
124 | 	}
125 | 
126 | 	return toMigHierarchy(c_hierarchy), nil
127 | }
128 | 
129 | func toMigHierarchy(c_hierarchy C.dcgmMigHierarchy_v2) MigHierarchy_v2 {
130 | 	var hierarchy MigHierarchy_v2
131 | 	hierarchy.Version = uint(c_hierarchy.version)
132 | 	hierarchy.Count = uint(c_hierarchy.count)
133 | 	for i := uint(0); i < hierarchy.Count; i++ {
134 | 		hierarchy.EntityList[i] = MigHierarchyInfo_v2{
135 | 			Entity: GroupEntityPair{Field_Entity_Group(c_hierarchy.entityList[i].entity.entityGroupId), uint(c_hierarchy.entityList[i].entity.entityId)},
136 | 			Parent: GroupEntityPair{Field_Entity_Group(c_hierarchy.entityList[i].parent.entityGroupId), uint(c_hierarchy.entityList[i].parent.entityId)},
137 | 			Info: MigEntityInfo{
138 | 				GpuUuid:               *stringPtr(&c_hierarchy.entityList[i].info.gpuUuid[0]),
139 | 				NvmlGpuIndex:          uint(c_hierarchy.entityList[i].info.nvmlGpuIndex),
140 | 				NvmlInstanceId:        uint(c_hierarchy.entityList[i].info.nvmlInstanceId),
141 | 				NvmlComputeInstanceId: uint(c_hierarchy.entityList[i].info.nvmlComputeInstanceId),
142 | 				NvmlMigProfileId:      uint(c_hierarchy.entityList[i].info.nvmlMigProfileId),
143 | 				NvmlProfileSlices:     uint(c_hierarchy.entityList[i].info.nvmlProfileSlices),
144 | 			},
145 | 		}
146 | 	}
147 | 
148 | 	return hierarchy
149 | }
150 | 


--------------------------------------------------------------------------------
/pkg/dcgm/device_status.go:
--------------------------------------------------------------------------------
  1 | package dcgm
  2 | 
  3 | /*
  4 | #include "dcgm_agent.h"
  5 | #include "dcgm_structs.h"
  6 | */
  7 | import "C"
  8 | 
  9 | import (
 10 | 	"fmt"
 11 | 	"math/rand"
 12 | )
 13 | 
 14 | // PerfState represents the performance state (P-state) of a GPU
 15 | type PerfState uint
 16 | 
 17 | const (
 18 | 	// PerfStateMax represents the highest performance state (P0)
 19 | 	PerfStateMax = 0
 20 | 
 21 | 	// PerfStateMin represents the lowest performance state (P15)
 22 | 	PerfStateMin = 15
 23 | 
 24 | 	// PerfStateUnknown represents an unknown performance state
 25 | 	PerfStateUnknown = 32
 26 | )
 27 | 
 28 | // String returns a string representation of the performance state
 29 | func (p PerfState) String() string {
 30 | 	if p >= PerfStateMax && p <= PerfStateMin {
 31 | 		return fmt.Sprintf("P%d", p)
 32 | 	}
 33 | 	return "Unknown"
 34 | }
 35 | 
 36 | // UtilizationInfo contains GPU utilization metrics
 37 | type UtilizationInfo struct {
 38 | 	GPU     int64 // %
 39 | 	Memory  int64 // %
 40 | 	Encoder int64 // %
 41 | 	Decoder int64 // %
 42 | }
 43 | 
 44 | // ECCErrorsInfo contains ECC memory error counts
 45 | type ECCErrorsInfo struct {
 46 | 	SingleBit int64
 47 | 	DoubleBit int64
 48 | }
 49 | 
 50 | // MemoryInfo contains GPU memory usage and error information
 51 | type MemoryInfo struct {
 52 | 	GlobalUsed int64
 53 | 	ECCErrors  ECCErrorsInfo
 54 | }
 55 | 
 56 | // ClockInfo contains GPU clock frequencies
 57 | type ClockInfo struct {
 58 | 	Cores  int64 // MHz
 59 | 	Memory int64 // MHz
 60 | }
 61 | 
 62 | // PCIThroughputInfo contains PCI bus transfer metrics
 63 | type PCIThroughputInfo struct {
 64 | 	Rx      int64 // MB
 65 | 	Tx      int64 // MB
 66 | 	Replays int64
 67 | }
 68 | 
 69 | // PCIStatusInfo contains PCI bus status information
 70 | type PCIStatusInfo struct {
 71 | 	BAR1Used   int64 // MB
 72 | 	Throughput PCIThroughputInfo
 73 | 	FBUsed     int64
 74 | }
 75 | 
 76 | // DeviceStatus contains comprehensive GPU device status information
 77 | type DeviceStatus struct {
 78 | 	Power       float64 // W
 79 | 	Temperature int64   // °C
 80 | 	Utilization UtilizationInfo
 81 | 	Memory      MemoryInfo
 82 | 	Clocks      ClockInfo
 83 | 	PCI         PCIStatusInfo
 84 | 	Performance PerfState
 85 | 	FanSpeed    int64 // %
 86 | }
 87 | 
 88 | func latestValuesForDevice(gpuId uint) (status DeviceStatus, err error) {
 89 | 	const (
 90 | 		pwr int = iota
 91 | 		temp
 92 | 		sm
 93 | 		mem
 94 | 		enc
 95 | 		dec
 96 | 		smClock
 97 | 		memClock
 98 | 		bar1Used
 99 | 		pcieRxThroughput
100 | 		pcieTxThroughput
101 | 		pcieReplay
102 | 		fbUsed
103 | 		sbe
104 | 		dbe
105 | 		pstate
106 | 		fanSpeed
107 | 		fieldsCount
108 | 	)
109 | 
110 | 	deviceFields := make([]Short, fieldsCount)
111 | 	deviceFields[pwr] = C.DCGM_FI_DEV_POWER_USAGE
112 | 	deviceFields[temp] = C.DCGM_FI_DEV_GPU_TEMP
113 | 	deviceFields[sm] = C.DCGM_FI_DEV_GPU_UTIL
114 | 	deviceFields[mem] = C.DCGM_FI_DEV_MEM_COPY_UTIL
115 | 	deviceFields[enc] = C.DCGM_FI_DEV_ENC_UTIL
116 | 	deviceFields[dec] = C.DCGM_FI_DEV_DEC_UTIL
117 | 	deviceFields[smClock] = C.DCGM_FI_DEV_SM_CLOCK
118 | 	deviceFields[memClock] = C.DCGM_FI_DEV_MEM_CLOCK
119 | 	deviceFields[bar1Used] = C.DCGM_FI_DEV_BAR1_USED
120 | 	deviceFields[pcieRxThroughput] = C.DCGM_FI_DEV_PCIE_RX_THROUGHPUT
121 | 	deviceFields[pcieTxThroughput] = C.DCGM_FI_DEV_PCIE_TX_THROUGHPUT
122 | 	deviceFields[pcieReplay] = C.DCGM_FI_DEV_PCIE_REPLAY_COUNTER
123 | 	deviceFields[fbUsed] = C.DCGM_FI_DEV_FB_USED
124 | 	deviceFields[sbe] = C.DCGM_FI_DEV_ECC_SBE_AGG_TOTAL
125 | 	deviceFields[dbe] = C.DCGM_FI_DEV_ECC_DBE_AGG_TOTAL
126 | 	deviceFields[pstate] = C.DCGM_FI_DEV_PSTATE
127 | 	deviceFields[fanSpeed] = C.DCGM_FI_DEV_FAN_SPEED
128 | 
129 | 	fieldsName := fmt.Sprintf("devStatusFields%d", rand.Uint64())
130 | 	fieldsId, err := FieldGroupCreate(fieldsName, deviceFields)
131 | 	if err != nil {
132 | 		return
133 | 	}
134 | 
135 | 	groupName := fmt.Sprintf("devStatus%d", rand.Uint64())
136 | 	groupId, err := WatchFields(gpuId, fieldsId, groupName)
137 | 	if err != nil {
138 | 		_ = FieldGroupDestroy(fieldsId)
139 | 		return
140 | 	}
141 | 
142 | 	values, err := GetLatestValuesForFields(gpuId, deviceFields)
143 | 	if err != nil {
144 | 		_ = FieldGroupDestroy(fieldsId)
145 | 		_ = DestroyGroup(groupId)
146 | 		return status, err
147 | 	}
148 | 
149 | 	power := values[pwr].Float64()
150 | 
151 | 	gpuUtil := UtilizationInfo{
152 | 		GPU:     values[sm].Int64(),
153 | 		Memory:  values[mem].Int64(),
154 | 		Encoder: values[enc].Int64(),
155 | 		Decoder: values[dec].Int64(),
156 | 	}
157 | 
158 | 	memory := MemoryInfo{
159 | 		ECCErrors: ECCErrorsInfo{
160 | 			SingleBit: values[sbe].Int64(),
161 | 			DoubleBit: values[dbe].Int64(),
162 | 		},
163 | 	}
164 | 
165 | 	clocks := ClockInfo{
166 | 		Cores:  values[smClock].Int64(),
167 | 		Memory: values[memClock].Int64(),
168 | 	}
169 | 
170 | 	pci := PCIStatusInfo{
171 | 		BAR1Used: values[bar1Used].Int64(),
172 | 		Throughput: PCIThroughputInfo{
173 | 			Rx:      values[pcieRxThroughput].Int64(),
174 | 			Tx:      values[pcieTxThroughput].Int64(),
175 | 			Replays: values[pcieReplay].Int64(),
176 | 		},
177 | 		FBUsed: values[fbUsed].Int64(),
178 | 	}
179 | 
180 | 	status = DeviceStatus{
181 | 		Power:       power,
182 | 		Temperature: values[temp].Int64(),
183 | 		Utilization: gpuUtil,
184 | 		Memory:      memory,
185 | 		Clocks:      clocks,
186 | 		PCI:         pci,
187 | 		Performance: PerfState(values[pstate].Int64()),
188 | 		FanSpeed:    values[fanSpeed].Int64(),
189 | 	}
190 | 
191 | 	_ = FieldGroupDestroy(fieldsId)
192 | 	_ = DestroyGroup(groupId)
193 | 	return
194 | }
195 | 


--------------------------------------------------------------------------------
/tests/dmon_test.go:
--------------------------------------------------------------------------------
  1 | package tests
  2 | 
  3 | import (
  4 | 	"testing"
  5 | 	"time"
  6 | 
  7 | 	"github.com/NVIDIA/go-dcgm/pkg/dcgm"
  8 | )
  9 | 
 10 | // TestDeviceMonitoring demonstrates device monitoring functionality
 11 | // This is equivalent to the dmon sample but runs for a limited time
 12 | func TestDeviceMonitoring(t *testing.T) {
 13 | 	cleanup, err := dcgm.Init(dcgm.Embedded)
 14 | 	if err != nil {
 15 | 		t.Fatalf("Failed to initialize DCGM: %v", err)
 16 | 	}
 17 | 	defer cleanup()
 18 | 
 19 | 	gpus, err := dcgm.GetSupportedDevices()
 20 | 	if err != nil {
 21 | 		t.Fatalf("Failed to get supported devices: %v", err)
 22 | 	}
 23 | 
 24 | 	if len(gpus) == 0 {
 25 | 		t.Skip("No supported GPUs found for monitoring")
 26 | 	}
 27 | 
 28 | 	t.Log("# gpu   pwr  temp    sm   mem   enc   dec  mclk  pclk")
 29 | 	t.Log("# Idx     W     C     %     %     %     %   MHz   MHz")
 30 | 
 31 | 	// Monitor for a few seconds instead of indefinitely
 32 | 	ticker := time.NewTicker(time.Second * 1)
 33 | 	defer ticker.Stop()
 34 | 
 35 | 	timeout := time.After(5 * time.Second)
 36 | 	sampleCount := 0
 37 | 
 38 | 	for {
 39 | 		select {
 40 | 		case <-ticker.C:
 41 | 			for _, gpu := range gpus {
 42 | 				st, err := dcgm.GetDeviceStatus(gpu)
 43 | 				if err != nil {
 44 | 					t.Errorf("Failed to get device status for GPU %d: %v", gpu, err)
 45 | 					continue
 46 | 				}
 47 | 
 48 | 				t.Logf("%5d %5d %5d %5d %5d %5d %5d %5d %5d",
 49 | 					gpu, int64(st.Power), st.Temperature, st.Utilization.GPU, st.Utilization.Memory,
 50 | 					st.Utilization.Encoder, st.Utilization.Decoder, st.Clocks.Memory, st.Clocks.Cores)
 51 | 
 52 | 				// Basic validation
 53 | 				if st.Temperature < 0 || st.Temperature > 150 {
 54 | 					t.Errorf("GPU %d temperature out of expected range: %d°C", gpu, st.Temperature)
 55 | 				}
 56 | 				if st.Utilization.GPU < 0 || st.Utilization.GPU > 100 {
 57 | 					t.Errorf("GPU %d utilization out of range: %d%%", gpu, st.Utilization.GPU)
 58 | 				}
 59 | 			}
 60 | 			sampleCount++
 61 | 
 62 | 		case <-timeout:
 63 | 			t.Logf("Monitoring completed after %d samples", sampleCount)
 64 | 			return
 65 | 		}
 66 | 	}
 67 | }
 68 | 
 69 | // TestDeviceStatusSingle demonstrates getting device status for a single GPU
 70 | func TestDeviceStatusSingle(t *testing.T) {
 71 | 	cleanup, err := dcgm.Init(dcgm.Embedded)
 72 | 	if err != nil {
 73 | 		t.Fatalf("Failed to initialize DCGM: %v", err)
 74 | 	}
 75 | 	defer cleanup()
 76 | 
 77 | 	gpus, err := dcgm.GetSupportedDevices()
 78 | 	if err != nil {
 79 | 		t.Fatalf("Failed to get supported devices: %v", err)
 80 | 	}
 81 | 
 82 | 	if len(gpus) == 0 {
 83 | 		t.Skip("No supported GPUs found")
 84 | 	}
 85 | 
 86 | 	// Test first GPU
 87 | 	gpu := gpus[0]
 88 | 	st, err := dcgm.GetDeviceStatus(gpu)
 89 | 	if err != nil {
 90 | 		t.Fatalf("Failed to get device status for GPU %d: %v", gpu, err)
 91 | 	}
 92 | 
 93 | 	t.Logf("GPU %d Status:", gpu)
 94 | 	t.Logf("  Power: %d W", int64(st.Power))
 95 | 	t.Logf("  Temperature: %d°C", st.Temperature)
 96 | 	t.Logf("  GPU Utilization: %d%%", st.Utilization.GPU)
 97 | 	t.Logf("  Memory Utilization: %d%%", st.Utilization.Memory)
 98 | 	t.Logf("  Encoder Utilization: %d%%", st.Utilization.Encoder)
 99 | 	t.Logf("  Decoder Utilization: %d%%", st.Utilization.Decoder)
100 | 	t.Logf("  Memory Clock: %d MHz", st.Clocks.Memory)
101 | 	t.Logf("  Core Clock: %d MHz", st.Clocks.Cores)
102 | 
103 | 	// Validate ranges
104 | 	if st.Temperature < 0 || st.Temperature > 150 {
105 | 		t.Errorf("Temperature out of expected range: %d°C", st.Temperature)
106 | 	}
107 | 	if st.Utilization.GPU < 0 || st.Utilization.GPU > 100 {
108 | 		t.Errorf("GPU utilization out of range: %d%%", st.Utilization.GPU)
109 | 	}
110 | 	if st.Utilization.Memory < 0 || st.Utilization.Memory > 100 {
111 | 		t.Errorf("Memory utilization out of range: %d%%", st.Utilization.Memory)
112 | 	}
113 | }
114 | 
115 | // TestDeviceStatusMultipleSamples demonstrates taking multiple samples over time
116 | func TestDeviceStatusMultipleSamples(t *testing.T) {
117 | 	if testing.Short() {
118 | 		t.Skip("Skipping multiple samples test in short mode")
119 | 	}
120 | 
121 | 	cleanup, err := dcgm.Init(dcgm.Embedded)
122 | 	if err != nil {
123 | 		t.Fatalf("Failed to initialize DCGM: %v", err)
124 | 	}
125 | 	defer cleanup()
126 | 
127 | 	gpus, err := dcgm.GetSupportedDevices()
128 | 	if err != nil {
129 | 		t.Fatalf("Failed to get supported devices: %v", err)
130 | 	}
131 | 
132 | 	if len(gpus) == 0 {
133 | 		t.Skip("No supported GPUs found")
134 | 	}
135 | 
136 | 	// Take samples every 500ms for 3 seconds
137 | 	gpu := gpus[0]
138 | 	samples := make([]dcgm.DeviceStatus, 0, 6)
139 | 
140 | 	for i := 0; i < 6; i++ {
141 | 		st, err := dcgm.GetDeviceStatus(gpu)
142 | 		if err != nil {
143 | 			t.Errorf("Failed to get device status sample %d: %v", i, err)
144 | 			continue
145 | 		}
146 | 		samples = append(samples, st)
147 | 		time.Sleep(500 * time.Millisecond)
148 | 	}
149 | 
150 | 	t.Logf("Collected %d samples for GPU %d", len(samples), gpu)
151 | 
152 | 	// Analyze samples for consistency
153 | 	if len(samples) > 1 {
154 | 		firstTemp := samples[0].Temperature
155 | 		tempVariation := false
156 | 		for _, sample := range samples[1:] {
157 | 			if abs64(sample.Temperature-firstTemp) > 5 { // Allow 5°C variation
158 | 				tempVariation = true
159 | 				break
160 | 			}
161 | 		}
162 | 
163 | 		if !tempVariation {
164 | 			t.Logf("Temperature remained stable around %d°C", firstTemp)
165 | 		} else {
166 | 			t.Logf("Temperature variation detected across samples")
167 | 		}
168 | 	}
169 | }
170 | 
171 | func abs64(x int64) int64 {
172 | 	if x < 0 {
173 | 		return -x
174 | 	}
175 | 	return x
176 | }
177 | 


--------------------------------------------------------------------------------
/.gitlab-ci.yml:
--------------------------------------------------------------------------------
  1 | # GitLab CI configuration for go-dcgm
  2 | # Uses Docker for building and testing based on the existing Dockerfile
  3 | 
  4 | # Define the stages of the pipeline
  5 | stages:
  6 |   - build
  7 |   - test
  8 | 
  9 | # Define global variables
 10 | variables:
 11 |   DOCKER_DRIVER: overlay2
 12 |   DOCKER_TLS_CERTDIR: "/certs"
 13 |   # Build arguments for the Dockerfile
 14 |   CUDA_VERSION: "12.5.1"
 15 |   DISTRO_FLAVOR: "ubuntu24.04"
 16 |   GO_VERSION: "1.24.4"
 17 |   DCGM_VERSION: "4.2.3-2"
 18 |   # Image names
 19 |   BUILD_IMAGE: "$CI_REGISTRY_IMAGE/build:$CI_COMMIT_SHA"
 20 |   TEST_IMAGE: "$CI_REGISTRY_IMAGE/test:$CI_COMMIT_SHA"
 21 | 
 22 | # Use Docker-in-Docker service
 23 | services:
 24 |   - docker:dind
 25 | 
 26 | # Use Docker image
 27 | image: docker:latest
 28 | 
 29 | before_script:
 30 |   - docker info
 31 |   - echo $CI_REGISTRY_PASSWORD | docker login -u $CI_REGISTRY_USER --password-stdin $CI_REGISTRY
 32 | 
 33 | # Build Docker image and compile Go code
 34 | build:
 35 |   stage: build
 36 |   script:
 37 |     - echo "Building Docker image with go-dcgm..."
 38 |     # Build the samples stage which includes the compiled binaries
 39 |     - echo $CI_REGISTRY_PASSWORD | docker login -u $CI_REGISTRY_USER --password-stdin $CI_REGISTRY
 40 |     - docker build
 41 |         --target samples
 42 |         --build-arg CUDA_VERSION=$CUDA_VERSION
 43 |         --build-arg DISTRO_FLAVOR=$DISTRO_FLAVOR
 44 |         --build-arg GO_VERSION=$GO_VERSION
 45 |         --build-arg DCGM_VERSION=$DCGM_VERSION
 46 |         --tag $BUILD_IMAGE
 47 |         .
 48 |     # Push the built image for use in test stage
 49 |     - docker push $BUILD_IMAGE
 50 |   rules:
 51 |     - if: $CI_COMMIT_BRANCH == "main"
 52 |     - if: $CI_PIPELINE_SOURCE == "merge_request_event"
 53 | 
 54 | # Run tests inside the built Docker container
 55 | test:
 56 |   stage: test
 57 |   tags:
 58 |     - docker
 59 |     - gpu-enabled
 60 |   image: $BUILD_IMAGE
 61 |   needs: ["build"]
 62 |   before_script: []
 63 |   script:
 64 |     # Create test reports directory
 65 |     - mkdir -p test-reports
 66 |     - echo '=== Installing test dependencies ==='
 67 |     - go install github.com/jstemmer/go-junit-report/v2@latest
 68 |     - echo '=== Running Go tests ==='
 69 |     - mkdir -p test-reports
 70 |     - >
 71 |       go test -v ./tests 2>&1
 72 |       | /root/go/bin/go-junit-report -set-exit-code
 73 |       > test-reports/go-tests.xml &&
 74 |     - echo '=== Tests completed successfully! ==='
 75 |   artifacts:
 76 |     reports:
 77 |       junit:
 78 |         - test-reports/go-tests.xml
 79 |     paths:
 80 |       - test-reports/
 81 |     expire_in: 1 week
 82 |     when: always
 83 |   rules:
 84 |     - if: $CI_COMMIT_BRANCH == "main"
 85 |     - if: $CI_PIPELINE_SOURCE == "merge_request_event"
 86 | 
 87 | # Run race tests inside the built Docker container
 88 | test-race:
 89 |   stage: test
 90 |   tags:
 91 |     - docker
 92 |     - gpu-enabled
 93 |   image: $BUILD_IMAGE
 94 |   needs: ["build"]
 95 |   before_script: []
 96 |   script:
 97 |     # Create test reports directory
 98 |     - mkdir -p test-reports
 99 |     - echo '=== Installing test dependencies ==='
100 |     - go install github.com/jstemmer/go-junit-report/v2@latest
101 |     - echo '=== Running Go race tests ==='
102 |     - mkdir -p test-reports
103 |     - >
104 |       go test -race -v ./tests 2>&1
105 |       | /root/go/bin/go-junit-report -set-exit-code
106 |       > test-reports/go-race-tests.xml
107 |     - echo '=== Race Tests completed successfully! ==='
108 |   artifacts:
109 |     reports:
110 |       junit:
111 |         - test-reports/go-race-tests.xml
112 |     paths:
113 |       - test-reports/
114 |     expire_in: 1 week
115 |     when: always
116 |   rules:
117 |     - if: $CI_COMMIT_BRANCH == "main"
118 |     - if: $CI_PIPELINE_SOURCE == "merge_request_event"
119 | 
120 | # Code format check inside Docker
121 | format-check:
122 |   stage: test
123 |   needs: ["build"]
124 |   before_script: []
125 |   tags:
126 |     - docker
127 |     - gpu-disabled
128 |   image: $BUILD_IMAGE
129 |   script:
130 |     - echo "Checking code format in Docker container..."
131 |     # Install gofumpt
132 |     - echo 'Installing gofumpt...'
133 |     - go install mvdan.cc/gofumpt@latest
134 |     # Run format check
135 |     - echo 'Checking code format...'
136 |     - make check-format
137 |   rules:
138 |     - if: $CI_COMMIT_BRANCH == "main"
139 |     - if: $CI_PIPELINE_SOURCE == "merge_request_event"
140 | 
141 | .go:
142 |   tags:
143 |     - docker
144 |     - gpu-disabled
145 |   image: golang
146 | 
147 | # Full linting inside Docker (optional)
148 | lint-full:
149 |   extends:
150 |     - .go
151 |   stage: test
152 |   before_script: []
153 |   needs: ["build"]
154 |   script:
155 |     - echo "Running full linting..."
156 |     # Install golangci-lint and run full linting
157 |     - echo 'Installing golangci-lint...'
158 |     - wget -O- -nv https://raw.githubusercontent.com/golangci/golangci-lint/HEAD/install.sh | sh -s v2.1.6
159 |     - echo 'Running full linting...'
160 |     - ./bin/golangci-lint run ./... --timeout 10m --fix
161 |   rules:
162 |     - if: $CI_COMMIT_BRANCH == "main"
163 |     - if: $CI_PIPELINE_SOURCE == "merge_request_event"
164 |   allow_failure: true
165 | 
166 | # Build matrix for different configurations (optional)
167 | build-matrix:
168 |   stage: build
169 |   script:
170 |     - echo "Building Docker images with docker-bake for matrix configurations..."
171 |     - docker context create go-dcgm
172 |     - docker buildx create --use go-dcgm
173 |     - docker buildx bake --load
174 |   rules:
175 |     - if: $CI_COMMIT_BRANCH == "main"
176 |     - if: $CI_PIPELINE_SOURCE == "merge_request_event"
177 | 


--------------------------------------------------------------------------------
/pkg/dcgm/api.go:
--------------------------------------------------------------------------------
  1 | package dcgm
  2 | 
  3 | import (
  4 | 	"context"
  5 | 	"errors"
  6 | 	"fmt"
  7 | 	"os"
  8 | 	"strconv"
  9 | 	"sync"
 10 | 	"time"
 11 | )
 12 | 
 13 | var (
 14 | 	dcgmInitCounter int
 15 | 	mux             sync.Mutex
 16 | )
 17 | 
 18 | // Init starts DCGM in the specified mode
 19 | // Mode can be:
 20 | // - Embedded: Start hostengine within this process
 21 | // - Standalone: Connect to an already running nv-hostengine
 22 | // - StartHostengine: Start and connect to nv-hostengine, terminate before exiting
 23 | // Returns a cleanup function and any error encountered
 24 | func Init(m mode, args ...string) (cleanup func(), err error) {
 25 | 	mux.Lock()
 26 | 	defer mux.Unlock()
 27 | 
 28 | 	if dcgmInitCounter < 0 {
 29 | 		count := strconv.Itoa(dcgmInitCounter)
 30 | 		err = fmt.Errorf("shutdown() is called %s times, before init()", count[1:])
 31 | 	}
 32 | 
 33 | 	if dcgmInitCounter == 0 {
 34 | 		err = initDCGM(m, args...)
 35 | 		if err != nil {
 36 | 			return nil, err
 37 | 		}
 38 | 	}
 39 | 
 40 | 	dcgmInitCounter += 1
 41 | 
 42 | 	return func() {
 43 | 		if shutdownErr := Shutdown(); shutdownErr != nil {
 44 | 			fmt.Fprintf(os.Stderr, "Failed to shutdown DCGM with error: `%v`", shutdownErr)
 45 | 		}
 46 | 	}, err
 47 | }
 48 | 
 49 | // Shutdown stops DCGM and destroys all connections
 50 | // Returns an error if DCGM is not initialized
 51 | func Shutdown() (err error) {
 52 | 	mux.Lock()
 53 | 	defer mux.Unlock()
 54 | 
 55 | 	if dcgmInitCounter <= 0 {
 56 | 		err = errors.New("init() needs to be called before shutdown()")
 57 | 	}
 58 | 
 59 | 	if dcgmInitCounter == 1 {
 60 | 		err = shutdown()
 61 | 	}
 62 | 
 63 | 	dcgmInitCounter -= 1
 64 | 
 65 | 	return
 66 | }
 67 | 
 68 | // GetAllDeviceCount returns the count of all GPUs in the system
 69 | func GetAllDeviceCount() (uint, error) {
 70 | 	return getAllDeviceCount()
 71 | }
 72 | 
 73 | // GetEntityGroupEntities returns all entities of the specified group type
 74 | func GetEntityGroupEntities(entityGroup Field_Entity_Group) ([]uint, error) {
 75 | 	return getEntityGroupEntities(entityGroup)
 76 | }
 77 | 
 78 | // GetSupportedDevices returns a list of DCGM-supported GPU IDs
 79 | func GetSupportedDevices() ([]uint, error) {
 80 | 	return getSupportedDevices()
 81 | }
 82 | 
 83 | // GetDeviceInfo returns detailed information about the specified GPU
 84 | func GetDeviceInfo(gpuID uint) (Device, error) {
 85 | 	return getDeviceInfo(gpuID)
 86 | }
 87 | 
 88 | // GetDeviceStatus returns current status information about the specified GPU
 89 | func GetDeviceStatus(gpuID uint) (DeviceStatus, error) {
 90 | 	return latestValuesForDevice(gpuID)
 91 | }
 92 | 
 93 | // GetDeviceTopology returns the topology (connectivity) information for the specified GPU
 94 | func GetDeviceTopology(gpuID uint) ([]P2PLink, error) {
 95 | 	return getDeviceTopology(gpuID)
 96 | }
 97 | 
 98 | // WatchPidFields configures DCGM to start recording stats for GPU processes
 99 | // Must be called before GetProcessInfo
100 | func WatchPidFields() (GroupHandle, error) {
101 | 	return watchPidFields(time.Microsecond*time.Duration(defaultUpdateFreq), time.Second*time.Duration(defaultMaxKeepAge), defaultMaxKeepSamples)
102 | }
103 | 
104 | // GetProcessInfo returns detailed per-GPU statistics for the specified process
105 | func GetProcessInfo(group GroupHandle, pid uint) ([]ProcessInfo, error) {
106 | 	return getProcessInfo(group, pid)
107 | }
108 | 
109 | // HealthCheckByGpuId performs a health check on the specified GPU
110 | func HealthCheckByGpuId(gpuID uint) (DeviceHealth, error) {
111 | 	return healthCheckByGpuId(gpuID)
112 | }
113 | 
114 | // ListenForPolicyViolations sets up monitoring for the specified policy conditions on all GPUs
115 | // Returns a channel that receives policy violations and any error encountered
116 | func ListenForPolicyViolations(ctx context.Context, typ ...PolicyCondition) (<-chan PolicyViolation, error) {
117 | 	groupID := GroupAllGPUs()
118 | 	return ListenForPolicyViolationsForGroup(ctx, groupID, typ...)
119 | }
120 | 
121 | // ListenForPolicyViolationsForGroup sets up policy monitoring for the specified GPU group
122 | // Returns a channel that receives policy violations and any error encountered
123 | func ListenForPolicyViolationsForGroup(ctx context.Context, group GroupHandle, typ ...PolicyCondition) (<-chan PolicyViolation, error) {
124 | 	return registerPolicy(ctx, group, typ...)
125 | }
126 | 
127 | // Introspect returns memory and CPU usage statistics for the DCGM hostengine
128 | func Introspect() (Status, error) {
129 | 	return introspect()
130 | }
131 | 
132 | // GetSupportedMetricGroups returns all supported metric groups for the specified GPU
133 | func GetSupportedMetricGroups(gpuID uint) ([]MetricGroup, error) {
134 | 	return getSupportedMetricGroups(gpuID)
135 | }
136 | 
137 | // GetNvLinkLinkStatus returns the status of all NVLink connections
138 | func GetNvLinkLinkStatus() ([]NvLinkStatus, error) {
139 | 	return getNvLinkLinkStatus()
140 | }
141 | 
142 | // GetNvLinkP2PStatus returns the status of NvLinks between GPU pairs
143 | func GetNvLinkP2PStatus() (NvLinkP2PStatus, error) {
144 | 	return getNvLinkP2PStatus()
145 | }
146 | 
147 | // SetPolicyForGroup configures policies with optional custom thresholds and actions for a GPU group
148 | func SetPolicyForGroup(group GroupHandle, configs ...PolicyConfig) error {
149 | 	return setPolicyForGroupWithConfig(group, configs...)
150 | }
151 | 
152 | // GetPolicyForGroup retrieves the current policy configuration for a GPU group
153 | func GetPolicyForGroup(group GroupHandle) (*PolicyStatus, error) {
154 | 	return getPolicyForGroup(group)
155 | }
156 | 
157 | // ClearPolicyForGroup clears all policy conditions for a GPU group
158 | func ClearPolicyForGroup(group GroupHandle) error {
159 | 	return clearPolicyForGroup(group)
160 | }
161 | 
162 | // WatchPolicyViolationsForGroup registers to receive violation notifications for a specific GPU group
163 | func WatchPolicyViolationsForGroup(ctx context.Context, group GroupHandle, typ ...PolicyCondition) (<-chan PolicyViolation, error) {
164 | 	return registerPolicyOnly(ctx, group, typ...)
165 | }
166 | 


--------------------------------------------------------------------------------
/tests/dcgm_test.go:
--------------------------------------------------------------------------------
  1 | package tests
  2 | 
  3 | import (
  4 | 	"math"
  5 | 	"strconv"
  6 | 	"strings"
  7 | 	"testing"
  8 | 
  9 | 	"github.com/NVIDIA/go-dcgm/pkg/dcgm"
 10 | )
 11 | 
 12 | func check(t *testing.T, err error) {
 13 | 	if err != nil {
 14 | 		t.Errorf("%v\n", err)
 15 | 	}
 16 | }
 17 | 
 18 | func TestDeviceCount(t *testing.T) {
 19 | 	cleanup, err := dcgm.Init(dcgm.Embedded)
 20 | 	check(t, err)
 21 | 	defer cleanup()
 22 | 
 23 | 	count, err := dcgm.GetAllDeviceCount()
 24 | 	check(t, err)
 25 | 
 26 | 	query := "count"
 27 | 	c := DeviceCount(query)
 28 | 
 29 | 	if c != count {
 30 | 		t.Errorf("Device Count from dcgm is wrong, got %d, want: %d", count, c)
 31 | 	}
 32 | }
 33 | 
 34 | func BenchmarkDeviceCount1(b *testing.B) {
 35 | 	_, _ = dcgm.Init(dcgm.Embedded)
 36 | 
 37 | 	b.StartTimer()
 38 | 
 39 | 	for n := 0; n < b.N; n++ {
 40 | 		_, _ = dcgm.GetAllDeviceCount()
 41 | 	}
 42 | 
 43 | 	b.StopTimer()
 44 | 
 45 | 	_ = dcgm.Shutdown()
 46 | }
 47 | 
 48 | func TestCpuQuery(t *testing.T) {
 49 | 	t.Setenv("DCGM_SKIP_SYSMON_HARDWARE_CHECK", "1")
 50 | 
 51 | 	cleanup, err := dcgm.Init(dcgm.Embedded)
 52 | 	check(t, err)
 53 | 
 54 | 	defer cleanup()
 55 | 
 56 | 	hierarchy, err := dcgm.GetCPUHierarchy()
 57 | 	check(t, err)
 58 | 
 59 | 	if hierarchy.NumCPUs == 0 {
 60 | 		t.Errorf("Found no CPUs")
 61 | 	}
 62 | 
 63 | 	for i := uint(0); i < hierarchy.NumCPUs; i++ {
 64 | 		coresFound := false
 65 | 
 66 | 		for j := uint(0); j < dcgm.MAX_CPU_CORE_BITMASK_COUNT; j++ {
 67 | 			if hierarchy.CPUs[i].OwnedCores[j] != 0 {
 68 | 				coresFound = true
 69 | 			}
 70 | 		}
 71 | 
 72 | 		if coresFound == false {
 73 | 			t.Errorf("Cpu %d has no cores", i)
 74 | 		}
 75 | 	}
 76 | }
 77 | 
 78 | func TestDeviceInfo(t *testing.T) {
 79 | 	cleanup, err := dcgm.Init(dcgm.Embedded)
 80 | 	check(t, err)
 81 | 	defer cleanup()
 82 | 
 83 | 	fields := []string{
 84 | 		"driver_version",
 85 | 		"name",
 86 | 		"serial",
 87 | 		"uuid",
 88 | 		"pci.bus_id",
 89 | 		"vbios_version",
 90 | 		"inforom.img",
 91 | 		"power.limit",
 92 | 	}
 93 | 
 94 | 	gpus, err := dcgm.GetSupportedDevices()
 95 | 	check(t, err)
 96 | 
 97 | 	for _, gpu := range gpus {
 98 | 		info, err := dcgm.GetDeviceInfo(gpu)
 99 | 		check(t, err)
100 | 
101 | 		id := strconv.FormatUint(uint64(gpu), 10)
102 | 
103 | 		for _, val := range fields {
104 | 			var msg, output string
105 | 
106 | 			res := Query(id, val)
107 | 			if res == "[N/A]" {
108 | 				continue
109 | 			}
110 | 
111 | 			switch val {
112 | 			case "driver_version":
113 | 				msg = "Driver version"
114 | 				output = info.Identifiers.DriverVersion
115 | 			case "name":
116 | 				msg = "Device name"
117 | 				output = info.Identifiers.Model
118 | 			case "serial":
119 | 				msg = "Device Serial number"
120 | 				output = info.Identifiers.Serial
121 | 			case "uuid":
122 | 				msg = "Device UUID"
123 | 				output = info.UUID
124 | 			case "pci.bus_id":
125 | 				msg = "Device PCI busId"
126 | 				output = info.PCI.BusID
127 | 			case "vbios_version":
128 | 				msg = "Device vbios version"
129 | 				output = info.Identifiers.Vbios
130 | 			case "inforom.img":
131 | 				msg = "Device inforom image"
132 | 				output = info.Identifiers.InforomImageVersion
133 | 			case "power.limit":
134 | 				msg = "Device power limit"
135 | 				output = strconv.FormatUint(uint64(info.Power), 10)
136 | 				power, err := strconv.ParseFloat(res, 64)
137 | 				check(t, err)
138 | 
139 | 				res = strconv.FormatUint(uint64(math.Round(power)), 10)
140 | 			}
141 | 
142 | 			if strings.Compare(res, output) != 0 {
143 | 				if strings.Contains(output, "NOT_SUPPORTED") {
144 | 					continue
145 | 				}
146 | 
147 | 				t.Errorf("%v from dcgm is wrong, got: %v, want: %v", msg, output, res)
148 | 			}
149 | 		}
150 | 	}
151 | }
152 | 
153 | func BenchmarkDeviceInfo1(b *testing.B) {
154 | 	_, _ = dcgm.Init(dcgm.Embedded)
155 | 
156 | 	b.StartTimer()
157 | 
158 | 	for n := 0; n < b.N; n++ {
159 | 		// assuming there will be atleast 1 GPU attached
160 | 		_, _ = dcgm.GetDeviceInfo(uint(0))
161 | 	}
162 | 
163 | 	b.StopTimer()
164 | 
165 | 	_ = dcgm.Shutdown()
166 | }
167 | 
168 | func TestDeviceStatus(t *testing.T) {
169 | 	cleanup, err := dcgm.Init(dcgm.Embedded)
170 | 	check(t, err)
171 | 	defer cleanup()
172 | 
173 | 	gpus, err := dcgm.GetSupportedDevices()
174 | 	check(t, err)
175 | 
176 | 	fields := []string{
177 | 		"power.draw",
178 | 		"temperature.gpu",
179 | 		"utilization.gpu",
180 | 		"utilization.memory",
181 | 		"encoder.stats.averageFps",
182 | 		"clocks.current.sm",
183 | 		"clocks.current.memory",
184 | 	}
185 | 
186 | 	for _, gpu := range gpus {
187 | 		status, err := dcgm.GetDeviceStatus(gpu)
188 | 		check(t, err)
189 | 
190 | 		id := strconv.FormatUint(uint64(gpu), 10)
191 | 
192 | 		for _, val := range fields {
193 | 			var msg, output string
194 | 
195 | 			res := Query(id, val)
196 | 			if res == "[N/A]" {
197 | 				continue
198 | 			}
199 | 
200 | 			switch val {
201 | 			case "power.draw":
202 | 				msg = "Device power utilization"
203 | 				output = strconv.FormatFloat(math.Round(status.Power), 'f', -1, 64)
204 | 				power, err := strconv.ParseFloat(res, 64)
205 | 				check(t, err)
206 | 
207 | 				res = strconv.FormatFloat(math.Round(power), 'f', -1, 64)
208 | 			case "temperature.gpu":
209 | 				msg = "Device temperature"
210 | 				output = strconv.FormatInt(status.Temperature, 10)
211 | 			case "utilization.gpu":
212 | 				msg = "Device gpu utilization"
213 | 				output = strconv.FormatInt(status.Utilization.GPU, 10)
214 | 			case "utilization.memory":
215 | 				msg = "Device memory utilization"
216 | 				output = strconv.FormatInt(status.Utilization.Memory, 10)
217 | 			case "encoder.stats.averageFps":
218 | 				msg = "Device encoder utilization"
219 | 				output = strconv.FormatInt(status.Utilization.Encoder, 10)
220 | 			case "clocks.current.sm":
221 | 				msg = "Device sm clock"
222 | 				output = strconv.FormatInt(status.Clocks.Cores, 10)
223 | 			case "clocks.current.memory":
224 | 				msg = "Device mem clock"
225 | 				output = strconv.FormatInt(status.Clocks.Memory, 10)
226 | 			}
227 | 
228 | 			if strings.Compare(res, output) != 0 {
229 | 				t.Errorf("%v from dcgm is wrong, got: %v, want: %v", msg, output, res)
230 | 			}
231 | 		}
232 | 	}
233 | }
234 | 


--------------------------------------------------------------------------------
/pkg/dcgm/topology.go:
--------------------------------------------------------------------------------
  1 | package dcgm
  2 | 
  3 | /*
  4 | #include "dcgm_agent.h"
  5 | #include "dcgm_structs.h"
  6 | */
  7 | import "C"
  8 | 
  9 | import (
 10 | 	"fmt"
 11 | 	"unsafe"
 12 | )
 13 | 
 14 | // P2PLinkType represents the type of peer-to-peer connection between GPUs
 15 | type P2PLinkType uint
 16 | 
 17 | const (
 18 | 	// P2PLinkUnknown represents an unknown link type
 19 | 	P2PLinkUnknown P2PLinkType = iota
 20 | 	// P2PLinkCrossCPU represents a connection across different CPUs
 21 | 	P2PLinkCrossCPU
 22 | 	// P2PLinkSameCPU represents a connection within the same CPU
 23 | 	P2PLinkSameCPU
 24 | 	// P2PLinkHostBridge represents a connection through the host bridge
 25 | 	P2PLinkHostBridge
 26 | 	// P2PLinkMultiSwitch represents a connection through multiple PCIe switches
 27 | 	P2PLinkMultiSwitch
 28 | 	// P2PLinkSingleSwitch represents a connection through a single PCIe switch
 29 | 	P2PLinkSingleSwitch
 30 | 	// P2PLinkSameBoard represents a connection on the same board
 31 | 	P2PLinkSameBoard
 32 | 	// SingleNVLINKLink represents a single NVLINK connection
 33 | 	SingleNVLINKLink
 34 | 	// TwoNVLINKLinks represents two NVLINK connections
 35 | 	TwoNVLINKLinks
 36 | 	// ThreeNVLINKLinks represents three NVLINK connections
 37 | 	ThreeNVLINKLinks
 38 | 	// FourNVLINKLinks represents four NVLINK connections
 39 | 	FourNVLINKLinks
 40 | )
 41 | 
 42 | // PCIPaths returns a string representation of the P2P link type
 43 | func (l P2PLinkType) PCIPaths() string {
 44 | 	switch l {
 45 | 	case P2PLinkSameBoard:
 46 | 		return "PSB"
 47 | 	case P2PLinkSingleSwitch:
 48 | 		return "PIX"
 49 | 	case P2PLinkMultiSwitch:
 50 | 		return "PXB"
 51 | 	case P2PLinkHostBridge:
 52 | 		return "PHB"
 53 | 	case P2PLinkSameCPU:
 54 | 		return "NODE"
 55 | 	case P2PLinkCrossCPU:
 56 | 		return "SYS"
 57 | 	case SingleNVLINKLink:
 58 | 		return "NV1"
 59 | 	case TwoNVLINKLinks:
 60 | 		return "NV2"
 61 | 	case ThreeNVLINKLinks:
 62 | 		return "NV3"
 63 | 	case FourNVLINKLinks:
 64 | 		return "NV4"
 65 | 	case P2PLinkUnknown:
 66 | 	}
 67 | 	return "N/A"
 68 | }
 69 | 
 70 | // P2PLink contains information about a peer-to-peer connection
 71 | type P2PLink struct {
 72 | 	// GPU is the ID of the GPU
 73 | 	GPU uint
 74 | 	// BusID is the PCIe bus ID of the GPU
 75 | 	BusID string
 76 | 	// Link is the type of P2P connection
 77 | 	Link P2PLinkType
 78 | }
 79 | 
 80 | func getP2PLink(path uint) P2PLinkType {
 81 | 	switch path {
 82 | 	case C.DCGM_TOPOLOGY_BOARD:
 83 | 		return P2PLinkSameBoard
 84 | 	case C.DCGM_TOPOLOGY_SINGLE:
 85 | 		return P2PLinkSingleSwitch
 86 | 	case C.DCGM_TOPOLOGY_MULTIPLE:
 87 | 		return P2PLinkMultiSwitch
 88 | 	case C.DCGM_TOPOLOGY_HOSTBRIDGE:
 89 | 		return P2PLinkHostBridge
 90 | 	case C.DCGM_TOPOLOGY_CPU:
 91 | 		return P2PLinkSameCPU
 92 | 	case C.DCGM_TOPOLOGY_SYSTEM:
 93 | 		return P2PLinkCrossCPU
 94 | 	case C.DCGM_TOPOLOGY_NVLINK1:
 95 | 		return SingleNVLINKLink
 96 | 	case C.DCGM_TOPOLOGY_NVLINK2:
 97 | 		return TwoNVLINKLinks
 98 | 	case C.DCGM_TOPOLOGY_NVLINK3:
 99 | 		return ThreeNVLINKLinks
100 | 	case C.DCGM_TOPOLOGY_NVLINK4:
101 | 		return FourNVLINKLinks
102 | 	}
103 | 	return P2PLinkUnknown
104 | }
105 | 
106 | func getBusID(gpuID uint) (string, error) {
107 | 	var device C.dcgmDeviceAttributes_v3
108 | 	device.version = makeVersion3(unsafe.Sizeof(device))
109 | 
110 | 	result := C.dcgmGetDeviceAttributes(handle.handle, C.uint(gpuID), &device)
111 | 	if err := errorString(result); err != nil {
112 | 		return "", fmt.Errorf("error getting device busid: %s", err)
113 | 	}
114 | 	return *stringPtr(&device.identifiers.pciBusId[0]), nil
115 | }
116 | 
117 | func getDeviceTopology(gpuID uint) (links []P2PLink, err error) {
118 | 	var topology C.dcgmDeviceTopology_v1
119 | 	topology.version = makeVersion1(unsafe.Sizeof(topology))
120 | 
121 | 	result := C.dcgmGetDeviceTopology(handle.handle, C.uint(gpuID), &topology)
122 | 	if result == C.DCGM_ST_NOT_SUPPORTED {
123 | 		return links, nil
124 | 	}
125 | 	if result != C.DCGM_ST_OK {
126 | 		return links, &Error{msg: C.GoString(C.errorString(result)), Code: result}
127 | 	}
128 | 
129 | 	busid, err := getBusID(gpuID)
130 | 	if err != nil {
131 | 		return
132 | 	}
133 | 	links = make([]P2PLink, topology.numGpus)
134 | 	for i := uint(0); i < uint(topology.numGpus); i++ {
135 | 		links[i].GPU = uint(topology.gpuPaths[i].gpuId)
136 | 		links[i].BusID = busid
137 | 		links[i].Link = getP2PLink(uint(topology.gpuPaths[i].path))
138 | 	}
139 | 	return
140 | }
141 | 
142 | // Link_State represents the state of an NVLINK connection
143 | type Link_State uint
144 | 
145 | const (
146 | 	// LS_NOT_SUPPORTED indicates the link is unsupported (Default for GPUs)
147 | 	LS_NOT_SUPPORTED Link_State = iota
148 | 	// LS_DISABLED indicates the link is supported but disabled (Default for NvSwitches)
149 | 	LS_DISABLED
150 | 	// LS_DOWN indicates the link is down (inactive)
151 | 	LS_DOWN
152 | 	// LS_UP indicates the link is up (active)
153 | 	LS_UP
154 | )
155 | 
156 | // NvLinkStatus contains information about an NVLINK connection status
157 | type NvLinkStatus struct {
158 | 	// ParentId is the ID of the parent entity (GPU or NVSwitch)
159 | 	ParentId uint
160 | 	// ParentType is the type of the parent entity
161 | 	ParentType Field_Entity_Group
162 | 	// State is the current state of the NVLINK
163 | 	State Link_State
164 | 	// Index is the link index number
165 | 	Index uint
166 | }
167 | 
168 | func getNvLinkLinkStatus() ([]NvLinkStatus, error) {
169 | 	var linkStatus C.dcgmNvLinkStatus_v4
170 | 	linkStatus.version = makeVersion4(unsafe.Sizeof(linkStatus))
171 | 
172 | 	result := C.dcgmGetNvLinkLinkStatus(handle.handle, &linkStatus)
173 | 	if result == C.DCGM_ST_NOT_SUPPORTED {
174 | 		return nil, nil
175 | 	}
176 | 
177 | 	if result != C.DCGM_ST_OK {
178 | 		return nil, &Error{msg: C.GoString(C.errorString(result)), Code: result}
179 | 	}
180 | 
181 | 	links := make([]NvLinkStatus, linkStatus.numGpus*C.DCGM_NVLINK_MAX_LINKS_PER_GPU+linkStatus.numNvSwitches*C.DCGM_NVLINK_MAX_LINKS_PER_NVSWITCH)
182 | 
183 | 	idx := 0
184 | 	for i := uint(0); i < uint(linkStatus.numGpus); i++ {
185 | 		for j := 0; j < int(C.DCGM_NVLINK_MAX_LINKS_PER_GPU); j++ {
186 | 			link := NvLinkStatus{
187 | 				uint(linkStatus.gpus[i].entityId),
188 | 				FE_GPU,
189 | 				Link_State(linkStatus.gpus[i].linkState[j]),
190 | 				uint(j),
191 | 			}
192 | 
193 | 			links[idx] = link
194 | 			idx++
195 | 		}
196 | 	}
197 | 
198 | 	for i := uint(0); i < uint(linkStatus.numNvSwitches); i++ {
199 | 		for j := 0; j < C.DCGM_NVLINK_MAX_LINKS_PER_NVSWITCH; j++ {
200 | 			link := NvLinkStatus{
201 | 				uint(linkStatus.nvSwitches[i].entityId),
202 | 				FE_SWITCH,
203 | 				Link_State(linkStatus.nvSwitches[i].linkState[j]),
204 | 				uint(j),
205 | 			}
206 | 
207 | 			links[idx] = link
208 | 			idx++
209 | 		}
210 | 	}
211 | 
212 | 	return links, nil
213 | }
214 | 


--------------------------------------------------------------------------------
/pkg/dcgm/test_utils.go:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | package dcgm
 18 | 
 19 | import (
 20 | 	"os"
 21 | 	"path/filepath"
 22 | 	"testing"
 23 | 
 24 | 	"github.com/stretchr/testify/assert"
 25 | 	"github.com/stretchr/testify/require"
 26 | )
 27 | 
 28 | const (
 29 | 	// DCGM_NVSDM_MOCK_YAML environment variable for enabling NVSDM mock configuration
 30 | 	DCGM_NVSDM_MOCK_YAML = "DCGM_NVSDM_MOCK_YAML"
 31 | 	// DCGM_DBG_FILE is environment variables which enables DCGM to write debug logs to a specific file
 32 | 	DCGM_DBG_FILE = "__DCGM_DBG_FILE"
 33 | 	// DCGM_DBG_LVL is environment variables which enables DCGM logging level
 34 | 	DCGM_DBG_LVL = "__DCGM_DBG_LVL"
 35 | )
 36 | 
 37 | func setupTest(tb testing.TB) func(testing.TB) {
 38 | 	// Store original debug settings
 39 | 	originalDebugLevel, hasDebugLevel := os.LookupEnv(DCGM_DBG_LVL)
 40 | 	originalDebugFile, hasDebugFile := os.LookupEnv(DCGM_DBG_FILE)
 41 | 
 42 | 	// Enable debug output to stdout
 43 | 	err := os.Setenv(DCGM_DBG_LVL, "6")
 44 | 	require.NoError(tb, err)
 45 | 	err = os.Setenv(DCGM_DBG_FILE, "/dev/stdout")
 46 | 	require.NoError(tb, err)
 47 | 
 48 | 	// Initialize DCGM
 49 | 	cleanup, err := Init(Embedded)
 50 | 	assert.NoError(tb, err)
 51 | 
 52 | 	return func(tb testing.TB) {
 53 | 		defer cleanup()
 54 | 
 55 | 		// Restore original debug settings
 56 | 		if hasDebugLevel {
 57 | 			_ = os.Setenv(DCGM_DBG_LVL, originalDebugLevel)
 58 | 		} else {
 59 | 			_ = os.Unsetenv(DCGM_DBG_LVL)
 60 | 		}
 61 | 
 62 | 		if hasDebugFile {
 63 | 			_ = os.Setenv(DCGM_DBG_FILE, originalDebugFile)
 64 | 		} else {
 65 | 			_ = os.Unsetenv(DCGM_DBG_FILE)
 66 | 		}
 67 | 	}
 68 | }
 69 | 
 70 | func runOnlyWithLiveGPUs(t *testing.T) {
 71 | 	t.Helper()
 72 | 
 73 | 	gpus, err := getSupportedDevices()
 74 | 	require.NoError(t, err)
 75 | 
 76 | 	if len(gpus) < 1 {
 77 | 		t.Skip("Skipping test that requires live GPUs. None were found")
 78 | 	}
 79 | }
 80 | 
 81 | func withInjectionGPUs(tb testing.TB, count int) ([]uint, error) {
 82 | 	tb.Helper()
 83 | 	numGPUs, err := GetAllDeviceCount()
 84 | 	require.NoError(tb, err)
 85 | 
 86 | 	if numGPUs+1 > MAX_NUM_DEVICES {
 87 | 		tb.Skipf("Unable to add fake GPU with more than %d gpus", MAX_NUM_DEVICES)
 88 | 	}
 89 | 
 90 | 	entityList := make([]MigHierarchyInfo, count)
 91 | 	for i := range entityList {
 92 | 		entityList[i] = MigHierarchyInfo{
 93 | 			Entity: GroupEntityPair{EntityGroupId: FE_GPU},
 94 | 		}
 95 | 	}
 96 | 
 97 | 	return CreateFakeEntities(entityList)
 98 | }
 99 | 
100 | // withInjectionGPUInstances creates fake GPU instances on the specified GPU.
101 | // It returns a map of fake GPU instance IDs to their parent GPU ID.
102 | func withInjectionGPUInstances(tb testing.TB, gpuId uint, instanceCount int) (map[uint]uint, error) {
103 | 	tb.Helper()
104 | 
105 | 	if instanceCount <= 0 {
106 | 		return nil, nil
107 | 	}
108 | 
109 | 	entities := make([]MigHierarchyInfo, 0, instanceCount)
110 | 	for i := 0; i < instanceCount; i++ {
111 | 		entities = append(entities, MigHierarchyInfo{
112 | 			Parent: GroupEntityPair{
113 | 				EntityGroupId: FE_GPU,
114 | 				EntityId:      gpuId,
115 | 			},
116 | 			Entity: GroupEntityPair{
117 | 				EntityGroupId: FE_GPU_I,
118 | 			},
119 | 		})
120 | 	}
121 | 
122 | 	createdIDs, err := CreateFakeEntities(entities)
123 | 	if err != nil {
124 | 		return nil, err
125 | 	}
126 | 
127 | 	result := make(map[uint]uint, len(createdIDs))
128 | 	for _, id := range createdIDs {
129 | 		result[id] = gpuId
130 | 	}
131 | 
132 | 	return result, nil
133 | }
134 | 
135 | // withInjectionComputeInstances creates fake compute instances on the specified GPU instances.
136 | // It returns a mapping of compute instance IDs to their parent GPU instance IDs.
137 | // If count is 0 or parentIDs is empty, it returns an empty map.
138 | func withInjectionComputeInstances(tb testing.TB, parentIDs []uint, count int) (map[uint]uint, error) {
139 | 	tb.Helper()
140 | 
141 | 	if count <= 0 {
142 | 		return nil, nil
143 | 	}
144 | 
145 | 	if len(parentIDs) == 0 {
146 | 		return nil, nil
147 | 	}
148 | 
149 | 	entities := make([]MigHierarchyInfo, 0, count)
150 | 	instanceIndex := 0
151 | 	for i := 0; i < count; i++ {
152 | 		if instanceIndex >= len(parentIDs) {
153 | 			instanceIndex = 0
154 | 		}
155 | 		entities = append(entities, MigHierarchyInfo{
156 | 			Parent: GroupEntityPair{
157 | 				EntityGroupId: FE_GPU_I,
158 | 				EntityId:      parentIDs[instanceIndex],
159 | 			},
160 | 			Entity: GroupEntityPair{
161 | 				EntityGroupId: FE_GPU_CI,
162 | 			},
163 | 		})
164 | 		instanceIndex++
165 | 	}
166 | 
167 | 	createdIDs, err := CreateFakeEntities(entities)
168 | 	if err != nil {
169 | 		return nil, err
170 | 	}
171 | 
172 | 	result := make(map[uint]uint, len(createdIDs))
173 | 	instanceIndex = 0
174 | 	for _, id := range createdIDs {
175 | 		if instanceIndex >= len(parentIDs) {
176 | 			instanceIndex = 0
177 | 		}
178 | 		result[id] = parentIDs[instanceIndex]
179 | 		instanceIndex++
180 | 	}
181 | 
182 | 	return result, nil
183 | }
184 | 
185 | // withNvsdmMockConfig runs a test with a specified NVSDM mock configuration
186 | // It handles setting up and tearing down the environment variable for the mock config
187 | func withNvsdmMockConfig(t *testing.T, configYamlPath string, testFunc func(t *testing.T)) {
188 | 	t.Helper()
189 | 
190 | 	// Get absolute path for the config file
191 | 
192 | 	absPath, err := filepath.Abs(configYamlPath)
193 | 	require.NoError(t, err, "Failed to get absolute path for config file")
194 | 
195 | 	// Check if config file exists
196 | 	if _, err = os.Stat(absPath); os.IsNotExist(err) {
197 | 		t.Skipf("Skip test due to missing config YAML file [%s]", absPath)
198 | 		return
199 | 	}
200 | 
201 | 	// Store original env var value if it exists
202 | 	originalValue, hasOriginal := os.LookupEnv(DCGM_NVSDM_MOCK_YAML)
203 | 
204 | 	// Set the environment variable
205 | 	err = os.Setenv(DCGM_NVSDM_MOCK_YAML, absPath)
206 | 	require.NoError(t, err, "Failed to set mock config environment variable")
207 | 
208 | 	// Cleanup function to restore original state
209 | 	defer func() {
210 | 		if hasOriginal {
211 | 			_ = os.Setenv(DCGM_NVSDM_MOCK_YAML, originalValue)
212 | 		} else {
213 | 			_ = os.Unsetenv(DCGM_NVSDM_MOCK_YAML)
214 | 		}
215 | 	}()
216 | 
217 | 	// Run the test
218 | 	testFunc(t)
219 | }
220 | 


--------------------------------------------------------------------------------
/tests/policy_test.go:
--------------------------------------------------------------------------------
  1 | package tests
  2 | 
  3 | import (
  4 | 	"context"
  5 | 	"testing"
  6 | 	"time"
  7 | 
  8 | 	"github.com/NVIDIA/go-dcgm/pkg/dcgm"
  9 | )
 10 | 
 11 | // TestPolicyViolations demonstrates listening for policy violations
 12 | // This is equivalent to the policy sample but runs for a limited time
 13 | func TestPolicyViolations(t *testing.T) {
 14 | 	cleanup, err := dcgm.Init(dcgm.Embedded)
 15 | 	if err != nil {
 16 | 		t.Fatalf("Failed to initialize DCGM: %v", err)
 17 | 	}
 18 | 	defer cleanup()
 19 | 
 20 | 	ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
 21 | 	defer cancel()
 22 | 
 23 | 	// Listen for policy violations (DBE and XID errors)
 24 | 	c, err := dcgm.ListenForPolicyViolations(ctx, dcgm.DbePolicy, dcgm.XidPolicy)
 25 | 	if err != nil {
 26 | 		t.Fatalf("Failed to start listening for policy violations: %v", err)
 27 | 	}
 28 | 
 29 | 	t.Log("Listening for policy violations (DBE and XID errors) for 10 seconds...")
 30 | 
 31 | 	violationCount := 0
 32 | 	timeout := time.After(10 * time.Second)
 33 | 
 34 | 	for {
 35 | 		select {
 36 | 		case pe := <-c:
 37 | 			violationCount++
 38 | 			t.Logf("Policy Violation %d:", violationCount)
 39 | 			t.Logf("  Condition: %v", pe.Condition)
 40 | 			t.Logf("  Timestamp: %v", pe.Timestamp)
 41 | 			t.Logf("  Data: %v", pe.Data)
 42 | 
 43 | 		case <-ctx.Done():
 44 | 			t.Logf("Policy violation monitoring completed")
 45 | 			t.Logf("Total violations detected: %d", violationCount)
 46 | 			return
 47 | 
 48 | 		case <-timeout:
 49 | 			t.Logf("Policy violation monitoring timed out")
 50 | 			t.Logf("Total violations detected: %d", violationCount)
 51 | 			return
 52 | 		}
 53 | 	}
 54 | }
 55 | 
 56 | // TestPolicyViolationsSingleType demonstrates listening for a specific type of policy violation
 57 | func TestPolicyViolationsSingleType(t *testing.T) {
 58 | 	if testing.Short() {
 59 | 		t.Skip("Skipping single type policy test in short mode")
 60 | 	}
 61 | 
 62 | 	cleanup, err := dcgm.Init(dcgm.Embedded)
 63 | 	if err != nil {
 64 | 		t.Fatalf("Failed to initialize DCGM: %v", err)
 65 | 	}
 66 | 	defer cleanup()
 67 | 
 68 | 	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
 69 | 	defer cancel()
 70 | 
 71 | 	// Listen for only XID policy violations
 72 | 	c, err := dcgm.ListenForPolicyViolations(ctx, dcgm.XidPolicy)
 73 | 	if err != nil {
 74 | 		t.Fatalf("Failed to start listening for XID policy violations: %v", err)
 75 | 	}
 76 | 
 77 | 	t.Log("Listening for XID policy violations for 5 seconds...")
 78 | 
 79 | 	xidCount := 0
 80 | 	timeout := time.After(5 * time.Second)
 81 | 
 82 | 	for {
 83 | 		select {
 84 | 		case pe := <-c:
 85 | 			xidCount++
 86 | 			t.Logf("XID Policy Violation %d:", xidCount)
 87 | 			t.Logf("  Condition: %v", pe.Condition)
 88 | 			t.Logf("  Timestamp: %v", pe.Timestamp)
 89 | 			t.Logf("  Data: %v", pe.Data)
 90 | 
 91 | 		case <-ctx.Done():
 92 | 			t.Logf("XID policy violation monitoring completed")
 93 | 			t.Logf("Total XID violations detected: %d", xidCount)
 94 | 			return
 95 | 
 96 | 		case <-timeout:
 97 | 			t.Logf("XID policy violation monitoring timed out")
 98 | 			t.Logf("Total XID violations detected: %d", xidCount)
 99 | 			return
100 | 		}
101 | 	}
102 | }
103 | 
104 | // TestPolicyViolationsMultipleTypes demonstrates listening for multiple types of policy violations
105 | func TestPolicyViolationsMultipleTypes(t *testing.T) {
106 | 	if testing.Short() {
107 | 		t.Skip("Skipping multiple types policy test in short mode")
108 | 	}
109 | 
110 | 	cleanup, err := dcgm.Init(dcgm.Embedded)
111 | 	if err != nil {
112 | 		t.Fatalf("Failed to initialize DCGM: %v", err)
113 | 	}
114 | 	defer cleanup()
115 | 
116 | 	ctx, cancel := context.WithTimeout(context.Background(), 8*time.Second)
117 | 	defer cancel()
118 | 
119 | 	// Listen for multiple types of policy violations
120 | 	// Note: Some policies may require root privileges
121 | 
122 | 	c, err := dcgm.ListenForPolicyViolations(ctx, dcgm.DbePolicy, dcgm.XidPolicy, dcgm.ThermalPolicy, dcgm.PowerPolicy)
123 | 	if err != nil {
124 | 		t.Logf("Failed to start listening for all policy violations (may need root): %v", err)
125 | 		// Try with just basic policies
126 | 		c, err = dcgm.ListenForPolicyViolations(ctx, dcgm.DbePolicy, dcgm.XidPolicy)
127 | 		if err != nil {
128 | 			t.Fatalf("Failed to start listening for basic policy violations: %v", err)
129 | 		}
130 | 		t.Log("Listening for basic policy violations (DBE and XID) for 8 seconds...")
131 | 	} else {
132 | 		t.Log("Listening for multiple policy violations for 8 seconds...")
133 | 	}
134 | 
135 | 	violationsByType := make(map[string]int)
136 | 	timeout := time.After(8 * time.Second)
137 | 
138 | 	for {
139 | 		select {
140 | 		case pe := <-c:
141 | 			conditionStr := string(pe.Condition)
142 | 			violationsByType[conditionStr]++
143 | 
144 | 			t.Logf("Policy Violation:")
145 | 			t.Logf("  Type: %s", conditionStr)
146 | 			t.Logf("  Timestamp: %v", pe.Timestamp)
147 | 			t.Logf("  Data: %v", pe.Data)
148 | 
149 | 		case <-ctx.Done():
150 | 			t.Log("Multi-type policy violation monitoring completed")
151 | 			for policyType, count := range violationsByType {
152 | 				t.Logf("  %s violations: %d", policyType, count)
153 | 			}
154 | 			return
155 | 
156 | 		case <-timeout:
157 | 			t.Log("Multi-type policy violation monitoring timed out")
158 | 			for policyType, count := range violationsByType {
159 | 				t.Logf("  %s violations: %d", policyType, count)
160 | 			}
161 | 			return
162 | 		}
163 | 	}
164 | }
165 | 
166 | // TestPolicyViolationsContextCancellation demonstrates proper context cancellation
167 | func TestPolicyViolationsContextCancellation(t *testing.T) {
168 | 	cleanup, err := dcgm.Init(dcgm.Embedded)
169 | 	if err != nil {
170 | 		t.Fatalf("Failed to initialize DCGM: %v", err)
171 | 	}
172 | 	defer cleanup()
173 | 
174 | 	ctx, cancel := context.WithCancel(context.Background())
175 | 
176 | 	c, err := dcgm.ListenForPolicyViolations(ctx, dcgm.DbePolicy)
177 | 	if err != nil {
178 | 		t.Fatalf("Failed to start listening for policy violations: %v", err)
179 | 	}
180 | 
181 | 	t.Log("Starting policy violation monitoring, will cancel after 2 seconds...")
182 | 
183 | 	// Cancel after 2 seconds
184 | 	go func() {
185 | 		time.Sleep(2 * time.Second)
186 | 		t.Log("Cancelling policy violation monitoring...")
187 | 		cancel()
188 | 	}()
189 | 
190 | 	violationCount := 0
191 | 	startTime := time.Now()
192 | 
193 | 	for {
194 | 		select {
195 | 		case pe := <-c:
196 | 			violationCount++
197 | 			t.Logf("Policy violation %d: %v", violationCount, pe.Condition)
198 | 
199 | 		case <-ctx.Done():
200 | 			elapsed := time.Since(startTime)
201 | 			t.Logf("Policy violation monitoring stopped after %v", elapsed)
202 | 			t.Logf("Total violations detected: %d", violationCount)
203 | 
204 | 			// Should have stopped within reasonable time after cancellation
205 | 			if elapsed > 3*time.Second {
206 | 				t.Errorf("Context cancellation took too long: %v", elapsed)
207 | 			}
208 | 			return
209 | 		}
210 | 	}
211 | }
212 | 


--------------------------------------------------------------------------------
/pkg/dcgm/admin.go:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | package dcgm
 18 | 
 19 | /*
 20 | #cgo linux LDFLAGS: -ldl -Wl,--export-dynamic -Wl,--unresolved-symbols=ignore-in-object-files
 21 | #cgo darwin LDFLAGS: -ldl -Wl,-undefined,dynamic_lookup
 22 | 
 23 | #include <dlfcn.h>
 24 | #include "dcgm_agent.h"
 25 | #include "dcgm_structs.h"
 26 | 
 27 | */
 28 | import "C"
 29 | 
 30 | import (
 31 | 	"errors"
 32 | 	"fmt"
 33 | 	"log"
 34 | 	"os"
 35 | 	"os/exec"
 36 | 	"strconv"
 37 | 	"syscall"
 38 | 	"unsafe"
 39 | )
 40 | 
 41 | type mode int
 42 | 
 43 | // const for DCGM hostengine running modes: Embedded, Standalone or StartHostengine
 44 | const (
 45 | 	Embedded mode = iota
 46 | 	Standalone
 47 | 	StartHostengine
 48 | )
 49 | 
 50 | type dcgmHandle struct{ handle C.dcgmHandle_t }
 51 | 
 52 | var (
 53 | 	dcgmLibHandle        unsafe.Pointer
 54 | 	stopMode             mode
 55 | 	handle               dcgmHandle
 56 | 	hostengineAsChildPid int
 57 | )
 58 | 
 59 | func initDCGM(m mode, args ...string) (err error) {
 60 | 	const (
 61 | 		dcgmLib = "libdcgm.so.4"
 62 | 	)
 63 | 	lib := C.CString(dcgmLib)
 64 | 	defer freeCString(lib)
 65 | 
 66 | 	dcgmLibHandle = C.dlopen(lib, C.RTLD_LAZY|C.RTLD_GLOBAL)
 67 | 	if dcgmLibHandle == nil {
 68 | 		return fmt.Errorf("%s not found", dcgmLib)
 69 | 	}
 70 | 
 71 | 	// set the stopMode for shutdown()
 72 | 	stopMode = m
 73 | 
 74 | 	switch m {
 75 | 	case Embedded:
 76 | 		return startEmbedded()
 77 | 	case Standalone:
 78 | 		return connectStandalone(args...)
 79 | 	case StartHostengine:
 80 | 		return startHostengine()
 81 | 	default:
 82 | 		panic(ErrInvalidMode)
 83 | 	}
 84 | }
 85 | 
 86 | func shutdown() (err error) {
 87 | 	switch stopMode {
 88 | 	case Embedded:
 89 | 		err = stopEmbedded()
 90 | 	case Standalone:
 91 | 		err = disconnectStandalone()
 92 | 	case StartHostengine:
 93 | 		err = stopHostengine()
 94 | 	}
 95 | 
 96 | 	C.dlclose(dcgmLibHandle)
 97 | 	return
 98 | }
 99 | 
100 | func startEmbedded() (err error) {
101 | 	result := C.dcgmInit()
102 | 	if err = errorString(result); err != nil {
103 | 		return fmt.Errorf("error initializing DCGM: %s", err)
104 | 	}
105 | 
106 | 	var cHandle C.dcgmHandle_t
107 | 	result = C.dcgmStartEmbedded(C.DCGM_OPERATION_MODE_AUTO, &cHandle)
108 | 	if err = errorString(result); err != nil {
109 | 		return fmt.Errorf("error starting nv-hostengine: %s", err)
110 | 	}
111 | 	handle = dcgmHandle{cHandle}
112 | 	return
113 | }
114 | 
115 | func stopEmbedded() (err error) {
116 | 	result := C.dcgmStopEmbedded(handle.handle)
117 | 	if err = errorString(result); err != nil {
118 | 		return fmt.Errorf("error stopping nv-hostengine: %s", err)
119 | 	}
120 | 
121 | 	result = C.dcgmShutdown()
122 | 	if err = errorString(result); err != nil {
123 | 		return fmt.Errorf("error shutting down DCGM: %s", err)
124 | 	}
125 | 	return
126 | }
127 | 
128 | func connectStandalone(args ...string) (err error) {
129 | 	var (
130 | 		cHandle       C.dcgmHandle_t
131 | 		connectParams C.dcgmConnectV2Params_v2
132 | 	)
133 | 
134 | 	if len(args) < 2 {
135 | 		return errors.New("missing dcgm address and / or port")
136 | 	}
137 | 
138 | 	result := C.dcgmInit()
139 | 	if err = errorString(result); err != nil {
140 | 		return fmt.Errorf("error initializing DCGM: %s", err)
141 | 	}
142 | 
143 | 	addr := C.CString(args[0])
144 | 	defer freeCString(addr)
145 | 	connectParams.version = makeVersion2(unsafe.Sizeof(connectParams))
146 | 
147 | 	sck, err := strconv.ParseUint(args[1], 10, 32)
148 | 	if err != nil {
149 | 		return fmt.Errorf("error parsing %s: %v", args[1], err)
150 | 	}
151 | 	connectParams.addressIsUnixSocket = C.uint(sck)
152 | 
153 | 	result = C.dcgmConnect_v2(addr, &connectParams, &cHandle)
154 | 	if err = errorString(result); err != nil {
155 | 		return fmt.Errorf("error connecting to nv-hostengine: %s", err)
156 | 	}
157 | 
158 | 	handle = dcgmHandle{cHandle}
159 | 
160 | 	return
161 | }
162 | 
163 | func disconnectStandalone() (err error) {
164 | 	result := C.dcgmDisconnect(handle.handle)
165 | 	if err = errorString(result); err != nil {
166 | 		return fmt.Errorf("error disconnecting from nv-hostengine: %s", err)
167 | 	}
168 | 
169 | 	result = C.dcgmShutdown()
170 | 	if err = errorString(result); err != nil {
171 | 		return fmt.Errorf("error shutting down DCGM: %s", err)
172 | 	}
173 | 	return
174 | }
175 | 
176 | func startHostengine() (err error) {
177 | 	var (
178 | 		procAttr      syscall.ProcAttr
179 | 		cHandle       C.dcgmHandle_t
180 | 		connectParams C.dcgmConnectV2Params_v2
181 | 	)
182 | 
183 | 	bin, err := exec.LookPath("nv-hostengine")
184 | 	if err != nil {
185 | 		return fmt.Errorf("error finding nv-hostengine: %s", err)
186 | 	}
187 | 	procAttr.Files = []uintptr{
188 | 		uintptr(syscall.Stdin),
189 | 		uintptr(syscall.Stdout),
190 | 		uintptr(syscall.Stderr),
191 | 	}
192 | 	procAttr.Sys = &syscall.SysProcAttr{Setpgid: true}
193 | 
194 | 	dir := "/tmp"
195 | 	tmpfile, err := os.CreateTemp(dir, "dcgm")
196 | 	if err != nil {
197 | 		return fmt.Errorf("error creating temporary file in %s directory: %s", dir, err)
198 | 	}
199 | 	socketPath := tmpfile.Name()
200 | 	defer os.Remove(socketPath)
201 | 
202 | 	connectArg := "--domain-socket"
203 | 	hostengineAsChildPid, err = syscall.ForkExec(bin, []string{bin, connectArg, socketPath}, &procAttr)
204 | 	if err != nil {
205 | 		return fmt.Errorf("error fork-execing nv-hostengine: %s", err)
206 | 	}
207 | 
208 | 	result := C.dcgmInit()
209 | 	if err = errorString(result); err != nil {
210 | 		return fmt.Errorf("error initializing DCGM: %s", err)
211 | 	}
212 | 
213 | 	connectParams.version = makeVersion2(unsafe.Sizeof(connectParams))
214 | 	isSocket := C.uint(1)
215 | 	connectParams.addressIsUnixSocket = isSocket
216 | 	cSockPath := C.CString(socketPath)
217 | 	defer freeCString(cSockPath)
218 | 	result = C.dcgmConnect_v2(cSockPath, &connectParams, &cHandle)
219 | 	if err = errorString(result); err != nil {
220 | 		return fmt.Errorf("error connecting to nv-hostengine: %s", err)
221 | 	}
222 | 
223 | 	handle = dcgmHandle{cHandle}
224 | 	return
225 | }
226 | 
227 | func stopHostengine() (err error) {
228 | 	if err = disconnectStandalone(); err != nil {
229 | 		return
230 | 	}
231 | 
232 | 	// terminate nv-hostengine
233 | 	cmd := exec.Command("nv-hostengine", "--term")
234 | 	if err = cmd.Run(); err != nil {
235 | 		return fmt.Errorf("error terminating nv-hostengine: %s", err)
236 | 	}
237 | 
238 | 	log.Println("Successfully terminated nv-hostengine.")
239 | 
240 | 	return syscall.Kill(hostengineAsChildPid, syscall.SIGKILL)
241 | }
242 | 


--------------------------------------------------------------------------------
/samples/README.md:
--------------------------------------------------------------------------------
  1 | # DCGM Samples
  2 | 
  3 | Modeled on [dcgmi (Data Center GPU Manager Interface)](https://developer.nvidia.com/data-center-gpu-manager-dcgm) and [nvidia-smi (NVIDIA System Management Interface)](https://developer.nvidia.com/nvidia-system-management-interface), seven samples and a [REST API](https://github.com/NVIDIA/go-dcgm/samples/dcgm/restApi/README.md) have been provided to show how to use DCGM go bindings.
  4 | 
  5 | ## DCGM running modes
  6 | 
  7 | DCGM can be run in three different ways.
  8 | 
  9 | ### Embedded Mode
 10 | 
 11 | In embedded mode, hostengine is started as part of the running process and is loaded as a shared library. In this mode, metrics are also updated and collected automatically. This mode is recommended for users who wants to avoid managing an autonomous hostengine.
 12 | 
 13 | ### Standalone Mode
 14 | 
 15 | This mode lets you connect to an already running hostengine at a specified TCP/IP or Unix socket address. This mode is recommended for remote connections to the hostengine.  By default, DCGM will assume a TCP connection and attempt to connect to localhost, unless specified.
 16 | 
 17 | ```bash
 18 | # If hostengine is running at a different address, pass it as
 19 | 
 20 | IP - Valid IP address for the remote hostengine to connect to, at port 5555.
 21 | 
 22 | IP:PORT - Valid IP address and port
 23 | 
 24 | O - Given address is a TCP/IP address
 25 | 
 26 | 1 - Given address is an Unix socket filename
 27 | 
 28 | $ ./sample -connect "IP" -socket "0"
 29 | 
 30 | ```
 31 | 
 32 | ### StartHostengine
 33 | 
 34 | This is an add-on mode which opens an Unix socket for starting and connecting with hostengine. The hostengine is started as a child process of the running process and automatically terminated on exit. When operating in this mode, make sure to stop an already running hostengine to avoid any connection address conflicts. This mode is recommended for safely integrating DCGM in an already existing setup.
 35 | 
 36 | ## Samples
 37 | 
 38 | ### deviceInfo
 39 | 
 40 | Provides detailed information about each GPU on the system, along with whether the given GPU is DCGM supported or not.
 41 | 
 42 | ```bash
 43 | $ go build && ./deviceInfo
 44 | 
 45 | # sample output
 46 | 
 47 | Driver Version         : 384.130
 48 | GPU                    : 0
 49 | DCGMSupported          : Yes
 50 | UUID                   : GPU-34e8d7ba-0e4d-ac00-6852-695d5d404f51
 51 | Brand                  : GeForce
 52 | Model                  : GeForce GTX 980
 53 | Serial Number          : 0324414056639
 54 | Vbios                  : 84.04.1F.00.02
 55 | InforomImage Version   : G001.0000.01.03
 56 | Bus ID                 : 00000000:01:00.0
 57 | BAR1 (MB)              : 256
 58 | FrameBuffer Memory (MB): 4036
 59 | Bandwidth (MB/s)       : 15760
 60 | Cores (MHz)            : 1392
 61 | Memory (MHz)           : 3505
 62 | Power (W)              : 180
 63 | CPUAffinity            : 0-11
 64 | P2P Available          : None
 65 | ---------------------------------------------------------------------
 66 | ```
 67 | 
 68 | #### dmon
 69 | 
 70 | Monitors each device status including its power, memory and GPU utilization.
 71 | 
 72 | ```bash
 73 | $ go build && ./dmon
 74 | 
 75 | # sample output
 76 | 
 77 | Started host engine version 1.4.3 using socket path: /tmp/dcgmrxvqro.socket
 78 | # gpu   pwr  temp    sm   mem   enc   dec  mclk  pclk
 79 | # Idx     W     C     %     %     %     %   MHz   MHz
 80 |     0    43    48     0     1     0     0  3505   936
 81 |     0    43    48     0     1     0     0  3505   936
 82 | ```
 83 | 
 84 | #### health
 85 | 
 86 | Monitors the health of the given GPU every second, by checking the configured watches for any errors/failures/warnings.
 87 | 
 88 | ```bash
 89 | $ go build && ./health
 90 | 
 91 | # sample output
 92 | GPU                : 0
 93 | Status             : Healthy
 94 | ```
 95 | 
 96 | #### hostengineStatus
 97 | 
 98 | Reports about DCGM hostengine memory and CPU usage.
 99 | 
100 | ```bash
101 | $ go build && ./hostengineStatus
102 | 
103 | # sample output
104 | 
105 | Memory  : 11480 KB
106 | CPU     : 0.08 %
107 | ```
108 | 
109 | #### policy
110 | 
111 | Sets GPU usage and error policies and notifies in case of violations via callback functions.
112 | 
113 | ```bash
114 | $ go build && ./policy
115 | 
116 | # sample output
117 | 
118 | 2018/06/25 23:48:34 Policy successfully set.
119 | 2018/06/25 23:48:34 Listening for violations...
120 | GPU        : 0
121 | Error      : XID Error
122 | Timestamp  : 2018-06-25 18:55:30 +0000 UTC
123 | Data       : {31}
124 | ```
125 | 
126 | #### processInfo
127 | 
128 | Provides per GPU detailed stats for this process.
129 | 
130 | ```bash
131 | $ go build && ./processInfo -pid PID
132 | 
133 | # sample output
134 | 
135 | ----------------------------------------------------------------------
136 | GPU ID                       : 0
137 | ----------Execution Stats---------------------------------------------
138 | PID                          : 15074
139 | Name                         : nbody
140 | Start Time                   : 2018-06-25 16:50:28 -0700 PDT
141 | End Time                     : Still Running
142 | ----------Performance Stats-------------------------------------------
143 | Energy Consumed (Joules)     : 181
144 | Max GPU Memory Used (bytes)  : 84279296
145 | Avg SM Clock (MHz)           : N/A
146 | Avg Memory Clock (MHz)       : N/A
147 | Avg SM Utilization (%)       : N/A
148 | Avg Memory Utilization (%)   : N/A
149 | Avg PCIe Rx Bandwidth (MB)   : N/A
150 | Avg PCIe Tx Bandwidth (MB)   : N/A
151 | ----------Event Stats-------------------------------------------------
152 | Single Bit ECC Errors        : 0
153 | Double Bit ECC Errors        : 0
154 | Critical XID Errors          : 0
155 | ----------Slowdown Stats----------------------------------------------
156 | Due to - Power (%)           : 0
157 |        - Thermal (%)         : 0
158 |        - Reliability (%)     : 0
159 |        - Board Limit (%)     : 0
160 |        - Low Utilization (%) : 0
161 |        - Sync Boost (%)      : 0
162 | ----------Process Utilization-----------------------------------------
163 | Avg SM Utilization (%)       : 0
164 | Avg Memory Utilization (%)   : 0
165 | ----------------------------------------------------------------------
166 | ```
167 | 
168 | #### topology
169 | 
170 | Informs about GPU topology and its CPU affinity.
171 | 
172 | ```bash
173 | $ go build && ./topology
174 | 
175 | # sample output
176 | 
177 | Started host engine version 1.4.3 using socket path: /tmp/dcgmvjeqkh.socket
178 |       GPU0CPUAffinity
179 | GPU0    X 0-11
180 | 
181 | Legend:
182 |  X    = Self
183 |  SYS  = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)
184 |  NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node
185 |  PHB  = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)
186 |  PXB  = Connection traversing multiple PCIe switches (without traversing the PCIe Host Bridge)
187 |  PIX  = Connection traversing a single PCIe switch
188 |  PSB  = Connection traversing a single on-board PCIe switch
189 |  NV#  = Connection traversing a bonded set of # NVLinks
190 |  2018/06/25 15:36:38 Successfully terminated nv-hostengine.
191 | ```
192 | 


--------------------------------------------------------------------------------
/pkg/dcgm/health_test.go:
--------------------------------------------------------------------------------
  1 | //go:build linux && cgo
  2 | 
  3 | /*
  4 |  * Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
  5 |  *
  6 |  * Licensed under the Apache License, Version 2.0 (the "License");
  7 |  * you may not use this file except in compliance with the License.
  8 |  * You may obtain a copy of the License at
  9 |  *
 10 |  *     http://www.apache.org/licenses/LICENSE-2.0
 11 |  *
 12 |  * Unless required by applicable law or agreed to in writing, software
 13 |  * distributed under the License is distributed on an "AS IS" BASIS,
 14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 |  * See the License for the specific language governing permissions and
 16 |  * limitations under the License.
 17 |  */
 18 | 
 19 | package dcgm
 20 | 
 21 | import (
 22 | 	"crypto/rand"
 23 | 	"fmt"
 24 | 	"math"
 25 | 	"math/big"
 26 | 	"strings"
 27 | 	"testing"
 28 | 	"time"
 29 | 
 30 | 	"github.com/stretchr/testify/assert"
 31 | 	"github.com/stretchr/testify/require"
 32 | )
 33 | 
 34 | func TestHealthWhenInvalidGroupID(t *testing.T) {
 35 | 	teardownTest := setupTest(t)
 36 | 	defer teardownTest(t)
 37 | 	runOnlyWithLiveGPUs(t)
 38 | 
 39 | 	var invalidGroupID uintptr = 99
 40 | 	gh := GroupHandle{}
 41 | 	gh.SetHandle(invalidGroupID)
 42 | 	err := HealthSet(gh, DCGM_HEALTH_WATCH_PCIE)
 43 | 	assert.Error(t, err)
 44 | 	assert.Contains(t, err.Error(), "Setting not configured")
 45 | 
 46 | 	_, err = HealthGet(gh)
 47 | 	assert.Error(t, err)
 48 | 	assert.Contains(t, err.Error(), "Setting not configured")
 49 | 
 50 | 	_, err = HealthGet(gh)
 51 | 	assert.Error(t, err)
 52 | 	assert.Contains(t, err.Error(), "Setting not configured")
 53 | }
 54 | 
 55 | func TestHealthCheckPCIE(t *testing.T) {
 56 | 	teardownTest := setupTest(t)
 57 | 	defer teardownTest(t)
 58 | 
 59 | 	runOnlyWithLiveGPUs(t)
 60 | 	gpus, err := withInjectionGPUs(t, 1)
 61 | 	require.NoError(t, err)
 62 | 
 63 | 	type testCase struct {
 64 | 		name              string
 65 | 		pcieGen           int
 66 | 		pcieGenSpeed      float64 // in Gbps
 67 | 		pcieLanes         int
 68 | 		pcieReplayCounter int
 69 | 		expectingIncident bool
 70 | 	}
 71 | 
 72 | 	pcieGenSpeeds := []float64{
 73 | 		2.5,  // Gen1 speed in Gbps
 74 | 		5.0,  // Gen2
 75 | 		8.0,  // Gen3
 76 | 		16.0, // Gen4
 77 | 		32.0, // Gen5
 78 | 		64.0, // Gen6
 79 | 	}
 80 | 
 81 | 	var tests []testCase
 82 | 	// Generate test cases
 83 | 	for i := 0; i < 1; i++ { // Run multiple iterations
 84 | 		for gen, speed := range pcieGenSpeeds {
 85 | 			pcieGen := gen + 1
 86 | 			// Generate random number between 1 and 16
 87 | 			n, err := rand.Int(rand.Reader, big.NewInt(16))
 88 | 			require.NoError(t, err)
 89 | 			pcieLanes := int(n.Int64()) + 1
 90 | 
 91 | 			ratePerLane := speed / 1000 * 60 // Convert to errors/min per lane
 92 | 			expectedLimit := math.Ceil(ratePerLane * float64(pcieLanes))
 93 | 
 94 | 			// Generate random number between 1 and 2*expectedLimit
 95 | 			n, err = rand.Int(rand.Reader, big.NewInt(2*int64(expectedLimit)))
 96 | 			require.NoError(t, err)
 97 | 			pcieReplayCounter := int(n.Int64()) + 1
 98 | 			expectingIncident := pcieReplayCounter > int(expectedLimit)
 99 | 
100 | 			tests = append(tests, testCase{
101 | 				name:              fmt.Sprintf("PCIe_Gen%d_%dLanes_Counter%d", pcieGen, pcieLanes, pcieReplayCounter),
102 | 				pcieGen:           pcieGen,
103 | 				pcieGenSpeed:      speed,
104 | 				pcieLanes:         pcieLanes,
105 | 				pcieReplayCounter: pcieReplayCounter,
106 | 				expectingIncident: expectingIncident,
107 | 			})
108 | 		}
109 | 	}
110 | 
111 | 	for _, tc := range tests {
112 | 		t.Run(tc.name, func(t *testing.T) {
113 | 			ratePerLane := tc.pcieGenSpeed / 1000 * 60
114 | 			expectedLimit := math.Ceil(ratePerLane * float64(tc.pcieLanes))
115 | 
116 | 			errMsg := fmt.Sprintf("pcieGen=%d pcieGenSpeed=%f pcieLanes=%d expectedLimit=%f pcieReplayCounter=%d expectingIncident=%v",
117 | 				tc.pcieGen, tc.pcieGenSpeed, tc.pcieLanes, expectedLimit, tc.pcieReplayCounter, tc.expectingIncident)
118 | 
119 | 			healthCheckPCIE(t, gpus, tc.pcieGen, tc.pcieLanes, tc.pcieReplayCounter, tc.expectingIncident, errMsg)
120 | 			defer resetPCICReplayCounter(t, gpus)
121 | 		})
122 | 	}
123 | }
124 | 
125 | func resetPCICReplayCounter(t *testing.T, gpuIDs []uint) {
126 | 	gpuID := gpuIDs[0]
127 | 	err := InjectFieldValue(gpuID,
128 | 		DCGM_FI_DEV_PCIE_REPLAY_COUNTER,
129 | 		DCGM_FT_INT64,
130 | 		0,
131 | 		time.Now().Add(100*time.Second).UnixMicro(),
132 | 		int64(0),
133 | 	)
134 | 	require.NoError(t, err)
135 | }
136 | 
137 | func healthCheckPCIE(t *testing.T, gpuIDs []uint, pcieGen, pcieLanes, pcieReplayCounter int, expectingPCIEIncident bool, errMessage string) {
138 | 	gpuID := gpuIDs[0]
139 | 
140 | 	groupID, err := CreateGroup("test1")
141 | 	require.NoError(t, err)
142 | 	defer func() {
143 | 		_ = DestroyGroup(groupID)
144 | 	}()
145 | 	err = AddEntityToGroup(groupID, FE_GPU, gpuID)
146 | 	require.NoError(t, err)
147 | 
148 | 	err = HealthSet(groupID, DCGM_HEALTH_WATCH_PCIE)
149 | 	require.NoError(t, err)
150 | 
151 | 	system, err := HealthGet(groupID)
152 | 	require.NoError(t, err)
153 | 	require.Equal(t, DCGM_HEALTH_WATCH_PCIE, system)
154 | 
155 | 	skipTestIfUnhealthy(t, groupID)
156 | 
157 | 	// inject PCIe Gen and width/lanes
158 | 	err = InjectFieldValue(gpuID,
159 | 		DCGM_FI_DEV_PCIE_LINK_GEN,
160 | 		DCGM_FT_INT64,
161 | 		0,
162 | 		0,
163 | 		int64(pcieGen),
164 | 	)
165 | 	require.NoError(t, err)
166 | 
167 | 	err = InjectFieldValue(gpuID,
168 | 		DCGM_FI_DEV_PCIE_LINK_WIDTH,
169 | 		DCGM_FT_INT64,
170 | 		0,
171 | 		0,
172 | 		int64(pcieLanes),
173 | 	)
174 | 	require.NoError(t, err)
175 | 
176 | 	err = InjectFieldValue(gpuID,
177 | 		DCGM_FI_DEV_PCIE_REPLAY_COUNTER,
178 | 		DCGM_FT_INT64,
179 | 		0,
180 | 		time.Now().Add(-50*time.Second).UnixMicro(),
181 | 		int64(0),
182 | 	)
183 | 	require.NoError(t, err)
184 | 
185 | 	// we expect that there will be no data here
186 | 	response, err := HealthCheck(groupID)
187 | 	require.NoError(t, err)
188 | 	require.Equal(t, DCGM_HEALTH_RESULT_PASS, response.OverallHealth)
189 | 
190 | 	// inject an error into PCIe
191 | 	err = InjectFieldValue(gpuID,
192 | 		DCGM_FI_DEV_PCIE_REPLAY_COUNTER,
193 | 		DCGM_FT_INT64,
194 | 		0,
195 | 		time.Now().Add(100*time.Second).UnixMicro(),
196 | 		int64(pcieReplayCounter),
197 | 	) // set the injected data into the future
198 | 	require.NoError(t, err)
199 | 
200 | 	response, err = HealthCheck(groupID)
201 | 	require.NoError(t, err)
202 | 	if expectingPCIEIncident {
203 | 		require.Len(t, response.Incidents, 1, errMessage)
204 | 		require.Equal(t, gpuID, response.Incidents[0].EntityInfo.EntityId)
205 | 		require.Equal(t, DCGM_HEALTH_WATCH_PCIE, response.Incidents[0].System)
206 | 		require.Equal(t, DCGM_FR_PCI_REPLAY_RATE, response.Incidents[0].Error.Code)
207 | 	} else {
208 | 		require.Empty(t, response.Incidents, errMessage)
209 | 	}
210 | }
211 | 
212 | func skipTestIfUnhealthy(t *testing.T, groupId GroupHandle) {
213 | 	health, err := HealthCheck(groupId)
214 | 	require.NoError(t, err)
215 | 	if health.OverallHealth != DCGM_HEALTH_RESULT_PASS {
216 | 		msg := "Skipping health check test because we are already unhealthy: "
217 | 		incidents := []string{}
218 | 		for _, incident := range health.Incidents {
219 | 			incidents = append(incidents, incident.Error.Message)
220 | 		}
221 | 
222 | 		t.Skip(msg + strings.Join(incidents, ", "))
223 | 	}
224 | }
225 | 


--------------------------------------------------------------------------------
/pkg/dcgm/health.go:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | package dcgm
 18 | 
 19 | /*
 20 | #include "dcgm_agent.h"
 21 | #include "dcgm_structs.h"
 22 | */
 23 | import "C"
 24 | 
 25 | import (
 26 | 	"fmt"
 27 | 	"math/rand"
 28 | 	"unsafe"
 29 | )
 30 | 
 31 | // SystemWatch represents a health watch system and its status
 32 | type SystemWatch struct {
 33 | 	// Type identifies the type of health watch system
 34 | 	Type string
 35 | 	// Status indicates the current health status
 36 | 	Status string
 37 | 	// Error contains any error message if status is not healthy
 38 | 	Error string
 39 | }
 40 | 
 41 | // DeviceHealth represents the health status of a GPU device
 42 | type DeviceHealth struct {
 43 | 	// GPU is the ID of the GPU device
 44 | 	GPU uint
 45 | 	// Status indicates the overall health status of the GPU
 46 | 	Status string
 47 | 	// Watches contains the status of individual health watch systems
 48 | 	Watches []SystemWatch
 49 | }
 50 | 
 51 | // HealthSet enables the DCGM health check system for the given systems.
 52 | // It configures which health watch systems should be monitored for the specified group.
 53 | func HealthSet(groupID GroupHandle, systems HealthSystem) (err error) {
 54 | 	result := C.dcgmHealthSet(handle.handle, groupID.handle, C.dcgmHealthSystems_t(systems))
 55 | 	if err := errorString(result); err != nil {
 56 | 		return fmt.Errorf("error setting health watches: %w", err)
 57 | 	}
 58 | 	return nil
 59 | }
 60 | 
 61 | // HealthGet retrieves the current state of the DCGM health check system.
 62 | // It returns which health watch systems are currently enabled for the specified group.
 63 | func HealthGet(groupID GroupHandle) (HealthSystem, error) {
 64 | 	var systems C.dcgmHealthSystems_t
 65 | 
 66 | 	result := C.dcgmHealthGet(handle.handle, groupID.handle, (*C.dcgmHealthSystems_t)(unsafe.Pointer(&systems)))
 67 | 	if err := errorString(result); err != nil {
 68 | 		return HealthSystem(0), err
 69 | 	}
 70 | 	return HealthSystem(systems), nil
 71 | }
 72 | 
 73 | // DiagErrorDetail contains detailed information about a health check error
 74 | type DiagErrorDetail struct {
 75 | 	// Message contains a human-readable description of the error
 76 | 	Message string
 77 | 	// Code identifies the specific type of error
 78 | 	Code HealthCheckErrorCode
 79 | }
 80 | 
 81 | // Incident represents a health check incident that occurred
 82 | type Incident struct {
 83 | 	// System identifies which health watch system detected the incident
 84 | 	System HealthSystem
 85 | 	// Health indicates the severity of the incident
 86 | 	Health HealthResult
 87 | 	// Error contains detailed information about the incident
 88 | 	Error DiagErrorDetail
 89 | 	// EntityInfo identifies the GPU or component where the incident occurred
 90 | 	EntityInfo GroupEntityPair
 91 | }
 92 | 
 93 | // HealthResponse contains the results of a health check operation
 94 | type HealthResponse struct {
 95 | 	// OverallHealth indicates the aggregate health status across all watches
 96 | 	OverallHealth HealthResult
 97 | 	// Incidents contains details about any health issues detected
 98 | 	Incidents []Incident
 99 | }
100 | 
101 | // HealthCheck checks the configured watches for any errors/failures/warnings that have occurred
102 | // since the last time this check was invoked. On the first call, stateful information
103 | // about all of the enabled watches within a group is created but no error results are
104 | // provided. On subsequent calls, any error information will be returned.
105 | func HealthCheck(groupID GroupHandle) (HealthResponse, error) {
106 | 	var healthResults C.dcgmHealthResponse_v5
107 | 	healthResults.version = makeVersion5(unsafe.Sizeof(healthResults))
108 | 
109 | 	result := C.dcgmHealthCheck(handle.handle, groupID.handle, (*C.dcgmHealthResponse_t)(unsafe.Pointer(&healthResults)))
110 | 
111 | 	if err := errorString(result); err != nil {
112 | 		return HealthResponse{}, &Error{msg: C.GoString(C.errorString(result)), Code: result}
113 | 	}
114 | 
115 | 	response := HealthResponse{
116 | 		OverallHealth: HealthResult(healthResults.overallHealth),
117 | 	}
118 | 
119 | 	// number of watches that encountered error/warning
120 | 	incidents := uint(healthResults.incidentCount)
121 | 
122 | 	response.Incidents = make([]Incident, incidents)
123 | 
124 | 	for i := uint(0); i < incidents; i++ {
125 | 		response.Incidents[i] = Incident{
126 | 			System: HealthSystem(healthResults.incidents[i].system),
127 | 			Health: HealthResult(healthResults.incidents[i].health),
128 | 			Error: DiagErrorDetail{
129 | 				Message: *stringPtr(&healthResults.incidents[i].error.msg[0]),
130 | 				Code:    HealthCheckErrorCode(healthResults.incidents[i].error.code),
131 | 			},
132 | 			EntityInfo: GroupEntityPair{
133 | 				EntityGroupId: Field_Entity_Group(healthResults.incidents[i].entityInfo.entityGroupId),
134 | 				EntityId:      uint(healthResults.incidents[i].entityInfo.entityId),
135 | 			},
136 | 		}
137 | 	}
138 | 
139 | 	return response, nil
140 | }
141 | 
142 | func healthCheckByGpuId(gpuID uint) (deviceHealth DeviceHealth, err error) {
143 | 	name := fmt.Sprintf("health%d", rand.Uint64())
144 | 	groupID, err := CreateGroup(name)
145 | 	if err != nil {
146 | 		return
147 | 	}
148 | 
149 | 	err = AddToGroup(groupID, gpuID)
150 | 	if err != nil {
151 | 		return
152 | 	}
153 | 
154 | 	err = HealthSet(groupID, DCGM_HEALTH_WATCH_ALL)
155 | 	if err != nil {
156 | 		return
157 | 	}
158 | 
159 | 	result, err := HealthCheck(groupID)
160 | 	if err != nil {
161 | 		return
162 | 	}
163 | 
164 | 	status := healthStatus(result.OverallHealth)
165 | 
166 | 	// number of watches that encountered error/warning
167 | 	incidents := len(result.Incidents)
168 | 	watches := make([]SystemWatch, incidents)
169 | 
170 | 	for j := 0; j < incidents; j++ {
171 | 		watches[j] = SystemWatch{
172 | 			Type:   systemWatch(result.Incidents[j].System),
173 | 			Status: healthStatus(result.Incidents[j].Health),
174 | 			Error:  result.Incidents[j].Error.Message,
175 | 		}
176 | 	}
177 | 
178 | 	deviceHealth = DeviceHealth{
179 | 		GPU:     gpuID,
180 | 		Status:  status,
181 | 		Watches: watches,
182 | 	}
183 | 	_ = DestroyGroup(groupID)
184 | 	return
185 | }
186 | 
187 | func healthStatus(status HealthResult) string {
188 | 	switch status {
189 | 	case 0:
190 | 		return "Healthy"
191 | 	case 10:
192 | 		return "Warning"
193 | 	case 20:
194 | 		return "Failure"
195 | 	}
196 | 	return "N/A"
197 | }
198 | 
199 | func systemWatch(watch HealthSystem) string {
200 | 	switch watch {
201 | 	case 1:
202 | 		return "PCIe watches"
203 | 	case 2:
204 | 		return "NVLINK watches"
205 | 	case 4:
206 | 		return "Power Managemnt unit watches"
207 | 	case 8:
208 | 		return "Microcontroller unit watches"
209 | 	case 16:
210 | 		return "Memory watches"
211 | 	case 32:
212 | 		return "Streaming Multiprocessor watches"
213 | 	case 64:
214 | 		return "Inforom watches"
215 | 	case 128:
216 | 		return "Temperature watches"
217 | 	case 256:
218 | 		return "Power watches"
219 | 	case 512:
220 | 		return "Driver-related watches"
221 | 	}
222 | 	return "N/A"
223 | }
224 | 


--------------------------------------------------------------------------------
/pkg/dcgm/diag.go:
--------------------------------------------------------------------------------
  1 | package dcgm
  2 | 
  3 | /*
  4 | #include "dcgm_agent.h"
  5 | #include "dcgm_structs.h"
  6 | */
  7 | import "C"
  8 | 
  9 | import (
 10 | 	"strings"
 11 | 	"unsafe"
 12 | )
 13 | 
 14 | // Package dcgm provides bindings for NVIDIA's Data Center GPU Manager (DCGM)
 15 | 
 16 | // DIAG_RESULT_STRING_SIZE represents the maximum size of diagnostic result strings
 17 | const DIAG_RESULT_STRING_SIZE = 1024
 18 | 
 19 | // DiagType represents the type of diagnostic test to run
 20 | type DiagType int
 21 | 
 22 | const (
 23 | 	// DiagQuick represents a quick diagnostic test that performs basic health checks
 24 | 	DiagQuick DiagType = 1
 25 | 
 26 | 	// DiagMedium represents a medium-length diagnostic test that performs more comprehensive checks
 27 | 	DiagMedium DiagType = 2
 28 | 
 29 | 	// DiagLong represents a long diagnostic test that performs extensive health checks
 30 | 	DiagLong DiagType = 3
 31 | 
 32 | 	// DiagExtended represents an extended diagnostic test that performs the most thorough system checks
 33 | 	DiagExtended DiagType = 4
 34 | )
 35 | 
 36 | // DiagResult represents the result of a single diagnostic test
 37 | type DiagResult struct {
 38 | 	// Status indicates the test result: "pass", "fail", "warn", "skip", or "notrun"
 39 | 	Status string
 40 | 	// TestName is the name of the diagnostic test that was run
 41 | 	TestName string
 42 | 	// TestOutput contains any additional output or messages from the test
 43 | 	TestOutput string
 44 | 	// ErrorCode is the numeric error code if the test failed
 45 | 	ErrorCode uint
 46 | 	// ErrorMessage contains a detailed error message if the test failed
 47 | 	ErrorMessage string
 48 | 	// Serial number of the tested entity
 49 | 	SerialNumber string
 50 | 	// EntityID
 51 | 	EntityID uint
 52 | }
 53 | 
 54 | // DiagResults contains the results of all diagnostic tests
 55 | type DiagResults struct {
 56 | 	// Software contains the results of software-related diagnostic tests
 57 | 	Software []DiagResult
 58 | }
 59 | 
 60 | // diagResultString converts a diagnostic result code to its string representation
 61 | func diagResultString(r int) string {
 62 | 	switch r {
 63 | 	case C.DCGM_DIAG_RESULT_PASS:
 64 | 		return "pass"
 65 | 	case C.DCGM_DIAG_RESULT_SKIP:
 66 | 		return "skipped"
 67 | 	case C.DCGM_DIAG_RESULT_WARN:
 68 | 		return "warn"
 69 | 	case C.DCGM_DIAG_RESULT_FAIL:
 70 | 		return "fail"
 71 | 	case C.DCGM_DIAG_RESULT_NOT_RUN:
 72 | 		return "notrun"
 73 | 	}
 74 | 	return ""
 75 | }
 76 | 
 77 | // gpuTestName returns the category name for a diagnostic test based on its test ID.
 78 | // This function handles all diagnostic test types including GPU tests and software tests.
 79 | // Software tests (DCGM_SWTEST_*) all report under DCGM_SOFTWARE_INDEX and return "software".
 80 | // Detailed test information is provided in TestOutput, not in the TestName.
 81 | func gpuTestName(t int) string {
 82 | 	switch t {
 83 | 	case C.DCGM_MEMORY_INDEX:
 84 | 		return "memory"
 85 | 	case C.DCGM_DIAGNOSTIC_INDEX:
 86 | 		return "diagnostic"
 87 | 	case C.DCGM_PCI_INDEX:
 88 | 		return "pcie"
 89 | 	case C.DCGM_SM_STRESS_INDEX:
 90 | 		return "sm stress"
 91 | 	case C.DCGM_TARGETED_STRESS_INDEX:
 92 | 		return "targeted stress"
 93 | 	case C.DCGM_TARGETED_POWER_INDEX:
 94 | 		return "targeted power"
 95 | 	case C.DCGM_MEMORY_BANDWIDTH_INDEX:
 96 | 		return "memory bandwidth"
 97 | 	case C.DCGM_MEMTEST_INDEX:
 98 | 		return "memtest"
 99 | 	case C.DCGM_PULSE_TEST_INDEX:
100 | 		return "pulse"
101 | 	case C.DCGM_EUD_TEST_INDEX:
102 | 		return "eud"
103 | 	case C.DCGM_SOFTWARE_INDEX:
104 | 		return "software"
105 | 	case C.DCGM_CONTEXT_CREATE_INDEX:
106 | 		return "context create"
107 | 	}
108 | 	return ""
109 | }
110 | 
111 | func getErrorMsg(entityId uint, testId uint, response C.dcgmDiagResponse_v12) (msg string, code uint) {
112 | 	for i := 0; i < int(response.numErrors); i++ {
113 | 		if uint(response.errors[i].entity.entityId) != entityId || uint(response.errors[i].testId) != testId {
114 | 			continue
115 | 		}
116 | 
117 | 		msg = C.GoString((*C.char)(unsafe.Pointer(&response.errors[i].msg)))
118 | 		code = uint(response.errors[i].code)
119 | 		return
120 | 	}
121 | 
122 | 	return
123 | }
124 | 
125 | func getInfoMsg(entityId uint, testId uint, response C.dcgmDiagResponse_v12) string {
126 | 	var msgs []string
127 | 	for i := 0; i < int(response.numInfo); i++ {
128 | 		if uint(response.info[i].entity.entityId) != entityId || uint(response.info[i].testId) != testId {
129 | 			continue
130 | 		}
131 | 		msgs = append(msgs, C.GoString((*C.char)(unsafe.Pointer(&response.info[i].msg))))
132 | 	}
133 | 	return strings.Join(msgs, " | ")
134 | }
135 | 
136 | func getTestName(resultIdx uint, response C.dcgmDiagResponse_v12) string {
137 | 	for i := uint(0); i < uint(response.numTests); i++ {
138 | 		t := response.tests[i]
139 | 		for j := uint16(0); j < uint16(t.numResults); j++ {
140 | 			if uint16(t.resultIndices[j]) == uint16(resultIdx) {
141 | 				plugin := C.GoString((*C.char)(unsafe.Pointer(&t.pluginName)))
142 | 				if plugin != "" {
143 | 					plugin = "/" + plugin
144 | 				}
145 | 				return C.GoString((*C.char)(unsafe.Pointer(&t.name))) + plugin
146 | 			}
147 | 		}
148 | 	}
149 | 	return ""
150 | }
151 | 
152 | func getSerial(resultIdx uint, response C.dcgmDiagResponse_v12) string {
153 | 	for i := 0; i < int(response.numEntities); i++ {
154 | 		if response.entities[i].entity.entityId == response.results[resultIdx].entity.entityId &&
155 | 			response.entities[i].entity.entityGroupId == response.results[resultIdx].entity.entityGroupId {
156 | 			return C.GoString((*C.char)(unsafe.Pointer(&response.entities[i].serialNum)))
157 | 		}
158 | 	}
159 | 	return ""
160 | }
161 | 
162 | func newDiagResult(resultIndex uint, response C.dcgmDiagResponse_v12) DiagResult {
163 | 	entityId := uint(response.results[resultIndex].entity.entityId)
164 | 	testId := uint(response.results[resultIndex].testId)
165 | 
166 | 	msg, code := getErrorMsg(entityId, testId, response)
167 | 	info := getInfoMsg(entityId, testId, response)
168 | 	testName := gpuTestName(int(testId))
169 | 	serial := getSerial(resultIndex, response)
170 | 
171 | 	return DiagResult{
172 | 		Status:       diagResultString(int(response.results[resultIndex].result)),
173 | 		TestName:     testName,
174 | 		TestOutput:   info,
175 | 		ErrorCode:    code,
176 | 		ErrorMessage: msg,
177 | 		SerialNumber: serial,
178 | 		EntityID:     entityId,
179 | 	}
180 | }
181 | 
182 | func diagLevel(diagType DiagType) C.dcgmDiagnosticLevel_t {
183 | 	switch diagType {
184 | 	case DiagQuick:
185 | 		return C.DCGM_DIAG_LVL_SHORT
186 | 	case DiagMedium:
187 | 		return C.DCGM_DIAG_LVL_MED
188 | 	case DiagLong:
189 | 		return C.DCGM_DIAG_LVL_LONG
190 | 	case DiagExtended:
191 | 		return C.DCGM_DIAG_LVL_XLONG
192 | 	}
193 | 	return C.DCGM_DIAG_LVL_INVALID
194 | }
195 | 
196 | // RunDiag runs diagnostic tests on a group of GPUs with the specified diagnostic level.
197 | // Parameters:
198 | //   - diagType: The type/level of diagnostic test to run (Quick, Medium, Long, or Extended)
199 | //   - groupId: The group of GPUs to run diagnostics on
200 | //
201 | // Returns:
202 | //   - DiagResults containing the results of all diagnostic tests
203 | //   - error if the diagnostics failed to run
204 | func RunDiag(diagType DiagType, groupID GroupHandle) (DiagResults, error) {
205 | 	var diagResults C.dcgmDiagResponse_v12
206 | 	diagResults.version = C.dcgmDiagResponse_version12
207 | 
208 | 	result := C.dcgmRunDiagnostic(handle.handle, groupID.handle, diagLevel(diagType), &diagResults)
209 | 	if err := errorString(result); err != nil {
210 | 		return DiagResults{}, &Error{msg: C.GoString(C.errorString(result)), Code: result}
211 | 	}
212 | 
213 | 	var diagRun DiagResults
214 | 	diagRun.Software = make([]DiagResult, diagResults.numResults)
215 | 	for i := 0; i < int(diagResults.numResults); i++ {
216 | 		diagRun.Software[i] = newDiagResult(uint(i), diagResults)
217 | 	}
218 | 
219 | 	return diagRun, nil
220 | }
221 | 


--------------------------------------------------------------------------------
/tests/README.md:
--------------------------------------------------------------------------------
  1 | # DCGM Go Testing Samples
  2 | 
  3 | This directory contains test versions of all the DCGM samples, reimplemented using the Go testing framework. These tests demonstrate the functionality of the NVIDIA Data Center GPU Manager (DCGM) Go bindings while being suitable for automated testing and CI/CD pipelines.
  4 | 
  5 | ## Test Files Overview
  6 | 
  7 | ### Core Device Management
  8 | 
  9 | - **`deviceinfo_test.go`** - Tests device information retrieval functionality
 10 |   - Equivalent to `samples/deviceInfo/main.go`
 11 |   - Tests GPU device properties, identification, and topology information
 12 |   - Includes tests for both embedded and standalone hostengine connections
 13 | 
 14 | - **`dmon_test.go`** - Tests device monitoring capabilities
 15 |   - Equivalent to `samples/dmon/main.go`
 16 |   - Monitors GPU utilization, temperature, power, and clock speeds
 17 |   - Includes time-limited monitoring tests and sample consistency checks
 18 | 
 19 | - **`device_status_test.go`** - Tests device status querying (part of dmon functionality)
 20 |   - Tests single and multiple GPU status queries
 21 |   - Validates utilization metrics and system health indicators
 22 | 
 23 | ### Diagnostics and Health
 24 | 
 25 | - **`diag_test.go`** - Tests DCGM diagnostic functionality
 26 |   - Equivalent to `samples/diag/main.go`
 27 |   - Runs quick and medium-level diagnostic tests
 28 |   - Validates software and hardware diagnostic results
 29 | 
 30 | - **`health_test.go`** - Tests GPU health monitoring
 31 |   - Equivalent to `samples/health/main.go`
 32 |   - Performs single and continuous health checks
 33 |   - Tests health watch configuration and error reporting
 34 | 
 35 | ### System Management
 36 | 
 37 | - **`hostengine_test.go`** - Tests DCGM hostengine introspection
 38 |   - Equivalent to `samples/hostengineStatus/main.go`
 39 |   - Monitors hostengine memory and CPU usage
 40 |   - Tests introspection under different load conditions
 41 | 
 42 | - **`policy_test.go`** - Tests policy violation monitoring
 43 |   - Equivalent to `samples/policy/main.go`
 44 |   - Tests various policy condition types (DBE, XID, thermal, power)
 45 |   - Includes context cancellation and timeout handling
 46 | 
 47 | ### Process and Topology
 48 | 
 49 | - **`processinfo_test.go`** - Tests GPU process monitoring
 50 |   - Equivalent to `samples/processInfo/main.go`
 51 |   - Tests process field watching and information retrieval
 52 |   - Includes PID-specific testing capabilities
 53 | 
 54 | - **`topology_test.go`** - Tests GPU topology analysis
 55 |   - Equivalent to `samples/topology/main.go`
 56 |   - Tests inter-GPU connection discovery and analysis
 57 |   - Includes topology consistency validation
 58 | 
 59 | ### REST API
 60 | 
 61 | - **`restapi_test.go`** - Tests REST API endpoint functionality
 62 |   - Equivalent to `samples/restApi/` (complete implementation)
 63 |   - Uses `httptest` for testing HTTP endpoints without starting a real server
 64 |   - Tests JSON response formats and error handling
 65 | 
 66 | ## Running the Tests
 67 | 
 68 | ### Run All Tests
 69 | 
 70 | ```bash
 71 | go test ./tests/... -v
 72 | ```
 73 | 
 74 | ### Run Specific Test Files
 75 | 
 76 | ```bash
 77 | # Run device information tests
 78 | go test ./tests/deviceinfo_test.go -v
 79 | 
 80 | # Run monitoring tests
 81 | go test ./tests/dmon_test.go -v
 82 | 
 83 | # Run diagnostic tests
 84 | go test ./tests/diag_test.go -v
 85 | ```
 86 | 
 87 | ### Run Tests with Different Modes
 88 | 
 89 | ```bash
 90 | # Run only quick tests (skip long-running tests)
 91 | go test ./tests/... -v -short
 92 | 
 93 | # Run tests with timeout
 94 | go test ./tests/... -v -timeout 5m
 95 | ```
 96 | 
 97 | ### Run Specific Test Functions
 98 | 
 99 | ```bash
100 | # Run specific test function
101 | go test ./tests/deviceinfo_test.go -v -run TestDeviceInfo
102 | 
103 | # Run all tests matching a pattern
104 | go test ./tests/... -v -run "TestDevice.*"
105 | ```
106 | 
107 | ## Test Features
108 | 
109 | ### Adaptive Testing
110 | 
111 | - Tests automatically skip when no GPUs are available
112 | - Different behavior for single vs. multi-GPU systems
113 | - Graceful handling of permission-restricted operations
114 | 
115 | ### Time-Limited Execution
116 | 
117 | - Long-running samples (like monitoring) are time-limited in tests
118 | - Configurable test durations for CI/CD environments
119 | - Background operations are properly cancelled
120 | 
121 | ### Comprehensive Coverage
122 | 
123 | - Each test covers the core functionality of its corresponding sample
124 | - Additional test scenarios for error conditions and edge cases
125 | - Validation of return values and data consistency
126 | 
127 | ### CI/CD Friendly
128 | 
129 | - Tests use the Go testing framework's standard patterns
130 | - Proper test isolation and cleanup
131 | - Structured logging for debugging
132 | 
133 | ## Prerequisites
134 | 
135 | ### System Requirements
136 | 
137 | - NVIDIA GPU(s) with DCGM support
138 | - NVIDIA drivers installed
139 | - DCGM libraries available
140 | - Go 1.19+ for testing framework features
141 | 
142 | ### Dependencies
143 | 
144 | The tests require the same dependencies as the original samples:
145 | 
146 | - `github.com/NVIDIA/go-dcgm/pkg/dcgm`
147 | - `github.com/gorilla/mux` (for REST API tests only)
148 | 
149 | ### Permissions
150 | 
151 | Some tests may require elevated privileges:
152 | 
153 | - Process monitoring tests work best when run as root
154 | - Certain policy violation tests require administrative access
155 | - Diagnostic tests may need elevated permissions for hardware access
156 | 
157 | ## Test Structure
158 | 
159 | Each test file follows a consistent pattern:
160 | 
161 | 1. **Basic Functionality Test** - Core sample functionality
162 | 2. **Extended Tests** - Additional scenarios and edge cases
163 | 3. **Error Handling Tests** - Validation of error conditions
164 | 4. **Performance/Consistency Tests** - Multi-sample validation
165 | 
166 | ### Example Test Pattern
167 | 
168 | ```go
169 | func TestSampleFunctionality(t *testing.T) {
170 |     // Initialize DCGM
171 |     cleanup, err := dcgm.Init(dcgm.Embedded)
172 |     if err != nil {
173 |         t.Fatalf("Failed to initialize DCGM: %v", err)
174 |     }
175 |     defer cleanup()
176 | 
177 |     // Test core functionality
178 |     // ... test implementation
179 | 
180 |     // Validate results
181 |     // ... assertions and checks
182 | }
183 | ```
184 | 
185 | ## Integration with CI/CD
186 | 
187 | These tests are designed to integrate well with continuous integration systems:
188 | 
189 | - Use standard Go testing patterns
190 | - Provide detailed logging for troubleshooting
191 | - Support timeout and cancellation
192 | - Can run with or without actual GPU hardware (with appropriate skipping)
193 | 
194 | ### Example GitHub Actions Integration
195 | 
196 | ```yaml
197 | - name: Run DCGM Tests
198 |   run: |
199 |     go test ./tests/... -v -timeout 10m
200 |   continue-on-error: true  # Optional: allow failure if no GPU available
201 | ```
202 | 
203 | ## Troubleshooting
204 | 
205 | ### Common Issues
206 | 
207 | 1. **No GPUs Found** - Tests will skip automatically
208 | 2. **Permission Denied** - Some tests require root privileges
209 | 3. **DCGM Not Available** - Ensure DCGM libraries are installed
210 | 4. **Timeout Issues** - Increase test timeout for slow systems
211 | 
212 | ### Debug Information
213 | 
214 | All tests provide verbose logging when run with `-v` flag:
215 | 
216 | ```bash
217 | go test ./tests/deviceinfo_test.go -v
218 | ```
219 | 
220 | ### Environment Variables
221 | 
222 | Tests respect standard Go testing environment variables:
223 | 
224 | - `GO_TEST_TIMEOUT_SCALE` - Scale test timeouts
225 | - `DCGM_TESTING_MODE` - Custom testing configurations (if implemented)
226 | 
227 | ## Contributing
228 | 
229 | When adding new tests:
230 | 
231 | 1. Follow the existing naming pattern (`*_test.go`)
232 | 2. Include comprehensive documentation
233 | 3. Add appropriate test skipping for missing hardware
234 | 4. Include both positive and negative test cases
235 | 5. Update this README with new test descriptions
236 | 


--------------------------------------------------------------------------------