├── .gitignore ├── Gopkg.toml ├── LICENSE ├── README.md ├── bindings.go ├── bindings_test.go ├── device_commands.go ├── device_commands_test.go ├── device_linux_only.go ├── device_linux_only_test.go ├── device_queries.go ├── device_queries_test.go ├── errors.go ├── example_test.go ├── structs.go ├── system.go └── system_test.go /.gitignore: -------------------------------------------------------------------------------- 1 | # Binaries for programs and plugins 2 | *.exe 3 | *.dll 4 | *.so 5 | *.dylib 6 | 7 | # Test binary, build with `go test -c` 8 | *.test 9 | 10 | # Output of the go coverage tool, specifically when used with LiteIDE 11 | *.out 12 | 13 | # Project-local glide cache, RE: https://github.com/Masterminds/glide/issues/736 14 | .glide/ 15 | 16 | .idea/ 17 | 18 | vendor/ 19 | Gopkg.lock -------------------------------------------------------------------------------- /Gopkg.toml: -------------------------------------------------------------------------------- 1 | 2 | [[constraint]] 3 | name = "github.com/pkg/errors" 4 | version = "0.8.0" 5 | 6 | [[constraint]] 7 | name = "github.com/stretchr/testify" 8 | version = "1.2.0" 9 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Maksym Pavlenko 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![GoDoc](https://godoc.org/github.com/mxpv/nvml-go?status.svg)](https://godoc.org/github.com/mxpv/nvml-go/) 2 | [![Go Report Card](https://goreportcard.com/badge/github.com/mxpv/nvml-go)](https://goreportcard.com/report/github.com/mxpv/nvml-go) 3 | [![MIT license](https://img.shields.io/badge/license-MIT-blue.svg)](./LICENSE) 4 | 5 | # nvml-go 6 | golang wrapper for NVIDIA Management Library (NVML) 7 | 8 | ## Basic example ## 9 | 10 | ```go 11 | func ExampleNew() { 12 | nvml, err := New("") 13 | if err != nil { 14 | panic(err) 15 | } 16 | 17 | defer nvml.Shutdown() 18 | 19 | err = nvml.Init() 20 | if err != nil { 21 | panic(err) 22 | } 23 | 24 | driverVersion, err := nvml.SystemGetDriverVersion() 25 | if err != nil { 26 | panic(err) 27 | } 28 | 29 | log.Printf("Driver version:\t%s", driverVersion) 30 | 31 | nvmlVersion, err := nvml.SystemGetNVMLVersion() 32 | if err != nil { 33 | panic(err) 34 | } 35 | 36 | log.Printf("NVML version:\t%s", nvmlVersion) 37 | 38 | deviceCount, err := nvml.DeviceGetCount() 39 | if err != nil { 40 | panic(err) 41 | } 42 | 43 | for i := uint32(0); i < deviceCount; i++ { 44 | handle, err := nvml.DeviceGetHandleByIndex(i) 45 | if err != nil { 46 | panic(err) 47 | } 48 | 49 | name, err := nvml.DeviceGetName(handle) 50 | log.Printf("Product name:\t%s", name) 51 | 52 | brand, err := nvml.DeviceGetBrand(handle) 53 | if err != nil { 54 | panic(err) 55 | } 56 | 57 | log.Printf("Product Brand:\t%s", brand) 58 | 59 | uuid, err := nvml.DeviceGetUUID(handle) 60 | if err != nil { 61 | panic(err) 62 | } 63 | 64 | log.Printf("GPU UUID:\t\t%s", uuid) 65 | 66 | fan, err := nvml.DeviceGetFanSpeed(handle) 67 | if err != nil { 68 | panic(err) 69 | } 70 | 71 | log.Printf("Fan Speed:\t\t%d", fan) 72 | } 73 | } 74 | ``` 75 | 76 | 77 | ## TODO ## 78 | - [Unit Queries](http://docs.nvidia.com/deploy/nvml-api/group__nvmlUnitQueries.html) 79 | - [Unit Commands](http://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceCommands.html) 80 | - Linux support -------------------------------------------------------------------------------- /bindings.go: -------------------------------------------------------------------------------- 1 | package nvml 2 | 3 | import ( 4 | "C" 5 | "os" 6 | "syscall" 7 | "unsafe" 8 | 9 | "github.com/pkg/errors" 10 | ) 11 | 12 | var ErrNotImplemented = errors.New("Not implemented") 13 | 14 | type API struct { 15 | dll *syscall.DLL 16 | // Initialization and cleanup 17 | nvmlInit, 18 | nvmlShutdown, 19 | // Error reporting 20 | nvmlErrorString, 21 | // System Queries 22 | nvmlSystemGetCudaDriverVersion, 23 | nvmlSystemGetDriverVersion, 24 | nvmlSystemGetNVMLVersion, 25 | nvmlSystemGetProcessName, 26 | // Device Queries 27 | nvmlDeviceClearCpuAffinity, 28 | nvmlDeviceGetAPIRestriction, 29 | nvmlDeviceGetApplicationsClock, 30 | nvmlDeviceGetAutoBoostedClocksEnabled, 31 | nvmlDeviceGetBAR1MemoryInfo, 32 | nvmlDeviceGetBoardId, 33 | nvmlDeviceGetBoardPartNumber, 34 | nvmlDeviceGetBrand, 35 | nvmlDeviceGetBridgeChipInfo, 36 | nvmlDeviceGetClock, 37 | nvmlDeviceGetClockInfo, 38 | nvmlDeviceGetComputeMode, 39 | nvmlDeviceGetComputeRunningProcesses, 40 | nvmlDeviceGetCount, 41 | nvmlDeviceGetCpuAffinity, 42 | nvmlDeviceGetCudaComputeCapability, 43 | nvmlDeviceGetCurrPcieLinkGeneration, 44 | nvmlDeviceGetCurrPcieLinkWidth, 45 | nvmlDeviceGetCurrentClocksThrottleReasons, 46 | nvmlDeviceGetDecoderUtilization, 47 | nvmlDeviceGetDefaultApplicationsClock, 48 | nvmlDeviceGetDetailedEccErrors, 49 | nvmlDeviceGetDisplayActive, 50 | nvmlDeviceGetDisplayMode, 51 | nvmlDeviceGetDriverModel, 52 | nvmlDeviceGetEccMode, 53 | nvmlDeviceGetEncoderCapacity, 54 | nvmlDeviceGetEncoderSessions, 55 | nvmlDeviceGetEncoderStats, 56 | nvmlDeviceGetEncoderUtilization, 57 | nvmlDeviceGetEnforcedPowerLimit, 58 | nvmlDeviceGetFanSpeed, 59 | nvmlDeviceGetGpuOperationMode, 60 | nvmlDeviceGetGraphicsRunningProcesses, 61 | nvmlDeviceGetHandleByIndex, 62 | nvmlDeviceGetHandleByPciBusId, 63 | nvmlDeviceGetHandleBySerial, 64 | nvmlDeviceGetHandleByUUID, 65 | nvmlDeviceGetIndex, 66 | nvmlDeviceGetInforomConfigurationChecksum, 67 | nvmlDeviceGetInforomImageVersion, 68 | nvmlDeviceGetInforomVersion, 69 | nvmlDeviceGetMaxClockInfo, 70 | nvmlDeviceGetMaxCustomerBoostClock, 71 | nvmlDeviceGetMaxPcieLinkGeneration, 72 | nvmlDeviceGetMaxPcieLinkWidth, 73 | nvmlDeviceGetMemoryErrorCounter, 74 | nvmlDeviceGetMemoryInfo, 75 | nvmlDeviceGetMinorNumber, 76 | nvmlDeviceGetMultiGpuBoard, 77 | nvmlDeviceGetName, 78 | nvmlDeviceGetP2PStatus, 79 | nvmlDeviceGetPciInfo, 80 | nvmlDeviceGetPcieReplayCounter, 81 | nvmlDeviceGetPcieThroughput, 82 | nvmlDeviceGetPerformanceState, 83 | nvmlDeviceGetPersistenceMode, 84 | nvmlDeviceGetPowerManagementDefaultLimit, 85 | nvmlDeviceGetPowerManagementLimit, 86 | nvmlDeviceGetPowerManagementLimitConstraints, 87 | nvmlDeviceGetPowerManagementMode, 88 | nvmlDeviceGetPowerState, 89 | nvmlDeviceGetPowerUsage, 90 | nvmlDeviceGetRetiredPages, 91 | nvmlDeviceGetRetiredPagesPendingStatus, 92 | nvmlDeviceGetSamples, 93 | nvmlDeviceGetSerial, 94 | nvmlDeviceGetSupportedClocksThrottleReasons, 95 | nvmlDeviceGetSupportedGraphicsClocks, 96 | nvmlDeviceGetSupportedMemoryClocks, 97 | nvmlDeviceGetTemperature, 98 | nvmlDeviceGetTemperatureThreshold, 99 | nvmlDeviceGetTopologyCommonAncestor, 100 | nvmlDeviceGetTopologyNearestGpus, 101 | nvmlDeviceGetTotalEccErrors, 102 | nvmlDeviceGetTotalEnergyConsumption, 103 | nvmlDeviceGetUUID, 104 | nvmlDeviceGetUtilizationRates, 105 | nvmlDeviceGetVbiosVersion, 106 | nvmlDeviceGetViolationStatus, 107 | nvmlDeviceOnSameBoard, 108 | nvmlDeviceResetApplicationsClocks, 109 | nvmlDeviceSetAutoBoostedClocksEnabled, 110 | nvmlDeviceSetCpuAffinity, 111 | nvmlDeviceSetDefaultAutoBoostedClocksEnabled, 112 | nvmlDeviceValidateInforom, 113 | nvmlSystemGetTopologyGpuSet, 114 | // Device commands 115 | nvmlDeviceClearEccErrorCounts, 116 | nvmlDeviceSetAPIRestriction, 117 | nvmlDeviceSetApplicationsClocks, 118 | nvmlDeviceSetComputeMode, 119 | nvmlDeviceSetDriverModel, 120 | nvmlDeviceSetEccMode, 121 | nvmlDeviceSetGpuOperationMode, 122 | nvmlDeviceSetPersistenceMode, 123 | nvmlDeviceSetPowerManagementLimit *syscall.Proc 124 | } 125 | 126 | func (a API) call(p *syscall.Proc, args ...uintptr) error { 127 | ret, _, _ := p.Call(args...) 128 | if ret != 0 { 129 | return returnValueToError(int(ret)) 130 | } 131 | 132 | return nil 133 | } 134 | 135 | // Init initializes NVML, but don't initialize any GPUs yet. 136 | func (a API) Init() error { 137 | return a.call(a.nvmlInit) 138 | } 139 | 140 | // Shutdown shut downs NVML by releasing all GPU resources previously allocated with Init() and 141 | // unloads nvml.dll via UnloadLibrary call. 142 | func (a API) Shutdown() error { 143 | err := a.call(a.nvmlShutdown) 144 | a.ReleaseDLL() 145 | return err 146 | } 147 | 148 | func (a API) ReleaseDLL() error { 149 | return a.dll.Release() 150 | } 151 | 152 | // ErrorString returns a string representation of the error. 153 | func (a API) ErrorString(result uintptr) string { 154 | ret, _, _ := a.nvmlErrorString.Call(uintptr(result)) 155 | buf := (*C.char)(unsafe.Pointer(ret)) 156 | return C.GoString(buf) 157 | } 158 | 159 | // New creates nvml.dll wrapper 160 | func New(path string) (*API, error) { 161 | if path == "" { 162 | path = os.ExpandEnv("$ProgramW6432\\NVIDIA Corporation\\NVSMI\\nvml.dll") 163 | } 164 | 165 | dll, err := syscall.LoadDLL(path) 166 | if err != nil { 167 | return nil, err 168 | } 169 | 170 | bindings := &API{ 171 | dll: dll, 172 | nvmlInit: dll.MustFindProc("nvmlInit"), 173 | nvmlShutdown: dll.MustFindProc("nvmlShutdown"), 174 | nvmlErrorString: dll.MustFindProc("nvmlErrorString"), 175 | nvmlSystemGetCudaDriverVersion: dll.MustFindProc("nvmlSystemGetCudaDriverVersion"), 176 | nvmlSystemGetDriverVersion: dll.MustFindProc("nvmlSystemGetDriverVersion"), 177 | nvmlSystemGetNVMLVersion: dll.MustFindProc("nvmlSystemGetNVMLVersion"), 178 | nvmlSystemGetProcessName: dll.MustFindProc("nvmlSystemGetProcessName"), 179 | nvmlDeviceClearCpuAffinity: dll.MustFindProc("nvmlDeviceClearCpuAffinity"), 180 | nvmlDeviceGetAPIRestriction: dll.MustFindProc("nvmlDeviceGetAPIRestriction"), 181 | nvmlDeviceGetApplicationsClock: dll.MustFindProc("nvmlDeviceGetApplicationsClock"), 182 | nvmlDeviceGetAutoBoostedClocksEnabled: dll.MustFindProc("nvmlDeviceGetAutoBoostedClocksEnabled"), 183 | nvmlDeviceGetBAR1MemoryInfo: dll.MustFindProc("nvmlDeviceGetBAR1MemoryInfo"), 184 | nvmlDeviceGetBoardId: dll.MustFindProc("nvmlDeviceGetBoardId"), 185 | nvmlDeviceGetBoardPartNumber: dll.MustFindProc("nvmlDeviceGetBoardPartNumber"), 186 | nvmlDeviceGetBrand: dll.MustFindProc("nvmlDeviceGetBrand"), 187 | nvmlDeviceGetBridgeChipInfo: dll.MustFindProc("nvmlDeviceGetBridgeChipInfo"), 188 | nvmlDeviceGetClock: dll.MustFindProc("nvmlDeviceGetClock"), 189 | nvmlDeviceGetClockInfo: dll.MustFindProc("nvmlDeviceGetClockInfo"), 190 | nvmlDeviceGetComputeMode: dll.MustFindProc("nvmlDeviceGetComputeMode"), 191 | nvmlDeviceGetComputeRunningProcesses: dll.MustFindProc("nvmlDeviceGetComputeRunningProcesses"), 192 | nvmlDeviceGetCount: dll.MustFindProc("nvmlDeviceGetCount"), 193 | nvmlDeviceGetCpuAffinity: dll.MustFindProc("nvmlDeviceGetCpuAffinity"), 194 | nvmlDeviceGetCudaComputeCapability: dll.MustFindProc("nvmlDeviceGetCudaComputeCapability"), 195 | nvmlDeviceGetCurrPcieLinkGeneration: dll.MustFindProc("nvmlDeviceGetCurrPcieLinkGeneration"), 196 | nvmlDeviceGetCurrPcieLinkWidth: dll.MustFindProc("nvmlDeviceGetCurrPcieLinkWidth"), 197 | nvmlDeviceGetCurrentClocksThrottleReasons: dll.MustFindProc("nvmlDeviceGetCurrentClocksThrottleReasons"), 198 | nvmlDeviceGetDecoderUtilization: dll.MustFindProc("nvmlDeviceGetDecoderUtilization"), 199 | nvmlDeviceGetDefaultApplicationsClock: dll.MustFindProc("nvmlDeviceGetDefaultApplicationsClock"), 200 | nvmlDeviceGetDetailedEccErrors: dll.MustFindProc("nvmlDeviceGetDetailedEccErrors"), 201 | nvmlDeviceGetDisplayActive: dll.MustFindProc("nvmlDeviceGetDisplayActive"), 202 | nvmlDeviceGetDisplayMode: dll.MustFindProc("nvmlDeviceGetDisplayMode"), 203 | nvmlDeviceGetDriverModel: dll.MustFindProc("nvmlDeviceGetDriverModel"), 204 | nvmlDeviceGetEccMode: dll.MustFindProc("nvmlDeviceGetEccMode"), 205 | nvmlDeviceGetEncoderCapacity: dll.MustFindProc("nvmlDeviceGetEncoderCapacity"), 206 | nvmlDeviceGetEncoderSessions: dll.MustFindProc("nvmlDeviceGetEncoderSessions"), 207 | nvmlDeviceGetEncoderStats: dll.MustFindProc("nvmlDeviceGetEncoderStats"), 208 | nvmlDeviceGetEncoderUtilization: dll.MustFindProc("nvmlDeviceGetEncoderUtilization"), 209 | nvmlDeviceGetEnforcedPowerLimit: dll.MustFindProc("nvmlDeviceGetEnforcedPowerLimit"), 210 | nvmlDeviceGetFanSpeed: dll.MustFindProc("nvmlDeviceGetFanSpeed"), 211 | nvmlDeviceGetGpuOperationMode: dll.MustFindProc("nvmlDeviceGetGpuOperationMode"), 212 | nvmlDeviceGetGraphicsRunningProcesses: dll.MustFindProc("nvmlDeviceGetGraphicsRunningProcesses"), 213 | nvmlDeviceGetHandleByIndex: dll.MustFindProc("nvmlDeviceGetHandleByIndex"), 214 | nvmlDeviceGetHandleByPciBusId: dll.MustFindProc("nvmlDeviceGetHandleByPciBusId"), 215 | nvmlDeviceGetHandleBySerial: dll.MustFindProc("nvmlDeviceGetHandleBySerial"), 216 | nvmlDeviceGetHandleByUUID: dll.MustFindProc("nvmlDeviceGetHandleByUUID"), 217 | nvmlDeviceGetIndex: dll.MustFindProc("nvmlDeviceGetIndex"), 218 | nvmlDeviceGetInforomConfigurationChecksum: dll.MustFindProc("nvmlDeviceGetInforomConfigurationChecksum"), 219 | nvmlDeviceGetInforomImageVersion: dll.MustFindProc("nvmlDeviceGetInforomImageVersion"), 220 | nvmlDeviceGetInforomVersion: dll.MustFindProc("nvmlDeviceGetInforomVersion"), 221 | nvmlDeviceGetMaxClockInfo: dll.MustFindProc("nvmlDeviceGetMaxClockInfo"), 222 | nvmlDeviceGetMaxCustomerBoostClock: dll.MustFindProc("nvmlDeviceGetMaxCustomerBoostClock"), 223 | nvmlDeviceGetMaxPcieLinkGeneration: dll.MustFindProc("nvmlDeviceGetMaxPcieLinkGeneration"), 224 | nvmlDeviceGetMaxPcieLinkWidth: dll.MustFindProc("nvmlDeviceGetMaxPcieLinkWidth"), 225 | nvmlDeviceGetMemoryErrorCounter: dll.MustFindProc("nvmlDeviceGetMemoryErrorCounter"), 226 | nvmlDeviceGetMemoryInfo: dll.MustFindProc("nvmlDeviceGetMemoryInfo"), 227 | nvmlDeviceGetMinorNumber: dll.MustFindProc("nvmlDeviceGetMinorNumber"), 228 | nvmlDeviceGetMultiGpuBoard: dll.MustFindProc("nvmlDeviceGetMultiGpuBoard"), 229 | nvmlDeviceGetName: dll.MustFindProc("nvmlDeviceGetName"), 230 | nvmlDeviceGetP2PStatus: dll.MustFindProc("nvmlDeviceGetP2PStatus"), 231 | nvmlDeviceGetPciInfo: dll.MustFindProc("nvmlDeviceGetPciInfo"), 232 | nvmlDeviceGetPcieReplayCounter: dll.MustFindProc("nvmlDeviceGetPcieReplayCounter"), 233 | nvmlDeviceGetPcieThroughput: dll.MustFindProc("nvmlDeviceGetPcieThroughput"), 234 | nvmlDeviceGetPerformanceState: dll.MustFindProc("nvmlDeviceGetPerformanceState"), 235 | nvmlDeviceGetPersistenceMode: dll.MustFindProc("nvmlDeviceGetPersistenceMode"), 236 | nvmlDeviceGetPowerManagementDefaultLimit: dll.MustFindProc("nvmlDeviceGetPowerManagementDefaultLimit"), 237 | nvmlDeviceGetPowerManagementLimit: dll.MustFindProc("nvmlDeviceGetPowerManagementLimit"), 238 | nvmlDeviceGetPowerManagementLimitConstraints: dll.MustFindProc("nvmlDeviceGetPowerManagementLimitConstraints"), 239 | nvmlDeviceGetPowerManagementMode: dll.MustFindProc("nvmlDeviceGetPowerManagementMode"), 240 | nvmlDeviceGetPowerState: dll.MustFindProc("nvmlDeviceGetPowerState"), 241 | nvmlDeviceGetPowerUsage: dll.MustFindProc("nvmlDeviceGetPowerUsage"), 242 | nvmlDeviceGetRetiredPages: dll.MustFindProc("nvmlDeviceGetRetiredPages"), 243 | nvmlDeviceGetRetiredPagesPendingStatus: dll.MustFindProc("nvmlDeviceGetRetiredPagesPendingStatus"), 244 | nvmlDeviceGetSamples: dll.MustFindProc("nvmlDeviceGetSamples"), 245 | nvmlDeviceGetSerial: dll.MustFindProc("nvmlDeviceGetSerial"), 246 | nvmlDeviceGetSupportedClocksThrottleReasons: dll.MustFindProc("nvmlDeviceGetSupportedClocksThrottleReasons"), 247 | nvmlDeviceGetSupportedGraphicsClocks: dll.MustFindProc("nvmlDeviceGetSupportedGraphicsClocks"), 248 | nvmlDeviceGetSupportedMemoryClocks: dll.MustFindProc("nvmlDeviceGetSupportedMemoryClocks"), 249 | nvmlDeviceGetTemperature: dll.MustFindProc("nvmlDeviceGetTemperature"), 250 | nvmlDeviceGetTemperatureThreshold: dll.MustFindProc("nvmlDeviceGetTemperatureThreshold"), 251 | nvmlDeviceGetTopologyCommonAncestor: dll.MustFindProc("nvmlDeviceGetTopologyCommonAncestor"), 252 | nvmlDeviceGetTopologyNearestGpus: dll.MustFindProc("nvmlDeviceGetTopologyNearestGpus"), 253 | nvmlDeviceGetTotalEccErrors: dll.MustFindProc("nvmlDeviceGetTotalEccErrors"), 254 | nvmlDeviceGetTotalEnergyConsumption: dll.MustFindProc("nvmlDeviceGetTotalEnergyConsumption"), 255 | nvmlDeviceGetUUID: dll.MustFindProc("nvmlDeviceGetUUID"), 256 | nvmlDeviceGetUtilizationRates: dll.MustFindProc("nvmlDeviceGetUtilizationRates"), 257 | nvmlDeviceGetVbiosVersion: dll.MustFindProc("nvmlDeviceGetVbiosVersion"), 258 | nvmlDeviceGetViolationStatus: dll.MustFindProc("nvmlDeviceGetViolationStatus"), 259 | nvmlDeviceOnSameBoard: dll.MustFindProc("nvmlDeviceOnSameBoard"), 260 | nvmlDeviceResetApplicationsClocks: dll.MustFindProc("nvmlDeviceResetApplicationsClocks"), 261 | nvmlDeviceSetAutoBoostedClocksEnabled: dll.MustFindProc("nvmlDeviceSetAutoBoostedClocksEnabled"), 262 | nvmlDeviceSetCpuAffinity: dll.MustFindProc("nvmlDeviceSetCpuAffinity"), 263 | nvmlDeviceSetDefaultAutoBoostedClocksEnabled: dll.MustFindProc("nvmlDeviceSetDefaultAutoBoostedClocksEnabled"), 264 | nvmlDeviceValidateInforom: dll.MustFindProc("nvmlDeviceValidateInforom"), 265 | nvmlSystemGetTopologyGpuSet: dll.MustFindProc("nvmlSystemGetTopologyGpuSet"), 266 | nvmlDeviceClearEccErrorCounts: dll.MustFindProc("nvmlDeviceClearEccErrorCounts"), 267 | nvmlDeviceSetAPIRestriction: dll.MustFindProc("nvmlDeviceSetAPIRestriction"), 268 | nvmlDeviceSetApplicationsClocks: dll.MustFindProc("nvmlDeviceSetApplicationsClocks"), 269 | nvmlDeviceSetComputeMode: dll.MustFindProc("nvmlDeviceSetComputeMode"), 270 | nvmlDeviceSetDriverModel: dll.MustFindProc("nvmlDeviceSetDriverModel"), 271 | nvmlDeviceSetEccMode: dll.MustFindProc("nvmlDeviceSetEccMode"), 272 | nvmlDeviceSetGpuOperationMode: dll.MustFindProc("nvmlDeviceSetGpuOperationMode"), 273 | nvmlDeviceSetPersistenceMode: dll.MustFindProc("nvmlDeviceSetPersistenceMode"), 274 | nvmlDeviceSetPowerManagementLimit: dll.MustFindProc("nvmlDeviceSetPowerManagementLimit"), 275 | } 276 | 277 | return bindings, nil 278 | } 279 | -------------------------------------------------------------------------------- /bindings_test.go: -------------------------------------------------------------------------------- 1 | package nvml 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/stretchr/testify/require" 7 | ) 8 | 9 | func TestNew(t *testing.T) { 10 | w, err := New("") 11 | require.NoError(t, err) 12 | 13 | err = w.Init() 14 | require.NoError(t, err) 15 | 16 | err = w.Shutdown() 17 | require.NoError(t, err) 18 | } 19 | 20 | func TestErrorString(t *testing.T) { 21 | w, err := New("") 22 | require.NoError(t, err) 23 | defer w.Shutdown() 24 | 25 | require.Equal(t, "Uninitialized", w.ErrorString(1)) 26 | require.Equal(t, "Invalid Argument", w.ErrorString(2)) 27 | require.Equal(t, "Not Supported", w.ErrorString(3)) 28 | } 29 | 30 | func create(t *testing.T) (*API, Device) { 31 | w, err := New("") 32 | require.NoError(t, err) 33 | 34 | err = w.Init() 35 | require.NoError(t, err) 36 | 37 | device, err := w.DeviceGetHandleByIndex(0) 38 | require.NoError(t, err) 39 | 40 | return w, device 41 | } 42 | -------------------------------------------------------------------------------- /device_commands.go: -------------------------------------------------------------------------------- 1 | package nvml 2 | 3 | // DeviceClearECCErrorCounts clears the ECC error and other memory error counts for the device. 4 | // Only applicable to devices with ECC. Requires NVML_INFOROM_ECC version 2.0 or higher to clear aggregate 5 | // location-based ECC counts. Requires NVML_INFOROM_ECC version 1.0 or higher to clear all other ECC counts. 6 | // Requires root/admin permissions. Requires ECC Mode to be enabled. 7 | // Sets all of the specified ECC counters to 0, including both detailed and total counts. 8 | // This operation takes effect immediately. 9 | func (a API) DeviceClearECCErrorCounts(device Device, counterType ECCCounterType) error { 10 | return a.call(a.nvmlDeviceClearEccErrorCounts, uintptr(device), uintptr(counterType)) 11 | } 12 | 13 | // DeviceSetAPIRestriction changes the root/admin restructions on certain APIs. 14 | // See nvmlRestrictedAPI_t for the list of supported APIs. 15 | // This method can be used by a root/admin user to give non-root/admin access to certain otherwise-restricted APIs. 16 | // The new setting lasts for the lifetime of the NVIDIA driver; it is not persistent. 17 | // See DeviceGetAPIRestriction to query the current restriction settings. 18 | func (a API) DeviceSetAPIRestriction(device Device, apiType RestrictedAPI, isRestricted bool) error { 19 | var isRestrictedInt int32 = 0 20 | if isRestricted { 21 | isRestrictedInt = 1 22 | } 23 | 24 | return a.call(a.nvmlDeviceSetAPIRestriction, uintptr(device), uintptr(apiType), uintptr(isRestrictedInt)) 25 | } 26 | 27 | // DeviceSetApplicationsClocks set clocks that applications will lock to. 28 | // Sets the clocks that compute and graphics applications will be running at. e.g. CUDA driver requests these clocks 29 | // during context creation which means this property defines clocks at which CUDA applications will be running unless 30 | // some overspec event occurs (e.g. over power, over thermal or external HW brake). 31 | // Can be used as a setting to request constant performance. 32 | // On Pascal and newer hardware, this will automatically disable automatic boosting of clocks. 33 | // On K80 and newer Kepler and Maxwell GPUs, users desiring fixed performance should also call 34 | // DeviceSetAutoBoostedClocksEnabled to prevent clocks from automatically boosting above the clock value being set. 35 | // After system reboot or driver reload applications clocks go back to their default value. 36 | func (a API) DeviceSetApplicationsClocks(device Device, memClockMHz, graphicsClockMHz uint32) error { 37 | return a.call(a.nvmlDeviceSetApplicationsClocks, uintptr(device), uintptr(memClockMHz), uintptr(graphicsClockMHz)) 38 | } 39 | 40 | // DeviceSetComputeMode sets the compute mode for the device. 41 | // Requires root/admin permissions. 42 | // The compute mode determines whether a GPU can be used for compute operations and whether it can be shared across contexts. 43 | // This operation takes effect immediately. 44 | // Under Linux it is not persistent across reboots and always resets to "Default". Under windows it is persistent. 45 | // Under windows compute mode may only be set to DEFAULT when running in WDDM. 46 | func (a API) DeviceSetComputeMode(device Device, mode ComputeMode) error { 47 | return a.call(a.nvmlDeviceSetComputeMode, uintptr(device), uintptr(mode)) 48 | } 49 | 50 | // DeviceSetDriverModel sets the driver model for the device. 51 | // For windows only. Requires root/admin permissions. 52 | // On Windows platforms the device driver can run in either WDDM or WDM (TCC) mode. 53 | // If a display is attached to the device it must run in WDDM mode. 54 | // It is possible to force the change to WDM (TCC) while the display is still attached with a force flag (nvmlFlagForce). 55 | // This should only be done if the host is subsequently powered down and the display is detached from the device before 56 | // the next reboot. 57 | // This operation takes effect after the next reboot. 58 | // Windows driver model may only be set to WDDM when running in DEFAULT compute mode. Change driver model to WDDM is not 59 | // supported when GPU doesn't support graphics acceleration or will not support it after reboot. 60 | func (a API) DeviceSetDriverModel(device Device, model DriverModel, flags uint32) error { 61 | return a.call(a.nvmlDeviceSetDriverModel, uintptr(device), uintptr(model), uintptr(flags)) 62 | } 63 | 64 | // DeviceSetECCMode sets the ECC mode for the device. 65 | // Only applicable to devices with ECC. Requires NVML_INFOROM_ECC version 1.0 or higher. 66 | // Requires root/admin permissions. 67 | // The ECC mode determines whether the GPU enables its ECC support. 68 | // This operation takes effect after the next reboot. 69 | func (a API) DeviceSetECCMode(device Device, ecc bool) error { 70 | var eccInt int32 = 0 71 | if ecc { 72 | eccInt = 1 73 | } 74 | 75 | return a.call(a.nvmlDeviceSetEccMode, uintptr(device), uintptr(eccInt)) 76 | } 77 | 78 | // DeviceSetGPUOperationMode sets new GOM. See nvmlGpuOperationMode_t for details. 79 | // For GK110 M-class and X-class Tesla products from the Kepler family. 80 | // Modes NVML_GOM_LOW_DP and NVML_GOM_ALL_ON are supported on fully supported GeForce products. 81 | // Not supported on Quadro and Tesla C-class products. 82 | // Requires root/admin permissions. 83 | // Changing GOMs requires a reboot. The reboot requirement might be removed in the future. 84 | // Compute only GOMs don't support graphics acceleration. 85 | // Under windows switching to these GOMs when pending driver model is WDDM is not supported. 86 | func (a API) DeviceSetGPUOperationMode(device Device, mode GPUOperationMode) error { 87 | return a.call(a.nvmlDeviceSetGpuOperationMode, uintptr(device), uintptr(mode)) 88 | } 89 | 90 | // DeviceSetPowerManagementLimit set new power limit of this device. 91 | // Requires root/admin permissions. 92 | // Note: Limit is not persistent across reboots or driver unloads. 93 | // Enable persistent mode to prevent driver from unloading when no application is using the device. 94 | func (a API) DeviceSetPowerManagementLimit(device Device, limit uint32) error { 95 | return a.call(a.nvmlDeviceSetPowerManagementLimit, uintptr(device), uintptr(limit)) 96 | } 97 | -------------------------------------------------------------------------------- /device_commands_test.go: -------------------------------------------------------------------------------- 1 | package nvml 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/stretchr/testify/require" 7 | ) 8 | 9 | func TestDeviceClearECCErrorCounts(t *testing.T) { 10 | w, device := create(t) 11 | defer w.Shutdown() 12 | 13 | err := w.DeviceClearECCErrorCounts(device, VolatileECC) 14 | require.NoError(t, err) 15 | } 16 | 17 | func TestDeviceSetAPIRestriction(t *testing.T) { 18 | w, device := create(t) 19 | defer w.Shutdown() 20 | 21 | err := w.DeviceSetAPIRestriction(device, RestrictedAPISetAutoBoostedClocks, true) 22 | require.NoError(t, err) 23 | } 24 | 25 | func TestDeviceSetApplicationsClocks(t *testing.T) { 26 | w, device := create(t) 27 | defer w.Shutdown() 28 | 29 | mem, err := w.DeviceGetApplicationsClock(device, ClockMem) 30 | require.NoError(t, err) 31 | 32 | graphics, err := w.DeviceGetApplicationsClock(device, ClockGraphics) 33 | require.NoError(t, err) 34 | 35 | err = w.DeviceSetApplicationsClocks(device, mem, graphics) 36 | require.NoError(t, err) 37 | } 38 | 39 | func TestDeviceSetComputeMode(t *testing.T) { 40 | w, device := create(t) 41 | defer w.Shutdown() 42 | 43 | mode, err := w.DeviceGetComputeMode(device) 44 | require.NoError(t, err) 45 | 46 | err = w.DeviceSetComputeMode(device, mode) 47 | require.NoError(t, err) 48 | } 49 | 50 | func TestDeviceSetDriverModel(t *testing.T) { 51 | w, device := create(t) 52 | defer w.Shutdown() 53 | 54 | _, pending, err := w.DeviceGetDriverModel(device) 55 | require.NoError(t, err) 56 | 57 | err = w.DeviceSetDriverModel(device, pending, 0) 58 | require.NoError(t, err) 59 | } 60 | 61 | func TestDeviceSetECCMode(t *testing.T) { 62 | w, device := create(t) 63 | defer w.Shutdown() 64 | 65 | err := w.DeviceSetECCMode(device, true) 66 | require.NoError(t, err) 67 | } 68 | 69 | func TestDeviceSetGPUOperationMode(t *testing.T) { 70 | w, device := create(t) 71 | defer w.Shutdown() 72 | 73 | _, pending, err := w.DeviceGetGPUOperationMode(device) 74 | require.NoError(t, err) 75 | 76 | err = w.DeviceSetGPUOperationMode(device, pending) 77 | require.NoError(t, err) 78 | } 79 | 80 | func TestDeviceSetPowerManagementLimit(t *testing.T) { 81 | w, device := create(t) 82 | defer w.Shutdown() 83 | 84 | limit, err := w.DeviceGetPowerManagementLimit(device) 85 | require.NoError(t, err) 86 | 87 | err = w.DeviceSetPowerManagementLimit(device, limit) 88 | require.NoError(t, err) 89 | } 90 | -------------------------------------------------------------------------------- /device_linux_only.go: -------------------------------------------------------------------------------- 1 | // +build linux,cgo 2 | 3 | package nvml 4 | 5 | import "unsafe" 6 | 7 | // DeviceGetCPUAffinity retrieves an array of unsigned ints (sized to cpuSetSize) of bitmasks with the ideal CPU 8 | // affinity for the device. For example, if processors 0, 1, 32, and 33 are ideal for the device and 9 | // cpuSetSize == 2, result[0] = 0x3, result[1] = 0x3 10 | func (a API) DeviceGetCPUAffinity(device Device, cpuSetSize uint32) (cpuSet uint32, err error) { 11 | err = a.call(a.nvmlDeviceGetCpuAffinity, uintptr(device), uintptr(cpuSetSize), uintptr(unsafe.Pointer(&cpuSet))) 12 | return 13 | } 14 | 15 | // DeviceSetCpuAffinity sets the ideal affinity for the calling thread and device using the guidelines given in 16 | // DeviceGetCPUAffinity(). Note, this is a change as of version 8.0. Older versions set the affinity for a calling 17 | // process and all children. Currently supports up to 64 processors. 18 | func (a API) DeviceSetCpuAffinity(device Device) error { 19 | return a.call(a.nvmlDeviceSetCpuAffinity, uintptr(device)) 20 | } 21 | 22 | // DeviceClearCpuAffinity clears all affinity bindings for the calling thread. Note, this is a change as of version 8.0 23 | // as older versions cleared the affinity for a calling process and all children. 24 | func (a API) DeviceClearCpuAffinity(device Device) (err error) { 25 | err = a.call(a.nvmlDeviceClearCpuAffinity, uintptr(device)) 26 | return 27 | } 28 | 29 | // DeviceGetPersistenceMode eetrieves the persistence mode associated with this device. 30 | // When driver persistence mode is enabled the driver software state is not torn down when the last client disconnects. 31 | // By default this feature is disabled. 32 | func (a API) DeviceGetPersistenceMode(device Device) (enabled bool, err error) { 33 | var state int32 34 | err = a.call(a.nvmlDeviceGetPersistenceMode, uintptr(device), uintptr(unsafe.Pointer(&state))) 35 | if err != nil { 36 | return 37 | } 38 | 39 | if state > 0 { 40 | enabled = true 41 | } else { 42 | enabled = false 43 | } 44 | 45 | return 46 | } 47 | 48 | // DeviceSetPersistenceMode sets the persistence mode for the device. 49 | // For Linux only. Requires root/admin permissions. 50 | // The persistence mode determines whether the GPU driver software is torn down after the last client exits. 51 | // This operation takes effect immediately. It is not persistent across reboots. 52 | // After each reboot the persistence mode is reset to "Disabled". 53 | func (a API) DeviceSetPersistenceMode(device Device, mode bool) error { 54 | var modeInt int32 = 0 55 | if mode { 56 | modeInt = 1 57 | } 58 | 59 | return a.call(a.nvmlDeviceSetPersistenceMode, uintptr(device), uintptr(modeInt)) 60 | } 61 | -------------------------------------------------------------------------------- /device_linux_only_test.go: -------------------------------------------------------------------------------- 1 | // +build linux,cgo 2 | 3 | package nvml 4 | 5 | import ( 6 | "testing" 7 | 8 | "github.com/stretchr/testify/require" 9 | ) 10 | 11 | func TestDeviceClearCpuAffinity(t *testing.T) { 12 | w, device := create(t) 13 | defer w.Shutdown() 14 | 15 | err := w.DeviceClearCpuAffinity(device) 16 | require.NoError(t, err) 17 | } 18 | 19 | func TestDeviceSetPersistenceMode(t *testing.T) { 20 | w, device := create(t) 21 | defer w.Shutdown() 22 | 23 | err := w.DeviceSetPersistenceMode(device, true) 24 | require.NoError(t, err) 25 | } 26 | 27 | func TestDeviceGetCPUAffinity(t *testing.T) { 28 | w, device := create(t) 29 | defer w.Shutdown() 30 | 31 | _, err := w.DeviceGetCPUAffinity(device, 1) 32 | require.NoError(t, err) 33 | } 34 | 35 | func TestDeviceGetPersistenceMode(t *testing.T) { 36 | w, device := create(t) 37 | defer w.Shutdown() 38 | 39 | _, err := w.DeviceGetPersistenceMode(device) 40 | require.NoError(t, err) 41 | } 42 | 43 | func TestDeviceSetCpuAffinity(t *testing.T) { 44 | w, device := create(t) 45 | defer w.Shutdown() 46 | 47 | err := w.DeviceSetCpuAffinity(device) 48 | require.NoError(t, err) 49 | } 50 | -------------------------------------------------------------------------------- /device_queries.go: -------------------------------------------------------------------------------- 1 | package nvml 2 | 3 | // #define NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE 16 4 | // typedef struct nvmlPciInfo_st { 5 | // char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE]; //!< The tuple domain:bus:device.function PCI identifier 6 | // unsigned int domain; //!< The PCI domain on which the device's bus resides, 0 to 0xffff 7 | // unsigned int bus; //!< The bus on which the device resides, 0 to 0xff 8 | // unsigned int device; //!< The device's id on the bus, 0 to 31 9 | // unsigned int pciDeviceId; //!< The combined 16-bit device id and 16-bit vendor id 10 | // 11 | // // Added in NVML 2.285 API 12 | // unsigned int pciSubSystemId; 13 | // 14 | // // NVIDIA reserved for internal use only 15 | // unsigned int reserved0; 16 | // unsigned int reserved1; 17 | // unsigned int reserved2; 18 | // unsigned int reserved3; 19 | // } nvmlPciInfo_t; 20 | // #include 21 | import "C" 22 | 23 | import ( 24 | "unsafe" 25 | ) 26 | 27 | // DeviceGetAPIRestriction retrieves the root/admin permissions on the target API. 28 | // See nvmlRestrictedAPI_t for the list of supported APIs. 29 | // If an API is restricted only root users can call that API. 30 | // See nvmlDeviceSetAPIRestriction to change current permissions. 31 | func (a API) DeviceGetAPIRestriction(device Device, apiType RestrictedAPI) (bool, error) { 32 | var state int32 33 | if err := a.call(a.nvmlDeviceGetAPIRestriction, uintptr(device), uintptr(apiType), uintptr(unsafe.Pointer(&state))); err != nil { 34 | return false, err 35 | } 36 | 37 | if state > 0 { 38 | return true, nil 39 | } 40 | 41 | return false, nil 42 | } 43 | 44 | // DeviceGetApplicationsClock retrieves the current setting of a clock that applications will use unless an overspec 45 | // situation occurs. Can be changed using DeviceSetApplicationsClocks. 46 | func (a API) DeviceGetApplicationsClock(device Device, clockType ClockType) (clockMHz uint32, err error) { 47 | err = a.call(a.nvmlDeviceGetApplicationsClock, uintptr(device), uintptr(clockType), uintptr(unsafe.Pointer(&clockMHz))) 48 | return 49 | } 50 | 51 | // DeviceGetAutoBoostedClocksEnabled retrieve the current state of Auto Boosted clocks on a device and store it in isEnabled. 52 | // Auto Boosted clocks are enabled by default on some hardware, allowing the GPU to run at higher clock rates to 53 | // maximize performance as thermal limits allow. 54 | // On Pascal and newer hardware, Auto Aoosted clocks are controlled through application clocks. 55 | func (a API) DeviceGetAutoBoostedClocksEnabled(device Device) (isEnabled, defaultIsEnabled bool, err error) { 56 | var isEnabledInt int32 57 | var defaultIsEnabledInt int32 58 | 59 | err = a.call(a.nvmlDeviceGetAutoBoostedClocksEnabled, uintptr(device), uintptr(unsafe.Pointer(&isEnabledInt)), uintptr(unsafe.Pointer(&defaultIsEnabledInt))) 60 | if err != nil { 61 | return 62 | } 63 | 64 | if isEnabledInt > 0 { 65 | isEnabled = true 66 | } else { 67 | isEnabled = false 68 | } 69 | 70 | if defaultIsEnabledInt > 0 { 71 | defaultIsEnabled = true 72 | } else { 73 | defaultIsEnabled = false 74 | } 75 | 76 | return 77 | } 78 | 79 | // DeviceGetBAR1MemoryInfo gets Total, Available and Used size of BAR1 memory. 80 | // BAR1 is used to map the FB (device memory) so that it can be directly accessed by the CPU or 81 | // by 3rd party devices (peer-to-peer on the PCIE bus). 82 | func (a API) DeviceGetBAR1MemoryInfo(device Device) (mem BAR1Memory, err error) { 83 | err = a.call(a.nvmlDeviceGetBAR1MemoryInfo, uintptr(device), uintptr(unsafe.Pointer(&mem))) 84 | return 85 | } 86 | 87 | // DeviceGetBoardID retrieves the device boardId from 0-N. Devices with the same boardId indicate GPUs connected to 88 | // the same PLX. Use in conjunction with DeviceGetMultiGpuBoard() to decide if they are on the same board as well. 89 | // The boardId returned is a unique ID for the current configuration. 90 | // Uniqueness and ordering across reboots and system configurations is not guaranteed (i.e. if a Tesla K40c returns 91 | // 0x100 and the two GPUs on a Tesla K10 in the same system returns 0x200 it is not guaranteed they will always return 92 | // those values but they will always be different from each other). 93 | func (a API) DeviceGetBoardID(device Device) (boardID uint32, err error) { 94 | err = a.call(a.nvmlDeviceGetBoardId, uintptr(device), uintptr(unsafe.Pointer(&boardID))) 95 | return 96 | } 97 | 98 | // DeviceGetBoardPartNumber retrieves the the device board part number which is programmed into the board's InfoROM 99 | func (a API) DeviceGetBoardPartNumber(device Device) (string, error) { 100 | const bufferSize = 128 101 | 102 | buffer := [bufferSize]C.char{} 103 | if err := a.call(a.nvmlDeviceGetBoardPartNumber, uintptr(device), uintptr(unsafe.Pointer(&buffer[0])), bufferSize); err != nil { 104 | return "", err 105 | } 106 | 107 | return C.GoString(&buffer[0]), nil 108 | } 109 | 110 | // DeviceGetBrand retrieves the brand of this device. 111 | func (a API) DeviceGetBrand(device Device) (brand BrandType, err error) { 112 | err = a.call(a.nvmlDeviceGetBrand, uintptr(device), uintptr(unsafe.Pointer(&brand))) 113 | return 114 | } 115 | 116 | func (a API) DeviceGetBridgeChipInfo() { 117 | 118 | } 119 | 120 | // DeviceGetClock retrieves the clock speed for the clock specified by the clock type and clock ID. 121 | func (a API) DeviceGetClock(device Device, clockType ClockType, clockID ClockID) (clockMHz uint32, err error) { 122 | err = a.call(a.nvmlDeviceGetClock, uintptr(device), uintptr(clockType), uintptr(clockID), uintptr(unsafe.Pointer(&clockMHz))) 123 | return 124 | } 125 | 126 | // DeviceGetClockInfo retrieves the current clock speeds for the device. 127 | func (a API) DeviceGetClockInfo(device Device, clockType ClockType) (clock uint32, err error) { 128 | err = a.call(a.nvmlDeviceGetClockInfo, uintptr(device), uintptr(clockType), uintptr(unsafe.Pointer(&clock))) 129 | return 130 | } 131 | 132 | // DeviceGetComputeMode retrieves the current compute mode for the device. 133 | func (a API) DeviceGetComputeMode(device Device) (mode ComputeMode, err error) { 134 | err = a.call(a.nvmlDeviceGetComputeMode, uintptr(device), uintptr(unsafe.Pointer(&mode))) 135 | return 136 | } 137 | 138 | // DeviceGetComputeRunningProcesses gets information about processes with a compute context on a device. 139 | // This function returns information only about compute running processes (e.g. CUDA application which have 140 | // active context). Any graphics applications (e.g. using OpenGL, DirectX) won't be listed by this function. 141 | // Keep in mind that information returned by this call is dynamic and the number of elements might change in time. 142 | // Allocate more space for infos table in case new compute processes are spawned. 143 | func (a API) DeviceGetComputeRunningProcesses(device Device) ([]ProcessInfo, error) { 144 | var infoCount uint32 145 | 146 | // Query the current number of running compute processes 147 | err := a.call(a.nvmlDeviceGetComputeRunningProcesses, uintptr(device), uintptr(unsafe.Pointer(&infoCount)), 0) 148 | 149 | // None are running 150 | if err == nil || infoCount == 0 { 151 | return []ProcessInfo{}, nil 152 | } 153 | 154 | if err != ErrInsufficientSize { 155 | return nil, err 156 | } 157 | 158 | list := make([]ProcessInfo, infoCount) 159 | err = a.call(a.nvmlDeviceGetComputeRunningProcesses, uintptr(device), uintptr(unsafe.Pointer(&infoCount)), uintptr(unsafe.Pointer(&list[0]))) 160 | if err != nil { 161 | return nil, err 162 | } 163 | 164 | return list[:infoCount], nil 165 | } 166 | 167 | // DeviceGetCount retrieves the number of compute devices in the system. A compute device is a single GPU. 168 | func (a API) DeviceGetCount() (count uint32, err error) { 169 | err = a.call(a.nvmlDeviceGetCount, uintptr(unsafe.Pointer(&count))) 170 | return 171 | } 172 | 173 | // DeviceGetCudaComputeCapability retrieves the CUDA compute capability of the device. 174 | // Returns the major and minor compute capability version numbers of the device. 175 | // The major and minor versions are equivalent to the CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR and 176 | // CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR attributes that would be returned by CUDA's cuDeviceGetAttribute(). 177 | func (a API) DeviceGetCudaComputeCapability(device Device) (major, minor int32, err error) { 178 | err = a.call(a.nvmlDeviceGetCudaComputeCapability, uintptr(device), uintptr(unsafe.Pointer(&major)), uintptr(unsafe.Pointer(&minor))) 179 | return 180 | } 181 | 182 | // DeviceGetCurrPcieLinkGeneration retrieves the current PCIe link generation. 183 | func (a API) DeviceGetCurrPcieLinkGeneration(device Device) (currLinkGen uint32, err error) { 184 | err = a.call(a.nvmlDeviceGetCurrPcieLinkGeneration, uintptr(device), uintptr(unsafe.Pointer(&currLinkGen))) 185 | return 186 | } 187 | 188 | // DeviceGetCurrPcieLinkWidth retrieves the current PCIe link width. 189 | func (a API) DeviceGetCurrPcieLinkWidth(device Device) (currLinkWidth uint32, err error) { 190 | err = a.call(a.nvmlDeviceGetCurrPcieLinkWidth, uintptr(device), uintptr(unsafe.Pointer(&currLinkWidth))) 191 | return 192 | } 193 | 194 | // DeviceGetCurrentClocksThrottleReasons retrieves current clocks throttling reasons. 195 | // More than one bit can be enabled at the same time. Multiple reasons can be affecting clocks at once. 196 | func (a API) DeviceGetCurrentClocksThrottleReasons(device Device) (clocksThrottleReasons ClocksThrottleReason, err error) { 197 | err = a.call(a.nvmlDeviceGetCurrentClocksThrottleReasons, uintptr(device), uintptr(unsafe.Pointer(&clocksThrottleReasons))) 198 | return 199 | } 200 | 201 | // DeviceGetDecoderUtilization retrieves the current utilization and sampling size in microseconds for the Decoder. 202 | func (a API) DeviceGetDecoderUtilization(device Device) (utilization, samplingPeriodUs uint32, err error) { 203 | err = a.call(a.nvmlDeviceGetDecoderUtilization, uintptr(device), uintptr(unsafe.Pointer(&utilization)), uintptr(unsafe.Pointer(&samplingPeriodUs))) 204 | return 205 | } 206 | 207 | // DeviceGetDefaultApplicationsClock retrieves the default applications clock that GPU boots with or 208 | // defaults to after DeviceResetApplicationsClocks call. 209 | func (a API) DeviceGetDefaultApplicationsClock(device Device, clockType ClockType) (clockMHz uint32, err error) { 210 | err = a.call(a.nvmlDeviceGetDefaultApplicationsClock, uintptr(device), uintptr(clockType), uintptr(unsafe.Pointer(&clockMHz))) 211 | return 212 | } 213 | 214 | // DeviceGetDetailedECCErrors retrieves the detailed ECC error counts for the device. 215 | // Only applicable to devices with ECC. Requires NVML_INFOROM_ECC version 2.0 or higher to report aggregate 216 | // location-based ECC counts. Requires NVML_INFOROM_ECC version 1.0 or higher to report all other ECC counts. 217 | // Requires ECC Mode to be enabled. 218 | // Detailed errors provide separate ECC counts for specific parts of the memory system. 219 | // Reports zero for unsupported ECC error counters when a subset of ECC error counters are supported. 220 | // Deprecated: This API supports only a fixed set of ECC error locations. 221 | // On different GPU architectures different locations are supported, see DeviceGetMemoryErrorCounter 222 | func (a API) DeviceGetDetailedECCErrors(device Device, errorType MemoryErrorType, counterType ECCCounterType) (*ECCErrorCounts, error) { 223 | counts := &ECCErrorCounts{} 224 | if err := a.call(a.nvmlDeviceGetDetailedEccErrors, uintptr(device), uintptr(errorType), uintptr(counterType), uintptr(unsafe.Pointer(counts))); err != nil { 225 | return nil, err 226 | } 227 | 228 | return counts, nil 229 | } 230 | 231 | // DeviceGetDisplayActive retrieves the display active state for the device. 232 | // This method indicates whether a display is initialized on the device. 233 | // For example whether X Server is attached to this device and has allocated memory for the screen. 234 | // Display can be active even when no monitor is physically attached. 235 | func (a API) DeviceGetDisplayActive(device Device) (bool, error) { 236 | var state int32 237 | if err := a.call(a.nvmlDeviceGetDisplayActive, uintptr(device), uintptr(unsafe.Pointer(&state))); err != nil { 238 | return false, err 239 | } 240 | 241 | if state > 0 { 242 | return true, nil 243 | } 244 | 245 | return false, nil 246 | } 247 | 248 | // DeviceGetDisplayMode retrieves the display mode for the device. This method indicates whether a physical display 249 | // (e.g. monitor) is currently connected to any of the device's connectors. 250 | func (a API) DeviceGetDisplayMode(device Device) (bool, error) { 251 | var state int32 252 | if err := a.call(a.nvmlDeviceGetDisplayMode, uintptr(device), uintptr(unsafe.Pointer(&state))); err != nil { 253 | return false, err 254 | } 255 | 256 | if state > 0 { 257 | return true, nil 258 | } 259 | 260 | return false, nil 261 | } 262 | 263 | // DeviceGetDriverModel retrieves the current and pending driver model for the device. 264 | // On Windows platforms the device driver can run in either WDDM or WDM (TCC) mode. 265 | // If a display is attached to the device it must run in WDDM mode. TCC mode is preferred if a display is not attached. 266 | func (a API) DeviceGetDriverModel(device Device) (current, pending DriverModel, err error) { 267 | err = a.call(a.nvmlDeviceGetDriverModel, uintptr(device), uintptr(unsafe.Pointer(¤t)), uintptr(unsafe.Pointer(&pending))) 268 | return 269 | } 270 | 271 | // DeviceGetECCMode retrieves the current and pending ECC modes for the device. 272 | // Only applicable to devices with ECC. Requires NVML_INFOROM_ECC version 1.0 or higher. 273 | // Changing ECC modes requires a reboot. The "pending" ECC mode refers to the target mode following the next reboot. 274 | func (a API) DeviceGetECCMode(device Device) (current, pending bool, err error) { 275 | var currentInt int32 276 | var pendingInt int32 277 | 278 | err = a.call(a.nvmlDeviceGetEccMode, uintptr(device), uintptr(unsafe.Pointer(¤tInt)), uintptr(unsafe.Pointer(&pendingInt))) 279 | if err != nil { 280 | return 281 | } 282 | 283 | if currentInt > 0 { 284 | current = true 285 | } else { 286 | current = false 287 | } 288 | 289 | if pendingInt > 0 { 290 | pending = true 291 | } else { 292 | pending = false 293 | } 294 | 295 | return 296 | } 297 | 298 | // DeviceGetEncoderCapacity retrieves the current capacity of the device's encoder, in macroblocks per second. 299 | func (a API) DeviceGetEncoderCapacity(device Device, encoderQueryType EncoderType) (encoderCapacity uint32, err error) { 300 | err = a.call(a.nvmlDeviceGetEncoderCapacity, uintptr(device), uintptr(encoderQueryType), uintptr(unsafe.Pointer(&encoderCapacity))) 301 | return 302 | } 303 | 304 | func (a API) DeviceGetEncoderSessions() error { 305 | return ErrNotImplemented 306 | } 307 | 308 | // DeviceGetEncoderStats retrieves the current encoder statistics for a given device. 309 | func (a API) DeviceGetEncoderStats(device Device) (sessionCount, averageFPS, averageLatency uint32, err error) { 310 | err = a.call( 311 | a.nvmlDeviceGetEncoderStats, 312 | uintptr(device), 313 | uintptr(unsafe.Pointer(&sessionCount)), 314 | uintptr(unsafe.Pointer(&averageFPS)), 315 | uintptr(unsafe.Pointer(&averageLatency))) 316 | return 317 | } 318 | 319 | // DeviceGetEncoderUtilization retrieves the current utilization and sampling size in microseconds for the Encoder 320 | func (a API) DeviceGetEncoderUtilization(device Device) (utilization, samplingPeriodUs uint32, err error) { 321 | err = a.call(a.nvmlDeviceGetEncoderUtilization, uintptr(device), uintptr(unsafe.Pointer(&utilization)), uintptr(unsafe.Pointer(&samplingPeriodUs))) 322 | return 323 | } 324 | 325 | // DeviceGetEnforcedPowerLimit gets the effective power limit that the driver enforces after taking into account all limiters. 326 | // Note: This can be different from the DeviceGetPowerManagementLimit if other limits are set elsewhere. 327 | // This includes the out of band power limit interface 328 | func (a API) DeviceGetEnforcedPowerLimit(device Device) (limit uint32, err error) { 329 | err = a.call(a.nvmlDeviceGetEnforcedPowerLimit, uintptr(device), uintptr(unsafe.Pointer(&limit))) 330 | return 331 | } 332 | 333 | // DeviceGetFanSpeed retrieves the intended operating speed of the device's fan. 334 | // Note: The reported speed is the intended fan speed. If the fan is physically blocked and unable to spin, 335 | // the output will not match the actual fan speed. 336 | // The fan speed is expressed as a percent of the maximum, i.e. full speed is 100%. 337 | func (a API) DeviceGetFanSpeed(device Device) (speed uint32, err error) { 338 | err = a.call(a.nvmlDeviceGetFanSpeed, uintptr(device), uintptr(unsafe.Pointer(&speed))) 339 | return 340 | } 341 | 342 | // DeviceGetGPUOperationMode retrieves the current GOM and pending GOM (the one that GPU will switch to after reboot). 343 | // For GK110 M-class and X-class Tesla products from the Kepler family. 344 | // Modes NVML_GOM_LOW_DP and NVML_GOM_ALL_ON are supported on fully supported GeForce products. 345 | // Not supported on Quadro and Tesla C-class products. 346 | func (a API) DeviceGetGPUOperationMode(device Device) (current, pending GPUOperationMode, err error) { 347 | err = a.call(a.nvmlDeviceGetGpuOperationMode, uintptr(device), uintptr(unsafe.Pointer(¤t)), uintptr(unsafe.Pointer(&pending))) 348 | return 349 | } 350 | 351 | // DeviceGetGraphicsRunningProcesses get information about processes with a graphics context on a device. 352 | // This function returns information only about graphics based processes (eg. applications using OpenGL, DirectX). 353 | // Keep in mind that information returned by this call is dynamic and the number of elements might change in time. 354 | // Allocate more space for infos table in case new graphics processes are spawned. 355 | func (a API) DeviceGetGraphicsRunningProcesses(device Device) ([]ProcessInfo, error) { 356 | var infoCount uint32 357 | 358 | // Query the current number of running compute processes 359 | err := a.call(a.nvmlDeviceGetGraphicsRunningProcesses, uintptr(device), uintptr(unsafe.Pointer(&infoCount)), 0) 360 | 361 | // None are running 362 | if err == nil || infoCount == 0 { 363 | return []ProcessInfo{}, nil 364 | } 365 | 366 | if err != ErrInsufficientSize { 367 | return nil, err 368 | } 369 | 370 | list := make([]ProcessInfo, infoCount) 371 | err = a.call(a.nvmlDeviceGetGraphicsRunningProcesses, uintptr(device), uintptr(unsafe.Pointer(&infoCount)), uintptr(unsafe.Pointer(&list[0]))) 372 | if err != nil { 373 | return nil, err 374 | } 375 | 376 | return list[:infoCount], nil 377 | } 378 | 379 | // DeviceGetHandleByIndex acquires the handle for a particular device, based on its index. 380 | func (a API) DeviceGetHandleByIndex(index uint32) (device Device, err error) { 381 | err = a.call(a.nvmlDeviceGetHandleByIndex, uintptr(index), uintptr(unsafe.Pointer(&device))) 382 | return 383 | } 384 | 385 | // DeviceGetHandleByPciBusId acquires the handle for a particular device, based on its PCI bus id. 386 | // This value corresponds to the nvmlPciInfo_t::busId returned by DeviceGetPciInfo(). 387 | // Starting from NVML 5, this API causes NVML to initialize the target GPU NVML may initialize additional GPUs if: 388 | // - The target GPU is an SLI slave 389 | // Note: NVML 4.304 and older version of nvmlDeviceGetHandleByPciBusId"_v1" returns NVML_ERROR_NOT_FOUND instead of NVML_ERROR_NO_PERMISSION. 390 | func (a API) DeviceGetHandleByPCIBusID(pciBusID string) (device Device, err error) { 391 | cstr := C.CString(pciBusID) 392 | defer C.free(unsafe.Pointer(cstr)) 393 | 394 | err = a.call(a.nvmlDeviceGetHandleByPciBusId, uintptr(unsafe.Pointer(cstr)), uintptr(unsafe.Pointer(&device))) 395 | return 396 | } 397 | 398 | // DeviceGetHandleBySerial acquires the handle for a particular device, based on its board serial number. 399 | // Starting from NVML 5, this API causes NVML to initialize the target GPU, NVML may initialize additional 400 | // GPUs as it searches for the target GPU 401 | func (a API) DeviceGetHandleBySerial(serial string) (device Device, err error) { 402 | cstr := C.CString(serial) 403 | defer C.free(unsafe.Pointer(cstr)) 404 | 405 | err = a.call(a.nvmlDeviceGetHandleBySerial, uintptr(unsafe.Pointer(cstr)), uintptr(unsafe.Pointer(&device))) 406 | return 407 | } 408 | 409 | // DeviceGetHandleByUUID acquires the handle for a particular device, 410 | // based on its globally unique immutable UUID associated with each device. 411 | func (a API) DeviceGetHandleByUUID(uuid string) (device Device, err error) { 412 | cstr := C.CString(uuid) 413 | defer C.free(unsafe.Pointer(cstr)) 414 | 415 | err = a.call(a.nvmlDeviceGetHandleByUUID, uintptr(unsafe.Pointer(cstr)), uintptr(unsafe.Pointer(&device))) 416 | return 417 | } 418 | 419 | // DeviceGetIndex retrieves the NVML index of this device. 420 | func (a API) DeviceGetIndex(device Device) (index uint32, err error) { 421 | err = a.call(a.nvmlDeviceGetIndex, uintptr(device), uintptr(unsafe.Pointer(&index))) 422 | return 423 | } 424 | 425 | // DeviceGetInforomConfigurationChecksum retrieves the checksum of the configuration stored in the device's infoROM. 426 | // Can be used to make sure that two GPUs have the exact same configuration. 427 | // Current checksum takes into account configuration stored in PWR and ECC infoROM objects. 428 | // Checksum can change between driver releases or when user changes configuration (e.g. disable/enable ECC) 429 | func (a API) DeviceGetInforomConfigurationChecksum(device Device) (checksum uint32, err error) { 430 | err = a.call(a.nvmlDeviceGetInforomConfigurationChecksum, uintptr(device), uintptr(unsafe.Pointer(&checksum))) 431 | return 432 | } 433 | 434 | // DeviceGetInforomImageVersion retrieves the global infoROM image version. Image version just like VBIOS version 435 | // uniquely describes the exact version of the infoROM flashed on the board in contrast to infoROM object version 436 | // which is only an indicator of supported features. 437 | func (a API) DeviceGetInfoROMImageVersion(device Device) (string, error) { 438 | buffer := [deviceInfoROMVersionBufferSize]C.char{} 439 | if err := a.call(a.nvmlDeviceGetInforomImageVersion, uintptr(device), uintptr(unsafe.Pointer(&buffer[0])), deviceInfoROMVersionBufferSize); err != nil { 440 | return "", err 441 | } 442 | 443 | return C.GoString(&buffer[0]), nil 444 | } 445 | 446 | // DeviceGetInfoROMVersion retrieves the version information for the device's infoROM object. 447 | func (a API) DeviceGetInfoROMVersion(device Device, object InfoROMObject) (string, error) { 448 | buffer := [deviceInfoROMVersionBufferSize]C.char{} 449 | if err := a.call(a.nvmlDeviceGetInforomVersion, uintptr(device), uintptr(object), uintptr(unsafe.Pointer(&buffer[0])), deviceInfoROMVersionBufferSize); err != nil { 450 | return "", err 451 | } 452 | 453 | return C.GoString(&buffer[0]), nil 454 | } 455 | 456 | // DeviceGetMaxClockInfo retrieves the maximum clock speeds for the device. 457 | func (a API) DeviceGetMaxClockInfo(device Device, clockType ClockType) (clock uint32, err error) { 458 | err = a.call(a.nvmlDeviceGetMaxClockInfo, uintptr(device), uintptr(clockType), uintptr(unsafe.Pointer(&clock))) 459 | return 460 | } 461 | 462 | // DeviceGetMaxCustomerBoostClock retrieves the customer defined maximum boost clock speed specified by the given clock type. 463 | func (a API) DeviceGetMaxCustomerBoostClock(device Device, clockType ClockType) (clockMHz uint32, err error) { 464 | err = a.call(a.nvmlDeviceGetMaxCustomerBoostClock, uintptr(device), uintptr(clockType), uintptr(unsafe.Pointer(&clockMHz))) 465 | return 466 | } 467 | 468 | // DeviceGetMaxPcieLinkGeneration retrieves the maximum PCIe link generation possible with this device and system. 469 | // I.E. for a generation 2 PCIe device attached to a generation 1 PCIe bus the max link generation this function will 470 | // report is generation 1. 471 | func (a API) DeviceGetMaxPcieLinkGeneration(device Device) (maxLinkGen uint32, err error) { 472 | err = a.call(a.nvmlDeviceGetMaxPcieLinkGeneration, uintptr(device), uintptr(unsafe.Pointer(&maxLinkGen))) 473 | return 474 | } 475 | 476 | // DeviceGetMaxPcieLinkWidth retrieves the maximum PCIe link width possible with this device and system 477 | // I.E. for a device with a 16x PCIe bus width attached to a 8x PCIe system bus this function will report a max link width of 8. 478 | func (a API) DeviceGetMaxPcieLinkWidth(device Device) (maxLinkWidth uint32, err error) { 479 | err = a.call(a.nvmlDeviceGetMaxPcieLinkWidth, uintptr(device), uintptr(unsafe.Pointer(&maxLinkWidth))) 480 | return 481 | } 482 | 483 | // DeviceGetMemoryErrorCounter retrieves the requested memory error counter for the device. 484 | // Requires NVML_INFOROM_ECC version 2.0 or higher to report aggregate location-based memory error counts. 485 | // Requires NVML_INFOROM_ECC version 1.0 or higher to report all other memory error counts. 486 | // Only applicable to devices with ECC. Requires ECC Mode to be enabled. 487 | func (a API) DeviceGetMemoryErrorCounter(device Device, errorType MemoryErrorType, counterType ECCCounterType, locationType MemoryLocation) (count uint64, err error) { 488 | err = a.call(a.nvmlDeviceGetMemoryErrorCounter, 489 | uintptr(device), 490 | uintptr(errorType), 491 | uintptr(counterType), 492 | uintptr(locationType), 493 | uintptr(unsafe.Pointer(&count))) 494 | return 495 | } 496 | 497 | // DeviceGetMemoryInfo retrieves the amount of used, free and total memory available on the device, in bytes. 498 | func (a API) DeviceGetMemoryInfo(device Device) (mem Memory, err error) { 499 | err = a.call(a.nvmlDeviceGetMemoryInfo, uintptr(device), uintptr(unsafe.Pointer(&mem))) 500 | return 501 | } 502 | 503 | // DeviceGetMinorNumber retrieves minor number for the device. The minor number for the device is such that 504 | // the Nvidia device node file for each GPU will have the form /dev/nvidia[minor number]. 505 | func (a API) DeviceGetMinorNumber(device Device) (minorNumber uint32, err error) { 506 | err = a.call(a.nvmlDeviceGetMinorNumber, uintptr(device), uintptr(unsafe.Pointer(&minorNumber))) 507 | return 508 | } 509 | 510 | // DeviceGetMultiGpuBoard retrieves whether the device is on a Multi-GPU Board. 511 | func (a API) DeviceGetMultiGpuBoard(device Device) (multiGpu bool, err error) { 512 | var multiGpuBool uint 513 | err = a.call(a.nvmlDeviceGetMultiGpuBoard, uintptr(device), uintptr(unsafe.Pointer(&multiGpuBool))) 514 | if err != nil { 515 | return 516 | } 517 | 518 | // Non-zero value indicates whether the device is on a multi GPU board 519 | multiGpu = false 520 | if multiGpuBool != 0 { 521 | multiGpu = true 522 | } 523 | 524 | return 525 | } 526 | 527 | // DeviceGetName retrieves the name of this device. 528 | func (a API) DeviceGetName(device Device) (string, error) { 529 | buffer := [deviceNameBufferSize]C.char{} 530 | if err := a.call(a.nvmlDeviceGetName, uintptr(device), uintptr(unsafe.Pointer(&buffer[0])), deviceNameBufferSize); err != nil { 531 | return "", err 532 | } 533 | 534 | return C.GoString(&buffer[0]), nil 535 | } 536 | 537 | func (a API) DeviceGetP2PStatus() error { 538 | return ErrNotImplemented 539 | } 540 | 541 | func (a API) DeviceGetPCIInfo(device Device) (*PCIInfo, error) { 542 | var pci C.nvmlPciInfo_t 543 | if err := a.call(a.nvmlDeviceGetPciInfo, uintptr(device), uintptr(unsafe.Pointer(&pci))); err != nil { 544 | return nil, err 545 | } 546 | 547 | return &PCIInfo{ 548 | BusID: C.GoString(&pci.busId[0]), 549 | Domain: uint32(pci.domain), 550 | Bus: uint32(pci.bus), 551 | Device: uint32(pci.device), 552 | PCIDeviceID: uint32(pci.pciDeviceId), 553 | PCISubsystemID: uint32(pci.pciSubSystemId), 554 | }, nil 555 | } 556 | 557 | // DeviceGetPcieReplayCounter retrieve the PCIe replay counter. 558 | func (a API) DeviceGetPcieReplayCounter(device Device) (value uint32, err error) { 559 | err = a.call(a.nvmlDeviceGetPcieReplayCounter, uintptr(device), uintptr(unsafe.Pointer(&value))) 560 | return 561 | } 562 | 563 | // DeviceGetPCIeThroughput eetrieve PCIe utilization information. 564 | // This function is querying a byte counter over a 20ms interval and thus is the PCIe throughput over that interval. 565 | // This method is not supported in virtual machines running virtual GPU (vGPU). 566 | func (a API) DeviceGetPCIeThroughput(device Device, counter PCIeUtilCounter) (value uint32, err error) { 567 | err = a.call(a.nvmlDeviceGetPcieThroughput, uintptr(device), uintptr(counter), uintptr(unsafe.Pointer(&value))) 568 | return 569 | } 570 | 571 | // DeviceGetPerformanceState retrieves the current performance state for the device. 572 | func (a API) DeviceGetPerformanceState(device Device) (state PState, err error) { 573 | err = a.call(a.nvmlDeviceGetPerformanceState, uintptr(device), uintptr(unsafe.Pointer(&state))) 574 | return 575 | } 576 | 577 | // DeviceGetPowerManagementDefaultLimit retrieves default power management limit on this device, in milliwatts. 578 | // Default power management limit is a power management limit that the device boots with. 579 | func (a API) DeviceGetPowerManagementDefaultLimit(device Device) (defaultLimit uint32, err error) { 580 | err = a.call(a.nvmlDeviceGetPowerManagementDefaultLimit, uintptr(device), uintptr(unsafe.Pointer(&defaultLimit))) 581 | return 582 | } 583 | 584 | // DeviceGetPowerManagementLimit retrieves the power management limit associated with this device. 585 | // The power limit defines the upper boundary for the card's power draw. 586 | // If the card's total power draw reaches this limit the power management algorithm kicks in. 587 | // This reading is only available if power management mode is supported, see DeviceGetPowerManagementMode. 588 | func (a API) DeviceGetPowerManagementLimit(device Device) (limit uint32, err error) { 589 | err = a.call(a.nvmlDeviceGetPowerManagementLimit, uintptr(device), uintptr(unsafe.Pointer(&limit))) 590 | return 591 | } 592 | 593 | // DeviceGetPowerManagementLimitConstraints retrieves information about possible values of power management limits on this device. 594 | func (a API) DeviceGetPowerManagementLimitConstraints(device Device) (minLimit, maxLimit uint32, err error) { 595 | err = a.call(a.nvmlDeviceGetPowerManagementLimitConstraints, uintptr(device), uintptr(unsafe.Pointer(&minLimit)), uintptr(unsafe.Pointer(&maxLimit))) 596 | return 597 | } 598 | 599 | // DeviceGetPowerManagementMode retrieves the power management mode associated with this device. 600 | // This API has been deprecated. 601 | // This flag indicates whether any power management algorithm is currently active on the device. 602 | // An enabled state does not necessarily mean the device is being actively throttled -- only that that the driver will 603 | // do so if the appropriate conditions are met. 604 | func (a API) DeviceGetPowerManagementMode(device Device) (bool, error) { 605 | var state int32 606 | if err := a.call(a.nvmlDeviceGetPowerManagementMode, uintptr(device), uintptr(unsafe.Pointer(&state))); err != nil { 607 | return false, nil 608 | } 609 | 610 | if state > 0 { 611 | return true, nil 612 | } 613 | 614 | return false, nil 615 | } 616 | 617 | // DeviceGetPowerState retrieve the current performance state for the device. 618 | // Deprecated: Use DeviceGetPerformanceState. 619 | // This function exposes an incorrect generalization. 620 | func (a API) DeviceGetPowerState(device Device) (state PState, err error) { 621 | err = a.call(a.nvmlDeviceGetPowerState, uintptr(device), uintptr(unsafe.Pointer(&state))) 622 | return 623 | } 624 | 625 | // DeviceGetPowerUsage retrieves power usage for this GPU in milliwatts and its associated circuitry (e.g. memory) 626 | func (a API) DeviceGetPowerUsage(device Device) (power uint32, err error) { 627 | err = a.call(a.nvmlDeviceGetPowerUsage, uintptr(device), uintptr(unsafe.Pointer(&power))) 628 | return 629 | } 630 | 631 | // DeviceGetRetiredPages returns the list of retired pages by source, including pages that are pending retirement. 632 | // The address information provided from this API is the hardware address of the page that was retired. 633 | // Note that this does not match the virtual address used in CUDA, but will match the address information in XID 63 634 | func (a API) DeviceGetRetiredPages(device Device, cause PageRetirementCause) ([]uint64, error) { 635 | // Get array size 636 | var count uint32 637 | err := a.call(a.nvmlDeviceGetRetiredPages, uintptr(device), uintptr(cause), uintptr(unsafe.Pointer(&count)), 0) 638 | if err == nil { 639 | return []uint64{}, nil 640 | } 641 | 642 | if err != ErrInsufficientSize { 643 | return nil, err 644 | } 645 | 646 | // Query data 647 | list := make([]uint64, count) 648 | err = a.call(a.nvmlDeviceGetRetiredPages, 649 | uintptr(device), 650 | uintptr(cause), 651 | uintptr(unsafe.Pointer(&count)), 652 | uintptr(unsafe.Pointer(&list[0]))) 653 | 654 | if err != nil { 655 | return nil, err 656 | } 657 | 658 | return list, nil 659 | } 660 | 661 | // DeviceGetRetiredPagesPendingStatus checks if any pages are pending retirement and need a reboot to fully retire. 662 | func (a API) DeviceGetRetiredPagesPendingStatus(device Device) (isPending bool, err error) { 663 | var state int32 = 0 664 | err = a.call(a.nvmlDeviceGetRetiredPagesPendingStatus, uintptr(device), uintptr(unsafe.Pointer(&state))) 665 | if err != nil { 666 | return 667 | } 668 | 669 | if state > 0 { 670 | isPending = true 671 | } else { 672 | isPending = false 673 | } 674 | 675 | return 676 | } 677 | 678 | func (a API) DeviceGetSamples() error { 679 | return ErrNotImplemented 680 | } 681 | 682 | // DeviceGetSerial retrieves the globally unique board serial number associated with this device's board. 683 | func (a API) DeviceGetSerial(device Device) (serial string, err error) { 684 | buffer := [deviceSerialBufferSize]C.char{} 685 | err = a.call(a.nvmlDeviceGetSerial, uintptr(device), uintptr(unsafe.Pointer(&buffer[0])), deviceSerialBufferSize) 686 | return 687 | } 688 | 689 | // DeviceGetSupportedClocksThrottleReasons retrieves bitmask of supported clocks throttle reasons that can be 690 | // returned by DeviceGetCurrentClocksThrottleReasons. This method is not supported in virtual machines 691 | // running virtual GPU (vGPU). 692 | func (a API) DeviceGetSupportedClocksThrottleReasons(device Device) (supportedClocksThrottleReasons ClocksThrottleReason, err error) { 693 | err = a.call(a.nvmlDeviceGetSupportedClocksThrottleReasons, uintptr(device), uintptr(unsafe.Pointer(&supportedClocksThrottleReasons))) 694 | return 695 | } 696 | 697 | // DeviceGetSupportedGraphicsClocks retrieves the list of possible graphics clocks that can be used 698 | // as an argument for DeviceSetApplicationsClocks. 699 | func (a API) DeviceGetSupportedGraphicsClocks(device Device, memoryClockMHz uint32) ([]uint32, error) { 700 | // Get array size 701 | var count uint32 702 | err := a.call(a.nvmlDeviceGetSupportedGraphicsClocks, uintptr(device), uintptr(memoryClockMHz), uintptr(unsafe.Pointer(&count)), 0) 703 | if err == nil { 704 | return []uint32{}, nil 705 | } 706 | 707 | if err != ErrInsufficientSize { 708 | return nil, err 709 | } 710 | 711 | // Query data 712 | list := make([]uint32, count) 713 | if err := a.call(a.nvmlDeviceGetSupportedGraphicsClocks, uintptr(device), uintptr(memoryClockMHz), uintptr(unsafe.Pointer(&count)), uintptr(unsafe.Pointer(&list[0]))); err != nil { 714 | return nil, err 715 | } 716 | 717 | return list, nil 718 | } 719 | 720 | // DeviceGetSupportedMemoryClocks retrieves the list of possible memory clocks that can be used 721 | // as an argument for DeviceSetApplicationsClocks. 722 | func (a API) DeviceGetSupportedMemoryClocks(device Device) ([]uint32, error) { 723 | // Get array size 724 | var count uint32 725 | 726 | err := a.call(a.nvmlDeviceGetSupportedMemoryClocks, uintptr(device), uintptr(unsafe.Pointer(&count)), 0) 727 | if err == nil { 728 | return []uint32{}, nil 729 | } 730 | 731 | if err != ErrInsufficientSize { 732 | return nil, err 733 | } 734 | 735 | // Query data 736 | list := make([]uint32, count) 737 | if err := a.call(a.nvmlDeviceGetSupportedMemoryClocks, uintptr(device), uintptr(unsafe.Pointer(&count)), uintptr(unsafe.Pointer(&list[0]))); err != nil { 738 | return nil, err 739 | } 740 | 741 | return list, nil 742 | } 743 | 744 | // DeviceGetTemperature retrieves the current temperature readings for the device, in degrees C. 745 | func (a API) DeviceGetTemperature(device Device, sensorType TemperatureSensor) (temp uint32, err error) { 746 | err = a.call(a.nvmlDeviceGetTemperature, uintptr(device), uintptr(sensorType), uintptr(unsafe.Pointer(&temp))) 747 | return 748 | } 749 | 750 | // DeviceGetTemperatureThreshold retrieves the temperature threshold for the GPU with the specified threshold type in degrees C. 751 | func (a API) DeviceGetTemperatureThreshold(device Device, thresholdType TemperatureThreshold) (temp uint32, err error) { 752 | err = a.call(a.nvmlDeviceGetTemperatureThreshold, uintptr(device), uintptr(thresholdType), uintptr(unsafe.Pointer(&temp))) 753 | return 754 | } 755 | 756 | // DeviceGetTopologyCommonAncestor retrieves the common ancestor for two devices. Supported on Linux only. 757 | func (a API) DeviceGetTopologyCommonAncestor(device1 Device, device2 Device) (pathInfo GPUTopologyLevel, err error) { 758 | err = a.call(a.nvmlDeviceGetTopologyCommonAncestor, uintptr(device1), uintptr(device2), uintptr(unsafe.Pointer(&pathInfo))) 759 | return 760 | } 761 | 762 | func (a API) DeviceGetTopologyNearestGpus() error { 763 | return ErrNotImplemented 764 | } 765 | 766 | // DeviceGetTotalECCErrors retrieves the total ECC error counts for the device. 767 | // Only applicable to devices with ECC. Requires NVML_INFOROM_ECC version 1.0 or higher. Requires ECC Mode to be enabled. 768 | // The total error count is the sum of errors across each of the separate memory systems, i.e. the total set of errors across the entire device. 769 | func (a API) DeviceGetTotalECCErrors(device Device, errorType MemoryErrorType, counterType ECCCounterType) (eccCount uint64, err error) { 770 | err = a.call(a.nvmlDeviceGetTotalEccErrors, uintptr(device), uintptr(errorType), uintptr(counterType), uintptr(unsafe.Pointer(&eccCount))) 771 | return 772 | } 773 | 774 | // DeviceGetTotalEnergyConsumption retrieves total energy consumption for this GPU in millijoules (mJ) 775 | // since the driver was last reloaded. 776 | func (a API) DeviceGetTotalEnergyConsumption(device Device) (energy uint64, err error) { 777 | err = a.call(a.nvmlDeviceGetTotalEnergyConsumption, uintptr(device), uintptr(unsafe.Pointer(&energy))) 778 | return 779 | } 780 | 781 | // DeviceGetUUID retrieves the globally unique immutable UUID associated with this device, 782 | // as a 5 part hexadecimal string, that augments the immutable, board serial identifier. 783 | func (a API) DeviceGetUUID(device Device) (string, error) { 784 | buffer := [deviceUUIDBufferSize]C.char{} 785 | if err := a.call(a.nvmlDeviceGetUUID, uintptr(device), uintptr(unsafe.Pointer(&buffer[0])), deviceUUIDBufferSize); err != nil { 786 | return "", err 787 | } 788 | 789 | return C.GoString(&buffer[0]), nil 790 | } 791 | 792 | // DeviceGetUtilizationRates retrieves the current utilization rates for the device's major subsystems. 793 | func (a API) DeviceGetUtilizationRates(device Device) (u Utilization, err error) { 794 | u.GPU = 0 795 | u.Memory = 0 796 | err = a.call(a.nvmlDeviceGetUtilizationRates, uintptr(device), uintptr(unsafe.Pointer(&u))) 797 | return 798 | } 799 | 800 | // DeviceGetVbiosVersion gets VBIOS version of the device. The VBIOS version may change from time to time. 801 | func (a API) DeviceGetVbiosVersion(device Device) (string, error) { 802 | buffer := [deviceVBIOSVersionBufferSize]C.char{} 803 | if err := a.call(a.nvmlDeviceGetVbiosVersion, uintptr(device), uintptr(unsafe.Pointer(&buffer[0])), deviceVBIOSVersionBufferSize); err != nil { 804 | return "", err 805 | } 806 | 807 | return C.GoString(&buffer[0]), nil 808 | } 809 | 810 | // DeviceGetViolationStatus gets the duration of time during which the device was throttled (lower than requested 811 | // clocks) due to power or thermal constraints. 812 | // The method is important to users who are tying to understand if their GPUs throttle at any point during their 813 | // applications. The difference in violation times at two different reference times gives the indication of 814 | // GPU throttling event. 815 | func (a API) DeviceGetViolationStatus(device Device, policyType PerfPolicyType) (violTime ViolationTime, err error) { 816 | err = a.call(a.nvmlDeviceGetViolationStatus, uintptr(device), uintptr(policyType), uintptr(unsafe.Pointer(&violTime))) 817 | return 818 | } 819 | 820 | // DeviceOnSameBoard checks if the GPU devices are on the same physical board. 821 | func (a API) DeviceOnSameBoard(device1 Device, device2 Device) (bool, error) { 822 | var onSameBoard int32 = 0 823 | 824 | if err := a.call(a.nvmlDeviceOnSameBoard, uintptr(device1), uintptr(device2), uintptr(unsafe.Pointer(&onSameBoard))); err != nil { 825 | return false, err 826 | } 827 | 828 | if onSameBoard == 0 { 829 | return false, nil 830 | } 831 | 832 | return true, nil 833 | } 834 | 835 | // DeviceResetApplicationsClocks resets the application clock to the default value. 836 | func (a API) DeviceResetApplicationsClocks(device Device) error { 837 | return a.call(a.nvmlDeviceResetApplicationsClocks, uintptr(device)) 838 | } 839 | 840 | // DeviceSetAutoBoostedClocksEnabled tries to set the current state of Auto Boosted clocks on a device. 841 | // Auto Boosted clocks are enabled by default on some hardware, allowing the GPU to run at higher clock rates to 842 | // maximize performance as thermal limits allow. Auto Boosted clocks should be disabled if fixed clock rates are desired. 843 | // Non-root users may use this API by default but can be restricted by root from using this API by calling 844 | // DeviceSetAPIRestriction with apiType=NVML_RESTRICTED_API_SET_AUTO_BOOSTED_CLOCKS. 845 | // Note: Persistence Mode is required to modify current Auto Boost settings, therefore, it must be enabled. 846 | func (a API) DeviceSetAutoBoostedClocksEnabled(device Device, enabled bool) error { 847 | var state int32 = 0 848 | if enabled { 849 | state = 1 850 | } 851 | 852 | return a.call(a.nvmlDeviceSetAutoBoostedClocksEnabled, uintptr(device), uintptr(state)) 853 | } 854 | 855 | // DeviceSetDefaultAutoBoostedClocksEnabled tries to set the default state of Auto Boosted clocks on a device. 856 | // This is the default state that Auto Boosted clocks will return to when no compute running processes (e.g. CUDA 857 | // application which have an active context) are running. 858 | func (a API) DeviceSetDefaultAutoBoostedClocksEnabled(device Device, enabled bool) error { 859 | var state int32 = 0 860 | if enabled { 861 | state = 1 862 | } 863 | 864 | return a.call(a.nvmlDeviceSetDefaultAutoBoostedClocksEnabled, uintptr(device), uintptr(state), 0) 865 | } 866 | 867 | // DeviceValidateInforom reads the infoROM from the flash and verifies the checksums. 868 | func (a API) DeviceValidateInforom(device Device) (err error) { 869 | err = a.call(a.nvmlDeviceValidateInforom, uintptr(device)) 870 | return 871 | } 872 | -------------------------------------------------------------------------------- /device_queries_test.go: -------------------------------------------------------------------------------- 1 | package nvml 2 | 3 | import ( 4 | "log" 5 | "testing" 6 | 7 | "github.com/stretchr/testify/require" 8 | ) 9 | 10 | func TestDeviceGetAPIRestriction(t *testing.T) { 11 | w, device := create(t) 12 | defer w.Shutdown() 13 | 14 | _, err := w.DeviceGetAPIRestriction(device, RestrictedAPISetApplicationClocks) 15 | require.NoError(t, err) 16 | } 17 | 18 | func TestDeviceGetApplicationsClock(t *testing.T) { 19 | w, device := create(t) 20 | defer w.Shutdown() 21 | 22 | clock, err := w.DeviceGetApplicationsClock(device, ClockGraphics) 23 | require.NoError(t, err) 24 | require.NotZero(t, clock) 25 | } 26 | 27 | func TestDeviceGetAutoBoostedClocksEnabled(t *testing.T) { 28 | w, device := create(t) 29 | defer w.Shutdown() 30 | 31 | _, _, err := w.DeviceGetAutoBoostedClocksEnabled(device) 32 | require.NoError(t, err) 33 | } 34 | 35 | func TestDeviceGetBAR1MemoryInfo(t *testing.T) { 36 | w, device := create(t) 37 | defer w.Shutdown() 38 | 39 | mem, err := w.DeviceGetBAR1MemoryInfo(device) 40 | require.NoError(t, err) 41 | require.NotZero(t, mem.Used) 42 | require.NotZero(t, mem.Total) 43 | require.NotZero(t, mem.Free) 44 | } 45 | 46 | func TestDeviceGetBoardID(t *testing.T) { 47 | w, device := create(t) 48 | defer w.Shutdown() 49 | 50 | id, err := w.DeviceGetBoardID(device) 51 | require.NoError(t, err) 52 | require.NotEqual(t, 0, id) 53 | } 54 | 55 | func TestDeviceGetBoardPartNumber(t *testing.T) { 56 | w, device := create(t) 57 | defer w.Shutdown() 58 | 59 | number, err := w.DeviceGetBoardPartNumber(device) 60 | require.NoError(t, err) 61 | require.NotEmpty(t, number) 62 | } 63 | 64 | func TestDeviceGetBrand(t *testing.T) { 65 | w, device := create(t) 66 | defer w.Shutdown() 67 | 68 | brand, err := w.DeviceGetBrand(device) 69 | require.NoError(t, err) 70 | require.NotEqual(t, BrandUnknown, brand) 71 | } 72 | 73 | func TestDeviceGetBridgeChipInfo(t *testing.T) { 74 | t.Skip() 75 | } 76 | 77 | func TestDeviceGetClock(t *testing.T) { 78 | w, device := create(t) 79 | defer w.Shutdown() 80 | 81 | clock, err := w.DeviceGetClock(device, ClockMem, ClockIDCurrent) 82 | require.NoError(t, err) 83 | require.NotZero(t, clock) 84 | } 85 | 86 | func TestDeviceGetClockInfo(t *testing.T) { 87 | w, device := create(t) 88 | defer w.Shutdown() 89 | 90 | // Core Clock 91 | graphics, err := w.DeviceGetClockInfo(device, ClockGraphics) 92 | require.NoError(t, err) 93 | require.True(t, graphics > 0) 94 | 95 | // Memory Clock 96 | mem, err := w.DeviceGetClockInfo(device, ClockMem) 97 | require.NoError(t, err) 98 | require.True(t, mem > 0) 99 | } 100 | 101 | func TestDeviceGetComputeMode(t *testing.T) { 102 | w, device := create(t) 103 | defer w.Shutdown() 104 | 105 | _, err := w.DeviceGetComputeMode(device) 106 | require.NoError(t, err) 107 | } 108 | 109 | func TestDeviceGetComputeRunningProcesses(t *testing.T) { 110 | w, device := create(t) 111 | defer w.Shutdown() 112 | 113 | list, err := w.DeviceGetComputeRunningProcesses(device) 114 | require.NoError(t, err) 115 | require.NotEmpty(t, list) 116 | } 117 | 118 | func TestDeviceGetCount(t *testing.T) { 119 | w, _ := create(t) 120 | defer w.Shutdown() 121 | 122 | count, err := w.DeviceGetCount() 123 | require.NoError(t, err) 124 | require.True(t, count > 0) 125 | } 126 | 127 | func TestDeviceGetCudaComputeCapability(t *testing.T) { 128 | w, device := create(t) 129 | defer w.Shutdown() 130 | 131 | _, _, err := w.DeviceGetCudaComputeCapability(device) 132 | require.NoError(t, err) 133 | } 134 | 135 | func TestDeviceGetCurrPcieLinkGeneration(t *testing.T) { 136 | w, device := create(t) 137 | defer w.Shutdown() 138 | 139 | gen, err := w.DeviceGetCurrPcieLinkGeneration(device) 140 | require.NoError(t, err) 141 | require.NotZero(t, gen) 142 | } 143 | 144 | func TestDeviceGetCurrPcieLinkWidth(t *testing.T) { 145 | w, device := create(t) 146 | defer w.Shutdown() 147 | 148 | gen, err := w.DeviceGetCurrPcieLinkWidth(device) 149 | require.NoError(t, err) 150 | require.NotZero(t, gen) 151 | } 152 | 153 | func TestDeviceGetCurrentClocksThrottleReasons(t *testing.T) { 154 | w, device := create(t) 155 | defer w.Shutdown() 156 | 157 | reasons, err := w.DeviceGetCurrentClocksThrottleReasons(device) 158 | require.NoError(t, err) 159 | require.NotZero(t, reasons) 160 | } 161 | 162 | func TestDeviceGetDecoderUtilization(t *testing.T) { 163 | w, device := create(t) 164 | defer w.Shutdown() 165 | 166 | _, _, err := w.DeviceGetDecoderUtilization(device) 167 | require.NoError(t, err) 168 | } 169 | 170 | func TestDeviceGetDefaultApplicationsClock(t *testing.T) { 171 | w, device := create(t) 172 | defer w.Shutdown() 173 | 174 | clock, err := w.DeviceGetDefaultApplicationsClock(device, ClockMem) 175 | require.NoError(t, err) 176 | require.NotZero(t, clock) 177 | } 178 | 179 | func TestDeviceGetDetailedECCErrors(t *testing.T) { 180 | w, device := create(t) 181 | defer w.Shutdown() 182 | 183 | _, err := w.DeviceGetDetailedECCErrors(device, MemoryErrorTypeCorrected, VolatileECC) 184 | require.NoError(t, err) 185 | } 186 | 187 | func TestDeviceGetDisplayActive(t *testing.T) { 188 | w, device := create(t) 189 | defer w.Shutdown() 190 | 191 | connected, err := w.DeviceGetDisplayActive(device) 192 | require.NoError(t, err) 193 | require.True(t, connected) 194 | } 195 | 196 | func TestDeviceGetDisplayMode(t *testing.T) { 197 | w, device := create(t) 198 | defer w.Shutdown() 199 | 200 | connected, err := w.DeviceGetDisplayMode(device) 201 | require.NoError(t, err) 202 | require.True(t, connected) 203 | } 204 | 205 | func TestDeviceGetDriverModel(t *testing.T) { 206 | w, device := create(t) 207 | defer w.Shutdown() 208 | 209 | _, _, err := w.DeviceGetDriverModel(device) 210 | require.NoError(t, err) 211 | } 212 | 213 | func TestDeviceGetECCMode(t *testing.T) { 214 | w, device := create(t) 215 | defer w.Shutdown() 216 | 217 | _, _, err := w.DeviceGetECCMode(device) 218 | require.NoError(t, err) 219 | } 220 | 221 | func TestDeviceGetEncoderCapacity(t *testing.T) { 222 | w, device := create(t) 223 | defer w.Shutdown() 224 | 225 | cap, err := w.DeviceGetEncoderCapacity(device, EncoderTypeQueryH264) 226 | require.NoError(t, err) 227 | require.NotZero(t, cap) 228 | } 229 | 230 | func TestDeviceGetEncoderSessions(t *testing.T) { 231 | t.Skip() 232 | } 233 | 234 | func TestDeviceGetEncoderStats(t *testing.T) { 235 | w, device := create(t) 236 | defer w.Shutdown() 237 | 238 | _, _, _, err := w.DeviceGetEncoderStats(device) 239 | require.NoError(t, err) 240 | } 241 | 242 | func TestDeviceGetEncoderUtilization(t *testing.T) { 243 | w, device := create(t) 244 | defer w.Shutdown() 245 | 246 | _, _, err := w.DeviceGetEncoderUtilization(device) 247 | require.NoError(t, err) 248 | } 249 | 250 | func TestDeviceGetEnforcedPowerLimit(t *testing.T) { 251 | w, device := create(t) 252 | defer w.Shutdown() 253 | 254 | limit, err := w.DeviceGetEnforcedPowerLimit(device) 255 | require.NoError(t, err) 256 | require.True(t, limit > 0) 257 | } 258 | 259 | func TestDeviceGetFanSpeed(t *testing.T) { 260 | w, device := create(t) 261 | defer w.Shutdown() 262 | 263 | speed, err := w.DeviceGetFanSpeed(device) 264 | require.NoError(t, err) 265 | require.True(t, speed > 0) 266 | } 267 | 268 | func TestDeviceGetGPUOperationMode(t *testing.T) { 269 | w, device := create(t) 270 | defer w.Shutdown() 271 | 272 | _, _, err := w.DeviceGetGPUOperationMode(device) 273 | require.NoError(t, err) 274 | } 275 | 276 | func TestDeviceGetGraphicsRunningProcesses(t *testing.T) { 277 | w, device := create(t) 278 | defer w.Shutdown() 279 | 280 | list, err := w.DeviceGetGraphicsRunningProcesses(device) 281 | require.NoError(t, err) 282 | require.NotEmpty(t, list) 283 | } 284 | 285 | func TestDeviceGetHandleByIndex(t *testing.T) { 286 | w, _ := create(t) 287 | defer w.Shutdown() 288 | 289 | device, err := w.DeviceGetHandleByIndex(0) 290 | require.NoError(t, err) 291 | require.NotNil(t, device) 292 | } 293 | 294 | func TestDeviceGetHandleByPciBusId(t *testing.T) { 295 | w, device := create(t) 296 | defer w.Shutdown() 297 | 298 | info, err := w.DeviceGetPCIInfo(device) 299 | require.NoError(t, err) 300 | 301 | handle, err := w.DeviceGetHandleByPCIBusID(info.BusID) 302 | require.NoError(t, err) 303 | require.Equal(t, device, handle) 304 | } 305 | 306 | func TestDeviceGetHandleBySerial(t *testing.T) { 307 | w, device := create(t) 308 | defer w.Shutdown() 309 | 310 | serial, err := w.DeviceGetSerial(device) 311 | require.NoError(t, err) 312 | 313 | result, err := w.DeviceGetHandleBySerial(serial) 314 | require.NoError(t, err) 315 | require.NotNil(t, err) 316 | require.Equal(t, device, result) 317 | } 318 | 319 | func TestDeviceGetHandleByUUID(t *testing.T) { 320 | w, device := create(t) 321 | defer w.Shutdown() 322 | 323 | uuid, err := w.DeviceGetUUID(device) 324 | require.NoError(t, err) 325 | 326 | result, err := w.DeviceGetHandleByUUID(uuid) 327 | require.NoError(t, err) 328 | require.NotNil(t, result) 329 | require.Equal(t, device, result) 330 | } 331 | 332 | func TestDeviceGetIndex(t *testing.T) { 333 | w, device := create(t) 334 | defer w.Shutdown() 335 | 336 | index, err := w.DeviceGetIndex(device) 337 | require.NoError(t, err) 338 | require.Equal(t, uint32(0), index) 339 | } 340 | 341 | func TestDeviceGetInforomConfigurationChecksum(t *testing.T) { 342 | w, device := create(t) 343 | defer w.Shutdown() 344 | 345 | checksum, err := w.DeviceGetInforomConfigurationChecksum(device) 346 | require.NoError(t, err) 347 | require.True(t, checksum != 0) 348 | } 349 | 350 | func TestDeviceGetInforomImageVersion(t *testing.T) { 351 | w, device := create(t) 352 | defer w.Shutdown() 353 | 354 | version, err := w.DeviceGetInfoROMImageVersion(device) 355 | require.NoError(t, err) 356 | require.NotEmpty(t, version) 357 | } 358 | 359 | func TestDeviceGetInfoROMVersion(t *testing.T) { 360 | w, device := create(t) 361 | defer w.Shutdown() 362 | 363 | version, err := w.DeviceGetInfoROMVersion(device, InfoROMObjectPower) 364 | require.NoError(t, err) 365 | require.NotEmpty(t, version) 366 | } 367 | 368 | func TestDeviceGetMaxClockInfo(t *testing.T) { 369 | w, device := create(t) 370 | defer w.Shutdown() 371 | 372 | clock, err := w.DeviceGetMaxClockInfo(device, ClockMem) 373 | require.NoError(t, err) 374 | require.NotZero(t, clock) 375 | } 376 | 377 | func TestDeviceGetMaxCustomerBoostClock(t *testing.T) { 378 | w, device := create(t) 379 | defer w.Shutdown() 380 | 381 | clock, err := w.DeviceGetMaxCustomerBoostClock(device, ClockGraphics) 382 | require.NoError(t, err) 383 | require.NotZero(t, clock) 384 | } 385 | 386 | func TestDeviceGetMaxPcieLinkGeneration(t *testing.T) { 387 | w, device := create(t) 388 | defer w.Shutdown() 389 | 390 | maxLinkGen, err := w.DeviceGetMaxPcieLinkGeneration(device) 391 | require.NoError(t, err) 392 | require.NotZero(t, maxLinkGen) 393 | } 394 | 395 | func TestDeviceGetMaxPcieLinkWidth(t *testing.T) { 396 | w, device := create(t) 397 | defer w.Shutdown() 398 | 399 | maxLinkWidth, err := w.DeviceGetMaxPcieLinkWidth(device) 400 | require.NoError(t, err) 401 | require.NotZero(t, maxLinkWidth) 402 | } 403 | 404 | func TestDeviceGetMemoryErrorCounter(t *testing.T) { 405 | w, device := create(t) 406 | defer w.Shutdown() 407 | 408 | counter, err := w.DeviceGetMemoryErrorCounter(device, MemoryErrorTypeCorrected, VolatileECC, MemoryLocationDeviceMemory) 409 | require.NoError(t, err) 410 | require.NotZero(t, counter) 411 | } 412 | 413 | func TestDeviceGetMemoryInfo(t *testing.T) { 414 | w, device := create(t) 415 | defer w.Shutdown() 416 | 417 | mem, err := w.DeviceGetMemoryInfo(device) 418 | require.NoError(t, err) 419 | 420 | require.True(t, mem.Free > 0) 421 | require.True(t, mem.Total > 0) 422 | require.True(t, mem.Used > 0) 423 | } 424 | 425 | func TestDeviceGetMinorNumber(t *testing.T) { 426 | w, device := create(t) 427 | defer w.Shutdown() 428 | 429 | _, err := w.DeviceGetMinorNumber(device) 430 | require.NoError(t, err) 431 | } 432 | 433 | func TestDeviceGetName(t *testing.T) { 434 | w, device := create(t) 435 | defer w.Shutdown() 436 | 437 | name, err := w.DeviceGetName(device) 438 | require.NoError(t, err) 439 | require.NotEmpty(t, name) 440 | } 441 | 442 | func TestDeviceGetP2PStatus(t *testing.T) { 443 | t.Skip() 444 | } 445 | 446 | func TestDeviceGetPciInfo(t *testing.T) { 447 | w, device := create(t) 448 | defer w.Shutdown() 449 | 450 | info, err := w.DeviceGetPCIInfo(device) 451 | require.NoError(t, err) 452 | require.NotNil(t, info) 453 | } 454 | 455 | func TestDeviceGetPcieReplayCounter(t *testing.T) { 456 | w, device := create(t) 457 | defer w.Shutdown() 458 | 459 | _, err := w.DeviceGetPcieReplayCounter(device) 460 | require.NoError(t, err) 461 | } 462 | 463 | func TestDeviceGetPCIeThroughput(t *testing.T) { 464 | w, device := create(t) 465 | defer w.Shutdown() 466 | 467 | value, err := w.DeviceGetPCIeThroughput(device, PCIeUtilTXBytes) 468 | require.NoError(t, err) 469 | require.NotZero(t, value) 470 | } 471 | 472 | func TestDeviceGetPerformanceState(t *testing.T) { 473 | w, device := create(t) 474 | defer w.Shutdown() 475 | 476 | _, err := w.DeviceGetPerformanceState(device) 477 | require.NoError(t, err) 478 | } 479 | 480 | func TestDeviceGetPowerManagementDefaultLimit(t *testing.T) { 481 | w, device := create(t) 482 | defer w.Shutdown() 483 | 484 | limit, err := w.DeviceGetPowerManagementDefaultLimit(device) 485 | require.NoError(t, err) 486 | require.NotZero(t, limit) 487 | } 488 | 489 | func TestDeviceGetPowerManagementLimit(t *testing.T) { 490 | w, device := create(t) 491 | defer w.Shutdown() 492 | 493 | limit, err := w.DeviceGetPowerManagementLimit(device) 494 | require.NoError(t, err) 495 | require.True(t, limit > 0) 496 | } 497 | 498 | func TestDeviceGetPowerManagementLimitConstraints(t *testing.T) { 499 | w, device := create(t) 500 | defer w.Shutdown() 501 | 502 | min, max, err := w.DeviceGetPowerManagementLimitConstraints(device) 503 | require.NoError(t, err) 504 | require.True(t, min > 0) 505 | require.True(t, max > 0) 506 | require.True(t, max > min) 507 | } 508 | 509 | func TestDeviceGetPowerManagementMode(t *testing.T) { 510 | w, device := create(t) 511 | defer w.Shutdown() 512 | 513 | _, err := w.DeviceGetPowerManagementMode(device) 514 | require.NoError(t, err) 515 | } 516 | 517 | func TestDeviceGetPowerState(t *testing.T) { 518 | w, device := create(t) 519 | defer w.Shutdown() 520 | 521 | _, err := w.DeviceGetPowerState(device) 522 | require.NoError(t, err) 523 | } 524 | 525 | func TestDeviceGetPowerUsage(t *testing.T) { 526 | w, device := create(t) 527 | defer w.Shutdown() 528 | 529 | power, err := w.DeviceGetPowerUsage(device) 530 | require.NoError(t, err) 531 | require.True(t, power > 0) 532 | } 533 | 534 | func TestDeviceGetRetiredPages(t *testing.T) { 535 | w, device := create(t) 536 | defer w.Shutdown() 537 | 538 | _, err := w.DeviceGetRetiredPages(device, PageRetirementCauseDoubleBitECCError) 539 | require.NoError(t, err) 540 | } 541 | 542 | func TestDeviceGetRetiredPagesPendingStatus(t *testing.T) { 543 | w, device := create(t) 544 | defer w.Shutdown() 545 | 546 | _, err := w.DeviceGetRetiredPagesPendingStatus(device) 547 | require.NoError(t, err) 548 | } 549 | 550 | func TestDeviceGetSamples(t *testing.T) { 551 | t.Skip() 552 | } 553 | 554 | func TestDeviceGetSerial(t *testing.T) { 555 | w, device := create(t) 556 | defer w.Shutdown() 557 | 558 | serial, err := w.DeviceGetSerial(device) 559 | require.NoError(t, err) 560 | require.NotEmpty(t, serial) 561 | } 562 | 563 | func TestDeviceGetSupportedClocksThrottleReasons(t *testing.T) { 564 | w, device := create(t) 565 | defer w.Shutdown() 566 | 567 | reasons, err := w.DeviceGetSupportedClocksThrottleReasons(device) 568 | require.NoError(t, err) 569 | require.NotZero(t, reasons) 570 | } 571 | 572 | func TestDeviceGetSupportedGraphicsClocks(t *testing.T) { 573 | w, device := create(t) 574 | defer w.Shutdown() 575 | 576 | mem, err := w.DeviceGetSupportedMemoryClocks(device) 577 | require.NoError(t, err) 578 | 579 | graphics, err := w.DeviceGetSupportedGraphicsClocks(device, mem[0]) 580 | require.NoError(t, err) 581 | require.NotEmpty(t, graphics) 582 | } 583 | 584 | func TestDeviceGetSupportedMemoryClocks(t *testing.T) { 585 | w, device := create(t) 586 | defer w.Shutdown() 587 | 588 | clocks, err := w.DeviceGetSupportedMemoryClocks(device) 589 | require.NoError(t, err) 590 | require.NotEmpty(t, clocks) 591 | 592 | for i, x := range clocks { 593 | log.Printf("%d: %d", i, x) 594 | } 595 | } 596 | 597 | func TestDeviceGetTemperature(t *testing.T) { 598 | w, device := create(t) 599 | defer w.Shutdown() 600 | 601 | temp, err := w.DeviceGetTemperature(device, TemperatureGPU) 602 | require.NoError(t, err) 603 | require.True(t, temp > 0) 604 | } 605 | 606 | func TestDeviceGetTemperatureThreshold(t *testing.T) { 607 | w, device := create(t) 608 | defer w.Shutdown() 609 | 610 | temp, err := w.DeviceGetTemperatureThreshold(device, TemperatureThresholdShutdown) 611 | require.NoError(t, err) 612 | require.True(t, temp > 0) 613 | } 614 | 615 | func TestDeviceGetTopologyCommonAncestor(t *testing.T) { 616 | w, device := create(t) 617 | defer w.Shutdown() 618 | 619 | _, err := w.DeviceGetTopologyCommonAncestor(device, device) 620 | require.NoError(t, err) 621 | } 622 | 623 | func TestDeviceGetTopologyNearestGpus(t *testing.T) { 624 | t.Skip() 625 | } 626 | 627 | func TestDeviceGetTotalECCErrors(t *testing.T) { 628 | w, device := create(t) 629 | defer w.Shutdown() 630 | 631 | _, err := w.DeviceGetTotalECCErrors(device, MemoryErrorTypeCorrected, VolatileECC) 632 | require.NoError(t, err) 633 | } 634 | 635 | func TestDeviceGetTotalEnergyConsumption(t *testing.T) { 636 | w, device := create(t) 637 | defer w.Shutdown() 638 | 639 | energy, err := w.DeviceGetTotalEnergyConsumption(device) 640 | require.NoError(t, err) 641 | require.NotZero(t, energy) 642 | } 643 | 644 | func TestDeviceGetUUID(t *testing.T) { 645 | w, device := create(t) 646 | defer w.Shutdown() 647 | 648 | uuid, err := w.DeviceGetUUID(device) 649 | require.NoError(t, err) 650 | require.NotEmpty(t, uuid) 651 | } 652 | 653 | func TestDeviceGetUtilizationRates(t *testing.T) { 654 | w, device := create(t) 655 | defer w.Shutdown() 656 | 657 | rates, err := w.DeviceGetUtilizationRates(device) 658 | require.NoError(t, err) 659 | require.NotZero(t, rates.GPU) 660 | require.NotZero(t, rates.Memory) 661 | } 662 | 663 | func TestDeviceGetVbiosVersion(t *testing.T) { 664 | w, device := create(t) 665 | defer w.Shutdown() 666 | 667 | version, err := w.DeviceGetVbiosVersion(device) 668 | require.NoError(t, err) 669 | require.NotEmpty(t, version) 670 | } 671 | 672 | func TestDeviceGetViolationStatus(t *testing.T) { 673 | w, device := create(t) 674 | defer w.Shutdown() 675 | 676 | status, err := w.DeviceGetViolationStatus(device, PerfPolicyPower) 677 | require.NoError(t, err) 678 | require.NotZero(t, status.ViolationTime) 679 | require.NotZero(t, status.ReferenceTime) 680 | } 681 | 682 | func TestDeviceOnSameBoard(t *testing.T) { 683 | w, device := create(t) 684 | defer w.Shutdown() 685 | 686 | s, err := w.DeviceOnSameBoard(device, device) 687 | require.NoError(t, err) 688 | require.True(t, s) 689 | } 690 | 691 | func TestDeviceSetAutoBoostedClocksEnabled(t *testing.T) { 692 | w, device := create(t) 693 | defer w.Shutdown() 694 | 695 | err := w.DeviceSetAutoBoostedClocksEnabled(device, false) 696 | require.NoError(t, err) 697 | } 698 | -------------------------------------------------------------------------------- /errors.go: -------------------------------------------------------------------------------- 1 | package nvml 2 | 3 | import ( 4 | "fmt" 5 | 6 | "github.com/pkg/errors" 7 | ) 8 | 9 | var ( 10 | ErrUninitialized = errors.New("NVML was not first initialized with Init") 11 | ErrInvalidArgument = errors.New("A supplied argument is invalid") 12 | ErrNotSupported = errors.New("The requested operation is not available on target device") 13 | ErrNoPermission = errors.New("The current user does not have permission for operation") 14 | ErrAlreadyInititlized = errors.New("Multiple initializations are now allowed through ref counting") 15 | ErrNotFound = errors.New("A query to find an object was unsuccessful") 16 | ErrInsufficientSize = errors.New("An input argument is not large enough") 17 | ErrInsufficientPower = errors.New("A device's external power cables are not properly attached") 18 | ErrDriverNotLoaded = errors.New("NVIDIA driver is not loaded") 19 | ErrTimeout = errors.New("User provided timeout passed") 20 | ErrIRQIssue = errors.New("NVIDIA Kernel detected an interrupt issue with a GPU") 21 | ErrLibraryNotFound = errors.New("NVML Shared Library couldn't be found or loaded") 22 | ErrFunctionNotFound = errors.New("Local version of NVML doesn't implement this function") 23 | ErrCorruptedInfoROM = errors.New("infoROM is corrupted") 24 | ErrGPULost = errors.New("The GPU has fallen off the bus or has otherwise become inaccessible") 25 | ErrResetRequired = errors.New("The GPU requires a reset before it can be used again") 26 | ErrOperatingSystem = errors.New("The GPU control device has been blocked by the operating system/cgroups") 27 | ErrLibRMVersionMismatch = errors.New("RM detects a driver/library version mismatch") 28 | ErrInUse = errors.New("An operation cannot be performed because the GPU is currently in use") 29 | ErrMemory = errors.New("Insufficient memory") 30 | ErrNoData = errors.New("No data") 31 | ErrVGPUECCNotSupported = errors.New("The requested vgpu operation is not available on target device, because ECC is enabled") 32 | ErrUnknown = errors.New("An internal driver error occurred") 33 | ) 34 | 35 | var errorCodeMappings = map[int]error{ 36 | 0: nil, 37 | 1: ErrUninitialized, 38 | 2: ErrInvalidArgument, 39 | 3: ErrNotSupported, 40 | 4: ErrNoPermission, 41 | 5: ErrAlreadyInititlized, 42 | 6: ErrNotFound, 43 | 7: ErrInsufficientSize, 44 | 8: ErrInsufficientPower, 45 | 9: ErrDriverNotLoaded, 46 | 10: ErrTimeout, 47 | 11: ErrIRQIssue, 48 | 12: ErrLibraryNotFound, 49 | 13: ErrFunctionNotFound, 50 | 14: ErrCorruptedInfoROM, 51 | 15: ErrGPULost, 52 | 16: ErrResetRequired, 53 | 17: ErrOperatingSystem, 54 | 18: ErrLibRMVersionMismatch, 55 | 19: ErrInUse, 56 | 20: ErrMemory, 57 | 21: ErrNoData, 58 | 22: ErrVGPUECCNotSupported, 59 | 999: ErrUnknown, 60 | } 61 | 62 | func returnValueToError(code int) error { 63 | if code == 0 { 64 | return nil 65 | } 66 | 67 | err, ok := errorCodeMappings[code] 68 | if ok { 69 | return err 70 | } 71 | 72 | return fmt.Errorf("NVML call failed with error code %d", code) 73 | } 74 | -------------------------------------------------------------------------------- /example_test.go: -------------------------------------------------------------------------------- 1 | package nvml 2 | 3 | import ( 4 | "log" 5 | ) 6 | 7 | func ExampleNew() { 8 | nvml, err := New("") 9 | if err != nil { 10 | panic(err) 11 | } 12 | 13 | defer nvml.Shutdown() 14 | 15 | err = nvml.Init() 16 | if err != nil { 17 | panic(err) 18 | } 19 | 20 | driverVersion, err := nvml.SystemGetDriverVersion() 21 | if err != nil { 22 | panic(err) 23 | } 24 | 25 | log.Printf("Driver version:\t%s", driverVersion) 26 | 27 | nvmlVersion, err := nvml.SystemGetNVMLVersion() 28 | if err != nil { 29 | panic(err) 30 | } 31 | 32 | log.Printf("NVML version:\t%s", nvmlVersion) 33 | 34 | deviceCount, err := nvml.DeviceGetCount() 35 | if err != nil { 36 | panic(err) 37 | } 38 | 39 | for i := uint32(0); i < deviceCount; i++ { 40 | handle, err := nvml.DeviceGetHandleByIndex(i) 41 | if err != nil { 42 | panic(err) 43 | } 44 | 45 | name, err := nvml.DeviceGetName(handle) 46 | log.Printf("Product name:\t%s", name) 47 | 48 | brand, err := nvml.DeviceGetBrand(handle) 49 | if err != nil { 50 | panic(err) 51 | } 52 | 53 | log.Printf("Product Brand:\t%s", brand) 54 | 55 | uuid, err := nvml.DeviceGetUUID(handle) 56 | if err != nil { 57 | panic(err) 58 | } 59 | 60 | log.Printf("GPU UUID:\t\t%s", uuid) 61 | 62 | fan, err := nvml.DeviceGetFanSpeed(handle) 63 | if err != nil { 64 | panic(err) 65 | } 66 | 67 | log.Printf("Fan Speed:\t\t%d", fan) 68 | } 69 | } 70 | -------------------------------------------------------------------------------- /structs.go: -------------------------------------------------------------------------------- 1 | package nvml 2 | 3 | import "math" 4 | 5 | // Device represents native NVML device handle. 6 | type Device uintptr 7 | 8 | const ( 9 | systemDriverVersionBufferSize = 80 10 | deviceNameBufferSize = 64 11 | deviceSerialBufferSize = 30 12 | deviceUUIDBufferSize = 80 13 | deviceVBIOSVersionBufferSize = 32 14 | deviceInfoROMVersionBufferSize = 16 15 | ) 16 | 17 | // Memory holds allocation information for a device. 18 | type Memory struct { 19 | Total uint64 // Total installed FB memory (in bytes). 20 | Free uint64 // Unallocated FB memory (in bytes). 21 | Used uint64 // Allocated FB memory (in bytes). 22 | } 23 | 24 | // BAR1Memory holds BAR1 memory allocation information for a device. 25 | type BAR1Memory struct { 26 | Total uint64 // Total BAR1 Memory (in bytes) 27 | Free uint64 // Unallocated BAR1 Memory (in bytes) 28 | Used uint64 // Allocated Used Memory (in bytes) 29 | } 30 | 31 | // Temperature sensors. 32 | type TemperatureSensor int32 33 | 34 | //noinspection GoUnusedConst 35 | const ( 36 | TemperatureGPU = TemperatureSensor(0) // Temperature sensor for the GPU die. 37 | ) 38 | 39 | // Temperature thresholds. 40 | type TemperatureThreshold int32 41 | 42 | //noinspection GoUnusedConst 43 | const ( 44 | // Temperature at which the GPU will shut down for HW protection 45 | TemperatureThresholdShutdown = TemperatureThreshold(0) 46 | // Temperature at which the GPU will begin HW slowdown 47 | TemperatureThresholdSlowdown = TemperatureThreshold(1) 48 | // Memory Temperature at which the GPU will begin SW slowdown 49 | TemperatureThresholdMemMax = TemperatureThreshold(2) 50 | // GPU Temperature at which the GPU can be throttled below base clock 51 | TemperatureThresholdGPUMax = TemperatureThreshold(3) 52 | ) 53 | 54 | // Clock types. All speeds are in Mhz. 55 | type ClockType int32 56 | 57 | //noinspection GoUnusedConst 58 | const ( 59 | ClockGraphics = ClockType(0) // Graphics clock domain 60 | ClockSM = ClockType(1) // SM clock domain 61 | ClockMem = ClockType(2) // Memory clock domain 62 | ClockVideo = ClockType(3) // Video encoder/decoder clock domain 63 | ) 64 | 65 | // ProcessInfo holds information about running compute processes on the GPU. 66 | type ProcessInfo struct { 67 | // Process ID 68 | PID uint32 69 | // Amount of used GPU memory in bytes. Under WDDM, NVML_VALUE_NOT_AVAILABLE is always reported because Windows KMD 70 | // manages all the memory and not the NVIDIA driver. 71 | UsedGPUMemory uint64 72 | } 73 | 74 | func (i ProcessInfo) MemoryInfoAvailable() bool { 75 | return i.UsedGPUMemory != math.MaxUint64 76 | } 77 | 78 | // Utilization information for a device. 79 | // Each sample period may be between 1 second and 1/6 second, depending on the product being queried. 80 | type Utilization struct { 81 | GPU uint32 // Percent of time over the past sample period during which one or more kernels was executing on the GPU. 82 | Memory uint32 // Percent of time over the past sample period during which global (device) memory was being read or written. 83 | } 84 | 85 | // The Brand of the GPU. 86 | type BrandType int32 87 | 88 | //noinspection GoUnusedConst 89 | const ( 90 | BrandUnknown = BrandType(0) 91 | BrandQuadro = BrandType(1) 92 | BrandTesla = BrandType(2) 93 | BrandNVS = BrandType(3) 94 | BrandGrid = BrandType(4) 95 | BrandGeforce = BrandType(5) 96 | ) 97 | 98 | func (b BrandType) String() string { 99 | switch b { 100 | case BrandQuadro: 101 | return "Quadro" 102 | case BrandTesla: 103 | return "Tesla" 104 | case BrandNVS: 105 | return "NVS" 106 | case BrandGrid: 107 | return "Grid" 108 | case BrandGeforce: 109 | return "Geforce" 110 | default: 111 | return "Unknown" 112 | } 113 | } 114 | 115 | // Clock Ids. These are used in combination with ClockType to specify a single clock value. 116 | type ClockID int32 117 | 118 | //noinspection GoUnusedConst 119 | const ( 120 | ClockIDCurrent = ClockID(0) // Current actual clock value. 121 | ClockIDAppClockTarget = ClockID(1) // Target application clock. 122 | ClockIDAppClockDefault = ClockID(2) // Default application clock target. 123 | ClockIDCustomerBoostMax = ClockID(3) // OEM-defined maximum clock rate. 124 | ) 125 | 126 | type ClocksThrottleReason uint64 127 | 128 | //noinspection GoUnusedConst 129 | const ( 130 | // Bit mask representing no clocks throttling. Clocks are as high as possible. 131 | ClocksThrottleReasonNone = ClocksThrottleReason(0) 132 | // Nothing is running on the GPU and the clocks are dropping to Idle state. 133 | ClocksThrottleReasonGPUIdle = ClocksThrottleReason(0x0000000000000001) 134 | // GPU clocks are limited by current setting of applications clocks. 135 | ClocksThrottleReasonApplicationsClocksSetting = ClocksThrottleReason(0x0000000000000002) 136 | // Renamed to ClocksThrottleReasonApplicationsClocksSetting as the name describes the situation more accurately. 137 | ClocksThrottleReasonUserDefinedClocks = ClocksThrottleReason(0x0000000000000002) 138 | // SW Power Scaling algorithm is reducing the clocks below requested clocks. 139 | ClocksThrottleReasonSWPowerCap = ClocksThrottleReason(0x0000000000000004) 140 | // HW Slowdown (reducing the core clocks by a factor of 2 or more) is engaged. 141 | // This is an indicator of: 142 | // - Temperature being too high 143 | // - External Power Brake Assertion is triggered (e.g. by the system power supply) 144 | // - Power draw is too high and Fast Trigger protection is reducing the clocks 145 | // - May be also reported during PState or clock change 146 | // - This behavior may be removed in a later release. 147 | ClocksThrottleReasonHWSlowdown = ClocksThrottleReason(0x0000000000000008) 148 | // This GPU has been added to a Sync boost group with nvidia-smi or DCGM in 149 | // order to maximize performance per watt. All GPUs in the sync boost group 150 | // will boost to the minimum possible clocks across the entire group. Look at 151 | // the throttle reasons for other GPUs in the system to see why those GPUs are 152 | // holding this one at lower clocks. 153 | ClocksThrottleReasonSyncBoost = ClocksThrottleReason(0x0000000000000010) 154 | // SW Thermal Slowdown 155 | // This is an indicator of one or more of the following: 156 | // - Current GPU temperature above the GPU Max Operating Temperature 157 | // - Current memory temperature above the Memory Max Operating Temperature 158 | ClocksThrottleReasonSWThermalSlowdown = ClocksThrottleReason(0x0000000000000020) 159 | // HW Thermal Slowdown (reducing the core clocks by a factor of 2 or more) is engaged. 160 | // This is an indicator of: 161 | // - Temperature being too high 162 | ClocksThrottleReasonHwThermalSlowdown = ClocksThrottleReason(0x0000000000000040) 163 | // HW Power Brake Slowdown (reducing the core clocks by a factor of 2 or more) is engaged. 164 | // This is an indicator of: 165 | // - External Power Brake Assertion being triggered (e.g. by the system power supply) 166 | ClocksThrottleReasonHwPowerBrakeSlowdown = ClocksThrottleReason(0x0000000000000080) 167 | ) 168 | 169 | // GPUOperationMode represents GPU Operation Mode. 170 | // GOM allows to reduce power usage and optimize GPU throughput by disabling GPU features. 171 | // Each GOM is designed to meet specific user needs. 172 | type GPUOperationMode int32 173 | 174 | //noinspection GoUnusedConst 175 | const ( 176 | // Everything is enabled and running at full speed. 177 | GPUOperationModeAllOn = GPUOperationMode(0) 178 | // Designed for running only compute tasks. Graphics operations are not allowed 179 | GPUOperationModeCompute = GPUOperationMode(1) 180 | // Designed for running graphics applications that don't require high bandwidth double precision 181 | GPUOperationModeLowDoublePrecision = GPUOperationMode(2) 182 | ) 183 | 184 | // PCIInfo represents PCI information about a GPU device. 185 | type PCIInfo struct { 186 | BusID string 187 | // The legacy tuple domain:bus:device.function PCI identifier 188 | BusIDLegacy string 189 | // The PCI domain on which the device's bus resides, 0 to 0xffffffff 190 | Domain uint32 191 | // The bus on which the device resides, 0 to 0xff 192 | Bus uint32 193 | // The device's id on the bus, 0 to 31 194 | Device uint32 195 | // The combined 16-bit device id and 16-bit vendor id 196 | PCIDeviceID uint32 197 | // The 32-bit Sub System Device ID. Added in NVML 2.285 API 198 | PCISubsystemID uint32 199 | } 200 | 201 | // Driver models. Windows only. 202 | type DriverModel int32 203 | 204 | //noinspection GoUnusedConst 205 | const ( 206 | // WDDM driver model -- GPU treated as a display device. 207 | DriverModelWDDM = DriverModel(0) 208 | // WDM (TCC) model (recommended) -- GPU treated as a generic device. 209 | DriverModelWDM = DriverModel(1) 210 | ) 211 | 212 | // Compute mode. 213 | type ComputeMode int32 214 | 215 | //noinspection GoUnusedConst 216 | const ( 217 | // Default compute mode - multiple contexts per device. 218 | ComputeModeDefault = ComputeMode(0) 219 | // Support Removed. 220 | ComputeModeExclusiveThread = ComputeMode(1) 221 | // No contexts per device. 222 | ComputeModeProhibited = ComputeMode(2) 223 | // Only one context per device, usable from multiple threads at a time. 224 | ComputeModeExclusiveProcess = ComputeMode(3) 225 | ) 226 | 227 | // API types that allow changes to default permission restrictions. 228 | type RestrictedAPI int32 229 | 230 | //noinspection GoUnusedConst 231 | const ( 232 | // APIs that change application clocks 233 | RestrictedAPISetApplicationClocks = RestrictedAPI(0) 234 | // APIs that enable/disable Auto Boosted clocks 235 | RestrictedAPISetAutoBoostedClocks = RestrictedAPI(1) 236 | ) 237 | 238 | // Available infoROM objects. 239 | type InfoROMObject int32 240 | 241 | //noinspection GoUnusedConst 242 | const ( 243 | // An object defined by OEM. 244 | InfoROMObjectOEM = InfoROMObject(0) 245 | // The ECC object determining the level of ECC support. 246 | InfoROMObjectECC = InfoROMObject(1) 247 | // The power management object. 248 | InfoROMObjectPower = InfoROMObject(2) 249 | ) 250 | 251 | // Represents type of encoder for capacity can be queried. 252 | type EncoderType int32 253 | 254 | //noinspection GoUnusedConst 255 | const ( 256 | EncoderTypeQueryH264 = EncoderType(0) 257 | EncoderTypeQueryHEVC = EncoderType(1) 258 | ) 259 | 260 | // Memory error types 261 | type MemoryErrorType int32 262 | 263 | //noinspection GoUnusedConst 264 | const ( 265 | // A memory error that was corrected for ECC errors, these are single bit errors For Texture memory, these are errors fixed by resend. 266 | MemoryErrorTypeCorrected = MemoryErrorType(0) 267 | // A memory error that was not corrected for ECC errors, these are double bit errors For Texture memory, these are errors where the resend fails. 268 | MemoryErrorTypeUncorrected = MemoryErrorType(1) 269 | ) 270 | 271 | // ECC counter types. 272 | // Note: Volatile counts are reset each time the driver loads. 273 | // On Windows this is once per boot. On Linux this can be more frequent. 274 | // On Linux the driver unloads when no active clients exist. 275 | // If persistence mode is enabled or there is always a driver client active (e.g. X11), then Linux also sees per-boot 276 | // behavior. If not, volatile counts are reset each time a compute app is run. 277 | type ECCCounterType int32 278 | 279 | //noinspection GoUnusedConst 280 | const ( 281 | // Volatile counts are reset each time the driver loads. 282 | VolatileECC = ECCCounterType(0) 283 | // Aggregate counts persist across reboots (i.e. for the lifetime of the device). 284 | AggregateECC = ECCCounterType(1) 285 | ) 286 | 287 | // Memory locations. 288 | type MemoryLocation int32 289 | 290 | //noinspection GoUnusedConst 291 | const ( 292 | // GPU L1 Cache. 293 | MemoryLocationL1Cache = MemoryLocation(0) 294 | // GPU L2 Cache. 295 | MemoryLocationL2Cache = MemoryLocation(1) 296 | // GPU Device Memory. 297 | MemoryLocationDeviceMemory = MemoryLocation(2) 298 | // GPU Register File. 299 | MemoryLocationRegisterFile = MemoryLocation(3) 300 | // GPU Texture Memory. 301 | MemoryLocationTextureMemory = MemoryLocation(4) 302 | // Shared memory. 303 | MemoryLocationTextureSHM = MemoryLocation(5) 304 | // CBU. 305 | MemoryLocationCBU = MemoryLocation(6) 306 | ) 307 | 308 | // Represents the queryable PCIe utilization counters. 309 | type PCIeUtilCounter int32 310 | 311 | //noinspection GoUnusedConst 312 | const ( 313 | PCIeUtilTXBytes = PCIeUtilCounter(0) 314 | PCIeUtilRXBytes = PCIeUtilCounter(1) 315 | ) 316 | 317 | // PState represents allowed PStates. 318 | type PState int32 319 | 320 | //noinspection GoUnusedConst 321 | const ( 322 | PState0 = PState(0) // Performance state 0 -- Maximum Performance. 323 | PState1 = PState(1) 324 | PState2 = PState(2) 325 | PState3 = PState(3) 326 | PState4 = PState(4) 327 | PState5 = PState(5) 328 | PState6 = PState(6) 329 | PState7 = PState(7) 330 | PState8 = PState(8) 331 | PState9 = PState(9) 332 | PState10 = PState(10) 333 | PState11 = PState(11) 334 | PState12 = PState(12) 335 | PState13 = PState(13) 336 | PState14 = PState(14) 337 | PState15 = PState(15) // Performance state 15 -- Minimum Performance. 338 | PStateUnknown = PState(32) 339 | ) 340 | 341 | // Causes for page retirement. 342 | type PageRetirementCause int32 343 | 344 | //noinspection GoUnusedConst 345 | const ( 346 | // Page was retired due to multiple single bit ECC error. 347 | PageRetirementCauseMultipleSingleBitECCErrors = PageRetirementCause(0) 348 | // Page was retired due to double bit ECC error. 349 | PageRetirementCauseDoubleBitECCError = PageRetirementCause(1) 350 | ) 351 | 352 | // Represents level relationships within a system between two GPUs. 353 | // The enums are spaced to allow for future relationships. 354 | type GPUTopologyLevel int32 355 | 356 | //noinspection GoUnusedConst 357 | const ( 358 | TopologyInternal = GPUTopologyLevel(0) 359 | TopologySingle = GPUTopologyLevel(10) 360 | TopologyMultiple = GPUTopologyLevel(20) 361 | TopologyHostbridge = GPUTopologyLevel(30) 362 | TopologyNode = GPUTopologyLevel(40) 363 | TopologySystem = GPUTopologyLevel(50) 364 | ) 365 | 366 | // Detailed ECC error counts for a device. 367 | // Different GPU families can have different memory error counters. 368 | type ECCErrorCounts struct { 369 | L1Cache uint64 // L1 cache errors. 370 | L2Cache uint64 // L2 cache errors. 371 | DeviceMemory uint64 // Device memory errors. 372 | RegisterFile uint64 // Register file errors. 373 | } 374 | 375 | // Represents type of perf policy for which violation times can be queried. 376 | type PerfPolicyType int32 377 | 378 | //noinspection GoUnusedConst 379 | const ( 380 | // How long did power violations cause the GPU to be below application clocks. 381 | PerfPolicyPower = PerfPolicyType(0) 382 | // How long did thermal violations cause the GPU to be below application clocks. 383 | PerfPolicyThermal = PerfPolicyType(1) 384 | // How long did sync boost cause the GPU to be below application clocks. 385 | PerfPolicySyncBoost = PerfPolicyType(2) 386 | // How long did the board limit cause the GPU to be below application clocks. 387 | PerfPolicyBoardLimit = PerfPolicyType(3) 388 | // How long did low utilization cause the GPU to be below application clocks. 389 | PerfPolicyLowUtilization = PerfPolicyType(4) 390 | // How long did the board reliability limit cause the GPU to be below application clocks. 391 | PerfPolicyReliability = PerfPolicyType(5) 392 | // Total time the GPU was held below application clocks by any limiter (0 - 5 above). 393 | PerfPolicyTotalAppClocks = PerfPolicyType(10) 394 | // Total time the GPU was held below base clocks. 395 | PerfPolicyTotalBaseClocks = PerfPolicyType(11) 396 | ) 397 | 398 | // ViolationTime holds perf policy violation status data. 399 | type ViolationTime struct { 400 | ReferenceTime uint64 // ReferenceTime represents CPU timestamp in microseconds 401 | ViolationTime uint64 // ViolationTime in Nanoseconds 402 | } 403 | -------------------------------------------------------------------------------- /system.go: -------------------------------------------------------------------------------- 1 | package nvml 2 | 3 | import ( 4 | "C" 5 | "unsafe" 6 | ) 7 | 8 | // SystemGetCudaDriverVersion retrieves the version of the CUDA driver. 9 | // The returned CUDA driver version is the same as the CUDA API cuDriverGetVersion() would return on the system. 10 | func (a API) SystemGetCudaDriverVersion() (cudaDriverVersion int32, err error) { 11 | err = a.call(a.nvmlSystemGetCudaDriverVersion, uintptr(unsafe.Pointer(&cudaDriverVersion))) 12 | return 13 | } 14 | 15 | // SystemGetDriverVersion retrieves the version of the system's graphics driver. 16 | func (a API) SystemGetDriverVersion() (string, error) { 17 | buffer := [systemDriverVersionBufferSize]C.char{} 18 | if err := a.call(a.nvmlSystemGetDriverVersion, uintptr(unsafe.Pointer(&buffer[0])), systemDriverVersionBufferSize); err != nil { 19 | return "", err 20 | } 21 | 22 | return C.GoString(&buffer[0]), nil 23 | } 24 | 25 | // SystemGetNVMLVersion retrieves the version of the NVML library. 26 | func (a API) SystemGetNVMLVersion() (string, error) { 27 | buffer := [systemDriverVersionBufferSize]C.char{} 28 | if err := a.call(a.nvmlSystemGetNVMLVersion, uintptr(unsafe.Pointer(&buffer[0])), systemDriverVersionBufferSize); err != nil { 29 | return "", err 30 | } 31 | 32 | return C.GoString(&buffer[0]), nil 33 | } 34 | 35 | // SystemGetProcessName gets name of the process with provided process id 36 | func (a API) SystemGetProcessName(pid uint) (string, error) { 37 | const maxLength = 256 38 | 39 | buffer := [maxLength]C.char{} 40 | if err := a.call(a.nvmlSystemGetProcessName, uintptr(pid), uintptr(unsafe.Pointer(&buffer[0])), maxLength); err != nil { 41 | return "", err 42 | } 43 | 44 | return C.GoString(&buffer[0]), nil 45 | } 46 | -------------------------------------------------------------------------------- /system_test.go: -------------------------------------------------------------------------------- 1 | package nvml 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/stretchr/testify/require" 7 | ) 8 | 9 | func TestSystemGetCudaDriverVersion(t *testing.T) { 10 | w, _ := create(t) 11 | defer w.Shutdown() 12 | 13 | version, err := w.SystemGetCudaDriverVersion() 14 | require.NoError(t, err) 15 | require.True(t, version > 0) 16 | } 17 | 18 | func TestSystemGetDriverVersion(t *testing.T) { 19 | w, _ := create(t) 20 | defer w.Shutdown() 21 | 22 | version, err := w.SystemGetDriverVersion() 23 | require.NoError(t, err) 24 | require.NotEmpty(t, version) 25 | } 26 | 27 | func TestSystemGetNVMLVersion(t *testing.T) { 28 | w, _ := create(t) 29 | defer w.Shutdown() 30 | 31 | version, err := w.SystemGetNVMLVersion() 32 | require.NoError(t, err) 33 | require.NotEmpty(t, version) 34 | } 35 | 36 | func TestSystemGetProcessName(t *testing.T) { 37 | w, _ := create(t) 38 | defer w.Shutdown() 39 | 40 | name, err := w.SystemGetProcessName(1336) 41 | require.NoError(t, err) 42 | require.NotEmpty(t, name) 43 | } 44 | --------------------------------------------------------------------------------