├── .gitignore ├── LICENSE ├── Makefile ├── README.md ├── cmd └── detect-gpu │ └── main.go ├── docs └── zh-cn.md ├── examples └── main.go ├── go.mod ├── go.sum ├── internal └── stat │ └── stat.go └── pkg └── detect ├── detect.go ├── detect_test.go └── model.go /.gitignore: -------------------------------------------------------------------------------- 1 | # If you prefer the allow list template instead of the deny list, see community template: 2 | # https://github.com/github/gitignore/blob/main/community/Golang/Go.AllowList.gitignore 3 | # 4 | # Binaries for programs and plugins 5 | *.exe 6 | *.exe~ 7 | *.dll 8 | *.so 9 | *.dylib 10 | 11 | # Test binary, built with `go test -c` 12 | *.test 13 | 14 | # Output of the go coverage tool, specifically when used with LiteIDE 15 | *.out 16 | 17 | # Dependency directories (remove the comment below to include it) 18 | # vendor/ 19 | 20 | # Go workspace file 21 | go.work 22 | 23 | bin 24 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | BINARY = detect-gpu 2 | GOARCH = amd64 3 | 4 | GITHUB_USER = mayooot 5 | CURRENT_DIR =$(shell pwd) 6 | BUILD_DIR=${CURRENT_DIR}/cmd/${BINARY} 7 | BIN_DIR=${CURRENT_DIR}/bin 8 | 9 | all: fmt imports test clean linux 10 | 11 | fmt: 12 | gofmt -l -w . 13 | 14 | imports: 15 | goimports-reviser --rm-unused -local github.com/${GITHUB_USER}/${BINARY} -format ./... 16 | 17 | test: 18 | go test -v pkg/detect/* 19 | 20 | clean: 21 | - rm -f ${BIN_DIR}/* 22 | 23 | linux: 24 | @cd ${BUILD_DIR}; \ 25 | GOOS=linux GOARCH=${GOARCH} go build -o ${BIN_DIR}/${BINARY}-linux-${GOARCH} . ; \ 26 | cd - >/dev/null 27 | 28 | .PHONY: all fmt imports test clean linux -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Detect-GPU 2 | 3 | ![license](https://img.shields.io/hexpm/l/plug.svg) 4 | [![Go Report Card](https://goreportcard.com/badge/github.com/mayooot/detect-gpu)](https://goreportcard.com/report/github.com/mayooot/detect-gpu) 5 | 6 | [简体中文](/docs/zh-cn.md) 7 | 8 | # Overview 9 | 10 | `Detect-GPU` is an HTTP server that calls [go-nvml](https://github.com/NVIDIA/go-nvml) and provides an api to get 11 | information about the NVIDIA GPU on a Linux server. 12 | 13 | Because we may use macOS or Windows for development, but [go-nvml](https://github.com/NVIDIA/go-nvml) needs Linux NVIDIA 14 | driver, so it will report error during development. 15 | 16 | And we split it out as a standalone HTTP service so that we can compile and run the main application successfully even 17 | in an environment without NVIDIA drivers. 18 | 19 | - [Detect-GPU](#detect-gpu) 20 | - [Overview](#Overview) 21 | - [Quick Start](#quick-start) 22 | - [Build from source](#build-from-source) 23 | - [Test](#test) 24 | - [Installation](#installation) 25 | - [Usage](#usage) 26 | - [Contribute](#contribute) 27 | 28 | # Quick Start 29 | 30 | Downloading the binary executable from [release](https://github.com/mayooot/detect-gpu/releases). 31 | And run it. 32 | 33 | ```shell 34 | $ ./detect-gpu-linux-amd64 35 | 2024/01/08 06:46:03 stat.go:60: [info] detect server start success, listen on 0.0.0.0:2376 36 | 2024/01/08 06:46:03 stat.go:61: [info] detect gpu timeout: 1000 ms 37 | 2024/01/08 06:46:03 stat.go:62: [info] ROUTES: 38 | 2024/01/08 06:46:03 stat.go:63: [info] GET --> /api/v1/detect/gpu 39 | 40 | $ ./detect-gpu-linux-amd64 -h 41 | Usage of ./detect-gpu-linux-amd64: 42 | -a, --addr string Address of detect server, format: ip:port, default: 0.0.0.0:2376 (default "0.0.0.0:2376") 43 | -p, --path string Path of detect server, default: /api/v1/detect/gpu (default "/api/v1/detect/gpu") 44 | -t, --timeout duration Timeout of detect gpu, default: 1s (default 1s) 45 | pflag: help requested 46 | ``` 47 | 48 | Send a GET request using cURL or any language. 49 | 50 | ```shell 51 | $ curl 127.0.0.1:2376/api/v1/detect/gpu 52 | [ 53 | { 54 | "index":0, 55 | "uuid":"uuid", 56 | "name":"NVIDIA A100 80GB PCIe", 57 | "memoryInfo":{ 58 | "Total":85899345920, 59 | "Free":63216877568, 60 | "Used":22682468352 61 | }, 62 | "powerUsage":74634, 63 | "powerState":0, 64 | "powerManagementDefaultLimit":300000, 65 | "informImageVersion":"1001.0230.00.03", 66 | "systemGetDriverVersion":"525.85.12", 67 | "systemGetCudaDriverVersion":12000, 68 | "tGraphicsRunningProcesses":[] 69 | }, 70 | { 71 | "index":1, 72 | "uuid":"uuid", 73 | "name":"NVIDIA A100 80GB PCIe", 74 | "memoryInfo":{ 75 | "Total":85899345920, 76 | "Free":30687952896, 77 | "Used":55211393024 78 | }, 79 | "powerUsage":65507, 80 | "powerState":0, 81 | "powerManagementDefaultLimit":300000, 82 | "informImageVersion":"1001.0230.00.03", 83 | "systemGetDriverVersion":"525.85.12", 84 | "systemGetCudaDriverVersion":12000, 85 | "tGraphicsRunningProcesses":[] 86 | } 87 | ] 88 | ``` 89 | 90 | # Build from source 91 | 92 | ```shell 93 | $ git clone https://github.com/mayooot/detect-gpu 94 | $ cd detect-gpu 95 | $ make linux 96 | ``` 97 | 98 | # Test 99 | 100 | ```shell 101 | $ make test 102 | ``` 103 | 104 | # Installation 105 | 106 | `Detect-GPU` is available using the standard go get command. 107 | 108 | Install by running: 109 | 110 | ```shell 111 | $ go get github.com/mayooot/detect-gpu/pkg/detect 112 | ``` 113 | 114 | # Usage 115 | 116 | You can refer to the [example](./examples/main.go) for usage. 117 | 118 | Like this: 119 | 120 | ```go 121 | package main 122 | 123 | import ( 124 | "fmt" 125 | "time" 126 | 127 | "github.com/mayooot/detect-gpu/pkg/detect" 128 | ) 129 | 130 | func main() { 131 | timeOutDuration := 500 * time.Millisecond 132 | 133 | testClient := detect.NewClient(detect.WithTimeout(timeOutDuration)) 134 | if err := testClient.Init(); err != nil { 135 | panic(err) 136 | } 137 | defer testClient.Close() 138 | 139 | gpus, err := testClient.DetectGpu() 140 | if err != nil { 141 | panic(err) 142 | } 143 | for _, gpu := range gpus { 144 | fmt.Printf("%#+v\n", gpu) 145 | } 146 | } 147 | 148 | ``` 149 | 150 | # Contribute 151 | 152 | Feel free to open issues and pull requests. Any feedback is highly appreciated! -------------------------------------------------------------------------------- /cmd/detect-gpu/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | goflag "flag" 5 | "fmt" 6 | "syscall" 7 | "time" 8 | 9 | "github.com/judwhite/go-svc" 10 | "github.com/ngaut/log" 11 | flag "github.com/spf13/pflag" 12 | 13 | "github.com/mayooot/detect-gpu/internal/stat" 14 | "github.com/mayooot/detect-gpu/pkg/detect" 15 | ) 16 | 17 | var ( 18 | addr = flag.StringP("addr", "a", "0.0.0.0:2376", "Address of detect server, format: ip:port, default: 0.0.0.0:2376") 19 | path = flag.StringP("path", "p", "/api/v1/detect/gpu", "Path of detect server, default: /api/v1/detect/gpu") 20 | timeout = flag.DurationP("timeout", "t", 1*time.Second, "Timeout of detect gpu, default: 1s") 21 | ) 22 | 23 | type program struct { 24 | stat *stat.Stat 25 | client *detect.Client 26 | } 27 | 28 | func main() { 29 | prg := &program{} 30 | if err := svc.Run(prg, syscall.SIGINT, syscall.SIGTERM); err != nil { 31 | log.Fatal(err) 32 | } 33 | } 34 | 35 | func (p *program) Init(svc.Environment) error { 36 | flag.CommandLine.AddGoFlagSet(goflag.CommandLine) 37 | flag.Parse() 38 | 39 | if len(*addr) == 0 || len(*path) == 0 || *timeout == 0 { 40 | return fmt.Errorf("addr, path, timeout must be set, "+ 41 | "addr: %s, path: %s, timeout: %d", 42 | *addr, *path, *timeout) 43 | } 44 | 45 | p.stat = stat.NewStat(stat.WithAddr(*addr), 46 | stat.WithPath(*path), 47 | stat.WithClient( 48 | detect.NewClient(detect.WithTimeout(*timeout)), 49 | )) 50 | 51 | if err := p.client.Init(); err != nil { 52 | return err 53 | } 54 | return nil 55 | } 56 | 57 | func (p *program) Start() error { 58 | go func() { 59 | p.stat.Run() 60 | }() 61 | return nil 62 | } 63 | 64 | func (p *program) Stop() error { 65 | if err := p.client.Close(); err != nil { 66 | log.Error(err) 67 | return err 68 | } 69 | log.Info("shutdown nvml success") 70 | log.Info("detect gpu server close") 71 | return nil 72 | } 73 | -------------------------------------------------------------------------------- /docs/zh-cn.md: -------------------------------------------------------------------------------- 1 | # Detect-GPU 2 | 3 | ![license](https://img.shields.io/hexpm/l/plug.svg) 4 | [![Go Report Card](https://goreportcard.com/badge/github.com/mayooot/detect-gpu)](https://goreportcard.com/report/github.com/mayooot/detect-gpu) 5 | 6 | [English](..%2FREADME.md) 7 | 8 | # 概述 9 | 10 | 调用 [go-nvml](https://github.com/NVIDIA/go-nvml),获取 Linux GPU服务器上的 NVIDIA GPU 信息,并以 API 的形式暴露。 11 | 12 | 因为开发时可能使用 mac 或者 windows 系统,但是 go-nvml 需要 Linux NVIDIA 的驱动,所以开发时会报错。 13 | 14 | 所以把它拆分出来,作为一个独立的 HTTP 服务,这样即使在没有 NVIDIA 驱动的环境中,也可以成功编译和运行主要应用程序。 15 | 16 | - [Detect-GPU](#detect-gpu) 17 | - [概述](#概述) 18 | - [快速开始](#快速开始) 19 | - [从源码构建](#从源码构建) 20 | - [运行](#运行) 21 | - [测试](#测试) 22 | - [使用](#使用) 23 | - [在Go项目中引用](#在Go项目中引用) 24 | - [简单的例子](#简单的例子) 25 | - [贡献代码](#贡献代码) 26 | 27 | # 快速开始 28 | 29 | ## 从源码构建 30 | 31 | ```shell 32 | $ git clone https://github.com/mayooot/detect-gpu 33 | $ cd detect-gpu 34 | $ make linux 35 | ``` 36 | 37 | ## 运行 38 | 39 | 如果需要的话,可以指定参数 40 | 41 | ```shell 42 | $ ./detect-gpu-linux-amd64 -h 43 | Usage of ./detect-gpu-linux-amd64: 44 | -r, --pattern string Pattern of detect server (default "/api/v1/detect/gpu") 45 | -p, --port string Port of detect server, format :port (default ":2376") 46 | -t, --td duration Timeout duration for detect gpu (default 5s) 47 | pflag: help requested 48 | 49 | $ ./detect-gpu-linux-amd64 50 | ``` 51 | 52 | ## 测试 53 | 54 | ```shell 55 | go test -v pkg/detect/* 56 | ``` 57 | 58 | ## 使用 59 | 60 | ```shell 61 | $ curl 127.0.0.1:2376/api/v1/detect/gpu 62 | [ 63 | { 64 | "index":0, 65 | "uuid":"GPU-uuid", 66 | "name":"NVIDIA A100 80GB PCIe", 67 | "memoryInfo":{ 68 | "Total":85899345920, 69 | "Free":63216877568, 70 | "Used":22682468352 71 | }, 72 | "powerUsage":74634, 73 | "powerState":0, 74 | "powerManagementDefaultLimit":300000, 75 | "informImageVersion":"1001.0230.00.03", 76 | "systemGetDriverVersion":"525.85.12", 77 | "systemGetCudaDriverVersion":12000, 78 | "tGraphicsRunningProcesses":[] 79 | }, 80 | { 81 | "index":1, 82 | "uuid":"GPU-uuid", 83 | "name":"NVIDIA A100 80GB PCIe", 84 | "memoryInfo":{ 85 | "Total":85899345920, 86 | "Free":30687952896, 87 | "Used":55211393024 88 | }, 89 | "powerUsage":65507, 90 | "powerState":0, 91 | "powerManagementDefaultLimit":300000, 92 | "informImageVersion":"1001.0230.00.03", 93 | "systemGetDriverVersion":"525.85.12", 94 | "systemGetCudaDriverVersion":12000, 95 | "tGraphicsRunningProcesses":[] 96 | } 97 | ] 98 | ``` 99 | 100 | # 在Go项目中引用 101 | 102 | 使用标准的 go get 命令可以获得 `detect-gpu`。 103 | 104 | ```shell 105 | $ go get github.com/mayooot/detect-gpu/pkg/detect 106 | ``` 107 | 108 | ## 简单的例子 109 | 110 | ```go 111 | package main 112 | 113 | import ( 114 | "fmt" 115 | "log" 116 | "time" 117 | 118 | "github.com/mayooot/detect-gpu/pkg/detect" 119 | ) 120 | 121 | func main() { 122 | infos, err := detect.DetectGpu(1 * time.Second) 123 | if err != nil { 124 | log.Fatal(err.Error()) 125 | } 126 | for _, info := range infos { 127 | fmt.Printf("%+v\n", info) 128 | } 129 | } 130 | ``` 131 | 132 | # 贡献代码 133 | 134 | 欢迎贡献代码或 issue! 135 | -------------------------------------------------------------------------------- /examples/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "time" 6 | 7 | "github.com/mayooot/detect-gpu/pkg/detect" 8 | ) 9 | 10 | func main() { 11 | timeOutDuration := 500 * time.Millisecond 12 | 13 | testClient := detect.NewClient(detect.WithTimeout(timeOutDuration)) 14 | if err := testClient.Init(); err != nil { 15 | panic(err) 16 | } 17 | defer testClient.Close() 18 | 19 | gpus, err := testClient.DetectGpu() 20 | if err != nil { 21 | panic(err) 22 | } 23 | for _, gpu := range gpus { 24 | fmt.Printf("%#+v\n", gpu) 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/mayooot/detect-gpu 2 | 3 | go 1.20 4 | 5 | require ( 6 | github.com/NVIDIA/go-nvml v0.12.0-1 7 | github.com/judwhite/go-svc v1.2.1 8 | github.com/ngaut/log v0.0.0-20221012222132-f3329cba28a5 9 | github.com/spf13/pflag v1.0.5 10 | github.com/stretchr/testify v1.8.4 11 | ) 12 | 13 | require ( 14 | github.com/davecgh/go-spew v1.1.1 // indirect 15 | github.com/pmezard/go-difflib v1.0.0 // indirect 16 | golang.org/x/sys v0.0.0-20210124154548-22da62e12c0c // indirect 17 | gopkg.in/yaml.v3 v3.0.1 // indirect 18 | ) 19 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | github.com/NVIDIA/go-nvml v0.12.0-1 h1:6mdjtlFo+17dWL7VFPfuRMtf0061TF4DKls9pkSw6uM= 2 | github.com/NVIDIA/go-nvml v0.12.0-1/go.mod h1:hy7HYeQy335x6nEss0Ne3PYqleRa6Ct+VKD9RQ4nyFs= 3 | github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= 4 | github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= 5 | github.com/judwhite/go-svc v1.2.1 h1:a7fsJzYUa33sfDJRF2N/WXhA+LonCEEY8BJb1tuS5tA= 6 | github.com/judwhite/go-svc v1.2.1/go.mod h1:mo/P2JNX8C07ywpP9YtO2gnBgnUiFTHqtsZekJrUuTk= 7 | github.com/ngaut/log v0.0.0-20221012222132-f3329cba28a5 h1:xIBY4Eci2hOJJLZLbQ9g/Uuq+V6QLiOBi1mZzqZyqOY= 8 | github.com/ngaut/log v0.0.0-20221012222132-f3329cba28a5/go.mod h1:ueVCjKQllPmX7uEvCYnZD5b8qjidGf1TCH61arVe4SU= 9 | github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= 10 | github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= 11 | github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA= 12 | github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= 13 | github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk= 14 | github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= 15 | golang.org/x/sys v0.0.0-20210124154548-22da62e12c0c h1:VwygUrnw9jn88c4u8GD3rZQbqrP/tgas88tPUbBxQrk= 16 | golang.org/x/sys v0.0.0-20210124154548-22da62e12c0c/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 17 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= 18 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= 19 | gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= 20 | gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= 21 | -------------------------------------------------------------------------------- /internal/stat/stat.go: -------------------------------------------------------------------------------- 1 | package stat 2 | 3 | import ( 4 | "encoding/json" 5 | "fmt" 6 | "net" 7 | "net/http" 8 | 9 | "github.com/ngaut/log" 10 | 11 | "github.com/mayooot/detect-gpu/pkg/detect" 12 | ) 13 | 14 | type Option func(s *Stat) 15 | 16 | // Stat is used to process http requests. 17 | type Stat struct { 18 | l net.Listener 19 | 20 | Addr string 21 | Path string 22 | 23 | Client *detect.Client 24 | } 25 | 26 | func NewStat(opts ...Option) *Stat { 27 | s := &Stat{} 28 | for _, apply := range opts { 29 | apply(s) 30 | } 31 | return s 32 | } 33 | 34 | func WithAddr(Addr string) Option { 35 | return func(s *Stat) { 36 | s.Addr = Addr 37 | } 38 | } 39 | 40 | func WithPath(path string) Option { 41 | return func(s *Stat) { 42 | s.Path = path 43 | } 44 | } 45 | 46 | func WithClient(client *detect.Client) Option { 47 | return func(s *Stat) { 48 | s.Client = client 49 | } 50 | } 51 | 52 | func (st *Stat) Run() { 53 | var err error 54 | 55 | st.l, err = net.Listen("tcp", st.Addr) 56 | if err != nil { 57 | log.Errorf("detect server start failed, err: %v", err) 58 | return 59 | } 60 | log.Infof("detect server start success, listen on %s", st.Addr) 61 | log.Infof("detect gpu timeout: %d ms", st.Client.Timeout.Milliseconds()) 62 | log.Info("ROUTES:") 63 | log.Infof("GET --> /api/v1/detect/gpu") 64 | 65 | srv := http.Server{} 66 | mux := http.NewServeMux() 67 | mux.HandleFunc(st.Path, func(w http.ResponseWriter, req *http.Request) { 68 | if req.Method != http.MethodGet { 69 | log.Errorf("method not allowed, method: %s", req.Method) 70 | http.Error(w, "Method Not Allowed", http.StatusMethodNotAllowed) 71 | return 72 | } 73 | 74 | gpuInfos, err := st.Client.DetectGpu() 75 | if err != nil { 76 | err = fmt.Errorf("detect.DetectGpu failed: %w", err) 77 | log.Errorf(err.Error()) 78 | http.Error(w, err.Error(), http.StatusInternalServerError) 79 | return 80 | } 81 | 82 | resp, _ := json.Marshal(gpuInfos) 83 | w.WriteHeader(http.StatusOK) 84 | w.Write(resp) 85 | log.Infof("detect gpu success, gpu num: %d", len(gpuInfos)) 86 | }) 87 | 88 | srv.Handler = mux 89 | srv.Serve(st.l) 90 | } 91 | -------------------------------------------------------------------------------- /pkg/detect/detect.go: -------------------------------------------------------------------------------- 1 | package detect 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | "time" 7 | 8 | "github.com/NVIDIA/go-nvml/pkg/nvml" 9 | ) 10 | 11 | type Option func(c *Client) 12 | 13 | // Client calls nvml to query gpus 14 | type Client struct { 15 | Timeout time.Duration 16 | } 17 | 18 | func NewClient(opts ...Option) *Client { 19 | c := &Client{} 20 | for _, apply := range opts { 21 | apply(c) 22 | } 23 | return c 24 | } 25 | 26 | func WithTimeout(timeout time.Duration) Option { 27 | return func(c *Client) { 28 | c.Timeout = timeout 29 | } 30 | } 31 | 32 | func (c *Client) Init() error { 33 | if ret := nvml.Init(); ret != nvml.SUCCESS { 34 | return fmt.Errorf("unable to initialize NVML: %v", nvml.ErrorString(ret)) 35 | } 36 | return nil 37 | } 38 | 39 | func (c *Client) Close() error { 40 | if ret := nvml.Shutdown(); ret != nvml.SUCCESS { 41 | return fmt.Errorf("unable to shutdown NVML: %v", nvml.ErrorString(ret)) 42 | } 43 | return nil 44 | } 45 | 46 | // DetectGpu return error if the timeout is exceeded 47 | func (c *Client) DetectGpu() ([]*gpuInfo, error) { 48 | ctx, cancel := context.WithTimeout(context.Background(), c.Timeout) 49 | defer cancel() 50 | 51 | resultCh := make(chan []*gpuInfo, 1) 52 | errCh := make(chan error, 1) 53 | go func() { 54 | gpuInfos, err := invokeNvml() 55 | if err != nil { 56 | errCh <- err 57 | return 58 | } 59 | resultCh <- gpuInfos 60 | }() 61 | 62 | select { 63 | case <-ctx.Done(): 64 | return nil, ctx.Err() 65 | case err := <-errCh: 66 | return nil, err 67 | case gpuInfos := <-resultCh: 68 | return gpuInfos, nil 69 | } 70 | } 71 | 72 | func invokeNvml() ([]*gpuInfo, error) { 73 | count, ret := nvml.DeviceGetCount() 74 | if ret != nvml.SUCCESS { 75 | return nil, fmt.Errorf("unable to get gpuInfo count: %v", nvml.ErrorString(ret)) 76 | } 77 | gpuInfos := make([]*gpuInfo, 0, count) 78 | 79 | for i := 0; i < count; i++ { 80 | info := &gpuInfo{Index: i} 81 | device, ret := nvml.DeviceGetHandleByIndex(i) 82 | if ret != nvml.SUCCESS { 83 | return gpuInfos, fmt.Errorf("unable to get gpuInfo at index %d: %v", i, nvml.ErrorString(ret)) 84 | } 85 | 86 | uuid, ret := device.GetUUID() 87 | if ret != nvml.SUCCESS { 88 | return gpuInfos, fmt.Errorf("unable to get uuid of gpuInfo at index %d: %v", i, nvml.ErrorString(ret)) 89 | } 90 | info.UUID = uuid 91 | 92 | name, ret := device.GetName() 93 | if ret != nvml.SUCCESS { 94 | return gpuInfos, fmt.Errorf("unable to get name of gpuInfo at index %d: %v", i, nvml.ErrorString(ret)) 95 | } 96 | info.Name = name 97 | 98 | memoryInfo, ret := device.GetMemoryInfo() 99 | if ret != nvml.SUCCESS { 100 | return gpuInfos, fmt.Errorf("unable to get memory info of gpuInfo at index %d: %v", i, nvml.ErrorString(ret)) 101 | } 102 | info.MemoryInfo = memoryInfo 103 | 104 | powerUsage, ret := device.GetPowerUsage() 105 | if ret != nvml.SUCCESS { 106 | return gpuInfos, fmt.Errorf("unable to get power usage of gpuInfo at index %d: %v", i, nvml.ErrorString(ret)) 107 | } 108 | info.PowerUsage = powerUsage 109 | 110 | powerState, ret := device.GetPowerState() 111 | if ret != nvml.SUCCESS { 112 | return gpuInfos, fmt.Errorf("unable to get power state of gpuInfo at index %d: %v", i, nvml.ErrorString(ret)) 113 | } 114 | info.PowerState = powerState 115 | 116 | managementDefaultLimit, ret := device.GetPowerManagementDefaultLimit() 117 | if ret != nvml.SUCCESS { 118 | return gpuInfos, fmt.Errorf("unable to get power management default limit of gpuInfo at index %d: %v", i, nvml.ErrorString(ret)) 119 | } 120 | info.PowerManagementDefaultLimit = managementDefaultLimit 121 | 122 | version, ret := device.GetInforomImageVersion() 123 | if ret != nvml.SUCCESS { 124 | return gpuInfos, fmt.Errorf("unable to get info image version of gpuInfo at index %d: %v", i, nvml.ErrorString(ret)) 125 | } 126 | info.InformImageVersion = version 127 | 128 | driverVersion, ret := nvml.SystemGetDriverVersion() 129 | if ret != nvml.SUCCESS { 130 | return gpuInfos, fmt.Errorf("unable to get system driver version: %v", nvml.ErrorString(ret)) 131 | } 132 | info.DriverVersion = driverVersion 133 | 134 | cudaDriverVersion, ret := nvml.SystemGetCudaDriverVersion() 135 | if ret != nvml.SUCCESS { 136 | return gpuInfos, fmt.Errorf("unable to get CUDA driver version: %v", nvml.ErrorString(ret)) 137 | } 138 | info.CUDADriverVersion = cudaDriverVersion 139 | 140 | computeRunningProcesses, ret := device.GetGraphicsRunningProcesses() 141 | if ret != nvml.SUCCESS { 142 | return gpuInfos, fmt.Errorf("unable to get graphics running processes of gpuInfo at index %d: %v", i, nvml.ErrorString(ret)) 143 | } 144 | info.GraphicsRunningProcesses = computeRunningProcesses 145 | 146 | 147 | utilization, ret := device.GetUtilizationRates() 148 | if ret != nvml.SUCCESS { 149 | return gpuInfos, fmt.Errorf("unable to get utilization rates of gpuInfo at index %d: %v", i, nvml.ErrorString(ret)) 150 | } 151 | info.Utilization = utilization 152 | 153 | gpuInfos = append(gpuInfos, info) 154 | } 155 | return gpuInfos, nil 156 | } 157 | -------------------------------------------------------------------------------- /pkg/detect/detect_test.go: -------------------------------------------------------------------------------- 1 | package detect 2 | 3 | import ( 4 | "testing" 5 | "time" 6 | 7 | "github.com/stretchr/testify/assert" 8 | "github.com/stretchr/testify/require" 9 | ) 10 | 11 | func TestDetectGpu(t *testing.T) { 12 | client := NewClient(WithTimeout(500 * time.Millisecond)) 13 | err := client.Init() 14 | require.Nil(t, err) 15 | defer client.Close() 16 | 17 | t.Run("Get All Gpus", func(t *testing.T) { 18 | gpus, err := client.DetectGpu() 19 | assert.Nil(t, err) 20 | assert.NotNil(t, gpus) 21 | assert.NotEqual(t, 0, len(gpus)) 22 | }) 23 | } 24 | 25 | func TestDetectGpu_Timeout(t *testing.T) { 26 | allowTimeDuration := 50 * time.Millisecond 27 | sleepTimeDuration := 100 * time.Millisecond 28 | 29 | client := NewClient(WithTimeout(allowTimeDuration)) 30 | err := client.Init() 31 | require.Nil(t, err) 32 | defer client.Close() 33 | 34 | t.Run("Test Timeout Control", func(t *testing.T) { 35 | timeOutDetectGpu := func(td time.Duration) ([]*gpuInfo, error) { 36 | time.Sleep(sleepTimeDuration) 37 | return client.DetectGpu() 38 | } 39 | 40 | gpus, err := timeOutDetectGpu(allowTimeDuration) 41 | assert.NotNil(t, err, "Timeout error should be returned") 42 | assert.Nil(t, gpus) 43 | }) 44 | } 45 | -------------------------------------------------------------------------------- /pkg/detect/model.go: -------------------------------------------------------------------------------- 1 | package detect 2 | 3 | import ( 4 | "github.com/NVIDIA/go-nvml/pkg/nvml" 5 | ) 6 | 7 | type gpuInfo struct { 8 | Index int `json:"index"` 9 | UUID string `json:"uuid"` 10 | Name string `json:"name"` 11 | MemoryInfo nvml.Memory `json:"memoryInfo"` 12 | PowerUsage uint32 `json:"powerUsage"` 13 | PowerState nvml.Pstates `json:"powerState"` 14 | PowerManagementDefaultLimit uint32 `json:"powerManagementDefaultLimit"` 15 | InformImageVersion string `json:"informImageVersion"` 16 | DriverVersion string `json:"systemGetDriverVersion"` 17 | CUDADriverVersion int `json:"systemGetCudaDriverVersion"` 18 | GraphicsRunningProcesses []nvml.ProcessInfo `json:"tGraphicsRunningProcesses"` 19 | Utilization nvml.Utilization `json:"utilization"` 20 | } 21 | --------------------------------------------------------------------------------