├── .gitignore
├── .travis.yml
├── CHANGELOG.md
├── CONTRIBUTING.md
├── LICENSE
├── MAINTAINERS
├── Makefile
├── README.md
├── VERSION
├── build
    ├── Dockerfile
    ├── copy-bin-lib.sh
    ├── extra-config.json
    ├── gpu-manager.conf
    ├── gpu-manager.service
    ├── gpu-manager.spec
    ├── start.sh
    └── volume.conf
├── cmd
    ├── client
    │   └── client.go
    └── manager
    │   ├── app
    │       └── app.go
    │   ├── nvidia-manager.go
    │   └── options
    │       └── options.go
├── docs
    └── faq.md
├── go.mod
├── go.sum
├── gpu-manager-svc.yaml
├── gpu-manager.yaml
├── hack
    ├── build.sh
    └── common.sh
├── pkg
    ├── algorithm
    │   └── nvidia
    │   │   ├── fragment.go
    │   │   ├── fragment_test.go
    │   │   ├── link.go
    │   │   ├── link_test.go
    │   │   ├── share.go
    │   │   ├── share_test.go
    │   │   └── util_test.go
    ├── api
    │   └── runtime
    │   │   ├── display
    │   │       ├── api.pb.go
    │   │       ├── api.pb.gw.go
    │   │       └── api.proto
    │   │   └── vcuda
    │   │       ├── api.pb.go
    │   │       └── api.proto
    ├── config
    │   └── config.go
    ├── device
    │   ├── dummy
    │   │   └── tree.go
    │   ├── nvidia
    │   │   ├── node.go
    │   │   ├── sort.go
    │   │   ├── sort_test.go
    │   │   ├── tree.go
    │   │   ├── tree_test.go
    │   │   └── tree_util.go
    │   ├── register
    │   │   └── register.go
    │   └── types.go
    ├── flags
    │   └── flags.go
    ├── logs
    │   └── logs.go
    ├── runtime
    │   ├── runtime.go
    │   └── runtime_stub.go
    ├── server
    │   ├── server.go
    │   ├── server_test.go
    │   ├── types.go
    │   ├── vcore.go
    │   └── vmemory.go
    ├── services
    │   ├── allocator
    │   │   ├── cache
    │   │   │   └── cache.go
    │   │   ├── checkpoint
    │   │   │   └── manager.go
    │   │   ├── dummy
    │   │   │   └── allocator.go
    │   │   ├── nvidia
    │   │   │   ├── allocator.go
    │   │   │   ├── allocator_test.go
    │   │   │   └── evaluator.go
    │   │   ├── register
    │   │   │   └── register.go
    │   │   └── types.go
    │   ├── display
    │   │   ├── display.go
    │   │   └── helper.go
    │   ├── response
    │   │   ├── fake.go
    │   │   └── manager.go
    │   ├── virtual-manager
    │   │   └── manager.go
    │   ├── volume
    │   │   ├── ldcache
    │   │   │   └── ldcache.go
    │   │   ├── util.go
    │   │   └── volume.go
    │   └── watchdog
    │   │   ├── label.go
    │   │   ├── label_test.go
    │   │   ├── watchdog.go
    │   │   └── watchdog_test.go
    ├── types
    │   └── types.go
    ├── utils
    │   ├── cgroup
    │   │   └── cgroup.go
    │   └── util.go
    └── version
    │   ├── .gitattributes
    │   ├── base.go
    │   ├── verflags.go
    │   └── version.go
├── revive.toml
└── staging
    └── src
        └── google
            └── protobuf
                ├── descriptor.proto
                └── empty.proto


/.gitignore:
--------------------------------------------------------------------------------
1 | go/
2 | .idea
3 | .chglog/
4 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: go
2 | 
3 | services:
4 |   - docker
5 | 
6 | script:
7 |   - make img
8 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
  1 | <a name="unreleased"></a>
  2 | ## [Unreleased]
  3 | 
  4 | ### Feat
  5 | - add timeout option waiting for all resource server ready
  6 | - upgrade vcuda
  7 | 
  8 | ### Fix
  9 | - virutal manager can't probe correct vm controller path
 10 | - wrong size of device memory when app has more than 1 card
 11 | - vcuda image repository url
 12 | - the mismatch between gpu-manager pick up and gpu-admission predicate ([#74](https://github.com/tkestack/gpu-manager/issues/74))
 13 | - read QoS class from pod status first
 14 | - kubelet 1.20 device checkpoint support ([#62](https://github.com/tkestack/gpu-manager/issues/62))
 15 | - report device memory when allocate more than one cards
 16 | - DeviceGetTopologyCommonAncestor get a zero value on multi-gpu board
 17 | - gpu-manager lost checkpoint data file
 18 | - preserve attribute
 19 | - read cgroup.procs files recursively
 20 | - wait server until it's ready
 21 | 
 22 | 
 23 | <a name="v1.1.5"></a>
 24 | ## [v1.1.5] - 2021-05-10
 25 | ### Docs
 26 | - Add FAQ link
 27 | - Update gpu manager yaml
 28 | 
 29 | ### Feat
 30 | - upgrade vcuda to 1.0.3
 31 | - Upgrade vcuda-controller to v1.0.1
 32 | - Use host network to build image
 33 | - Update go version to 1.14.3
 34 | - Support CRI interface
 35 | 
 36 | ### Fix
 37 | - kubelet 1.20 device checkpoint support ([#62](https://github.com/tkestack/gpu-manager/issues/62))
 38 | - the mismatch between gpu-manager pick up and gpu-admission predicate ([#74](https://github.com/tkestack/gpu-manager/issues/74))
 39 | - read QoS class from pod status first
 40 | - report device memory when allocate more than one cards
 41 | - gpu-manager lost checkpoint data file
 42 | - DeviceGetTopologyCommonAncestor get a zero value on multi-gpu board
 43 | - preserve attribute
 44 | - read cgroup.procs files recursively
 45 | - wait server until it's ready
 46 | - Revert using vendor directory
 47 | - Allow non-root user to communicate with gpu manager
 48 | - Change ius rpm broken link
 49 | - skip symlink when copy bin to |${NV_DIR}|. ([#15](https://github.com/tkestack/gpu-manager/issues/15))
 50 | 
 51 | ### Refact
 52 | - Use vendor directory
 53 | - Refact gpu-manager code
 54 | 
 55 | 
 56 | <a name="v1.0.9"></a>
 57 | ## [v1.0.9] - 2021-02-23
 58 | ### Feat
 59 | - use apiserver cache to list pod
 60 | 
 61 | ### Fix
 62 | - ignore not running container while recovering
 63 | 
 64 | 
 65 | <a name="v1.0.8"></a>
 66 | ## [v1.0.8] - 2021-02-22
 67 | ### Feat
 68 | - Upgrade vcuda-controller to v1.0.2
 69 | - Use host network to build image
 70 | - Upgrade vcuda-controller to v1.0.1
 71 | 
 72 | ### Fix
 73 | - missing recover tree data if information is retrieved from checkpoint file
 74 | - gpu-manager lost checkpoint data file
 75 | - DeviceGetTopologyCommonAncestor get a zero value on multi-gpu board
 76 | - preserve attribute
 77 | - upgrade go to 1.15
 78 | - wait server until it's ready
 79 | - Change ius rpm broken link
 80 | - Allow non-root user to communicate with gpu manager
 81 | 
 82 | ### Refact
 83 | - only watch pod belong this node
 84 | 
 85 | 
 86 | <a name="v1.1.4"></a>
 87 | ## [v1.1.4] - 2021-02-05
 88 | ### Fix
 89 | - read QoS class from pod status first
 90 | 
 91 | 
 92 | <a name="v1.1.3"></a>
 93 | ## [v1.1.3] - 2021-02-02
 94 | ### Feat
 95 | - upgrade vcuda to 1.0.3
 96 | 
 97 | ### Fix
 98 | - report device memory when allocate more than one cards
 99 | 
100 | 
101 | <a name="v1.1.2"></a>
102 | ## [v1.1.2] - 2020-12-09
103 | ### Docs
104 | - Add FAQ link
105 | - Update gpu manager yaml
106 | 
107 | ### Feat
108 | - Upgrade vcuda-controller to v1.0.1
109 | - Use host network to build image
110 | - Update go version to 1.14.3
111 | - Support CRI interface
112 | 
113 | ### Fix
114 | - gpu-manager lost checkpoint data file
115 | - DeviceGetTopologyCommonAncestor get a zero value on multi-gpu board
116 | - preserve attribute
117 | - read cgroup.procs files recursively
118 | - wait server until it's ready
119 | - Revert using vendor directory
120 | - Allow non-root user to communicate with gpu manager
121 | - Change ius rpm broken link
122 | - skip symlink when copy bin to |${NV_DIR}|. ([#15](https://github.com/tkestack/gpu-manager/issues/15))
123 | 
124 | ### Refact
125 | - Use vendor directory
126 | - Refact gpu-manager code
127 | 
128 | 
129 | <a name="v1.0.7"></a>
130 | ## [v1.0.7] - 2020-12-09
131 | ### Feat
132 | - Upgrade vcuda-controller to v1.0.2
133 | - Use host network to build image
134 | - Upgrade vcuda-controller to v1.0.1
135 | 
136 | ### Fix
137 | - gpu-manager lost checkpoint data file
138 | - DeviceGetTopologyCommonAncestor get a zero value on multi-gpu board
139 | - preserve attribute
140 | - upgrade go to 1.15
141 | - wait server until it's ready
142 | - Change ius rpm broken link
143 | - Allow non-root user to communicate with gpu manager
144 | 
145 | ### Refact
146 | - only watch pod belong this node
147 | 
148 | 
149 | <a name="v1.1.1"></a>
150 | ## [v1.1.1] - 2020-12-02
151 | ### Docs
152 | - Add FAQ link
153 | - Update gpu manager yaml
154 | 
155 | ### Feat
156 | - Upgrade vcuda-controller to v1.0.1
157 | - Use host network to build image
158 | - Update go version to 1.14.3
159 | - Support CRI interface
160 | 
161 | ### Fix
162 | - DeviceGetTopologyCommonAncestor get a zero value on multi-gpu board
163 | - preserve attribute
164 | - read cgroup.procs files recursively
165 | - wait server until it's ready
166 | - Revert using vendor directory
167 | - Allow non-root user to communicate with gpu manager
168 | - Change ius rpm broken link
169 | - skip symlink when copy bin to |${NV_DIR}|. ([#15](https://github.com/tkestack/gpu-manager/issues/15))
170 | 
171 | ### Refact
172 | - Use vendor directory
173 | - Refact gpu-manager code
174 | 
175 | 
176 | <a name="v1.0.6"></a>
177 | ## [v1.0.6] - 2020-12-02
178 | ### Fix
179 | - DeviceGetTopologyCommonAncestor get a zero value on multi-gpu board
180 | - preserve attribute
181 | 
182 | 
183 | <a name="v1.0.5"></a>
184 | ## [v1.0.5] - 2020-08-28
185 | ### Feat
186 | - Upgrade vcuda-controller to v1.0.2
187 | 
188 | ### Fix
189 | - upgrade go to 1.15
190 | - wait server until it's ready
191 | 
192 | ### Refact
193 | - only watch pod belong this node
194 | 
195 | 
196 | <a name="v1.0.4"></a>
197 | ## [v1.0.4] - 2020-05-21
198 | ### Feat
199 | - Use host network to build image
200 | - Upgrade vcuda-controller to v1.0.1
201 | 
202 | ### Fix
203 | - Change ius rpm broken link
204 | - Allow non-root user to communicate with gpu manager
205 | 
206 | 
207 | <a name="v1.1.0"></a>
208 | ## [v1.1.0] - 2020-05-21
209 | ### Docs
210 | - Add FAQ link
211 | - Update gpu manager yaml
212 | 
213 | ### Feat
214 | - Upgrade vcuda-controller to v1.0.1
215 | - Use host network to build image
216 | - Update go version to 1.14.3
217 | - Support CRI interface
218 | 
219 | ### Fix
220 | - Revert using vendor directory
221 | - Allow non-root user to communicate with gpu manager
222 | - Change ius rpm broken link
223 | - skip symlink when copy bin to |${NV_DIR}|. ([#15](https://github.com/tkestack/gpu-manager/issues/15))
224 | 
225 | ### Refact
226 | - Use vendor directory
227 | - Refact gpu-manager code
228 | 
229 | 
230 | <a name="v1.0.3"></a>
231 | ## v1.0.3 - 2019-12-17
232 | 
233 | [Unreleased]: https://github.com/tkestack/gpu-manager/compare/v1.1.5...HEAD
234 | [v1.1.5]: https://github.com/tkestack/gpu-manager/compare/v1.0.9...v1.1.5
235 | [v1.0.9]: https://github.com/tkestack/gpu-manager/compare/v1.0.8...v1.0.9
236 | [v1.0.8]: https://github.com/tkestack/gpu-manager/compare/v1.1.4...v1.0.8
237 | [v1.1.4]: https://github.com/tkestack/gpu-manager/compare/v1.1.3...v1.1.4
238 | [v1.1.3]: https://github.com/tkestack/gpu-manager/compare/v1.1.2...v1.1.3
239 | [v1.1.2]: https://github.com/tkestack/gpu-manager/compare/v1.0.7...v1.1.2
240 | [v1.0.7]: https://github.com/tkestack/gpu-manager/compare/v1.1.1...v1.0.7
241 | [v1.1.1]: https://github.com/tkestack/gpu-manager/compare/v1.0.6...v1.1.1
242 | [v1.0.6]: https://github.com/tkestack/gpu-manager/compare/v1.0.5...v1.0.6
243 | [v1.0.5]: https://github.com/tkestack/gpu-manager/compare/v1.0.4...v1.0.5
244 | [v1.0.4]: https://github.com/tkestack/gpu-manager/compare/v1.1.0...v1.0.4
245 | [v1.1.0]: https://github.com/tkestack/gpu-manager/compare/v1.0.3...v1.1.0
246 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # CONTRIBUTING
 2 | 
 3 | Welcome to [report Issues](https://github.com/tkestack/gpu-manager/issues) or [pull requests](https://github.com/tkestack/gpu-manager/pulls). It's recommended to read the following Contributing Guide first before contributing.
 4 | 
 5 | This document provides a set of best practices for open source contributions - bug reports, code submissions / pull requests, etc.
 6 | 
 7 | ## Issues
 8 | 
 9 | We use Github Issues to track public bugs and feature requests.
10 | 
11 | ### Due diligence
12 | 
13 | Before submitting a issue, please do the following:
14 | 
15 | * Perform **basic troubleshooting** steps:
16 |     * Make sure you’re on the latest version. If you’re not on the most recent version, your problem may have been solved already! Upgrading is always the best first step.
17 |     * Try older versions. If you’re already on the latest release, try rolling back a few minor versions (e.g. if on 1.7, try 1.5 or 1.6) and see if the problem goes away. This will help the devs narrow down when the problem first arose in the commit log.
18 |     * Try switching up dependency versions. If the software in question has dependencies (other libraries, etc) try upgrading/downgrading those as well.
19 | * Search the project’s bug/issue tracker to make sure it’s not a known issue.
20 | * If you don’t find a pre-existing issue, consider checking with the mailing list and/or IRC channel in case the problem is non-bug-related.
21 | 
22 | ### What to put in your bug report
23 | 
24 | Make sure your report gets the attention it deserves: bug reports with missing information may be ignored or punted back to you, delaying a fix. The below constitutes a bare minimum; more info is almost always better:
25 | 
26 | * What version of the core programming language interpreter/compiler are you using? For example, if it’s a Golang project, are you using Golang 1.13? Golang 1.12?
27 | * What operating system are you on? Windows? (32-bit? 64-bit?) Mac OS X? (10.14? 10.10?) Linux? (Which distro? Which version of that distro? 32 or 64 bits?) Again, more detail is better.
28 | * Which version or versions of the software are you using? Ideally, you followed the advice above and have ruled out (or verified that the problem exists in) a few different versions.
29 | * How can the developers recreate the bug on their end? If possible, include a copy of your code, the command you used to invoke it, and the full output of your run (if applicable.) A common tactic is to pare down your code until a simple (but still bug-causing) “base case” remains. Not only can this help you identify problems which aren’t real bugs, but it means the developer can get to fixing the bug faster.
30 | 
31 | ## Pull Requests
32 | 
33 | We strongly welcome your pull request to make TKEStack project better.
34 | 
35 | ### Licensing of contributed material
36 | 
37 | Keep in mind as you contribute, that code, docs and other material submitted to open source projects are usually considered licensed under the same terms as the rest of the work.
38 | 
39 | Anything submitted to a project falls under the licensing terms in the repository’s top level LICENSE file. Per-file copyright/license headers are typically extraneous and undesirable. Please don’t add your own copyright headers to new files unless the project’s license actually requires them!
40 | 
41 | ### Branch Management
42 | 
43 | There are three main branches here:
44 | 
45 | 1. `master` branch.
46 | 	1. It is the latest (pre-)release branch. We use `master` for tags, with version number `1.1.0`, `1.2.0`, `1.3.0`...
47 | 	2. **Don't submit any PR on `master` branch.**
48 | 2. `dev` branch. 
49 | 	1. It is our stable developing branch. After full testing, `dev` will be merged to `master` branch for the next release.
50 | 	2. **You are recommended to submit bugfix or feature PR on `dev` branch.**
51 | 3. `hotfix` branch. 
52 | 	1. It is the latest tag version for hot fix. If we accept your pull request, we may just tag with version number `1.1.1`, `1.2.3`.
53 | 	2. **Only submit urgent PR on `hotfix` branch for next specific release.**
54 | 
55 | Normal bugfix or feature request should be submitted to `dev` branch. After full testing, we will merge them to `master` branch for the next release. 
56 | 
57 | If you have some urgent bugfixes on a published version, but the `master` branch have already far away with the latest tag version, you can submit a PR on hotfix. And it will be cherry picked to `dev` branch if it is possible.
58 | 
59 | ```
60 | master
61 |  ↑
62 | dev        <--- hotfix PR
63 |  ↑ 
64 | feature/bugfix PR
65 | ```  
66 | 
67 | ### Make Pull Requests
68 | 
69 | The code team will monitor all pull request, we run some code check and test on it. After all tests passed, we will accecpt this PR. But it won't merge to `master` branch at once, which have some delay.
70 | 
71 | Before submitting a pull request, please make sure the followings are done:
72 | 
73 | 1. Fork the repo and create your branch from `master` or `hotfix`.
74 | 2. Update code or documentation if you have changed APIs.
75 | 3. Add the copyright notice to the top of any new files you've added.
76 | 4. Check your code lints and checkstyles.
77 | 5. Test and test again your code.
78 | 6. Now, you can submit your pull request on `dev` or `hotfix` branch.
79 | 
80 | ## Code Conventions
81 | 
82 | Use [Kubernetes Code Conventions](https://github.com/kubernetes/community/blob/master/contributors/guide/coding-conventions.md) for all projects in the TKEStack organization.
83 | 
84 | ## Documentation isn’t optional
85 | 
86 | It’s not! Patches without documentation will be returned to sender. By “documentation” we mean:
87 | 
88 | * Docstrings must be created or updated for public API functions/methods/etc. (This step is optional for some bugfixes.)
89 | * New features should ideally include updates to prose documentation, including useful example code snippets.
90 | * All submissions should have a changelog entry crediting the contributor and/or any individuals instrumental in identifying the problem.
91 | 
92 | ## Tests aren’t optional
93 | 
94 | Any bugfix that doesn’t include a test proving the existence of the bug being fixed, may be suspect. Ditto for new features that can’t prove they actually work.
95 | 
96 | We’ve found that test-first development really helps make features better architected and identifies potential edge cases earlier instead of later. Writing tests before the implementation is strongly encouraged.
97 | 


--------------------------------------------------------------------------------
/MAINTAINERS:
--------------------------------------------------------------------------------
1 | Thomas Song <thomassong@tencent.com> @mYmNeo
2 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: all
 2 | all:
 3 | 	hack/build.sh manager client
 4 | 
 5 | .PHONY: clean
 6 | clean:
 7 | 	rm -rf ./go
 8 | 
 9 | .PHONY: test
10 | test:
11 | 	hack/build.sh "test"
12 | 
13 | .PHONY: proto
14 | proto:
15 | 	hack/build.sh "proto"
16 | 
17 | .PHONY: img
18 | img:
19 | 	hack/build.sh "img"
20 | 
21 | .PHONY: fmt
22 | fmt:
23 | 	hack/build.sh "fmt"
24 | 
25 | .PHONY: lint
26 | lint:
27 | 	@revive -config revive.toml -exclude vendor/... -exclude pkg/api/runtime/... ./...
28 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # GPU Manager
  2 | 
  3 | [![Build Status](https://travis-ci.org/tkestack/gpu-manager.svg?branch=master)](https://travis-ci.org/tkestack/gpu-manager)
  4 | 
  5 | GPU Manager is used for managing the nvidia GPU devices in Kubernetes cluster. It implements the `DevicePlugin` interface
  6 | of Kubernetes. So it's compatible with 1.9+ of Kubernetes release version. 
  7 | 
  8 | To compare with the combination solution of `nvidia-docker`
  9 | and `nvidia-k8s-plugin`, GPU manager will use native `runc` without modification but nvidia solution does.
 10 | Besides we also support metrics report without deploying new components. 
 11 | 
 12 | To schedule a GPU payload correctly, GPU manager should work with [gpu-admission](https://github.com/tkestack/gpu-admission) which is a
 13 |  kubernetes scheduler plugin.
 14 | 
 15 | GPU manager also supports the payload with fraction resource of GPU device such as 0.1 card or 100MiB gpu device memory.
 16 | If you want this kind feature, please refer to [vcuda-controller](https://github.com/tkestack/vcuda-controller) project.
 17 | 
 18 | ## Build
 19 | 
 20 | **1.** Build binary
 21 | 
 22 | - Prerequisite
 23 |    - CUDA toolkit
 24 |     
 25 | ```
 26 | make
 27 | ```
 28 | 
 29 | **2.** Build image
 30 | 
 31 | - Prerequisite
 32 |     - Docker
 33 | 
 34 | ```
 35 | make img
 36 | ```
 37 | 
 38 | ## Prebuilt image
 39 | 
 40 | Prebuilt image can be found at `thomassong/gpu-manager`
 41 | 
 42 | ## Deploy
 43 | 
 44 | GPU Manager is running as daemonset, and because of the RABC restriction and hydrid cluster,
 45 | you need to do the following steps to make this daemonset run correctly.
 46 | 
 47 | - service account and clusterrole
 48 | 
 49 | ```
 50 | kubectl create sa gpu-manager -n kube-system
 51 | kubectl create clusterrolebinding gpu-manager-role --clusterrole=cluster-admin --serviceaccount=kube-system:gpu-manager
 52 | ```
 53 | 
 54 | - label node with `nvidia-device-enable=enable`
 55 | 
 56 | ```
 57 | kubectl label node <node> nvidia-device-enable=enable
 58 | ```
 59 | 
 60 | - submit daemonset yaml
 61 | 
 62 | ```
 63 | kubectl create -f gpu-manager.yaml
 64 | ```
 65 | 
 66 | ## Pod template example
 67 | 
 68 | There is nothing special to submit a Pod except the description of GPU resource is no longer 1
 69 | . The GPU
 70 | resources are described as that 100 `tencent.com/vcuda-core` for 1 GPU and N `tencent.com/vcuda-memory` for GPU memory (1 tencent.com/vcuda-memory means 256Mi
 71 | GPU memory). And because of the limitation of extend resource validation of Kubernetes, to support
 72 | GPU utilization limitation, you should add `tencent.com/vcuda-core-limit: XX` in the annotation
 73 |  field of a Pod.
 74 |  
 75 |  **Notice: the value of `tencent.com/vcuda-core` is either the multiple of 100 or any value
 76 | smaller than 100.For example, 100, 200 or 20 is valid value but 150 or 250 is invalid**
 77 | 
 78 | - Submit a Pod with 0.3 GPU utilization and 7680MiB GPU memory with 0.5 GPU utilization limit
 79 | 
 80 | ```
 81 | apiVersion: v1
 82 | kind: Pod
 83 | metadata:
 84 |   name: vcuda
 85 |   annotations:
 86 |     tencent.com/vcuda-core-limit: 50
 87 | spec:
 88 |   restartPolicy: Never
 89 |   containers:
 90 |   - image: <test-image>
 91 |     name: nvidia
 92 |     command:
 93 |     - /usr/local/nvidia/bin/nvidia-smi
 94 |     - pmon
 95 |     - -d
 96 |     - 10
 97 |     resources:
 98 |       requests:
 99 |         tencent.com/vcuda-core: 50
100 |         tencent.com/vcuda-memory: 30
101 |       limits:
102 |         tencent.com/vcuda-core: 50
103 |         tencent.com/vcuda-memory: 30
104 | ```
105 | 
106 | - Submit a Pod with 2 GPU card
107 | 
108 | ```
109 | apiVersion: v1
110 | kind: Pod
111 | metadata:
112 |   name: vcuda
113 | spec:
114 |   restartPolicy: Never
115 |   containers:
116 |   - image: <test-image>
117 |     name: nvidia
118 |     command:
119 |     - /usr/local/nvidia/bin/nvidia-smi
120 |     - pmon
121 |     - -d
122 |     - 10
123 |     resources:
124 |       requests:
125 |         tencent.com/vcuda-core: 200
126 |         tencent.com/vcuda-memory: 60
127 |       limits:
128 |         tencent.com/vcuda-core: 200
129 |         tencent.com/vcuda-memory: 60
130 | ```
131 | 
132 | ## FAQ
133 | 
134 | If you have some questions about this project, you can first refer to [FAQ](./docs/faq.md) to find a solution.
135 | 


--------------------------------------------------------------------------------
/VERSION:
--------------------------------------------------------------------------------
1 | 1.1.4
2 | 


--------------------------------------------------------------------------------
/build/Dockerfile:
--------------------------------------------------------------------------------
 1 | ARG base_img
 2 | FROM nvidia/cuda:10.1-devel-centos7 as build
 3 | 
 4 | ARG version
 5 | ARG commit
 6 | 
 7 | RUN yum install -y rpm-build make
 8 | 
 9 | # default git has problems while cloning some repository
10 | RUN yum install -y https://repo.ius.io/ius-release-el7.rpm \
11 |   && yum install -y git222
12 | 
13 | ENV GOLANG_VERSION 1.14.3
14 | RUN curl -sSL https://dl.google.com/go/go${GOLANG_VERSION}.linux-amd64.tar.gz \
15 |     | tar -C /usr/local -xz
16 | ENV GOPATH /go
17 | ENV PATH $GOPATH/bin:/usr/local/go/bin:$PATH
18 | 
19 | RUN mkdir -p /root/rpmbuild/{SPECS,SOURCES}
20 | 
21 | COPY gpu-manager.spec /root/rpmbuild/SPECS
22 | COPY gpu-manager-source.tar.gz /root/rpmbuild/SOURCES
23 | 
24 | RUN echo '%_topdir /root/rpmbuild' > /root/.rpmmacros \
25 |   && echo '%__os_install_post %{nil}' >> /root/.rpmmacros \
26 |   && echo '%debug_package %{nil}' >> /root/.rpmmacros
27 | WORKDIR /root/rpmbuild/SPECS
28 | RUN rpmbuild -bb --quiet \
29 |   --define 'version '${version}'' \
30 |   --define 'commit '${commit}'' \
31 |   gpu-manager.spec
32 | 
33 | FROM $base_img
34 | 
35 | ARG version
36 | ARG commit
37 | 
38 | COPY --from=build /root/rpmbuild/RPMS/x86_64/gpu-manager-${version}-${commit}.el7.x86_64.rpm /tmp
39 | 
40 | RUN yum install epel-release -y && \
41 |   yum install -y which jq
42 | 
43 | # Install packages
44 | RUN rpm -ivh /tmp/gpu-manager-${version}-${commit}.el7.x86_64.rpm \
45 | 	&& rm -rf /tmp/gpu-manager-${version}-${commit}.el7.x86_64.rpm
46 | 
47 | # kubelet
48 | VOLUME ["/var/lib/kubelet/device-plugins"]
49 | 
50 | # gpu manager storage
51 | VOLUME ["/etc/gpu-manager/vm"]
52 | VOLUME ["/etc/gpu-manager/vdriver"]
53 | VOLUME ["/var/log/gpu-manager"]
54 | 
55 | # nvidia library search location
56 | VOLUME ["/usr/local/host"]
57 | 
58 | RUN echo "/usr/local/nvidia/lib" > /etc/ld.so.conf.d/nvidia.conf && \
59 |     echo "/usr/local/nvidia/lib64" >> /etc/ld.so.conf.d/nvidia.conf
60 | 
61 | ENV PATH=$PATH:/usr/local/nvidia/bin
62 | 
63 | # cgroup
64 | VOLUME ["/sys/fs/cgroup"]
65 | 
66 | # display
67 | EXPOSE 5678
68 | 
69 | COPY start.sh /
70 | COPY copy-bin-lib.sh /
71 | 
72 | CMD ["/start.sh"]
73 | 


--------------------------------------------------------------------------------
/build/copy-bin-lib.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -o pipefail
 4 | set -o errexit
 5 | set -o nounset
 6 | 
 7 | FILE=${FILE:-"/etc/gpu-manager/volume.conf"}
 8 | LIB_FILES=$(jq -r .volume[1].components.libraries[] ${FILE})
 9 | BIN_FILES=$(jq -r .volume[1].components.binaries[] ${FILE})
10 | readonly NV_DIR="/usr/local/nvidia"
11 | readonly FIND_BASE=${FIND_BASE:-"/usr/local/host"}
12 | 
13 | function check_arch() {
14 |   local readonly lib=$1
15 |   if [[ $(objdump -f ${lib} | grep -o "elf64-x86-64") == "elf64-x86-64" ]]; then
16 |     echo "64"
17 |   else
18 |     echo ""
19 |   fi
20 | }
21 | 
22 | function copy_lib() {
23 |   for target in $(find ${FIND_BASE} -name "${1}*" | grep -v "stubs"); do
24 |     if [[ $(objdump -p ${target} 2>/dev/null | grep -o "SONAME") == "SONAME" ]]; then
25 |       copy_directory ${target} "${NV_DIR}/lib$(check_arch ${target})"
26 |     fi
27 |   done
28 | }
29 | 
30 | function copy_bin() {
31 |   for target in $(find ${FIND_BASE} -name "${1}"); do
32 |     if [[ -L ${target} ]]; then
33 |       echo "${target} is symlink"
34 |       continue
35 |     fi
36 |     copy_directory ${target} "${NV_DIR}/bin/"
37 |   done
38 | }
39 | 
40 | function copy_directory() {
41 |   local readonly lib=$1
42 |   local readonly path=$2
43 | 
44 |   echo "copy ${lib} to ${path}"
45 |   cp --preserve=mode,ownership -Pf "${lib}" "${path}"
46 | }
47 | 
48 | rm -rf ${NV_DIR}
49 | mkdir -p ${NV_DIR}/{bin,lib,lib64}
50 | 
51 | for file in ${LIB_FILES[@]}; do
52 |   copy_lib ${file}
53 | done
54 | 
55 | for file in ${BIN_FILES[@]}; do
56 |   copy_bin ${file}
57 | done
58 | 
59 | # fix libvdpau_nvidia.so
60 | (
61 |   cd ${NV_DIR}/lib
62 |   rm -rf libvdpau_nvidia.so
63 |   rel_path=$(readlink -f libvdpau_nvidia.so.1)
64 |   ln -s $(basename ${rel_path}) libvdpau_nvidia.so
65 | )
66 | 
67 | (
68 |   cd ${NV_DIR}/lib64
69 |   rm -rf libvdpau_nvidia.so
70 |   rel_path=$(readlink -f libvdpau_nvidia.so.1)
71 |   ln -s $(basename ${rel_path}) libvdpau_nvidia.so
72 | )
73 | 
74 | # fix libnvidia-ml.so
75 | (
76 |   cd ${NV_DIR}/lib
77 |   rm -rf libnvidia-ml.so
78 |   rel_path=$(readlink -f libnvidia-ml.so.1)
79 |   ln -s $(basename ${rel_path}) libnvidia-ml.so
80 | )
81 | 
82 | (
83 |   cd ${NV_DIR}/lib64
84 |   rm -rf libnvidia-ml.so
85 |   rel_path=$(readlink -f libnvidia-ml.so.1)
86 |   ln -s $(basename ${rel_path}) libnvidia-ml.so
87 | )
88 | 


--------------------------------------------------------------------------------
/build/extra-config.json:
--------------------------------------------------------------------------------
1 | {}
2 | 


--------------------------------------------------------------------------------
/build/gpu-manager.conf:
--------------------------------------------------------------------------------
1 | GPU_MANAGER_ARGS="--extra-config=/etc/gpu-manager/extra-config.json --addr=/var/run/gpu-manager.sock --v=2 --logtostderr"
2 | 


--------------------------------------------------------------------------------
/build/gpu-manager.service:
--------------------------------------------------------------------------------
 1 | [Unit]
 2 | Description=GPU Manager Runtime
 3 | After=network-online.target docker.socket kubelet.service
 4 | Wants=network-online.target kubelet.service
 5 | 
 6 | [Service]
 7 | Type=notify
 8 | # the default is not to use systemd for cgroups because the delegate issues still
 9 | # exists and systemd currently does not support the cgroup feature set required
10 | # for containers run by docker
11 | EnvironmentFile=-/etc/gpu-manager/gpu-manager.conf
12 | ExecStart=/usr/bin/gpu-manager $GPU_MANAGER_ARGS
13 | ExecReload=/bin/kill -s HUP $MAINPID
14 | LimitNOFILE=1048576
15 | # Having non-zero Limit*s causes performance problems due to accounting overhead
16 | # in the kernel. We recommend using cgroups to do container-local accounting.
17 | LimitNPROC=infinity
18 | LimitCORE=infinity
19 | # Uncomment TasksMax if your systemd version supports it.
20 | # Only systemd 226 and above support this version.
21 | #TasksMax=infinity
22 | TimeoutStartSec=0
23 | # set delegate yes so that systemd does not reset the cgroups of docker containers
24 | Delegate=yes
25 | # kill only the docker process, not all processes in the cgroup
26 | KillMode=process
27 | # restart the docker process if it exits prematurely
28 | Restart=on-failure
29 | StartLimitBurst=3
30 | StartLimitInterval=60s
31 | UMask=0000
32 | 
33 | [Install]
34 | WantedBy=multi-user.target
35 | 


--------------------------------------------------------------------------------
/build/gpu-manager.spec:
--------------------------------------------------------------------------------
 1 | Name: gpu-manager
 2 | Version: %{version}
 3 | Release: %{commit}%{?dist}
 4 | Summary: GPU Manager Plugin for Kubernetes
 5 | 
 6 | License: MIT
 7 | Source: gpu-manager-source.tar.gz
 8 | 
 9 | Requires: systemd-units
10 | 
11 | %define pkgname %{name}-%{version}-%{release}
12 | 
13 | %description
14 | GPU Manager Plugin for Kubernetes
15 | 
16 | %prep
17 | %setup -n gpu-manager-%{version}
18 | 
19 | 
20 | %build
21 | make all
22 | 
23 | %install
24 | install -d $RPM_BUILD_ROOT/%{_bindir}
25 | install -d $RPM_BUILD_ROOT/%{_unitdir}
26 | install -d $RPM_BUILD_ROOT/etc/gpu-manager
27 | 
28 | install -p -m 755 ./go/bin/gpu-manager $RPM_BUILD_ROOT/%{_bindir}/
29 | install -p -m 755 ./go/bin/gpu-client $RPM_BUILD_ROOT/%{_bindir}/
30 | 
31 | install -p -m 644 ./build/extra-config.json $RPM_BUILD_ROOT/etc/gpu-manager/
32 | install -p -m 644 ./build/gpu-manager.conf $RPM_BUILD_ROOT/etc/gpu-manager/
33 | install -p -m 644 ./build/volume.conf $RPM_BUILD_ROOT/etc/gpu-manager/
34 | 
35 | install -p -m 644 ./build/gpu-manager.service $RPM_BUILD_ROOT/%{_unitdir}/
36 | 
37 | %clean
38 | rm -rf $RPM_BUILD_ROOT
39 | 
40 | %files
41 | %config(noreplace,missingok) /etc/gpu-manager/extra-config.json
42 | %config(noreplace,missingok) /etc/gpu-manager/gpu-manager.conf
43 | %config(noreplace,missingok) /etc/gpu-manager/volume.conf
44 | 
45 | /%{_bindir}/gpu-manager
46 | /%{_bindir}/gpu-client
47 | 
48 | /%{_unitdir}/gpu-manager.service
49 | 


--------------------------------------------------------------------------------
/build/start.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -o errexit
 4 | set -o pipefail
 5 | set -o nounset
 6 | 
 7 | source copy-bin-lib.sh
 8 | 
 9 | echo "rebuild ldcache"
10 | /usr/sbin/ldconfig
11 | 
12 | echo "launch gpu manager"
13 | /usr/bin/gpu-manager --extra-config=/etc/gpu-manager/extra-config.json --v=${LOG_LEVEL} --hostname-override=${NODE_NAME} --share-mode=true --volume-config=/etc/gpu-manager/volume.conf --log-dir=/var/log/gpu-manager --query-addr=0.0.0.0 ${EXTRA_FLAGS:-""}


--------------------------------------------------------------------------------
/build/volume.conf:
--------------------------------------------------------------------------------
 1 | {
 2 |   "volume": [
 3 |     {
 4 |       "name": "nvidia",
 5 |       "base": "/etc/gpu-manager/vdriver",
 6 |       "mode": "ro",
 7 |       "components": {
 8 |         "binaries": [
 9 |           "nvidia-cuda-mps-control",
10 |           "nvidia-cuda-mps-server",
11 |           "nvidia-debugdump",
12 |           "nvidia-persistenced",
13 |           "nvidia-smi",
14 |           "gpu-client"
15 |         ],
16 |         "libraries": [
17 |           "libnvidia-ml.so",
18 |           "libcuda.so",
19 |           "libcuda-control.so",
20 |           "libnvidia-ptxjitcompiler.so",
21 |           "libnvidia-fatbinaryloader.so",
22 |           "libnvidia-opencl.so",
23 |           "libnvidia-compiler.so",
24 |           "libvdpau_nvidia.so",
25 |           "libnvidia-encode.so",
26 |           "libnvcuvid.so",
27 |           "libnvidia-fbc.so",
28 |           "libnvidia-ifr.so",
29 |           "libGL.so",
30 |           "libGLX.so",
31 |           "libOpenGL.so",
32 |           "libGLESv1_CM.so",
33 |           "libGLESv2.so",
34 |           "libEGL.so",
35 |           "libGLdispatch.so",
36 |           "libGLX_nvidia.so",
37 |           "libEGL_nvidia.so",
38 |           "libGLESv2_nvidia.so",
39 |           "libGLESv1_CM_nvidia.so",
40 |           "libnvidia-eglcore.so",
41 |           "libnvidia-egl-wayland.so",
42 |           "libnvidia-glcore.so",
43 |           "libnvidia-tls.so",
44 |           "libnvidia-glsi.so",
45 |           "libnvidia-opticalflow.so",
46 |           "libnvidia-gpucomp.so"
47 |         ]
48 |       }
49 |     },
50 |     {
51 |       "name": "origin",
52 |       "base": "/etc/gpu-manager/vdriver",
53 |       "mode": "ro",
54 |       "components": {
55 |         "binaries": [
56 |           "nvidia-cuda-mps-control",
57 |           "nvidia-cuda-mps-server",
58 |           "nvidia-debugdump",
59 |           "nvidia-persistenced",
60 |           "nvidia-smi"
61 |         ],
62 |         "libraries": [
63 |           "libnvidia-ml.so",
64 |           "libcuda.so",
65 |           "libnvidia-ptxjitcompiler.so",
66 |           "libnvidia-fatbinaryloader.so",
67 |           "libnvidia-opencl.so",
68 |           "libnvidia-compiler.so",
69 |           "libvdpau_nvidia.so",
70 |           "libnvidia-encode.so",
71 |           "libnvcuvid.so",
72 |           "libnvidia-fbc.so",
73 |           "libnvidia-ifr.so",
74 |           "libGL.so",
75 |           "libGLX.so",
76 |           "libOpenGL.so",
77 |           "libGLESv1_CM.so",
78 |           "libGLESv2.so",
79 |           "libEGL.so",
80 |           "libGLdispatch.so",
81 |           "libGLX_nvidia.so",
82 |           "libEGL_nvidia.so",
83 |           "libGLESv2_nvidia.so",
84 |           "libGLESv1_CM_nvidia.so",
85 |           "libnvidia-eglcore.so",
86 |           "libnvidia-egl-wayland.so",
87 |           "libnvidia-glcore.so",
88 |           "libnvidia-tls.so",
89 |           "libnvidia-glsi.so",
90 |           "libnvidia-opticalflow.so"
91 |         ]
92 |       }
93 |     }
94 |   ]
95 | }
96 | 


--------------------------------------------------------------------------------
/cmd/client/client.go:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Tencent is pleased to support the open source community by making TKEStack available.
 3 |  *
 4 |  * Copyright (C) 2012-2019 Tencent. All Rights Reserved.
 5 |  *
 6 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not use
 7 |  * this file except in compliance with the License. You may obtain a copy of the
 8 |  * License at
 9 |  *
10 |  * https://opensource.org/licenses/Apache-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
14 |  * WARRANTIES OF ANY KIND, either express or implied.  See the License for the
15 |  * specific language governing permissions and limitations under the License.
16 |  */
17 | 
18 | package main
19 | 
20 | import (
21 | 	"context"
22 | 	goflag "flag"
23 | 
24 | 	vcudaapi "tkestack.io/gpu-manager/pkg/api/runtime/vcuda"
25 | 	"tkestack.io/gpu-manager/pkg/flags"
26 | 	"tkestack.io/gpu-manager/pkg/logs"
27 | 	"tkestack.io/gpu-manager/pkg/utils"
28 | 
29 | 	"github.com/spf13/pflag"
30 | 	"google.golang.org/grpc"
31 | 	"k8s.io/klog"
32 | )
33 | 
34 | var (
35 | 	addr, busID, podUID, contName, contID string
36 | )
37 | 
38 | func main() {
39 | 	cmdFlags := pflag.CommandLine
40 | 
41 | 	cmdFlags.StringVar(&addr, "addr", "", "RPC address location for dial")
42 | 	cmdFlags.StringVar(&busID, "bus-id", "", "GPU card bus id of caller")
43 | 	cmdFlags.StringVar(&podUID, "pod-uid", "", "Pod UID of caller")
44 | 	cmdFlags.StringVar(&contName, "cont-name", "", "Container name of caller")
45 | 	cmdFlags.StringVar(&contID, "cont-id", "", "Container id of calller")
46 | 
47 | 	flags.InitFlags()
48 | 	goflag.CommandLine.Parse([]string{})
49 | 	logs.InitLogs()
50 | 	defer logs.FlushLogs()
51 | 
52 | 	if len(addr) == 0 || len(podUID) == 0 || (len(contName) == 0 && len(contID) == 0) {
53 | 		klog.Fatalf("argument is empty, current: %s", cmdFlags.Args())
54 | 	}
55 | 
56 | 	conn, err := grpc.Dial(addr, utils.DefaultDialOptions...)
57 | 	if err != nil {
58 | 		klog.Fatalf("can't dial %s, error %v", addr, err)
59 | 	}
60 | 	defer conn.Close()
61 | 
62 | 	client := vcudaapi.NewVCUDAServiceClient(conn)
63 | 	ctx := context.TODO()
64 | 
65 | 	req := &vcudaapi.VDeviceRequest{
66 | 		BusId:         busID,
67 | 		PodUid:        podUID,
68 | 		ContainerName: contName,
69 | 	}
70 | 
71 | 	if len(contID) > 0 {
72 | 		req.ContainerName = ""
73 | 		req.ContainerId = contID
74 | 	}
75 | 
76 | 	_, err = client.RegisterVDevice(ctx, req)
77 | 	if err != nil {
78 | 		klog.Fatalf("fail to get response from manager, error %v", err)
79 | 	}
80 | }
81 | 


--------------------------------------------------------------------------------
/cmd/manager/app/app.go:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Tencent is pleased to support the open source community by making TKEStack available.
  3 |  *
  4 |  * Copyright (C) 2012-2019 Tencent. All Rights Reserved.
  5 |  *
  6 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not use
  7 |  * this file except in compliance with the License. You may obtain a copy of the
  8 |  * License at
  9 |  *
 10 |  * https://opensource.org/licenses/Apache-2.0
 11 |  *
 12 |  * Unless required by applicable law or agreed to in writing, software
 13 |  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 14 |  * WARRANTIES OF ANY KIND, either express or implied.  See the License for the
 15 |  * specific language governing permissions and limitations under the License.
 16 |  */
 17 | 
 18 | package app
 19 | 
 20 | import (
 21 | 	"log"
 22 | 	"os"
 23 | 	"path/filepath"
 24 | 	"strings"
 25 | 	"time"
 26 | 
 27 | 	"tkestack.io/gpu-manager/cmd/manager/options"
 28 | 	"tkestack.io/gpu-manager/pkg/config"
 29 | 	"tkestack.io/gpu-manager/pkg/server"
 30 | 	"tkestack.io/gpu-manager/pkg/types"
 31 | 	"tkestack.io/gpu-manager/pkg/utils"
 32 | 
 33 | 	"github.com/fsnotify/fsnotify"
 34 | 	"k8s.io/klog"
 35 | 	pluginapi "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1"
 36 | )
 37 | 
 38 | // #lizard forgives
 39 | func Run(opt *options.Options) error {
 40 | 	cfg := &config.Config{
 41 | 		Driver:                   opt.Driver,
 42 | 		QueryPort:                opt.QueryPort,
 43 | 		QueryAddr:                opt.QueryAddr,
 44 | 		KubeConfig:               opt.KubeConfigFile,
 45 | 		SamplePeriod:             time.Duration(opt.SamplePeriod) * time.Second,
 46 | 		VCudaRequestsQueue:       make(chan *types.VCudaRequest, 10),
 47 | 		DevicePluginPath:         pluginapi.DevicePluginPath,
 48 | 		VirtualManagerPath:       opt.VirtualManagerPath,
 49 | 		VolumeConfigPath:         opt.VolumeConfigPath,
 50 | 		EnableShare:              opt.EnableShare,
 51 | 		AllocationCheckPeriod:    time.Duration(opt.AllocationCheckPeriod) * time.Second,
 52 | 		CheckpointPath:           opt.CheckpointPath,
 53 | 		ContainerRuntimeEndpoint: opt.ContainerRuntimeEndpoint,
 54 | 		CgroupDriver:             opt.CgroupDriver,
 55 | 		RequestTimeout:           opt.RequestTimeout,
 56 | 	}
 57 | 
 58 | 	if len(opt.HostnameOverride) > 0 {
 59 | 		cfg.Hostname = opt.HostnameOverride
 60 | 	}
 61 | 
 62 | 	if len(opt.ExtraPath) > 0 {
 63 | 		cfg.ExtraConfigPath = opt.ExtraPath
 64 | 	}
 65 | 
 66 | 	if len(opt.DevicePluginPath) > 0 {
 67 | 		cfg.DevicePluginPath = opt.DevicePluginPath
 68 | 	}
 69 | 
 70 | 	cfg.NodeLabels = make(map[string]string)
 71 | 	for _, item := range strings.Split(opt.NodeLabels, ",") {
 72 | 		if len(item) > 0 {
 73 | 			kvs := strings.SplitN(item, "=", 2)
 74 | 			if len(kvs) == 2 {
 75 | 				cfg.NodeLabels[kvs[0]] = kvs[1]
 76 | 			} else {
 77 | 				klog.Warningf("malformed node labels: %v", kvs)
 78 | 			}
 79 | 		}
 80 | 	}
 81 | 
 82 | 	srv := server.NewManager(cfg)
 83 | 	go srv.Run()
 84 | 
 85 | 	waitTimer := time.NewTimer(opt.WaitTimeout)
 86 | 	for !srv.Ready() {
 87 | 		select {
 88 | 		case <-waitTimer.C:
 89 | 			klog.Warningf("Wait too long for server ready, restarting")
 90 | 			os.Exit(1)
 91 | 		default:
 92 | 			klog.Infof("Wait for internal server ready")
 93 | 		}
 94 | 		time.Sleep(time.Second)
 95 | 	}
 96 | 	waitTimer.Stop()
 97 | 
 98 | 	if err := srv.RegisterToKubelet(); err != nil {
 99 | 		return err
100 | 	}
101 | 
102 | 	devicePluginSocket := filepath.Join(cfg.DevicePluginPath, types.KubeletSocket)
103 | 	watcher, err := utils.NewFSWatcher(cfg.DevicePluginPath)
104 | 	if err != nil {
105 | 		log.Println("Failed to created FS watcher.")
106 | 		os.Exit(1)
107 | 	}
108 | 	defer watcher.Close()
109 | 
110 | 	for {
111 | 		select {
112 | 		case event := <-watcher.Events:
113 | 			if event.Name == devicePluginSocket && event.Op&fsnotify.Create == fsnotify.Create {
114 | 				time.Sleep(time.Second)
115 | 				klog.Fatalf("inotify: %s created, restarting.", devicePluginSocket)
116 | 			}
117 | 		case err := <-watcher.Errors:
118 | 			klog.Fatalf("inotify: %s", err)
119 | 		}
120 | 	}
121 | 	return nil
122 | }
123 | 


--------------------------------------------------------------------------------
/cmd/manager/nvidia-manager.go:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Tencent is pleased to support the open source community by making TKEStack available.
 3 |  *
 4 |  * Copyright (C) 2012-2019 Tencent. All Rights Reserved.
 5 |  *
 6 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not use
 7 |  * this file except in compliance with the License. You may obtain a copy of the
 8 |  * License at
 9 |  *
10 |  * https://opensource.org/licenses/Apache-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
14 |  * WARRANTIES OF ANY KIND, either express or implied.  See the License for the
15 |  * specific language governing permissions and limitations under the License.
16 |  */
17 | 
18 | package main
19 | 
20 | import (
21 | 	goflag "flag"
22 | 	"fmt"
23 | 	"os"
24 | 
25 | 	"k8s.io/klog"
26 | 
27 | 	"tkestack.io/gpu-manager/cmd/manager/app"
28 | 	"tkestack.io/gpu-manager/cmd/manager/options"
29 | 	"tkestack.io/gpu-manager/pkg/flags"
30 | 	"tkestack.io/gpu-manager/pkg/logs"
31 | 	"tkestack.io/gpu-manager/pkg/version"
32 | 
33 | 	"github.com/spf13/pflag"
34 | )
35 | 
36 | func main() {
37 | 	klog.InitFlags(nil)
38 | 	opt := options.NewOptions()
39 | 	opt.AddFlags(pflag.CommandLine)
40 | 
41 | 	flags.InitFlags()
42 | 	goflag.CommandLine.Parse([]string{})
43 | 	logs.InitLogs()
44 | 	defer logs.FlushLogs()
45 | 
46 | 	version.PrintAndExitIfRequested()
47 | 
48 | 	if err := app.Run(opt); err != nil {
49 | 		fmt.Fprintf(os.Stderr, "%v\n", err)
50 | 		os.Exit(1)
51 | 	}
52 | }
53 | 


--------------------------------------------------------------------------------
/cmd/manager/options/options.go:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Tencent is pleased to support the open source community by making TKEStack available.
 3 |  *
 4 |  * Copyright (C) 2012-2019 Tencent. All Rights Reserved.
 5 |  *
 6 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not use
 7 |  * this file except in compliance with the License. You may obtain a copy of the
 8 |  * License at
 9 |  *
10 |  * https://opensource.org/licenses/Apache-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
14 |  * WARRANTIES OF ANY KIND, either express or implied.  See the License for the
15 |  * specific language governing permissions and limitations under the License.
16 |  */
17 | 
18 | package options
19 | 
20 | import (
21 | 	"time"
22 | 
23 | 	"github.com/spf13/pflag"
24 | )
25 | 
26 | const (
27 | 	DefaultDriver                   = "nvidia"
28 | 	DefaultQueryPort                = 5678
29 | 	DefaultSamplePeriod             = 1
30 | 	DefaultVirtualManagerPath       = "/etc/gpu-manager/vm"
31 | 	DefaultAllocationCheckPeriod    = 30
32 | 	DefaultCheckpointPath           = "/etc/gpu-manager/checkpoint"
33 | 	DefaultContainerRuntimeEndpoint = "/var/run/dockershim.sock"
34 | 	DefaultCgroupDriver             = "cgroupfs"
35 | )
36 | 
37 | // Options contains plugin information
38 | type Options struct {
39 | 	Driver                   string
40 | 	ExtraPath                string
41 | 	VolumeConfigPath         string
42 | 	QueryPort                int
43 | 	QueryAddr                string
44 | 	KubeConfigFile           string
45 | 	SamplePeriod             int
46 | 	NodeLabels               string
47 | 	HostnameOverride         string
48 | 	VirtualManagerPath       string
49 | 	DevicePluginPath         string
50 | 	EnableShare              bool
51 | 	AllocationCheckPeriod    int
52 | 	CheckpointPath           string
53 | 	ContainerRuntimeEndpoint string
54 | 	CgroupDriver             string
55 | 	RequestTimeout           time.Duration
56 | 	WaitTimeout              time.Duration
57 | }
58 | 
59 | // NewOptions gives a default options template.
60 | func NewOptions() *Options {
61 | 	return &Options{
62 | 		Driver:                   DefaultDriver,
63 | 		QueryPort:                DefaultQueryPort,
64 | 		QueryAddr:                "localhost",
65 | 		SamplePeriod:             DefaultSamplePeriod,
66 | 		VirtualManagerPath:       DefaultVirtualManagerPath,
67 | 		AllocationCheckPeriod:    DefaultAllocationCheckPeriod,
68 | 		CheckpointPath:           DefaultCheckpointPath,
69 | 		ContainerRuntimeEndpoint: DefaultContainerRuntimeEndpoint,
70 | 		CgroupDriver:             DefaultCgroupDriver,
71 | 		RequestTimeout:           time.Second * 5,
72 | 		WaitTimeout:              time.Minute,
73 | 	}
74 | }
75 | 
76 | // AddFlags add some commandline flags.
77 | func (opt *Options) AddFlags(fs *pflag.FlagSet) {
78 | 	fs.StringVar(&opt.Driver, "driver", opt.Driver, "The driver name for manager")
79 | 	fs.StringVar(&opt.ExtraPath, "extra-config", opt.ExtraPath, "The extra config file location")
80 | 	fs.StringVar(&opt.VolumeConfigPath, "volume-config", opt.VolumeConfigPath, "The volume config file location")
81 | 	fs.IntVar(&opt.QueryPort, "query-port", opt.QueryPort, "port for query statistics information")
82 | 	fs.StringVar(&opt.QueryAddr, "query-addr", opt.QueryAddr, "address for query statistics information")
83 | 	fs.StringVar(&opt.KubeConfigFile, "kubeconfig", opt.KubeConfigFile, "Path to kubeconfig file with authorization information (the master location is set by the master flag).")
84 | 	fs.IntVar(&opt.SamplePeriod, "sample-period", opt.SamplePeriod, "Sample period for each card, unit second")
85 | 	fs.StringVar(&opt.NodeLabels, "node-labels", opt.NodeLabels, "automated label for this node, if empty, node will be only labeled by gpu model")
86 | 	fs.StringVar(&opt.HostnameOverride, "hostname-override", opt.HostnameOverride, "If non-empty, will use this string as identification instead of the actual hostname.")
87 | 	fs.StringVar(&opt.VirtualManagerPath, "virtual-manager-path", opt.VirtualManagerPath, "configuration path for virtual manager store files")
88 | 	fs.StringVar(&opt.DevicePluginPath, "device-plugin-path", opt.DevicePluginPath, "the path for kubelet receive device plugin registration")
89 | 	fs.StringVar(&opt.CheckpointPath, "checkpoint-path", opt.CheckpointPath, "configuration path for checkpoint store file")
90 | 	fs.BoolVar(&opt.EnableShare, "share-mode", opt.EnableShare, "enable share mode allocation")
91 | 	fs.IntVar(&opt.AllocationCheckPeriod, "allocation-check-period", opt.AllocationCheckPeriod, "allocation check period, unit second")
92 | 	fs.StringVar(&opt.ContainerRuntimeEndpoint, "container-runtime-endpoint", opt.ContainerRuntimeEndpoint, "container runtime endpoint")
93 | 	fs.StringVar(&opt.CgroupDriver, "cgroup-driver", opt.CgroupDriver, "Driver that the kubelet uses to manipulate cgroups on the host.  "+
94 | 		"Possible values: 'cgroupfs', 'systemd'")
95 | 	fs.DurationVar(&opt.RequestTimeout, "runtime-request-timeout", opt.RequestTimeout,
96 | 		"request timeout for communicating with container runtime endpoint")
97 | 	fs.DurationVar(&opt.WaitTimeout, "wait-timeout", opt.WaitTimeout, "wait timeout for resource server ready")
98 | }
99 | 


--------------------------------------------------------------------------------
/docs/faq.md:
--------------------------------------------------------------------------------
 1 | # FAQ
 2 | 
 3 | *1.* Q: If I use another container runtime, what should I do?
 4 | 
 5 | A: You need to change the `EXTRA_FLAGS` of `gpu-manager.yaml`, add `--container-runtime-endpoint` options, the value is the
 6 | path of your container runtime unix socket, like `/var/run/crio.sock` or something like that.
 7 | 
 8 | *2.* Q: When I use a fraction gpu resource, my program hung
 9 | 
10 | A: Add environment variable `LOGGER_LEVEL` and set value to `5` to `gpu-manager.yaml, and paste your log in your issue.
11 | 
12 | *3.* Q: When I use a fraction gpu resource, program reported a error like `rpc failed`
13 | 
14 | A: After v1.0.3, we use CRI interface to find cgroup path, so if your cgroup driver is not `cgroupfs`, you
15 | need to change the `EXTRA_FLAGS` of `gpu-manager.yaml`, add `--cgroup-driver` options, the possible options are `cgroupfs` or `systemd`.
16 | 


--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
 1 | module tkestack.io/gpu-manager
 2 | 
 3 | go 1.14
 4 | 
 5 | replace tkestack.io/nvml => github.com/tkestack/go-nvml v0.0.0-20191217064248-7363e630a33e
 6 | 
 7 | require (
 8 | 	github.com/coreos/go-systemd v0.0.0-20190321100706-95778dfbb74e
 9 | 	github.com/docker/go-units v0.4.0 // indirect
10 | 	github.com/fsnotify/fsnotify v1.4.7
11 | 	github.com/godbus/dbus v0.0.0-20181101234600-2ff6f7ffd60f // indirect
12 | 	github.com/golang/protobuf v1.3.2
13 | 	github.com/grpc-ecosystem/grpc-gateway v1.12.1
14 | 	github.com/opencontainers/runc v1.0.0-rc9
15 | 	github.com/opencontainers/runtime-spec v1.0.2 // indirect
16 | 	github.com/pkg/errors v0.8.1
17 | 	github.com/prometheus/client_golang v1.2.1
18 | 	github.com/spf13/pflag v1.0.5
19 | 	golang.org/x/net v0.0.0-20191109021931-daa7c04131f5
20 | 	google.golang.org/genproto v0.0.0-20191108220845-16a3f7862a1a
21 | 	google.golang.org/grpc v1.24.0
22 | 	k8s.io/api v0.17.4
23 | 	k8s.io/apimachinery v0.17.4
24 | 	k8s.io/client-go v0.17.4
25 | 	k8s.io/cri-api v0.17.4
26 | 	k8s.io/klog v1.0.0
27 | 	k8s.io/kubectl v0.17.4
28 | 	k8s.io/kubelet v0.17.4
29 | 	tkestack.io/nvml v0.0.0-00010101000000-000000000000
30 | )
31 | 


--------------------------------------------------------------------------------
/gpu-manager-svc.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Service
 3 | metadata:
 4 |   name: gpu-manager-metric
 5 |   namespace: kube-system
 6 |   annotations:
 7 |     prometheus.io/scrape: "true"
 8 |   labels:
 9 |     kubernetes.io/cluster-service: "true"
10 | spec:
11 |   clusterIP: None
12 |   ports:
13 |     - name: metrics
14 |       port: 5678
15 |       protocol: TCP
16 |       targetPort: 5678
17 |   selector:
18 |     name: gpu-manager-ds
19 | 


--------------------------------------------------------------------------------
/gpu-manager.yaml:
--------------------------------------------------------------------------------
  1 | apiVersion: apps/v1
  2 | kind: DaemonSet
  3 | metadata:
  4 |   name: gpu-manager-daemonset
  5 |   namespace: kube-system
  6 | spec:
  7 |   updateStrategy:
  8 |     type: RollingUpdate
  9 |   selector:
 10 |     matchLabels:
 11 |       name: gpu-manager-ds
 12 |   template:
 13 |     metadata:
 14 |       # This annotation is deprecated. Kept here for backward compatibility
 15 |       # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
 16 |       annotations:
 17 |         scheduler.alpha.kubernetes.io/critical-pod: ""
 18 |       labels:
 19 |         name: gpu-manager-ds
 20 |     spec:
 21 |       serviceAccount: gpu-manager
 22 |       tolerations:
 23 |         # This toleration is deprecated. Kept here for backward compatibility
 24 |         # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
 25 |         - key: CriticalAddonsOnly
 26 |           operator: Exists
 27 |         - key: tencent.com/vcuda-core
 28 |           operator: Exists
 29 |           effect: NoSchedule
 30 |       # Mark this pod as a critical add-on; when enabled, the critical add-on
 31 |       # scheduler reserves resources for critical add-on pods so that they can
 32 |       # be rescheduled after a failure.
 33 |       # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
 34 |       priorityClassName: "system-node-critical"
 35 |       # only run node has gpu device
 36 |       nodeSelector:
 37 |         nvidia-device-enable: enable
 38 |       hostPID: true
 39 |       containers:
 40 |         - image: tkestack/gpu-manager:1.0.3
 41 |           imagePullPolicy: Always
 42 |           name: gpu-manager
 43 |           securityContext:
 44 |             privileged: true
 45 |           ports:
 46 |             - containerPort: 5678
 47 |           volumeMounts:
 48 |             - name: device-plugin
 49 |               mountPath: /var/lib/kubelet/device-plugins
 50 |             - name: vdriver
 51 |               mountPath: /etc/gpu-manager/vdriver
 52 |             - name: vmdata
 53 |               mountPath: /etc/gpu-manager/vm
 54 |             - name: log
 55 |               mountPath: /var/log/gpu-manager
 56 |             - name: checkpoint
 57 |               mountPath: /etc/gpu-manager/checkpoint
 58 |             - name: run-dir
 59 |               mountPath: /var/run
 60 |             - name: cgroup
 61 |               mountPath: /sys/fs/cgroup
 62 |               readOnly: true
 63 |             - name: usr-directory
 64 |               mountPath: /usr/local/host
 65 |               readOnly: true
 66 |           env:
 67 |             - name: LOG_LEVEL
 68 |               value: "4"
 69 |             - name: EXTRA_FLAGS
 70 |               value: "--logtostderr=false"
 71 |             - name: NODE_NAME
 72 |               valueFrom:
 73 |                 fieldRef:
 74 |                   fieldPath: spec.nodeName
 75 |       volumes:
 76 |         - name: device-plugin
 77 |           hostPath:
 78 |             type: Directory
 79 |             path: /var/lib/kubelet/device-plugins
 80 |         - name: vmdata
 81 |           hostPath:
 82 |             type: DirectoryOrCreate
 83 |             path: /etc/gpu-manager/vm
 84 |         - name: vdriver
 85 |           hostPath:
 86 |             type: DirectoryOrCreate
 87 |             path: /etc/gpu-manager/vdriver
 88 |         - name: log
 89 |           hostPath:
 90 |             type: DirectoryOrCreate
 91 |             path: /etc/gpu-manager/log
 92 |         - name: checkpoint
 93 |           hostPath:
 94 |             type: DirectoryOrCreate
 95 |             path: /etc/gpu-manager/checkpoint
 96 |         # We have to mount the whole /var/run directory into container, because of bind mount docker.sock
 97 |         # inode change after host docker is restarted
 98 |         - name: run-dir
 99 |           hostPath:
100 |             type: Directory
101 |             path: /var/run
102 |         - name: cgroup
103 |           hostPath:
104 |             type: Directory
105 |             path: /sys/fs/cgroup
106 |         # We have to mount /usr directory instead of specified library path, because of non-existing
107 |         # problem for different distro
108 |         - name: usr-directory
109 |           hostPath:
110 |             type: Directory
111 |             path: /usr
112 | 


--------------------------------------------------------------------------------
/hack/build.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -o errexit
 4 | set -o pipefail
 5 | set -o nounset
 6 | 
 7 | ROOT=$(cd $(dirname ${BASH_SOURCE[0]})/.. && pwd -P)
 8 | 
 9 | source "${ROOT}/hack/common.sh"
10 | 
11 | function plugin::build() {
12 |   (
13 |     for arg; do
14 |         case $arg in
15 |         test)
16 |             plugin::run_test
17 |             ;;
18 |         proto)
19 |             plugin::generate_proto
20 |             ;;
21 |         img)
22 |             plugin::generate_img
23 |             ;;
24 |         fmt)
25 |             plugin::fmt
26 |             ;;
27 |         *)
28 |             plugin::build_binary
29 |         esac
30 |     done
31 |   )
32 | }
33 | 
34 | function plugin::run_test() {
35 |   go test -timeout=1m -bench=. -cover -v ./...
36 | }
37 | 
38 | function plugin::build_binary() {
39 |   go build -o "${ROOT}/go/bin/gpu-$arg" -ldflags "$(plugin::version::ldflags) -s -w" ${PACKAGE}/cmd/$arg
40 | }
41 | 
42 | function plugin::generate_img() {
43 |   readonly local commit=$(git log --no-merges --oneline | wc -l | sed -e 's,^[ \t]*,,')
44 |   readonly local version=$(<"${ROOT}/VERSION")
45 |   readonly local base_img=${BASE_IMG:-"thomassong/vcuda:1.0.4"}
46 | 
47 |   mkdir -p "${ROOT}/go/build"
48 |   tar czf "${ROOT}/go/build/gpu-manager-source.tar.gz" --transform 's,^,/gpu-manager-'${version}'/,' $(plugin::source_targets)
49 | 
50 |   cp -R "${ROOT}/build/"* "${ROOT}/go/build/"
51 | 
52 |   (
53 |     cd ${ROOT}/go/build
54 |     docker build \
55 |         --network=host \
56 |         --build-arg version=${version} \
57 |         --build-arg commit=${commit} \
58 |         --build-arg base_img=${base_img} \
59 |         -t "${IMAGE_FILE}:${version}" .
60 |   )
61 | }
62 | 
63 | function plugin::fmt() {
64 |   local unfmt_files=()
65 |   for file in $(plugin::fmt_targets); do
66 |     if [[ -n $(gofmt -d -s $file 2>&1) ]]; then
67 |       unfmt_files+=($file)
68 |     fi
69 |   done
70 |   if [[ ${#unfmt_files[@]} -gt 0 ]]; then
71 |     echo "need fmt ${unfmt_files[@]}"
72 |     exit 1
73 |   fi
74 | }
75 | 
76 | plugin::build "$@"
77 | 


--------------------------------------------------------------------------------
/hack/common.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | readonly PACKAGE="tkestack.io/gpu-manager"
 4 | readonly BUILD_IMAGE_REPO=plugin-build
 5 | readonly LOCAL_OUTPUT_IMAGE_STAGING="${ROOT}/go/images"
 6 | readonly IMAGE_FILE=${IMAGE_FILE:-"thomassong/gpu-manager"}
 7 | readonly PROTO_IMAGE="proto-generater"
 8 | 
 9 | function plugin::cleanup() {
10 |   rm -rf ${ROOT}/go
11 | }
12 | 
13 | function plugin::cleanup_image() {
14 |   docker rm -vf ${PROTO_IMAGE}
15 | }
16 | 
17 | function plugin::generate_proto() {
18 | (
19 |   docker run --rm \
20 |     -v ${ROOT}/pkg/api:/tmp/pkg/api \
21 |     -v ${ROOT}/staging/src:/tmp/staging/src \
22 |     -u $(id -u) \
23 |     devsu/grpc-gateway \
24 |       bash -c "cd /tmp && protoc \\
25 |         --proto_path=staging/src:. \\
26 |         --proto_path=/go/src/github.com/grpc-ecosystem/grpc-gateway/third_party/googleapis:. \\
27 |         --go_out=plugins=grpc:. \\
28 |         --grpc-gateway_out=logtostderr=true:. \\
29 |         pkg/api/runtime/display/api.proto"
30 | 
31 |   docker run --rm \
32 |     -v ${ROOT}/pkg/api:/tmp/pkg/api \
33 |     -u $(id -u) \
34 |     devsu/grpc-gateway \
35 |       bash -c "cd /tmp && protoc \\
36 |         --go_out=plugins=grpc:. \\
37 |         pkg/api/runtime/vcuda/api.proto"
38 | )
39 | }
40 | 
41 | function plugin::version::ldflag() {
42 |   local key=${1}
43 |   local val=${2}
44 |   echo "-X ${PACKAGE}/pkg/version.${key}=${val}"
45 | }
46 | 
47 | function plugin::version::ldflags() {
48 |   GIT_COMMIT=$(git log -1 --oneline 2>/dev/null | awk '{print $1}')
49 |   local -a ldflags=()
50 |   if [[ -n ${GIT_COMMIT} ]]; then
51 |     ldflags+=($(plugin::version::ldflag "gitCommit" "${GIT_COMMIT}"))
52 |   fi
53 | 
54 |   echo "${ldflags[*]-}"
55 | }
56 | 
57 | function plugin::source_targets() {
58 |   local targets=(
59 |     $(find . -mindepth 1 -maxdepth 1 -not \(        \
60 |         \( -path ./go \) -prune  \
61 |       \))
62 |   )
63 |   echo "${targets[@]}"
64 | }
65 | 
66 | function plugin::fmt_targets() {
67 |   local targets=(
68 |     $(find . -not \(  \
69 |         \( -path ./go \
70 |         -o -path ./vendor \
71 |         \) -prune \
72 |         \) \
73 |         -name "*.go" \
74 |         -print \
75 |     )
76 |   )
77 |   echo "${targets[@]}"
78 | }
79 | 


--------------------------------------------------------------------------------
/pkg/algorithm/nvidia/fragment.go:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Tencent is pleased to support the open source community by making TKEStack available.
  3 |  *
  4 |  * Copyright (C) 2012-2019 Tencent. All Rights Reserved.
  5 |  *
  6 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not use
  7 |  * this file except in compliance with the License. You may obtain a copy of the
  8 |  * License at
  9 |  *
 10 |  * https://opensource.org/licenses/Apache-2.0
 11 |  *
 12 |  * Unless required by applicable law or agreed to in writing, software
 13 |  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 14 |  * WARRANTIES OF ANY KIND, either express or implied.  See the License for the
 15 |  * specific language governing permissions and limitations under the License.
 16 |  */
 17 | 
 18 | package nvidia
 19 | 
 20 | import (
 21 | 	"sort"
 22 | 
 23 | 	"k8s.io/klog"
 24 | 
 25 | 	"tkestack.io/gpu-manager/pkg/device/nvidia"
 26 | )
 27 | 
 28 | type fragmentMode struct {
 29 | 	tree *nvidia.NvidiaTree
 30 | }
 31 | 
 32 | //NewFragmentMode returns a new fragmentMode struct.
 33 | //
 34 | //Evaluate() of fragmentMode returns nodes with minimum available cores
 35 | //which fullfil the request.
 36 | //
 37 | //Fragment mode means to allocate cores on fragmented nodes first, which
 38 | //helps link mode work better.
 39 | func NewFragmentMode(t *nvidia.NvidiaTree) *fragmentMode {
 40 | 	return &fragmentMode{t}
 41 | }
 42 | 
 43 | func (al *fragmentMode) Evaluate(cores int64, _ int64) []*nvidia.NvidiaNode {
 44 | 	var (
 45 | 		candidate = al.tree.Root()
 46 | 		next      *nvidia.NvidiaNode
 47 | 		sorter    = fragmentSort(nvidia.ByAvailable, nvidia.ByAllocatableMemory, nvidia.ByPids, nvidia.ByMinorID)
 48 | 		nodes     = make([]*nvidia.NvidiaNode, 0)
 49 | 		num       = int(cores / nvidia.HundredCore)
 50 | 	)
 51 | 
 52 | 	for next != candidate {
 53 | 		next = candidate
 54 | 
 55 | 		sorter.Sort(candidate.Children)
 56 | 
 57 | 		for _, node := range candidate.Children {
 58 | 			if len(node.Children) == 0 || node.Available() < num {
 59 | 				continue
 60 | 			}
 61 | 
 62 | 			candidate = node
 63 | 			klog.V(2).Infof("Choose id %d, mask %b", candidate.Meta.ID, candidate.Mask)
 64 | 			break
 65 | 		}
 66 | 	}
 67 | 
 68 | 	for _, n := range candidate.GetAvailableLeaves() {
 69 | 		if num == 0 {
 70 | 			break
 71 | 		}
 72 | 
 73 | 		klog.V(2).Infof("Pick up %d mask %b", n.Meta.ID, n.Mask)
 74 | 		nodes = append(nodes, n)
 75 | 		num--
 76 | 	}
 77 | 
 78 | 	if num > 0 {
 79 | 		return nil
 80 | 	}
 81 | 
 82 | 	return nodes
 83 | }
 84 | 
 85 | type fragmentPriority struct {
 86 | 	data []*nvidia.NvidiaNode
 87 | 	less []nvidia.LessFunc
 88 | }
 89 | 
 90 | func fragmentSort(less ...nvidia.LessFunc) *fragmentPriority {
 91 | 	return &fragmentPriority{
 92 | 		less: less,
 93 | 	}
 94 | }
 95 | 
 96 | func (fp *fragmentPriority) Sort(data []*nvidia.NvidiaNode) {
 97 | 	fp.data = data
 98 | 	sort.Sort(fp)
 99 | }
100 | 
101 | func (fp *fragmentPriority) Len() int {
102 | 	return len(fp.data)
103 | }
104 | 
105 | func (fp *fragmentPriority) Swap(i, j int) {
106 | 	fp.data[i], fp.data[j] = fp.data[j], fp.data[i]
107 | }
108 | 
109 | func (fp *fragmentPriority) Less(i, j int) bool {
110 | 	var k int
111 | 
112 | 	for k = 0; k < len(fp.less)-1; k++ {
113 | 		less := fp.less[k]
114 | 		switch {
115 | 		case less(fp.data[i], fp.data[j]):
116 | 			return true
117 | 		case less(fp.data[j], fp.data[i]):
118 | 			return false
119 | 		}
120 | 	}
121 | 
122 | 	return fp.less[k](fp.data[i], fp.data[j])
123 | }
124 | 


--------------------------------------------------------------------------------
/pkg/algorithm/nvidia/fragment_test.go:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Tencent is pleased to support the open source community by making TKEStack available.
 3 |  *
 4 |  * Copyright (C) 2012-2019 Tencent. All Rights Reserved.
 5 |  *
 6 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not use
 7 |  * this file except in compliance with the License. You may obtain a copy of the
 8 |  * License at
 9 |  *
10 |  * https://opensource.org/licenses/Apache-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
14 |  * WARRANTIES OF ANY KIND, either express or implied.  See the License for the
15 |  * specific language governing permissions and limitations under the License.
16 |  */
17 | 
18 | package nvidia
19 | 
20 | import (
21 | 	"flag"
22 | 	"testing"
23 | 
24 | 	"tkestack.io/gpu-manager/pkg/device/nvidia"
25 | )
26 | 
27 | func init() {
28 | 	flag.Set("v", "4")
29 | 	flag.Set("logtostderr", "true")
30 | }
31 | 
32 | func TestFragment(t *testing.T) {
33 | 	flag.Parse()
34 | 	obj := nvidia.NewNvidiaTree(nil)
35 | 	tree, _ := obj.(*nvidia.NvidiaTree)
36 | 
37 | 	testCase1 :=
38 | 		`    GPU0    GPU1    GPU2    GPU3    GPU4    GPU5
39 | GPU0      X      PIX     PHB     PHB     SOC     SOC
40 | GPU1     PIX      X      PHB     PHB     SOC     SOC
41 | GPU2     PHB     PHB      X      PIX     SOC     SOC
42 | GPU3     PHB     PHB     PIX      X      SOC     SOC
43 | GPU4     SOC     SOC     SOC     SOC      X      PIX
44 | GPU5     SOC     SOC     SOC     SOC     PIX      X
45 | `
46 | 	tree.Init(testCase1)
47 | 	algo := NewFragmentMode(tree)
48 | 
49 | 	expectCase1 := []string{
50 | 		"/dev/nvidia4", "/dev/nvidia5",
51 | 	}
52 | 
53 | 	cores := int64(2 * nvidia.HundredCore)
54 | 	pass, should, but := examining(expectCase1, algo.Evaluate(cores, 0))
55 | 	if !pass {
56 | 		t.Fatalf("Evaluate function got wrong, should be %s, but %s", should, but)
57 | 	}
58 | 
59 | 	tree.MarkOccupied(&nvidia.NvidiaNode{
60 | 		Meta: nvidia.DeviceMeta{
61 | 			MinorID: 4,
62 | 		},
63 | 	}, cores, 0)
64 | 
65 | 	expectCase2 := []string{
66 | 		"/dev/nvidia5",
67 | 	}
68 | 
69 | 	cores = int64(nvidia.HundredCore)
70 | 	pass, should, but = examining(expectCase2, algo.Evaluate(cores, 0))
71 | 	if !pass {
72 | 		t.Fatalf("Evaluate function got wrong, should be %s, but %s", should, but)
73 | 	}
74 | }
75 | 
76 | func TestFragmentOnlyOne(t *testing.T) {
77 | 	flag.Parse()
78 | 	obj := nvidia.NewNvidiaTree(nil)
79 | 	tree, _ := obj.(*nvidia.NvidiaTree)
80 | 
81 | 	testCase1 :=
82 | 		` GPU0
83 | GPU0   x`
84 | 
85 | 	tree.Init(testCase1)
86 | 	algo := NewFragmentMode(tree)
87 | 
88 | 	expectCase1 := []string{
89 | 		"/dev/nvidia0",
90 | 	}
91 | 
92 | 	cores := int64(nvidia.HundredCore)
93 | 	pass, should, but := examining(expectCase1, algo.Evaluate(cores, 0))
94 | 	if !pass {
95 | 		t.Fatalf("Evaluate function got wrong, should be %s, but %s", should, but)
96 | 	}
97 | }
98 | 


--------------------------------------------------------------------------------
/pkg/algorithm/nvidia/link.go:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Tencent is pleased to support the open source community by making TKEStack available.
  3 |  *
  4 |  * Copyright (C) 2012-2019 Tencent. All Rights Reserved.
  5 |  *
  6 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not use
  7 |  * this file except in compliance with the License. You may obtain a copy of the
  8 |  * License at
  9 |  *
 10 |  * https://opensource.org/licenses/Apache-2.0
 11 |  *
 12 |  * Unless required by applicable law or agreed to in writing, software
 13 |  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 14 |  * WARRANTIES OF ANY KIND, either express or implied.  See the License for the
 15 |  * specific language governing permissions and limitations under the License.
 16 |  */
 17 | 
 18 | package nvidia
 19 | 
 20 | import (
 21 | 	"sort"
 22 | 
 23 | 	"k8s.io/klog"
 24 | 
 25 | 	"tkestack.io/gpu-manager/pkg/device/nvidia"
 26 | )
 27 | 
 28 | type linkMode struct {
 29 | 	tree *nvidia.NvidiaTree
 30 | }
 31 | 
 32 | //NewLinkMode returns a new linkMode struct.
 33 | //
 34 | //Evaluate() of linkMode returns nodes with minimum connection overhead
 35 | //of each other.
 36 | func NewLinkMode(t *nvidia.NvidiaTree) *linkMode {
 37 | 	return &linkMode{t}
 38 | }
 39 | 
 40 | func (al *linkMode) Evaluate(cores int64, memory int64) []*nvidia.NvidiaNode {
 41 | 	var (
 42 | 		sorter   = linkSort(nvidia.ByType, nvidia.ByAvailable, nvidia.ByAllocatableMemory, nvidia.ByPids, nvidia.ByMinorID)
 43 | 		tmpStore = make(map[int]*nvidia.NvidiaNode)
 44 | 		root     = al.tree.Root()
 45 | 		nodes    = make([]*nvidia.NvidiaNode, 0)
 46 | 		num      = int(cores / nvidia.HundredCore)
 47 | 	)
 48 | 
 49 | 	for _, node := range al.tree.Leaves() {
 50 | 		for node != root {
 51 | 			klog.V(2).Infof("Test %d mask %b", node.Meta.ID, node.Mask)
 52 | 			if node.Available() < num {
 53 | 				node = node.Parent
 54 | 				continue
 55 | 			}
 56 | 
 57 | 			tmpStore[node.Meta.ID] = node
 58 | 			klog.V(2).Infof("Choose %d mask %b", node.Meta.ID, node.Mask)
 59 | 			break
 60 | 		}
 61 | 	}
 62 | 
 63 | 	if len(tmpStore) == 0 {
 64 | 		tmpStore[-1] = root
 65 | 	}
 66 | 
 67 | 	candidates := make([]*nvidia.NvidiaNode, 0)
 68 | 	for _, n := range tmpStore {
 69 | 		candidates = append(candidates, n)
 70 | 	}
 71 | 
 72 | 	sorter.Sort(candidates)
 73 | 
 74 | 	for _, n := range candidates[0].GetAvailableLeaves() {
 75 | 		if num == 0 {
 76 | 			break
 77 | 		}
 78 | 
 79 | 		klog.V(2).Infof("Pick up %d mask %b", n.Meta.ID, n.Mask)
 80 | 		nodes = append(nodes, n)
 81 | 		num--
 82 | 	}
 83 | 
 84 | 	if num > 0 {
 85 | 		return nil
 86 | 	}
 87 | 
 88 | 	return nodes
 89 | }
 90 | 
 91 | type linkPriority struct {
 92 | 	data []*nvidia.NvidiaNode
 93 | 	less []nvidia.LessFunc
 94 | }
 95 | 
 96 | func linkSort(less ...nvidia.LessFunc) *linkPriority {
 97 | 	return &linkPriority{
 98 | 		less: less,
 99 | 	}
100 | }
101 | 
102 | func (lp *linkPriority) Sort(data []*nvidia.NvidiaNode) {
103 | 	lp.data = data
104 | 	sort.Sort(lp)
105 | }
106 | 
107 | func (lp *linkPriority) Len() int {
108 | 	return len(lp.data)
109 | }
110 | 
111 | func (lp *linkPriority) Swap(i, j int) {
112 | 	lp.data[i], lp.data[j] = lp.data[j], lp.data[i]
113 | }
114 | 
115 | func (lp *linkPriority) Less(i, j int) bool {
116 | 	var k int
117 | 
118 | 	for k = 0; k < len(lp.less)-1; k++ {
119 | 		less := lp.less[k]
120 | 		switch {
121 | 		case less(lp.data[i], lp.data[j]):
122 | 			return true
123 | 		case less(lp.data[j], lp.data[i]):
124 | 			return false
125 | 		}
126 | 	}
127 | 
128 | 	return lp.less[k](lp.data[i], lp.data[j])
129 | }
130 | 


--------------------------------------------------------------------------------
/pkg/algorithm/nvidia/link_test.go:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Tencent is pleased to support the open source community by making TKEStack available.
 3 |  *
 4 |  * Copyright (C) 2012-2019 Tencent. All Rights Reserved.
 5 |  *
 6 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not use
 7 |  * this file except in compliance with the License. You may obtain a copy of the
 8 |  * License at
 9 |  *
10 |  * https://opensource.org/licenses/Apache-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
14 |  * WARRANTIES OF ANY KIND, either express or implied.  See the License for the
15 |  * specific language governing permissions and limitations under the License.
16 |  */
17 | 
18 | package nvidia
19 | 
20 | import (
21 | 	"flag"
22 | 	"testing"
23 | 
24 | 	"tkestack.io/gpu-manager/pkg/device/nvidia"
25 | )
26 | 
27 | func init() {
28 | 	flag.Set("v", "4")
29 | 	flag.Set("logtostderr", "true")
30 | }
31 | 
32 | func TestLink(t *testing.T) {
33 | 	flag.Parse()
34 | 	obj := nvidia.NewNvidiaTree(nil)
35 | 	tree, _ := obj.(*nvidia.NvidiaTree)
36 | 
37 | 	testCase1 :=
38 | 		`    GPU0    GPU1    GPU2    GPU3    GPU4    GPU5
39 | GPU0      X      PIX     PHB     PHB     SOC     SOC
40 | GPU1     PIX      X      PHB     PHB     SOC     SOC
41 | GPU2     PHB     PHB      X      PIX     SOC     SOC
42 | GPU3     PHB     PHB     PIX      X      SOC     SOC
43 | GPU4     SOC     SOC     SOC     SOC      X      PIX
44 | GPU5     SOC     SOC     SOC     SOC     PIX      X
45 | `
46 | 	tree.Init(testCase1)
47 | 	algo := NewLinkMode(tree)
48 | 
49 | 	expectCase1 := []string{
50 | 		"/dev/nvidia0",
51 | 		"/dev/nvidia1",
52 | 		"/dev/nvidia2",
53 | 	}
54 | 
55 | 	cores := int64(3 * nvidia.HundredCore)
56 | 	pass, should, but := examining(expectCase1, algo.Evaluate(cores, 0))
57 | 	if !pass {
58 | 		t.Fatalf("Evaluate function got wrong, should be %s, but %s", should, but)
59 | 	}
60 | 
61 | 	tree.MarkOccupied(&nvidia.NvidiaNode{
62 | 		Meta: nvidia.DeviceMeta{
63 | 			MinorID: 2,
64 | 		},
65 | 	}, cores, 0)
66 | 
67 | 	expectCase2 := []string{
68 | 		"/dev/nvidia0",
69 | 		"/dev/nvidia1",
70 | 	}
71 | 
72 | 	cores = int64(2 * nvidia.HundredCore)
73 | 	pass, should, but = examining(expectCase2, algo.Evaluate(cores, 0))
74 | 	if !pass {
75 | 		t.Fatalf("Evaluate function got wrong, should be %s, but %s", should, but)
76 | 	}
77 | }
78 | 


--------------------------------------------------------------------------------
/pkg/algorithm/nvidia/share.go:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Tencent is pleased to support the open source community by making TKEStack available.
  3 |  *
  4 |  * Copyright (C) 2012-2019 Tencent. All Rights Reserved.
  5 |  *
  6 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not use
  7 |  * this file except in compliance with the License. You may obtain a copy of the
  8 |  * License at
  9 |  *
 10 |  * https://opensource.org/licenses/Apache-2.0
 11 |  *
 12 |  * Unless required by applicable law or agreed to in writing, software
 13 |  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 14 |  * WARRANTIES OF ANY KIND, either express or implied.  See the License for the
 15 |  * specific language governing permissions and limitations under the License.
 16 |  */
 17 | 
 18 | package nvidia
 19 | 
 20 | import (
 21 | 	"sort"
 22 | 
 23 | 	"k8s.io/klog"
 24 | 
 25 | 	"tkestack.io/gpu-manager/pkg/device/nvidia"
 26 | )
 27 | 
 28 | type shareMode struct {
 29 | 	tree *nvidia.NvidiaTree
 30 | }
 31 | 
 32 | //NewShareMode returns a new shareMode struct.
 33 | //
 34 | //Evaluate() of shareMode returns one node with minimum available cores
 35 | //which fullfil the request.
 36 | //
 37 | //Share mode means multiple application may share one GPU node which uses
 38 | //GPU more efficiently.
 39 | func NewShareMode(t *nvidia.NvidiaTree) *shareMode {
 40 | 	return &shareMode{t}
 41 | }
 42 | 
 43 | func (al *shareMode) Evaluate(cores int64, memory int64) []*nvidia.NvidiaNode {
 44 | 	var (
 45 | 		nodes    []*nvidia.NvidiaNode
 46 | 		tmpStore = make([]*nvidia.NvidiaNode, al.tree.Total())
 47 | 		sorter   = shareModeSort(nvidia.ByAllocatableCores, nvidia.ByAllocatableMemory, nvidia.ByPids, nvidia.ByMinorID)
 48 | 	)
 49 | 
 50 | 	for i := 0; i < al.tree.Total(); i++ {
 51 | 		tmpStore[i] = al.tree.Leaves()[i]
 52 | 	}
 53 | 
 54 | 	sorter.Sort(tmpStore)
 55 | 
 56 | 	for _, node := range tmpStore {
 57 | 		if node.AllocatableMeta.Cores >= cores && node.AllocatableMeta.Memory >= memory {
 58 | 			klog.V(2).Infof("Pick up %d mask %b, cores: %d, memory: %d", node.Meta.ID, node.Mask, node.AllocatableMeta.Cores, node.AllocatableMeta.Memory)
 59 | 			nodes = append(nodes, node)
 60 | 			break
 61 | 		}
 62 | 	}
 63 | 
 64 | 	return nodes
 65 | }
 66 | 
 67 | type shareModePriority struct {
 68 | 	data []*nvidia.NvidiaNode
 69 | 	less []nvidia.LessFunc
 70 | }
 71 | 
 72 | func shareModeSort(less ...nvidia.LessFunc) *shareModePriority {
 73 | 	return &shareModePriority{
 74 | 		less: less,
 75 | 	}
 76 | }
 77 | 
 78 | func (smp *shareModePriority) Sort(data []*nvidia.NvidiaNode) {
 79 | 	smp.data = data
 80 | 	sort.Sort(smp)
 81 | }
 82 | 
 83 | func (smp *shareModePriority) Len() int {
 84 | 	return len(smp.data)
 85 | }
 86 | 
 87 | func (smp *shareModePriority) Swap(i, j int) {
 88 | 	smp.data[i], smp.data[j] = smp.data[j], smp.data[i]
 89 | }
 90 | 
 91 | func (smp *shareModePriority) Less(i, j int) bool {
 92 | 	var k int
 93 | 
 94 | 	for k = 0; k < len(smp.less)-1; k++ {
 95 | 		less := smp.less[k]
 96 | 		switch {
 97 | 		case less(smp.data[i], smp.data[j]):
 98 | 			return true
 99 | 		case less(smp.data[j], smp.data[i]):
100 | 			return false
101 | 		}
102 | 	}
103 | 
104 | 	return smp.less[k](smp.data[i], smp.data[j])
105 | }
106 | 


--------------------------------------------------------------------------------
/pkg/algorithm/nvidia/share_test.go:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Tencent is pleased to support the open source community by making TKEStack available.
 3 |  *
 4 |  * Copyright (C) 2012-2019 Tencent. All Rights Reserved.
 5 |  *
 6 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not use
 7 |  * this file except in compliance with the License. You may obtain a copy of the
 8 |  * License at
 9 |  *
10 |  * https://opensource.org/licenses/Apache-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
14 |  * WARRANTIES OF ANY KIND, either express or implied.  See the License for the
15 |  * specific language governing permissions and limitations under the License.
16 |  */
17 | 
18 | package nvidia
19 | 
20 | import (
21 | 	"flag"
22 | 	"testing"
23 | 
24 | 	"tkestack.io/gpu-manager/pkg/device/nvidia"
25 | )
26 | 
27 | func init() {
28 | 	flag.Set("v", "4")
29 | 	flag.Set("logtostderr", "true")
30 | }
31 | 
32 | func TestShare(t *testing.T) {
33 | 	flag.Parse()
34 | 	obj := nvidia.NewNvidiaTree(nil)
35 | 	tree, _ := obj.(*nvidia.NvidiaTree)
36 | 
37 | 	testCase1 :=
38 | 		`    GPU0    GPU1    GPU2    GPU3    GPU4    GPU5
39 | GPU0      X      PIX     PHB     PHB     SOC     SOC
40 | GPU1     PIX      X      PHB     PHB     SOC     SOC
41 | GPU2     PHB     PHB      X      PIX     SOC     SOC
42 | GPU3     PHB     PHB     PIX      X      SOC     SOC
43 | GPU4     SOC     SOC     SOC     SOC      X      PIX
44 | GPU5     SOC     SOC     SOC     SOC     PIX      X
45 | `
46 | 	tree.Init(testCase1)
47 | 	for _, n := range tree.Leaves() {
48 | 		n.AllocatableMeta.Cores = nvidia.HundredCore
49 | 		n.AllocatableMeta.Memory = 1024
50 | 	}
51 | 	algo := NewShareMode(tree)
52 | 
53 | 	expectCase1 := []string{
54 | 		"/dev/nvidia0",
55 | 	}
56 | 
57 | 	cores := int64(0.5 * nvidia.HundredCore)
58 | 	pass, should, but := examining(expectCase1, algo.Evaluate(cores, 0))
59 | 	if !pass {
60 | 		t.Fatalf("Evaluate function got wrong, should be %s, but %s", should, but)
61 | 	}
62 | 
63 | 	tree.MarkOccupied(&nvidia.NvidiaNode{
64 | 		Meta: nvidia.DeviceMeta{
65 | 			MinorID: 0,
66 | 		},
67 | 	}, cores, 0)
68 | 
69 | 	expectCase2 := []string{
70 | 		"/dev/nvidia1",
71 | 	}
72 | 
73 | 	cores = int64(0.6 * nvidia.HundredCore)
74 | 	pass, should, but = examining(expectCase2, algo.Evaluate(cores, 0))
75 | 	if !pass {
76 | 		t.Fatalf("Evaluate function got wrong, should be %s, but %s", should, but)
77 | 	}
78 | }
79 | 


--------------------------------------------------------------------------------
/pkg/algorithm/nvidia/util_test.go:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Tencent is pleased to support the open source community by making TKEStack available.
 3 |  *
 4 |  * Copyright (C) 2012-2019 Tencent. All Rights Reserved.
 5 |  *
 6 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not use
 7 |  * this file except in compliance with the License. You may obtain a copy of the
 8 |  * License at
 9 |  *
10 |  * https://opensource.org/licenses/Apache-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
14 |  * WARRANTIES OF ANY KIND, either express or implied.  See the License for the
15 |  * specific language governing permissions and limitations under the License.
16 |  */
17 | 
18 | package nvidia
19 | 
20 | import (
21 | 	"tkestack.io/gpu-manager/pkg/device/nvidia"
22 | )
23 | 
24 | func examining(expect []string, nodes []*nvidia.NvidiaNode) (pass bool, want string, actual string) {
25 | 	if len(expect) != len(nodes) {
26 | 		return false, "", ""
27 | 	}
28 | 
29 | 	for i, n := range nodes {
30 | 		if expect[i] != n.MinorName() {
31 | 			return false, expect[i], n.MinorName()
32 | 		}
33 | 	}
34 | 
35 | 	return true, "", ""
36 | }
37 | 


--------------------------------------------------------------------------------
/pkg/api/runtime/display/api.pb.gw.go:
--------------------------------------------------------------------------------
  1 | // Code generated by protoc-gen-grpc-gateway
  2 | // source: pkg/api/runtime/display/api.proto
  3 | // DO NOT EDIT!
  4 | 
  5 | /*
  6 | Package display is a reverse proxy.
  7 | 
  8 | It translates gRPC into RESTful JSON APIs.
  9 | */
 10 | package display
 11 | 
 12 | import (
 13 | 	"io"
 14 | 	"net/http"
 15 | 
 16 | 	"github.com/golang/protobuf/proto"
 17 | 	"github.com/golang/protobuf/ptypes/empty"
 18 | 	"github.com/grpc-ecosystem/grpc-gateway/runtime"
 19 | 	"github.com/grpc-ecosystem/grpc-gateway/utilities"
 20 | 	"golang.org/x/net/context"
 21 | 	"google.golang.org/grpc"
 22 | 	"google.golang.org/grpc/codes"
 23 | 	"google.golang.org/grpc/grpclog"
 24 | 	"google.golang.org/grpc/status"
 25 | )
 26 | 
 27 | var _ codes.Code
 28 | var _ io.Reader
 29 | var _ status.Status
 30 | var _ = runtime.String
 31 | var _ = utilities.NewDoubleArray
 32 | 
 33 | func request_GPUDisplay_PrintGraph_0(ctx context.Context, marshaler runtime.Marshaler, client GPUDisplayClient, req *http.Request, pathParams map[string]string) (proto.Message, runtime.ServerMetadata, error) {
 34 | 	var protoReq empty.Empty
 35 | 	var metadata runtime.ServerMetadata
 36 | 
 37 | 	msg, err := client.PrintGraph(ctx, &protoReq, grpc.Header(&metadata.HeaderMD), grpc.Trailer(&metadata.TrailerMD))
 38 | 	return msg, metadata, err
 39 | 
 40 | }
 41 | 
 42 | func request_GPUDisplay_PrintUsages_0(ctx context.Context, marshaler runtime.Marshaler, client GPUDisplayClient, req *http.Request, pathParams map[string]string) (proto.Message, runtime.ServerMetadata, error) {
 43 | 	var protoReq empty.Empty
 44 | 	var metadata runtime.ServerMetadata
 45 | 
 46 | 	msg, err := client.PrintUsages(ctx, &protoReq, grpc.Header(&metadata.HeaderMD), grpc.Trailer(&metadata.TrailerMD))
 47 | 	return msg, metadata, err
 48 | 
 49 | }
 50 | 
 51 | func request_GPUDisplay_Version_0(ctx context.Context, marshaler runtime.Marshaler, client GPUDisplayClient, req *http.Request, pathParams map[string]string) (proto.Message, runtime.ServerMetadata, error) {
 52 | 	var protoReq empty.Empty
 53 | 	var metadata runtime.ServerMetadata
 54 | 
 55 | 	msg, err := client.Version(ctx, &protoReq, grpc.Header(&metadata.HeaderMD), grpc.Trailer(&metadata.TrailerMD))
 56 | 	return msg, metadata, err
 57 | 
 58 | }
 59 | 
 60 | // RegisterGPUDisplayHandlerFromEndpoint is same as RegisterGPUDisplayHandler but
 61 | // automatically dials to "endpoint" and closes the connection when "ctx" gets done.
 62 | func RegisterGPUDisplayHandlerFromEndpoint(ctx context.Context, mux *runtime.ServeMux, endpoint string, opts []grpc.DialOption) (err error) {
 63 | 	conn, err := grpc.Dial(endpoint, opts...)
 64 | 	if err != nil {
 65 | 		return err
 66 | 	}
 67 | 	defer func() {
 68 | 		if err != nil {
 69 | 			if cerr := conn.Close(); cerr != nil {
 70 | 				grpclog.Printf("Failed to close conn to %s: %v", endpoint, cerr)
 71 | 			}
 72 | 			return
 73 | 		}
 74 | 		go func() {
 75 | 			<-ctx.Done()
 76 | 			if cerr := conn.Close(); cerr != nil {
 77 | 				grpclog.Printf("Failed to close conn to %s: %v", endpoint, cerr)
 78 | 			}
 79 | 		}()
 80 | 	}()
 81 | 
 82 | 	return RegisterGPUDisplayHandler(ctx, mux, conn)
 83 | }
 84 | 
 85 | // RegisterGPUDisplayHandler registers the http handlers for service GPUDisplay to "mux".
 86 | // The handlers forward requests to the grpc endpoint over "conn".
 87 | func RegisterGPUDisplayHandler(ctx context.Context, mux *runtime.ServeMux, conn *grpc.ClientConn) error {
 88 | 	client := NewGPUDisplayClient(conn)
 89 | 
 90 | 	mux.Handle("GET", pattern_GPUDisplay_PrintGraph_0, func(w http.ResponseWriter, req *http.Request, pathParams map[string]string) {
 91 | 		ctx, cancel := context.WithCancel(ctx)
 92 | 		defer cancel()
 93 | 		if cn, ok := w.(http.CloseNotifier); ok {
 94 | 			go func(done <-chan struct{}, closed <-chan bool) {
 95 | 				select {
 96 | 				case <-done:
 97 | 				case <-closed:
 98 | 					cancel()
 99 | 				}
100 | 			}(ctx.Done(), cn.CloseNotify())
101 | 		}
102 | 		inboundMarshaler, outboundMarshaler := runtime.MarshalerForRequest(mux, req)
103 | 		rctx, err := runtime.AnnotateContext(ctx, mux, req)
104 | 		if err != nil {
105 | 			runtime.HTTPError(ctx, mux, outboundMarshaler, w, req, err)
106 | 			return
107 | 		}
108 | 		resp, md, err := request_GPUDisplay_PrintGraph_0(rctx, inboundMarshaler, client, req, pathParams)
109 | 		ctx = runtime.NewServerMetadataContext(ctx, md)
110 | 		if err != nil {
111 | 			runtime.HTTPError(ctx, mux, outboundMarshaler, w, req, err)
112 | 			return
113 | 		}
114 | 
115 | 		forward_GPUDisplay_PrintGraph_0(ctx, mux, outboundMarshaler, w, req, resp, mux.GetForwardResponseOptions()...)
116 | 
117 | 	})
118 | 
119 | 	mux.Handle("GET", pattern_GPUDisplay_PrintUsages_0, func(w http.ResponseWriter, req *http.Request, pathParams map[string]string) {
120 | 		ctx, cancel := context.WithCancel(ctx)
121 | 		defer cancel()
122 | 		if cn, ok := w.(http.CloseNotifier); ok {
123 | 			go func(done <-chan struct{}, closed <-chan bool) {
124 | 				select {
125 | 				case <-done:
126 | 				case <-closed:
127 | 					cancel()
128 | 				}
129 | 			}(ctx.Done(), cn.CloseNotify())
130 | 		}
131 | 		inboundMarshaler, outboundMarshaler := runtime.MarshalerForRequest(mux, req)
132 | 		rctx, err := runtime.AnnotateContext(ctx, mux, req)
133 | 		if err != nil {
134 | 			runtime.HTTPError(ctx, mux, outboundMarshaler, w, req, err)
135 | 			return
136 | 		}
137 | 		resp, md, err := request_GPUDisplay_PrintUsages_0(rctx, inboundMarshaler, client, req, pathParams)
138 | 		ctx = runtime.NewServerMetadataContext(ctx, md)
139 | 		if err != nil {
140 | 			runtime.HTTPError(ctx, mux, outboundMarshaler, w, req, err)
141 | 			return
142 | 		}
143 | 
144 | 		forward_GPUDisplay_PrintUsages_0(ctx, mux, outboundMarshaler, w, req, resp, mux.GetForwardResponseOptions()...)
145 | 
146 | 	})
147 | 
148 | 	mux.Handle("GET", pattern_GPUDisplay_Version_0, func(w http.ResponseWriter, req *http.Request, pathParams map[string]string) {
149 | 		ctx, cancel := context.WithCancel(ctx)
150 | 		defer cancel()
151 | 		if cn, ok := w.(http.CloseNotifier); ok {
152 | 			go func(done <-chan struct{}, closed <-chan bool) {
153 | 				select {
154 | 				case <-done:
155 | 				case <-closed:
156 | 					cancel()
157 | 				}
158 | 			}(ctx.Done(), cn.CloseNotify())
159 | 		}
160 | 		inboundMarshaler, outboundMarshaler := runtime.MarshalerForRequest(mux, req)
161 | 		rctx, err := runtime.AnnotateContext(ctx, mux, req)
162 | 		if err != nil {
163 | 			runtime.HTTPError(ctx, mux, outboundMarshaler, w, req, err)
164 | 			return
165 | 		}
166 | 		resp, md, err := request_GPUDisplay_Version_0(rctx, inboundMarshaler, client, req, pathParams)
167 | 		ctx = runtime.NewServerMetadataContext(ctx, md)
168 | 		if err != nil {
169 | 			runtime.HTTPError(ctx, mux, outboundMarshaler, w, req, err)
170 | 			return
171 | 		}
172 | 
173 | 		forward_GPUDisplay_Version_0(ctx, mux, outboundMarshaler, w, req, resp, mux.GetForwardResponseOptions()...)
174 | 
175 | 	})
176 | 
177 | 	return nil
178 | }
179 | 
180 | var (
181 | 	pattern_GPUDisplay_PrintGraph_0 = runtime.MustPattern(runtime.NewPattern(1, []int{2, 0}, []string{"graph"}, ""))
182 | 
183 | 	pattern_GPUDisplay_PrintUsages_0 = runtime.MustPattern(runtime.NewPattern(1, []int{2, 0}, []string{"usage"}, ""))
184 | 
185 | 	pattern_GPUDisplay_Version_0 = runtime.MustPattern(runtime.NewPattern(1, []int{2, 0}, []string{"version"}, ""))
186 | )
187 | 
188 | var (
189 | 	forward_GPUDisplay_PrintGraph_0 = runtime.ForwardResponseMessage
190 | 
191 | 	forward_GPUDisplay_PrintUsages_0 = runtime.ForwardResponseMessage
192 | 
193 | 	forward_GPUDisplay_Version_0 = runtime.ForwardResponseMessage
194 | )
195 | 


--------------------------------------------------------------------------------
/pkg/api/runtime/display/api.proto:
--------------------------------------------------------------------------------
 1 | syntax = 'proto3';
 2 | 
 3 | package display;
 4 | 
 5 | import "google/api/annotations.proto";
 6 | import "google/protobuf/empty.proto";
 7 | 
 8 | // GPUDisplay service defines the aux APIs for remote request
 9 | service GPUDisplay {
10 |   // PrintGraph returns the text graph of allocator state
11 |   rpc PrintGraph(google.protobuf.Empty) returns (GraphResponse) {
12 |     option (google.api.http) = {
13 |       get: "/graph"
14 |     };
15 |   }
16 | 
17 |   // GPU usages
18 |   rpc PrintUsages(google.protobuf.Empty) returns (UsageResponse) {
19 |     option (google.api.http) = {
20 |       get: "/usage"
21 |     };
22 |   }
23 | 
24 |   // Version
25 |   rpc Version(google.protobuf.Empty) returns (VersionResponse) {
26 |     option (google.api.http) = {
27 |       get: "/version"
28 |     };
29 |   }
30 | }
31 | 
32 | message GraphResponse {
33 |   string graph = 1;
34 | }
35 | 
36 | message UsageResponse {
37 |   map<string, ContainerStat> usage = 1;
38 | }
39 | 
40 | message ContainerStat {
41 |   map<string, Devices> stat = 1;
42 |   string project = 2;
43 |   string user = 3;
44 |   string cluster = 4;
45 |   map<string, Spec> spec = 5;
46 | }
47 | 
48 | message Devices {
49 |     repeated DeviceInfo dev = 1;
50 | }
51 | 
52 | message DeviceInfo {
53 |   string id = 1;
54 |   string card_idx = 2;
55 |   float gpu = 10;
56 |   float mem = 11;
57 |   repeated int32 pids = 12;
58 |   float device_mem = 13;
59 | }
60 | 
61 | message VersionResponse {
62 |   string version = 1;
63 | }
64 | 
65 | message Spec {
66 |     float gpu = 1;
67 |     float mem = 2;
68 | }
69 | 


--------------------------------------------------------------------------------
/pkg/api/runtime/vcuda/api.pb.go:
--------------------------------------------------------------------------------
  1 | // Code generated by protoc-gen-go. DO NOT EDIT.
  2 | // source: pkg/api/runtime/vcuda/api.proto
  3 | 
  4 | /*
  5 | Package vcuda is a generated protocol buffer package.
  6 | 
  7 | It is generated from these files:
  8 | 	pkg/api/runtime/vcuda/api.proto
  9 | 
 10 | It has these top-level messages:
 11 | 	VDeviceRequest
 12 | 	VDeviceResponse
 13 | */
 14 | package vcuda
 15 | 
 16 | import proto "github.com/golang/protobuf/proto"
 17 | import fmt "fmt"
 18 | import math "math"
 19 | 
 20 | import (
 21 | 	context "golang.org/x/net/context"
 22 | 	grpc "google.golang.org/grpc"
 23 | )
 24 | 
 25 | // Reference imports to suppress errors if they are not otherwise used.
 26 | var _ = proto.Marshal
 27 | var _ = fmt.Errorf
 28 | var _ = math.Inf
 29 | 
 30 | // This is a compile-time assertion to ensure that this generated file
 31 | // is compatible with the proto package it is being compiled against.
 32 | // A compilation error at this line likely means your copy of the
 33 | // proto package needs to be updated.
 34 | const _ = proto.ProtoPackageIsVersion2 // please upgrade the proto package
 35 | 
 36 | type VDeviceRequest struct {
 37 | 	BusId         string `protobuf:"bytes,1,opt,name=bus_id,json=busId" json:"bus_id,omitempty"`
 38 | 	PodUid        string `protobuf:"bytes,2,opt,name=pod_uid,json=podUid" json:"pod_uid,omitempty"`
 39 | 	ContainerName string `protobuf:"bytes,3,opt,name=container_name,json=containerName" json:"container_name,omitempty"`
 40 | 	ContainerId   string `protobuf:"bytes,4,opt,name=container_id,json=containerId" json:"container_id,omitempty"`
 41 | }
 42 | 
 43 | func (m *VDeviceRequest) Reset()                    { *m = VDeviceRequest{} }
 44 | func (m *VDeviceRequest) String() string            { return proto.CompactTextString(m) }
 45 | func (*VDeviceRequest) ProtoMessage()               {}
 46 | func (*VDeviceRequest) Descriptor() ([]byte, []int) { return fileDescriptor0, []int{0} }
 47 | 
 48 | func (m *VDeviceRequest) GetBusId() string {
 49 | 	if m != nil {
 50 | 		return m.BusId
 51 | 	}
 52 | 	return ""
 53 | }
 54 | 
 55 | func (m *VDeviceRequest) GetPodUid() string {
 56 | 	if m != nil {
 57 | 		return m.PodUid
 58 | 	}
 59 | 	return ""
 60 | }
 61 | 
 62 | func (m *VDeviceRequest) GetContainerName() string {
 63 | 	if m != nil {
 64 | 		return m.ContainerName
 65 | 	}
 66 | 	return ""
 67 | }
 68 | 
 69 | func (m *VDeviceRequest) GetContainerId() string {
 70 | 	if m != nil {
 71 | 		return m.ContainerId
 72 | 	}
 73 | 	return ""
 74 | }
 75 | 
 76 | type VDeviceResponse struct {
 77 | }
 78 | 
 79 | func (m *VDeviceResponse) Reset()                    { *m = VDeviceResponse{} }
 80 | func (m *VDeviceResponse) String() string            { return proto.CompactTextString(m) }
 81 | func (*VDeviceResponse) ProtoMessage()               {}
 82 | func (*VDeviceResponse) Descriptor() ([]byte, []int) { return fileDescriptor0, []int{1} }
 83 | 
 84 | func init() {
 85 | 	proto.RegisterType((*VDeviceRequest)(nil), "vcuda.VDeviceRequest")
 86 | 	proto.RegisterType((*VDeviceResponse)(nil), "vcuda.VDeviceResponse")
 87 | }
 88 | 
 89 | // Reference imports to suppress errors if they are not otherwise used.
 90 | var _ context.Context
 91 | var _ grpc.ClientConn
 92 | 
 93 | // This is a compile-time assertion to ensure that this generated file
 94 | // is compatible with the grpc package it is being compiled against.
 95 | const _ = grpc.SupportPackageIsVersion4
 96 | 
 97 | // Client API for VCUDAService service
 98 | 
 99 | type VCUDAServiceClient interface {
100 | 	RegisterVDevice(ctx context.Context, in *VDeviceRequest, opts ...grpc.CallOption) (*VDeviceResponse, error)
101 | }
102 | 
103 | type vCUDAServiceClient struct {
104 | 	cc *grpc.ClientConn
105 | }
106 | 
107 | func NewVCUDAServiceClient(cc *grpc.ClientConn) VCUDAServiceClient {
108 | 	return &vCUDAServiceClient{cc}
109 | }
110 | 
111 | func (c *vCUDAServiceClient) RegisterVDevice(ctx context.Context, in *VDeviceRequest, opts ...grpc.CallOption) (*VDeviceResponse, error) {
112 | 	out := new(VDeviceResponse)
113 | 	err := grpc.Invoke(ctx, "/vcuda.VCUDAService/RegisterVDevice", in, out, c.cc, opts...)
114 | 	if err != nil {
115 | 		return nil, err
116 | 	}
117 | 	return out, nil
118 | }
119 | 
120 | // Server API for VCUDAService service
121 | 
122 | type VCUDAServiceServer interface {
123 | 	RegisterVDevice(context.Context, *VDeviceRequest) (*VDeviceResponse, error)
124 | }
125 | 
126 | func RegisterVCUDAServiceServer(s *grpc.Server, srv VCUDAServiceServer) {
127 | 	s.RegisterService(&_VCUDAService_serviceDesc, srv)
128 | }
129 | 
130 | func _VCUDAService_RegisterVDevice_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
131 | 	in := new(VDeviceRequest)
132 | 	if err := dec(in); err != nil {
133 | 		return nil, err
134 | 	}
135 | 	if interceptor == nil {
136 | 		return srv.(VCUDAServiceServer).RegisterVDevice(ctx, in)
137 | 	}
138 | 	info := &grpc.UnaryServerInfo{
139 | 		Server:     srv,
140 | 		FullMethod: "/vcuda.VCUDAService/RegisterVDevice",
141 | 	}
142 | 	handler := func(ctx context.Context, req interface{}) (interface{}, error) {
143 | 		return srv.(VCUDAServiceServer).RegisterVDevice(ctx, req.(*VDeviceRequest))
144 | 	}
145 | 	return interceptor(ctx, in, info, handler)
146 | }
147 | 
148 | var _VCUDAService_serviceDesc = grpc.ServiceDesc{
149 | 	ServiceName: "vcuda.VCUDAService",
150 | 	HandlerType: (*VCUDAServiceServer)(nil),
151 | 	Methods: []grpc.MethodDesc{
152 | 		{
153 | 			MethodName: "RegisterVDevice",
154 | 			Handler:    _VCUDAService_RegisterVDevice_Handler,
155 | 		},
156 | 	},
157 | 	Streams:  []grpc.StreamDesc{},
158 | 	Metadata: "pkg/api/runtime/vcuda/api.proto",
159 | }
160 | 
161 | func init() { proto.RegisterFile("pkg/api/runtime/vcuda/api.proto", fileDescriptor0) }
162 | 
163 | var fileDescriptor0 = []byte{
164 | 	// 226 bytes of a gzipped FileDescriptorProto
165 | 	0x1f, 0x8b, 0x08, 0x00, 0x00, 0x09, 0x6e, 0x88, 0x02, 0xff, 0x5c, 0xd0, 0xcf, 0x4a, 0xc3, 0x40,
166 | 	0x10, 0x06, 0x70, 0xa3, 0x36, 0xe2, 0x58, 0x5b, 0x5c, 0xa8, 0x06, 0x2f, 0x6a, 0x40, 0xf0, 0x94,
167 | 	0x80, 0x3e, 0x81, 0xda, 0x4b, 0x2f, 0x1e, 0x56, 0xda, 0x6b, 0xd8, 0x64, 0x86, 0x32, 0x48, 0x76,
168 | 	0xd7, 0xfd, 0xd3, 0x87, 0xf0, 0xa9, 0x25, 0xab, 0x44, 0xec, 0xf5, 0x37, 0x1f, 0xc3, 0x7c, 0x03,
169 | 	0x37, 0xf6, 0x63, 0x5b, 0x2b, 0xcb, 0xb5, 0x8b, 0x3a, 0x70, 0x4f, 0xf5, 0xae, 0x8b, 0xa8, 0x06,
170 | 	0xa9, 0xac, 0x33, 0xc1, 0x88, 0x49, 0x82, 0xf2, 0x2b, 0x83, 0xd9, 0x66, 0x49, 0x3b, 0xee, 0x48,
171 | 	0xd2, 0x67, 0x24, 0x1f, 0xc4, 0x02, 0xf2, 0x36, 0xfa, 0x86, 0xb1, 0xc8, 0x6e, 0xb3, 0x87, 0x53,
172 | 	0x39, 0x69, 0xa3, 0x5f, 0xa1, 0xb8, 0x82, 0x13, 0x6b, 0xb0, 0x89, 0x8c, 0xc5, 0x61, 0xf2, 0xdc,
173 | 	0x1a, 0x5c, 0x33, 0x8a, 0x7b, 0x98, 0x75, 0x46, 0x07, 0xc5, 0x9a, 0x5c, 0xa3, 0x55, 0x4f, 0xc5,
174 | 	0x51, 0x9a, 0x9f, 0x8f, 0xfa, 0xa6, 0x7a, 0x12, 0x77, 0x30, 0xfd, 0x8b, 0x31, 0x16, 0xc7, 0x29,
175 | 	0x74, 0x36, 0xda, 0x0a, 0xcb, 0x0b, 0x98, 0x8f, 0xb7, 0x78, 0x6b, 0xb4, 0xa7, 0x47, 0x09, 0xd3,
176 | 	0xcd, 0xeb, 0x7a, 0xf9, 0xfc, 0x4e, 0x6e, 0x70, 0xf1, 0x02, 0x73, 0x49, 0x5b, 0xf6, 0x81, 0xdc,
177 | 	0x6f, 0x54, 0x2c, 0xaa, 0x54, 0xa5, 0xfa, 0x5f, 0xe3, 0xfa, 0x72, 0x9f, 0x7f, 0x36, 0x96, 0x07,
178 | 	0x6d, 0x9e, 0x3e, 0xf0, 0xf4, 0x1d, 0x00, 0x00, 0xff, 0xff, 0x6a, 0xa0, 0x48, 0xf8, 0x24, 0x01,
179 | 	0x00, 0x00,
180 | }
181 | 


--------------------------------------------------------------------------------
/pkg/api/runtime/vcuda/api.proto:
--------------------------------------------------------------------------------
 1 | syntax = 'proto3';
 2 | 
 3 | package vcuda;
 4 | 
 5 | service VCUDAService {
 6 |   rpc RegisterVDevice(VDeviceRequest) returns (VDeviceResponse) {}
 7 | }
 8 | 
 9 | message VDeviceRequest {
10 |     string bus_id = 1;
11 |     string pod_uid = 2;
12 |     string container_name = 3;
13 |     string container_id = 4;
14 | }
15 | 
16 | message VDeviceResponse {}


--------------------------------------------------------------------------------
/pkg/config/config.go:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Tencent is pleased to support the open source community by making TKEStack available.
 3 |  *
 4 |  * Copyright (C) 2012-2019 Tencent. All Rights Reserved.
 5 |  *
 6 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not use
 7 |  * this file except in compliance with the License. You may obtain a copy of the
 8 |  * License at
 9 |  *
10 |  * https://opensource.org/licenses/Apache-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
14 |  * WARRANTIES OF ANY KIND, either express or implied.  See the License for the
15 |  * specific language governing permissions and limitations under the License.
16 |  */
17 | 
18 | package config
19 | 
20 | import (
21 | 	"time"
22 | 
23 | 	"tkestack.io/gpu-manager/pkg/types"
24 | )
25 | 
26 | // Config contains the necessary options for the plugin.
27 | type Config struct {
28 | 	Driver                   string
29 | 	ExtraConfigPath          string
30 | 	QueryPort                int
31 | 	QueryAddr                string
32 | 	KubeConfig               string
33 | 	SamplePeriod             time.Duration
34 | 	Hostname                 string
35 | 	NodeLabels               map[string]string
36 | 	VirtualManagerPath       string
37 | 	DevicePluginPath         string
38 | 	VolumeConfigPath         string
39 | 	EnableShare              bool
40 | 	AllocationCheckPeriod    time.Duration
41 | 	CheckpointPath           string
42 | 	ContainerRuntimeEndpoint string
43 | 	CgroupDriver             string
44 | 	RequestTimeout           time.Duration
45 | 
46 | 	VCudaRequestsQueue chan *types.VCudaRequest
47 | }
48 | 
49 | //ExtraConfig contains extra options other than Config
50 | type ExtraConfig struct {
51 | 	Devices []string `json:"devices,omitempty"`
52 | }
53 | 


--------------------------------------------------------------------------------
/pkg/device/dummy/tree.go:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Tencent is pleased to support the open source community by making TKEStack available.
 3 |  *
 4 |  * Copyright (C) 2012-2019 Tencent. All Rights Reserved.
 5 |  *
 6 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not use
 7 |  * this file except in compliance with the License. You may obtain a copy of the
 8 |  * License at
 9 |  *
10 |  * https://opensource.org/licenses/Apache-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
14 |  * WARRANTIES OF ANY KIND, either express or implied.  See the License for the
15 |  * specific language governing permissions and limitations under the License.
16 |  */
17 | 
18 | package dummy
19 | 
20 | import (
21 | 	"tkestack.io/gpu-manager/pkg/config"
22 | 	"tkestack.io/gpu-manager/pkg/device"
23 | )
24 | 
25 | func init() {
26 | 	device.Register("dummy", NewDummyTree)
27 | }
28 | 
29 | //DummyTree represents dummy tree struct
30 | type DummyTree struct {
31 | }
32 | 
33 | var _ device.GPUTree = &DummyTree{}
34 | 
35 | //NewDummyTree creates a new DummyTree
36 | func NewDummyTree(_ *config.Config) device.GPUTree {
37 | 	return &DummyTree{}
38 | }
39 | 
40 | //Init a DummyTree
41 | func (t *DummyTree) Init(_ string) {
42 | }
43 | 
44 | //Update a DummyTree
45 | func (t *DummyTree) Update() {
46 | 
47 | }
48 | 


--------------------------------------------------------------------------------
/pkg/device/nvidia/node.go:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Tencent is pleased to support the open source community by making TKEStack available.
  3 |  *
  4 |  * Copyright (C) 2012-2019 Tencent. All Rights Reserved.
  5 |  *
  6 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not use
  7 |  * this file except in compliance with the License. You may obtain a copy of the
  8 |  * License at
  9 |  *
 10 |  * https://opensource.org/licenses/Apache-2.0
 11 |  *
 12 |  * Unless required by applicable law or agreed to in writing, software
 13 |  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 14 |  * WARRANTIES OF ANY KIND, either express or implied.  See the License for the
 15 |  * specific language governing permissions and limitations under the License.
 16 |  */
 17 | 
 18 | package nvidia
 19 | 
 20 | import (
 21 | 	"fmt"
 22 | 	"math/bits"
 23 | 
 24 | 	"k8s.io/klog"
 25 | 
 26 | 	"tkestack.io/nvml"
 27 | )
 28 | 
 29 | //SchedulerCache contains allocatable resource of GPU
 30 | type SchedulerCache struct {
 31 | 	Cores  int64
 32 | 	Memory int64
 33 | }
 34 | 
 35 | //DeviceMeta contains metadata of GPU device
 36 | type DeviceMeta struct {
 37 | 	ID          int
 38 | 	MinorID     int
 39 | 	UsedMemory  uint64
 40 | 	TotalMemory uint64
 41 | 	Pids        []uint
 42 | 	BusId       string
 43 | 	Utilization uint
 44 | 	UUID        string
 45 | }
 46 | 
 47 | //NvidiaNode represents a node of Nvidia GPU
 48 | type NvidiaNode struct {
 49 | 	Meta            DeviceMeta
 50 | 	AllocatableMeta SchedulerCache
 51 | 
 52 | 	Parent   *NvidiaNode
 53 | 	Children []*NvidiaNode
 54 | 	Mask     uint32
 55 | 
 56 | 	pendingReset bool
 57 | 	vchildren    map[int]*NvidiaNode
 58 | 	ntype        nvml.GpuTopologyLevel
 59 | 	tree         *NvidiaTree
 60 | }
 61 | 
 62 | var (
 63 | 	/** test only */
 64 | 	nodeIndex = 0
 65 | )
 66 | 
 67 | //NewNvidiaNode returns a new NvidiaNode
 68 | func NewNvidiaNode(t *NvidiaTree) *NvidiaNode {
 69 | 	node := &NvidiaNode{
 70 | 		vchildren: make(map[int]*NvidiaNode),
 71 | 		ntype:     nvml.TOPOLOGY_UNKNOWN,
 72 | 		tree:      t,
 73 | 		Meta: DeviceMeta{
 74 | 			ID: nodeIndex,
 75 | 		},
 76 | 	}
 77 | 
 78 | 	nodeIndex++
 79 | 
 80 | 	return node
 81 | }
 82 | 
 83 | func (n *NvidiaNode) setParent(p *NvidiaNode) {
 84 | 	n.Parent = p
 85 | 	p.vchildren[n.Meta.ID] = n
 86 | }
 87 | 
 88 | //MinorName returns MinorID of this NvidiaNode
 89 | func (n *NvidiaNode) MinorName() string {
 90 | 	return fmt.Sprintf(NamePattern, n.Meta.MinorID)
 91 | }
 92 | 
 93 | //Type returns GpuTopologyLevel of this NvidiaNode
 94 | func (n *NvidiaNode) Type() int {
 95 | 	return int(n.ntype)
 96 | }
 97 | 
 98 | //GetAvailableLeaves returns leaves of this NvidiaNode
 99 | //which available for allocating.
100 | func (n *NvidiaNode) GetAvailableLeaves() []*NvidiaNode {
101 | 	var leaves []*NvidiaNode
102 | 
103 | 	mask := n.Mask
104 | 
105 | 	for mask != 0 {
106 | 		id := uint32(bits.TrailingZeros32(mask))
107 | 		klog.V(2).Infof("Pick up %d mask %b", id, n.tree.leaves[id].Mask)
108 | 		leaves = append(leaves, n.tree.leaves[id])
109 | 		mask ^= one << id
110 | 	}
111 | 
112 | 	return leaves
113 | }
114 | 
115 | //Available returns conut of available leaves
116 | //of this NvidiaNode.
117 | func (n *NvidiaNode) Available() int {
118 | 	return bits.OnesCount32(n.Mask)
119 | }
120 | 
121 | func (n *NvidiaNode) String() string {
122 | 	switch n.ntype {
123 | 	case nvml.TOPOLOGY_INTERNAL:
124 | 		return fmt.Sprintf("GPU%d", n.Meta.ID)
125 | 	case nvml.TOPOLOGY_SINGLE:
126 | 		return "PIX"
127 | 	case nvml.TOPOLOGY_MULTIPLE:
128 | 		return "PXB"
129 | 	case nvml.TOPOLOGY_HOSTBRIDGE:
130 | 		return "PHB"
131 | 	case nvml.TOPOLOGY_CPU:
132 | 		return "CPU"
133 | 	case nvml.TOPOLOGY_SYSTEM:
134 | 		return "SYS"
135 | 	}
136 | 
137 | 	return "ROOT"
138 | }
139 | 


--------------------------------------------------------------------------------
/pkg/device/nvidia/sort.go:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Tencent is pleased to support the open source community by making TKEStack available.
  3 |  *
  4 |  * Copyright (C) 2012-2019 Tencent. All Rights Reserved.
  5 |  *
  6 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not use
  7 |  * this file except in compliance with the License. You may obtain a copy of the
  8 |  * License at
  9 |  *
 10 |  * https://opensource.org/licenses/Apache-2.0
 11 |  *
 12 |  * Unless required by applicable law or agreed to in writing, software
 13 |  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 14 |  * WARRANTIES OF ANY KIND, either express or implied.  See the License for the
 15 |  * specific language governing permissions and limitations under the License.
 16 |  */
 17 | 
 18 | package nvidia
 19 | 
 20 | import (
 21 | 	"sort"
 22 | 
 23 | 	"tkestack.io/gpu-manager/pkg/types"
 24 | )
 25 | 
 26 | //LessFunc represents funcion to compare two NvidiaNode
 27 | type LessFunc func(p1, p2 *NvidiaNode) bool
 28 | 
 29 | var (
 30 | 	//ByType compares two NvidiaNode by GpuTopologyLevel
 31 | 	ByType = func(p1, p2 *NvidiaNode) bool {
 32 | 		return p1.Type() < p2.Type()
 33 | 	}
 34 | 
 35 | 	//ByAvailable compares two NvidiaNode by count of available leaves
 36 | 	ByAvailable = func(p1, p2 *NvidiaNode) bool {
 37 | 		return p1.Available() < p2.Available()
 38 | 	}
 39 | 
 40 | 	//ByID compares two NvidiaNode by ID
 41 | 	ByID = func(p1, p2 *NvidiaNode) bool {
 42 | 		return p1.Meta.ID < p2.Meta.ID
 43 | 	}
 44 | 
 45 | 	//ByMinorID compares two NvidiaNode by minor ID
 46 | 	ByMinorID = func(p1, p2 *NvidiaNode) bool {
 47 | 		return p1.Meta.MinorID < p2.Meta.MinorID
 48 | 	}
 49 | 
 50 | 	//ByMemory compares two NvidiaNode by memory already used
 51 | 	ByMemory = func(p1, p2 *NvidiaNode) bool {
 52 | 		return p1.Meta.UsedMemory < p2.Meta.UsedMemory
 53 | 	}
 54 | 
 55 | 	//ByPids compares two NvidiaNode by length of PIDs running on node
 56 | 	ByPids = func(p1, p2 *NvidiaNode) bool {
 57 | 		return len(p1.Meta.Pids) < len(p2.Meta.Pids)
 58 | 	}
 59 | 
 60 | 	//ByAllocatableCores compares two NvidiaNode by available cores
 61 | 	ByAllocatableCores = func(p1, p2 *NvidiaNode) bool {
 62 | 		return p1.AllocatableMeta.Cores < p2.AllocatableMeta.Cores
 63 | 	}
 64 | 
 65 | 	//ByAllocatableMemory compares two NvidiaNode by available memory
 66 | 	ByAllocatableMemory = func(p1, p2 *NvidiaNode) bool {
 67 | 		return p1.AllocatableMeta.Memory/types.MemoryBlockSize < p2.AllocatableMeta.Memory/types.MemoryBlockSize
 68 | 	}
 69 | 
 70 | 	//PrintSorter is used to sort nodes when printing them out
 71 | 	PrintSorter = &printSort{
 72 | 		less: []LessFunc{ByType, ByAvailable, ByMinorID},
 73 | 	}
 74 | )
 75 | 
 76 | type printSort struct {
 77 | 	data []*NvidiaNode
 78 | 	less []LessFunc
 79 | }
 80 | 
 81 | func (p *printSort) Sort(d []*NvidiaNode) {
 82 | 	p.data = d
 83 | 	sort.Sort(p)
 84 | }
 85 | 
 86 | func (p *printSort) Len() int {
 87 | 	return len(p.data)
 88 | }
 89 | 
 90 | func (p *printSort) Swap(i, j int) {
 91 | 	p.data[i], p.data[j] = p.data[j], p.data[i]
 92 | }
 93 | 
 94 | func (p *printSort) Less(i, j int) bool {
 95 | 	var k int
 96 | 
 97 | 	for k = 0; k < len(p.less)-1; k++ {
 98 | 		less := p.less[k]
 99 | 		switch {
100 | 		case less(p.data[i], p.data[j]):
101 | 			return true
102 | 		case less(p.data[j], p.data[i]):
103 | 			return false
104 | 		}
105 | 	}
106 | 
107 | 	return p.less[k](p.data[i], p.data[j])
108 | }
109 | 


--------------------------------------------------------------------------------
/pkg/device/nvidia/sort_test.go:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Tencent is pleased to support the open source community by making TKEStack available.
 3 |  *
 4 |  * Copyright (C) 2012-2019 Tencent. All Rights Reserved.
 5 |  *
 6 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not use
 7 |  * this file except in compliance with the License. You may obtain a copy of the
 8 |  * License at
 9 |  *
10 |  * https://opensource.org/licenses/Apache-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
14 |  * WARRANTIES OF ANY KIND, either express or implied.  See the License for the
15 |  * specific language governing permissions and limitations under the License.
16 |  */
17 | 
18 | package nvidia
19 | 
20 | import (
21 | 	"flag"
22 | 	"testing"
23 | 
24 | 	"tkestack.io/gpu-manager/pkg/types"
25 | )
26 | 
27 | func init() {
28 | 	flag.Set("v", "4")
29 | 	flag.Set("logtostderr", "true")
30 | }
31 | 
32 | func TestSort(t *testing.T) {
33 | 	flag.Parse()
34 | 	//init tree
35 | 	obj := NewNvidiaTree(nil)
36 | 	tree, _ := obj.(*NvidiaTree)
37 | 	testCase1 :=
38 | 		`    GPU0    GPU1    GPU2    GPU3    GPU4    GPU5
39 | GPU0      X      PIX     PHB     PHB     SOC     SOC
40 | GPU1     PIX      X      PHB     PHB     SOC     SOC
41 | GPU2     PHB     PHB      X      PIX     SOC     SOC
42 | GPU3     PHB     PHB     PIX      X      SOC     SOC
43 | GPU4     SOC     SOC     SOC     SOC      X      PIX
44 | GPU5     SOC     SOC     SOC     SOC     PIX      X
45 | `
46 | 	tree.Init(testCase1)
47 | 	for idx, n := range tree.Leaves() {
48 | 		n.AllocatableMeta.Cores = HundredCore
49 | 		n.AllocatableMeta.Memory = 1024 - int64(idx)
50 | 	}
51 | 
52 | 	//test sort
53 | 	expectLeaves := []string{"GPU5", "GPU0", "GPU1", "GPU2", "GPU3", "GPU4"}
54 | 	leaves := tree.Leaves()
55 | 	tree.MarkOccupied(leaves[5], 100, 1*types.MemoryBlockSize)
56 | 	ps := &printSort{
57 | 		less: []LessFunc{ByAllocatableCores,
58 | 			ByAvailable,
59 | 			ByType,
60 | 			ByAllocatableMemory,
61 | 			ByMinorID,
62 | 			ByPids,
63 | 			ByMemory},
64 | 	}
65 | 	ps.Sort(leaves)
66 | 	for i, s := range expectLeaves {
67 | 		if s != leaves[i].String() {
68 | 			t.Fatalf("sort went wrong")
69 | 		}
70 | 	}
71 | }
72 | 


--------------------------------------------------------------------------------
/pkg/device/nvidia/tree_test.go:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Tencent is pleased to support the open source community by making TKEStack available.
  3 |  *
  4 |  * Copyright (C) 2012-2019 Tencent. All Rights Reserved.
  5 |  *
  6 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not use
  7 |  * this file except in compliance with the License. You may obtain a copy of the
  8 |  * License at
  9 |  *
 10 |  * https://opensource.org/licenses/Apache-2.0
 11 |  *
 12 |  * Unless required by applicable law or agreed to in writing, software
 13 |  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 14 |  * WARRANTIES OF ANY KIND, either express or implied.  See the License for the
 15 |  * specific language governing permissions and limitations under the License.
 16 |  */
 17 | 
 18 | package nvidia
 19 | 
 20 | import (
 21 | 	"flag"
 22 | 	"testing"
 23 | 
 24 | 	"tkestack.io/gpu-manager/pkg/types"
 25 | )
 26 | 
 27 | func init() {
 28 | 	flag.Set("v", "4")
 29 | 	flag.Set("logtostderr", "true")
 30 | }
 31 | 
 32 | func TestTree(t *testing.T) {
 33 | 	flag.Parse()
 34 | 	testCase1 :=
 35 | 		`    GPU0    GPU1    GPU2    GPU3    GPU4    GPU5
 36 | GPU0      X      PIX     PHB     PHB     SOC     SOC
 37 | GPU1     PIX      X      PHB     PHB     SOC     SOC
 38 | GPU2     PHB     PHB      X      PIX     SOC     SOC
 39 | GPU3     PHB     PHB     PIX      X      SOC     SOC
 40 | GPU4     SOC     SOC     SOC     SOC      X      PIX
 41 | GPU5     SOC     SOC     SOC     SOC     PIX      X
 42 | `
 43 | 	testTree(t, testCase1, 6)
 44 | 
 45 | 	testCase2 :=
 46 | 		` GPU0
 47 | GPU0   x`
 48 | 	testTree(t, testCase2, 1)
 49 | }
 50 | 
 51 | func testTree(t *testing.T, testCase string, nodeNum int) {
 52 | 	//init tree
 53 | 	obj := NewNvidiaTree(nil)
 54 | 	tree, _ := obj.(*NvidiaTree)
 55 | 	tree.Init(testCase)
 56 | 	for _, n := range tree.Leaves() {
 57 | 		n.AllocatableMeta.Cores = HundredCore
 58 | 		n.AllocatableMeta.Memory = 1024
 59 | 	}
 60 | 
 61 | 	//test Leaves(), Total() and Available()
 62 | 	leaves := tree.Leaves()
 63 | 	if tree.Available() != nodeNum || len(leaves) != nodeNum || tree.Total() != nodeNum {
 64 | 		t.Fatalf("available leaves number wrong")
 65 | 	}
 66 | 
 67 | 	//test Root() and GetAvailableLeaves()
 68 | 	root := tree.Root()
 69 | 	availableLeaves := root.GetAvailableLeaves()
 70 | 	for i, l := range availableLeaves {
 71 | 		if l != leaves[i] {
 72 | 			t.Fatalf("get available leaves wrong")
 73 | 		}
 74 | 	}
 75 | 
 76 | 	//test MarkOccupied() and MarkFree() with half core
 77 | 	tree.MarkOccupied(leaves[0], 50, 1*types.MemoryBlockSize)
 78 | 	if tree.Available() != (nodeNum - 1) {
 79 | 		t.Fatalf("available leaves number wrong after MarkOccupied")
 80 | 	}
 81 | 
 82 | 	tree.MarkFree(leaves[0], 50, 1*types.MemoryBlockSize)
 83 | 	if tree.Available() != nodeNum {
 84 | 		t.Fatalf("available leaves number wrong after MarkFree")
 85 | 	}
 86 | 
 87 | 	//test MarkOccupied() and MarkFree() with one core
 88 | 	tree.MarkOccupied(leaves[0], 100, 1*types.MemoryBlockSize)
 89 | 	if tree.Available() != (nodeNum - 1) {
 90 | 		t.Fatalf("available leaves number wrong after MarkOccupied")
 91 | 	}
 92 | 
 93 | 	tree.MarkFree(leaves[0], 100, 1*types.MemoryBlockSize)
 94 | 	if tree.Available() != nodeNum {
 95 | 		t.Fatalf("available leaves number wrong after MarkFree")
 96 | 	}
 97 | 
 98 | 	//test Query()
 99 | 	if len(leaves) > 0 && tree.Query("/dev/nvidia0") != leaves[0] {
100 | 		t.Fatalf("method Query get wrong node")
101 | 	}
102 | }
103 | 


--------------------------------------------------------------------------------
/pkg/device/nvidia/tree_util.go:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Tencent is pleased to support the open source community by making TKEStack available.
 3 |  *
 4 |  * Copyright (C) 2012-2019 Tencent. All Rights Reserved.
 5 |  *
 6 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not use
 7 |  * this file except in compliance with the License. You may obtain a copy of the
 8 |  * License at
 9 |  *
10 |  * https://opensource.org/licenses/Apache-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
14 |  * WARRANTIES OF ANY KIND, either express or implied.  See the License for the
15 |  * specific language governing permissions and limitations under the License.
16 |  */
17 | 
18 | package nvidia
19 | 
20 | import (
21 | 	"strings"
22 | 
23 | 	"tkestack.io/nvml"
24 | )
25 | 
26 | func parseToGpuTopologyLevel(str string) nvml.GpuTopologyLevel {
27 | 	switch str {
28 | 	case "PIX":
29 | 		return nvml.TOPOLOGY_SINGLE
30 | 	case "PXB":
31 | 		return nvml.TOPOLOGY_MULTIPLE
32 | 	case "PHB":
33 | 		return nvml.TOPOLOGY_HOSTBRIDGE
34 | 	case "SOC":
35 | 		return nvml.TOPOLOGY_CPU
36 | 	}
37 | 
38 | 	if strings.HasPrefix(str, "GPU") {
39 | 		return nvml.TOPOLOGY_INTERNAL
40 | 	}
41 | 
42 | 	return nvml.TOPOLOGY_UNKNOWN
43 | }
44 | 


--------------------------------------------------------------------------------
/pkg/device/register/register.go:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Tencent is pleased to support the open source community by making TKEStack available.
 3 |  *
 4 |  * Copyright (C) 2012-2019 Tencent. All Rights Reserved.
 5 |  *
 6 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not use
 7 |  * this file except in compliance with the License. You may obtain a copy of the
 8 |  * License at
 9 |  *
10 |  * https://opensource.org/licenses/Apache-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
14 |  * WARRANTIES OF ANY KIND, either express or implied.  See the License for the
15 |  * specific language governing permissions and limitations under the License.
16 |  */
17 | 
18 | package register
19 | 
20 | import (
21 | 	// Register test device
22 | 	_ "tkestack.io/gpu-manager/pkg/device/dummy"
23 | 	// Register nvidia device
24 | 	_ "tkestack.io/gpu-manager/pkg/device/nvidia"
25 | )
26 | 


--------------------------------------------------------------------------------
/pkg/device/types.go:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Tencent is pleased to support the open source community by making TKEStack available.
 3 |  *
 4 |  * Copyright (C) 2012-2019 Tencent. All Rights Reserved.
 5 |  *
 6 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not use
 7 |  * this file except in compliance with the License. You may obtain a copy of the
 8 |  * License at
 9 |  *
10 |  * https://opensource.org/licenses/Apache-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
14 |  * WARRANTIES OF ANY KIND, either express or implied.  See the License for the
15 |  * specific language governing permissions and limitations under the License.
16 |  */
17 | 
18 | package device
19 | 
20 | import (
21 | 	"tkestack.io/gpu-manager/pkg/config"
22 | 
23 | 	"k8s.io/klog"
24 | )
25 | 
26 | //GPUTree is an interface for GPU tree structure
27 | type GPUTree interface {
28 | 	Init(input string)
29 | 	Update()
30 | }
31 | 
32 | //NewFunc is a function to create GPUTree
33 | type NewFunc func(cfg *config.Config) GPUTree
34 | 
35 | var (
36 | 	factory = make(map[string]NewFunc)
37 | )
38 | 
39 | //Register NewFunc with name, which can be get
40 | //by calling NewFuncForName() later.
41 | func Register(name string, item NewFunc) {
42 | 	if _, ok := factory[name]; ok {
43 | 		return
44 | 	}
45 | 
46 | 	klog.V(2).Infof("Register NewFunc with name %s", name)
47 | 
48 | 	factory[name] = item
49 | }
50 | 
51 | //NewFuncForName tries to find functions with specific name
52 | //from factory, return nil if not found.
53 | func NewFuncForName(name string) NewFunc {
54 | 	if item, ok := factory[name]; ok {
55 | 		return item
56 | 	}
57 | 
58 | 	klog.V(2).Infof("Can not find NewFunc with name %s", name)
59 | 
60 | 	return nil
61 | }
62 | 


--------------------------------------------------------------------------------
/pkg/flags/flags.go:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Tencent is pleased to support the open source community by making TKEStack available.
 3 |  *
 4 |  * Copyright (C) 2012-2019 Tencent. All Rights Reserved.
 5 |  *
 6 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not use
 7 |  * this file except in compliance with the License. You may obtain a copy of the
 8 |  * License at
 9 |  *
10 |  * https://opensource.org/licenses/Apache-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
14 |  * WARRANTIES OF ANY KIND, either express or implied.  See the License for the
15 |  * specific language governing permissions and limitations under the License.
16 |  */
17 | 
18 | package flags
19 | 
20 | import (
21 | 	goflag "flag"
22 | 	"strings"
23 | 
24 | 	"github.com/spf13/pflag"
25 | )
26 | 
27 | // WordSepNormalizeFunc changes all flags that contain "_" separators
28 | func WordSepNormalizeFunc(f *pflag.FlagSet, name string) pflag.NormalizedName {
29 | 	if strings.Contains(name, "_") {
30 | 		return pflag.NormalizedName(strings.Replace(name, "_", "-", -1))
31 | 	}
32 | 	return pflag.NormalizedName(name)
33 | }
34 | 
35 | // InitFlags normalizes and parses the command line flags
36 | func InitFlags() {
37 | 	pflag.CommandLine.SetNormalizeFunc(WordSepNormalizeFunc)
38 | 	// Only klog flags will be added
39 | 	goflag.CommandLine.VisitAll(func(goflag *goflag.Flag) {
40 | 		switch goflag.Name {
41 | 		case "logtostderr", "alsologtostderr",
42 | 			"v", "stderrthreshold", "vmodule", "log_backtrace_at", "log_dir":
43 | 			pflag.CommandLine.AddGoFlag(goflag)
44 | 		}
45 | 	})
46 | 
47 | 	pflag.Parse()
48 | }
49 | 


--------------------------------------------------------------------------------
/pkg/logs/logs.go:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Tencent is pleased to support the open source community by making TKEStack available.
 3 |  *
 4 |  * Copyright (C) 2012-2019 Tencent. All Rights Reserved.
 5 |  *
 6 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not use
 7 |  * this file except in compliance with the License. You may obtain a copy of the
 8 |  * License at
 9 |  *
10 |  * https://opensource.org/licenses/Apache-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
14 |  * WARRANTIES OF ANY KIND, either express or implied.  See the License for the
15 |  * specific language governing permissions and limitations under the License.
16 |  */
17 | 
18 | package logs
19 | 
20 | import (
21 | 	"log"
22 | 	"time"
23 | 
24 | 	"google.golang.org/grpc/grpclog"
25 | 	"k8s.io/klog"
26 | )
27 | 
28 | // klogWriter serves as a bridge between the standard log package and the klog package.
29 | type klogWriter struct{}
30 | 
31 | // Write implements the io.Writer interface.
32 | func (gw klogWriter) Write(data []byte) (n int, err error) {
33 | 	klog.Info(string(data))
34 | 	return len(data), nil
35 | }
36 | 
37 | // InitLogs initializes logs the way we want for kubernetes.
38 | func InitLogs() {
39 | 	logger := klogWriter{}
40 | 	log.SetOutput(logger)
41 | 	log.SetFlags(0)
42 | 
43 | 	grpclog.SetLogger(logger)
44 | 	// The default klog flush interval is 30 seconds, which is frighteningly long.
45 | 	go func() {
46 | 		for range time.Tick(time.Second) {
47 | 			klog.Flush()
48 | 		}
49 | 	}()
50 | }
51 | 
52 | //FlushLogs calls klog.Flush to flush all pending log I/O
53 | func FlushLogs() {
54 | 	klog.Flush()
55 | }
56 | 
57 | //Fatal wraps klog.FatalDepth
58 | func (gw klogWriter) Fatal(args ...interface{}) {
59 | 	klog.FatalDepth(1, args...)
60 | }
61 | 
62 | //Fatalf wraps klog.Fatalf
63 | func (gw klogWriter) Fatalf(format string, args ...interface{}) {
64 | 	klog.Fatalf(format, args...)
65 | }
66 | 
67 | //Fatalln wraps klog.Fatalln
68 | func (gw klogWriter) Fatalln(args ...interface{}) {
69 | 	klog.Fatalln(args...)
70 | }
71 | 
72 | //Print wraps klog.InfoDepth
73 | func (gw klogWriter) Print(args ...interface{}) {
74 | 	klog.InfoDepth(1, args...)
75 | }
76 | 
77 | //Printf wraps klog.V(2).Infof
78 | func (gw klogWriter) Printf(format string, args ...interface{}) {
79 | 	klog.V(2).Infof(format, args...)
80 | }
81 | 
82 | //Println wraps klog.Info
83 | func (gw klogWriter) Println(args ...interface{}) {
84 | 	klog.Info(args...)
85 | }
86 | 


--------------------------------------------------------------------------------
/pkg/runtime/runtime.go:
--------------------------------------------------------------------------------
  1 | package runtime
  2 | 
  3 | import (
  4 | 	"bufio"
  5 | 	"fmt"
  6 | 	"os"
  7 | 	"path/filepath"
  8 | 	"strconv"
  9 | 	"strings"
 10 | 	"time"
 11 | 
 12 | 	"golang.org/x/net/context"
 13 | 	"google.golang.org/grpc"
 14 | 	v1 "k8s.io/api/core/v1"
 15 | 	criapi "k8s.io/cri-api/pkg/apis/runtime/v1alpha2"
 16 | 	"k8s.io/klog"
 17 | 	"k8s.io/kubectl/pkg/util/qos"
 18 | 
 19 | 	"tkestack.io/gpu-manager/pkg/services/watchdog"
 20 | 	"tkestack.io/gpu-manager/pkg/types"
 21 | 	"tkestack.io/gpu-manager/pkg/utils"
 22 | 	"tkestack.io/gpu-manager/pkg/utils/cgroup"
 23 | )
 24 | 
 25 | type ContainerRuntimeInterface interface {
 26 | 	// Get pids in the given container id
 27 | 	GetPidsInContainers(containerID string) ([]int, error)
 28 | 	// InspectContainer returns the container information by the given name
 29 | 	InspectContainer(containerID string) (*criapi.ContainerStatus, error)
 30 | 	// RuntimeName returns the container runtime name
 31 | 	RuntimeName() string
 32 | }
 33 | 
 34 | type containerRuntimeManager struct {
 35 | 	cgroupDriver   string
 36 | 	runtimeName    string
 37 | 	requestTimeout time.Duration
 38 | 	client         criapi.RuntimeServiceClient
 39 | }
 40 | 
 41 | var _ ContainerRuntimeInterface = (*containerRuntimeManager)(nil)
 42 | 
 43 | var (
 44 | 	containerRoot = cgroup.NewCgroupName([]string{}, "kubepods")
 45 | )
 46 | 
 47 | func NewContainerRuntimeManager(cgroupDriver, endpoint string, requestTimeout time.Duration) (*containerRuntimeManager, error) {
 48 | 	dialOptions := []grpc.DialOption{grpc.WithInsecure(), grpc.WithDialer(utils.UnixDial), grpc.WithBlock(), grpc.WithTimeout(time.Second * 5)}
 49 | 	conn, err := grpc.Dial(endpoint, dialOptions...)
 50 | 	if err != nil {
 51 | 		return nil, err
 52 | 	}
 53 | 
 54 | 	client := criapi.NewRuntimeServiceClient(conn)
 55 | 
 56 | 	m := &containerRuntimeManager{
 57 | 		cgroupDriver:   cgroupDriver,
 58 | 		client:         client,
 59 | 		requestTimeout: requestTimeout,
 60 | 	}
 61 | 
 62 | 	ctx, cancel := context.WithTimeout(context.Background(), m.requestTimeout)
 63 | 	defer cancel()
 64 | 	resp, err := client.Version(ctx, &criapi.VersionRequest{Version: "0.1.0"})
 65 | 	if err != nil {
 66 | 		return nil, err
 67 | 	}
 68 | 
 69 | 	klog.V(2).Infof("Container runtime is %s", resp.RuntimeName)
 70 | 	m.runtimeName = resp.RuntimeName
 71 | 
 72 | 	return m, nil
 73 | }
 74 | 
 75 | func (m *containerRuntimeManager) GetPidsInContainers(containerID string) ([]int, error) {
 76 | 	req := &criapi.ContainerStatusRequest{
 77 | 		ContainerId: containerID,
 78 | 	}
 79 | 
 80 | 	ctx, cancel := context.WithTimeout(context.Background(), m.requestTimeout)
 81 | 	defer cancel()
 82 | 
 83 | 	resp, err := m.client.ContainerStatus(ctx, req)
 84 | 	if err != nil {
 85 | 		klog.Errorf("can't get container %s status, %v", containerID, err)
 86 | 		return nil, err
 87 | 	}
 88 | 
 89 | 	ns := resp.Status.Labels[types.PodNamespaceLabelKey]
 90 | 	podName := resp.Status.Labels[types.PodNameLabelKey]
 91 | 
 92 | 	pod, err := watchdog.GetPod(ns, podName)
 93 | 	if err != nil {
 94 | 		klog.Errorf("can't get pod %s/%s, %v", ns, podName, err)
 95 | 		return nil, err
 96 | 	}
 97 | 
 98 | 	cgroupPath, err := m.getCgroupName(pod, containerID)
 99 | 	if err != nil {
100 | 		klog.Errorf("can't get cgroup parent, %v", err)
101 | 		return nil, err
102 | 	}
103 | 
104 | 	pids := make([]int, 0)
105 | 	baseDir := filepath.Clean(filepath.Join(types.CGROUP_BASE, cgroupPath))
106 | 	filepath.Walk(baseDir, func(path string, info os.FileInfo, err error) error {
107 | 		if info == nil {
108 | 			return nil
109 | 		}
110 | 		if info.IsDir() || info.Name() != types.CGROUP_PROCS {
111 | 			return nil
112 | 		}
113 | 
114 | 		p, err := readProcsFile(path)
115 | 		if err == nil {
116 | 			pids = append(pids, p...)
117 | 		}
118 | 
119 | 		return nil
120 | 	})
121 | 
122 | 	return pids, nil
123 | }
124 | 
125 | func readProcsFile(file string) ([]int, error) {
126 | 	f, err := os.Open(file)
127 | 	if err != nil {
128 | 		klog.Errorf("can't read %s, %v", file, err)
129 | 		return nil, nil
130 | 	}
131 | 	defer f.Close()
132 | 
133 | 	scanner := bufio.NewScanner(f)
134 | 	pids := make([]int, 0)
135 | 	for scanner.Scan() {
136 | 		line := scanner.Text()
137 | 		if pid, err := strconv.Atoi(line); err == nil {
138 | 			pids = append(pids, pid)
139 | 		}
140 | 	}
141 | 
142 | 	klog.V(4).Infof("Read from %s, pids: %v", file, pids)
143 | 	return pids, nil
144 | }
145 | 
146 | func (m *containerRuntimeManager) getCgroupName(pod *v1.Pod, containerID string) (string, error) {
147 | 	podQos := pod.Status.QOSClass
148 | 	if len(podQos) == 0 {
149 | 		podQos = qos.GetPodQOS(pod)
150 | 	}
151 | 
152 | 	var parentContainer cgroup.CgroupName
153 | 	switch podQos {
154 | 	case v1.PodQOSGuaranteed:
155 | 		parentContainer = cgroup.NewCgroupName(containerRoot)
156 | 	case v1.PodQOSBurstable:
157 | 		parentContainer = cgroup.NewCgroupName(containerRoot, strings.ToLower(string(v1.PodQOSBurstable)))
158 | 	case v1.PodQOSBestEffort:
159 | 		parentContainer = cgroup.NewCgroupName(containerRoot, strings.ToLower(string(v1.PodQOSBestEffort)))
160 | 	}
161 | 
162 | 	podContainer := types.PodCgroupNamePrefix + string(pod.UID)
163 | 	cgroupName := cgroup.NewCgroupName(parentContainer, podContainer)
164 | 
165 | 	switch m.cgroupDriver {
166 | 	case "systemd":
167 | 		return fmt.Sprintf("%s/%s-%s.scope", cgroupName.ToSystemd(), cgroup.SystemdPathPrefixOfRuntime(m.runtimeName), containerID), nil
168 | 	case "cgroupfs":
169 | 		return fmt.Sprintf("%s/%s", cgroupName.ToCgroupfs(), containerID), nil
170 | 	default:
171 | 	}
172 | 
173 | 	return "", fmt.Errorf("unsupported cgroup driver")
174 | }
175 | 
176 | func (m *containerRuntimeManager) InspectContainer(containerID string) (*criapi.ContainerStatus, error) {
177 | 	req := &criapi.ContainerStatusRequest{
178 | 		ContainerId: containerID,
179 | 	}
180 | 
181 | 	ctx, cancel := context.WithTimeout(context.Background(), m.requestTimeout)
182 | 	defer cancel()
183 | 
184 | 	resp, err := m.client.ContainerStatus(ctx, req)
185 | 	if err != nil {
186 | 		return nil, err
187 | 	}
188 | 
189 | 	return resp.Status, nil
190 | }
191 | 
192 | func (m *containerRuntimeManager) RuntimeName() string { return m.runtimeName }
193 | 


--------------------------------------------------------------------------------
/pkg/runtime/runtime_stub.go:
--------------------------------------------------------------------------------
 1 | package runtime
 2 | 
 3 | import (
 4 | 	criapi "k8s.io/cri-api/pkg/apis/runtime/v1alpha2"
 5 | )
 6 | 
 7 | type containerRuntimeManagerStub struct {
 8 | }
 9 | 
10 | var _ ContainerRuntimeInterface = (*containerRuntimeManagerStub)(nil)
11 | 
12 | func NewContainerRuntimeManagerStub() *containerRuntimeManagerStub {
13 | 	return &containerRuntimeManagerStub{}
14 | }
15 | 
16 | func (m *containerRuntimeManagerStub) GetPidsInContainers(containerID string) ([]int, error) {
17 | 	return nil, nil
18 | }
19 | 
20 | func (m *containerRuntimeManagerStub) InspectContainer(containerID string) (*criapi.ContainerStatus, error) {
21 | 	return nil, nil
22 | }
23 | 
24 | func (m *containerRuntimeManagerStub) RuntimeName() string { return "" }
25 | 


--------------------------------------------------------------------------------
/pkg/server/server_test.go:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Tencent is pleased to support the open source community by making TKEStack available.
  3 |  *
  4 |  * Copyright (C) 2012-2019 Tencent. All Rights Reserved.
  5 |  *
  6 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not use
  7 |  * this file except in compliance with the License. You may obtain a copy of the
  8 |  * License at
  9 |  *
 10 |  * https://opensource.org/licenses/Apache-2.0
 11 |  *
 12 |  * Unless required by applicable law or agreed to in writing, software
 13 |  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 14 |  * WARRANTIES OF ANY KIND, either express or implied.  See the License for the
 15 |  * specific language governing permissions and limitations under the License.
 16 |  */
 17 | 
 18 | package server
 19 | 
 20 | import (
 21 | 	"context"
 22 | 	"flag"
 23 | 	"fmt"
 24 | 	"io/ioutil"
 25 | 	"net"
 26 | 	"os"
 27 | 	"path/filepath"
 28 | 	"reflect"
 29 | 	"strconv"
 30 | 	"sync"
 31 | 	"testing"
 32 | 	"time"
 33 | 
 34 | 	"tkestack.io/gpu-manager/cmd/manager/options"
 35 | 	"tkestack.io/gpu-manager/pkg/config"
 36 | 	deviceFactory "tkestack.io/gpu-manager/pkg/device"
 37 | 	"tkestack.io/gpu-manager/pkg/device/nvidia"
 38 | 	"tkestack.io/gpu-manager/pkg/runtime"
 39 | 	allocFactory "tkestack.io/gpu-manager/pkg/services/allocator"
 40 | 	"tkestack.io/gpu-manager/pkg/services/response"
 41 | 	virtual_manager "tkestack.io/gpu-manager/pkg/services/virtual-manager"
 42 | 	"tkestack.io/gpu-manager/pkg/services/watchdog"
 43 | 	"tkestack.io/gpu-manager/pkg/types"
 44 | 	"tkestack.io/gpu-manager/pkg/utils"
 45 | 
 46 | 	"github.com/pkg/errors"
 47 | 	"google.golang.org/grpc"
 48 | 	corev1 "k8s.io/api/core/v1"
 49 | 	"k8s.io/apimachinery/pkg/api/resource"
 50 | 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 51 | 	k8stypes "k8s.io/apimachinery/pkg/types"
 52 | 	"k8s.io/client-go/kubernetes/fake"
 53 | 	pluginapi "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1"
 54 | )
 55 | 
 56 | func init() {
 57 | 	flag.Set("v", "4")
 58 | 	flag.Set("logtostderr", "true")
 59 | }
 60 | 
 61 | type kubeletStub struct {
 62 | 	sync.Mutex
 63 | 	socket          string
 64 | 	pluginEndpoints map[string]string
 65 | 	server          *grpc.Server
 66 | }
 67 | 
 68 | type podRawInfo struct {
 69 | 	Name       string
 70 | 	UID        string
 71 | 	Containers []containerRawInfo
 72 | }
 73 | 
 74 | type containerRawInfo struct {
 75 | 	Name   string
 76 | 	Cores  int
 77 | 	Memory int
 78 | }
 79 | 
 80 | // newKubeletStub returns an initialized kubeletStub for testing purpose.
 81 | func newKubeletStub(socket string) *kubeletStub {
 82 | 	return &kubeletStub{
 83 | 		socket:          socket,
 84 | 		pluginEndpoints: make(map[string]string),
 85 | 	}
 86 | }
 87 | 
 88 | // Minimal implementation of deviceplugin.RegistrationServer interface
 89 | func (k *kubeletStub) Register(ctx context.Context, r *pluginapi.RegisterRequest) (*pluginapi.Empty, error) {
 90 | 	k.Lock()
 91 | 	defer k.Unlock()
 92 | 	k.pluginEndpoints[r.ResourceName] = r.Endpoint
 93 | 	return &pluginapi.Empty{}, nil
 94 | }
 95 | 
 96 | func (k *kubeletStub) start() error {
 97 | 	os.Remove(k.socket)
 98 | 	s, err := net.Listen("unix", k.socket)
 99 | 	if err != nil {
100 | 		return errors.Wrap(err, "Can't listen at the socket")
101 | 	}
102 | 
103 | 	k.server = grpc.NewServer()
104 | 
105 | 	pluginapi.RegisterRegistrationServer(k.server, k)
106 | 	go k.server.Serve(s)
107 | 
108 | 	// Wait till the grpcServer is ready to serve services.
109 | 	return utils.WaitForServer(k.socket)
110 | }
111 | 
112 | //stop servers and clean up
113 | func stopServer(srv *managerImpl) {
114 | 	for _, s := range srv.bundleServer {
115 | 		s.Stop()
116 | 	}
117 | 	srv.srv.Stop()
118 | 	os.RemoveAll(srv.config.VirtualManagerPath)
119 | }
120 | 
121 | func TestServer(t *testing.T) {
122 | 	flag.Parse()
123 | 	tempDir, _ := ioutil.TempDir("", "gpu-manager")
124 | 
125 | 	//init opt and cfg
126 | 	opt := options.NewOptions()
127 | 	opt.VirtualManagerPath = filepath.Clean(filepath.Join(tempDir, "vm"))
128 | 	opt.DevicePluginPath = tempDir
129 | 	opt.EnableShare = true
130 | 	opt.HostnameOverride = "testnode"
131 | 	cfg := &config.Config{
132 | 		Driver:                opt.Driver,
133 | 		QueryPort:             opt.QueryPort,
134 | 		QueryAddr:             opt.QueryAddr,
135 | 		KubeConfig:            opt.KubeConfigFile,
136 | 		SamplePeriod:          time.Duration(opt.SamplePeriod) * time.Second,
137 | 		VCudaRequestsQueue:    make(chan *types.VCudaRequest, 10),
138 | 		DevicePluginPath:      opt.DevicePluginPath,
139 | 		VirtualManagerPath:    opt.VirtualManagerPath,
140 | 		VolumeConfigPath:      opt.VolumeConfigPath,
141 | 		EnableShare:           opt.EnableShare,
142 | 		Hostname:              opt.HostnameOverride,
143 | 		AllocationCheckPeriod: 5 * time.Second,
144 | 	}
145 | 
146 | 	defer func() {
147 | 		os.RemoveAll(tempDir)
148 | 	}()
149 | 
150 | 	//init kubletstub
151 | 	kubeletSocket := filepath.Join(cfg.DevicePluginPath, "kubelet.sock")
152 | 	kubelet := newKubeletStub(kubeletSocket)
153 | 	err := kubelet.start()
154 | 	if err != nil {
155 | 		t.Fatalf("%+v", err)
156 | 	}
157 | 	defer kubelet.server.Stop()
158 | 
159 | 	// init manager
160 | 	srv, _ := NewManager(cfg).(*managerImpl)
161 | 	fakeRuntimeManager := runtime.NewContainerRuntimeManagerStub()
162 | 	srv.virtualManager = virtual_manager.NewVirtualManagerForTest(cfg, fakeRuntimeManager, response.NewFakeResponseManager())
163 | 	srv.virtualManager.Run()
164 | 	defer stopServer(srv)
165 | 
166 | 	treeInitFn := deviceFactory.NewFuncForName(cfg.Driver)
167 | 	obj := treeInitFn(cfg)
168 | 	tree, _ := obj.(*nvidia.NvidiaTree)
169 | 
170 | 	testCase1 :=
171 | 		`    GPU0    GPU1    GPU2    GPU3    GPU4    GPU5
172 | GPU0      X      PIX     PHB     PHB     SOC     SOC
173 | GPU1     PIX      X      PHB     PHB     SOC     SOC
174 | GPU2     PHB     PHB      X      PIX     SOC     SOC
175 | GPU3     PHB     PHB     PIX      X      SOC     SOC
176 | GPU4     SOC     SOC     SOC     SOC      X      PIX
177 | GPU5     SOC     SOC     SOC     SOC     PIX      X
178 | `
179 | 	tree.Init(testCase1)
180 | 	for _, n := range tree.Leaves() {
181 | 		n.AllocatableMeta.Cores = nvidia.HundredCore
182 | 		n.AllocatableMeta.Memory = 1024 * 1024 * 1024
183 | 		n.Meta.TotalMemory = 1024 * 1024 * 1024
184 | 	}
185 | 
186 | 	k8sClient := fake.NewSimpleClientset()
187 | 	watchdog.NewPodCacheForTest(k8sClient)
188 | 	initAllocator := allocFactory.NewFuncForName(cfg.Driver + "_test")
189 | 	srv.allocator = initAllocator(cfg, tree, k8sClient, response.NewFakeResponseManager())
190 | 	srv.setupGRPCService()
191 | 	srv.RegisterToKubelet()
192 | 	for _, rs := range srv.bundleServer {
193 | 		go rs.Run()
194 | 		if err := utils.WaitForServer(rs.SocketName()); err != nil {
195 | 			t.Fatalf("%s failed to start: %+v", rs.SocketName(), err)
196 | 		}
197 | 	}
198 | 
199 | 	//check if bundleServers register to kublet correctly
200 | 	expectEndpoints := make(map[string]string)
201 | 	expectEndpoints[types.VCoreAnnotation] = vcoreSocketName
202 | 	expectEndpoints[types.VMemoryAnnotation] = vmemorySocketName
203 | 	if !reflect.DeepEqual(expectEndpoints, kubelet.pluginEndpoints) {
204 | 		t.Fatalf("register to kublet wrong, expect %v, got %v", expectEndpoints, kubelet.pluginEndpoints)
205 | 	}
206 | 
207 | 	//check if bundleServer work correctly
208 | 	pluginSocket := filepath.Join(opt.DevicePluginPath, kubelet.pluginEndpoints[types.VCoreAnnotation])
209 | 	conn, err := grpc.Dial(pluginSocket, utils.DefaultDialOptions...)
210 | 	if err != nil {
211 | 		t.Fatalf("Failed to get connection: %+v", err)
212 | 	}
213 | 	defer conn.Close()
214 | 
215 | 	//create pod with gpu resource required
216 | 	testCases := []podRawInfo{
217 | 		{
218 | 			Name: "pod-0",
219 | 			UID:  "uid-0",
220 | 			Containers: []containerRawInfo{
221 | 				{
222 | 					Name:   "container-0",
223 | 					Cores:  10,
224 | 					Memory: 1,
225 | 				},
226 | 				{
227 | 					Name:   "container-1",
228 | 					Cores:  10,
229 | 					Memory: 1,
230 | 				},
231 | 			},
232 | 		},
233 | 	}
234 | 	for _, cs := range testCases {
235 | 		containers := []corev1.Container{}
236 | 		for _, c := range cs.Containers {
237 | 			container := corev1.Container{
238 | 				Name: c.Name,
239 | 				Resources: corev1.ResourceRequirements{
240 | 					Limits: corev1.ResourceList{
241 | 						types.VCoreAnnotation:   resource.MustParse(fmt.Sprintf("%d", c.Cores)),
242 | 						types.VMemoryAnnotation: resource.MustParse(fmt.Sprintf("%d", c.Memory)),
243 | 					},
244 | 				},
245 | 			}
246 | 			containers = append(containers, container)
247 | 		}
248 | 		pod := &corev1.Pod{
249 | 			ObjectMeta: metav1.ObjectMeta{
250 | 				Name:        cs.Name,
251 | 				UID:         k8stypes.UID(cs.UID),
252 | 				Annotations: make(map[string]string),
253 | 			},
254 | 			Spec: corev1.PodSpec{
255 | 				Containers: containers,
256 | 			},
257 | 			Status: corev1.PodStatus{
258 | 				Phase: corev1.PodPending,
259 | 			},
260 | 		}
261 | 		pod.Annotations[types.PredicateTimeAnnotation] = "1"
262 | 		pod.Annotations[types.GPUAssigned] = "false"
263 | 		for i := range pod.Spec.Containers {
264 | 			pod.Annotations[types.PredicateGPUIndexPrefix+strconv.Itoa(i)] = "0"
265 | 		}
266 | 		pod, _ = k8sClient.CoreV1().Pods("test-ns").Create(pod)
267 | 
268 | 		// wait for podLister to sync
269 | 		time.Sleep(time.Second * 2)
270 | 
271 | 		client := pluginapi.NewDevicePluginClient(conn)
272 | 		for _, c := range pod.Spec.Containers {
273 | 			devicesIDs := []string{}
274 | 			vcore := c.Resources.Limits[types.VCoreAnnotation]
275 | 			for i := 0; i < int(vcore.Value()); i++ {
276 | 				devicesIDs = append(devicesIDs, types.VCoreAnnotation)
277 | 			}
278 | 			_, err = client.Allocate(context.Background(), &pluginapi.AllocateRequest{
279 | 				ContainerRequests: []*pluginapi.ContainerAllocateRequest{
280 | 					{
281 | 						DevicesIDs: devicesIDs,
282 | 					},
283 | 				},
284 | 			})
285 | 			if err != nil {
286 | 				t.Errorf("Failed to allocate for container %s due to %+v", c.Name, err)
287 | 			}
288 | 		}
289 | 	}
290 | }
291 | 


--------------------------------------------------------------------------------
/pkg/server/types.go:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Tencent is pleased to support the open source community by making TKEStack available.
 3 |  *
 4 |  * Copyright (C) 2012-2019 Tencent. All Rights Reserved.
 5 |  *
 6 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not use
 7 |  * this file except in compliance with the License. You may obtain a copy of the
 8 |  * License at
 9 |  *
10 |  * https://opensource.org/licenses/Apache-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
14 |  * WARRANTIES OF ANY KIND, either express or implied.  See the License for the
15 |  * specific language governing permissions and limitations under the License.
16 |  */
17 | 
18 | package server
19 | 
20 | import (
21 | 	"google.golang.org/grpc"
22 | )
23 | 
24 | //Manager api
25 | type Manager interface {
26 | 	Ready() bool
27 | 	Run() error
28 | 	RegisterToKubelet() error
29 | }
30 | 
31 | //ResourceServer api for manager
32 | type ResourceServer interface {
33 | 	Run() error
34 | 	Stop()
35 | 	SocketName() string
36 | 	ResourceName() string
37 | }
38 | 
39 | type resourceServerImpl struct {
40 | 	srv        *grpc.Server
41 | 	socketFile string
42 | 
43 | 	mgr *managerImpl
44 | }
45 | 


--------------------------------------------------------------------------------
/pkg/server/vcore.go:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Tencent is pleased to support the open source community by making TKEStack available.
  3 |  *
  4 |  * Copyright (C) 2012-2019 Tencent. All Rights Reserved.
  5 |  *
  6 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not use
  7 |  * this file except in compliance with the License. You may obtain a copy of the
  8 |  * License at
  9 |  *
 10 |  * https://opensource.org/licenses/Apache-2.0
 11 |  *
 12 |  * Unless required by applicable law or agreed to in writing, software
 13 |  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 14 |  * WARRANTIES OF ANY KIND, either express or implied.  See the License for the
 15 |  * specific language governing permissions and limitations under the License.
 16 |  */
 17 | 
 18 | package server
 19 | 
 20 | import (
 21 | 	"context"
 22 | 	"net"
 23 | 	"os"
 24 | 	"path/filepath"
 25 | 	"syscall"
 26 | 
 27 | 	"google.golang.org/grpc"
 28 | 	"k8s.io/klog"
 29 | 	pluginapi "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1"
 30 | 
 31 | 	"tkestack.io/gpu-manager/pkg/types"
 32 | )
 33 | 
 34 | const (
 35 | 	vcoreSocketName = "vcore.sock"
 36 | )
 37 | 
 38 | type vcoreResourceServer struct {
 39 | 	resourceServerImpl
 40 | }
 41 | 
 42 | var _ pluginapi.DevicePluginServer = &vcoreResourceServer{}
 43 | var _ ResourceServer = &vcoreResourceServer{}
 44 | 
 45 | func newVcoreServer(manager *managerImpl) ResourceServer {
 46 | 	socketFile := filepath.Join(manager.config.DevicePluginPath, vcoreSocketName)
 47 | 
 48 | 	return &vcoreResourceServer{
 49 | 		resourceServerImpl: resourceServerImpl{
 50 | 			srv:        grpc.NewServer(),
 51 | 			socketFile: socketFile,
 52 | 			mgr:        manager,
 53 | 		},
 54 | 	}
 55 | }
 56 | 
 57 | func (vr *vcoreResourceServer) SocketName() string {
 58 | 	return vr.socketFile
 59 | }
 60 | 
 61 | func (vr *vcoreResourceServer) ResourceName() string {
 62 | 	return types.VCoreAnnotation
 63 | }
 64 | 
 65 | func (vr *vcoreResourceServer) Stop() {
 66 | 	vr.srv.Stop()
 67 | }
 68 | 
 69 | func (vr *vcoreResourceServer) Run() error {
 70 | 	pluginapi.RegisterDevicePluginServer(vr.srv, vr)
 71 | 
 72 | 	err := syscall.Unlink(vr.socketFile)
 73 | 	if err != nil && !os.IsNotExist(err) {
 74 | 		return err
 75 | 	}
 76 | 
 77 | 	l, err := net.Listen("unix", vr.socketFile)
 78 | 	if err != nil {
 79 | 		return err
 80 | 	}
 81 | 
 82 | 	klog.V(2).Infof("Server %s is ready at %s", types.VCoreAnnotation, vr.socketFile)
 83 | 
 84 | 	return vr.srv.Serve(l)
 85 | }
 86 | 
 87 | /** device plugin interface */
 88 | func (vr *vcoreResourceServer) Allocate(ctx context.Context, reqs *pluginapi.AllocateRequest) (*pluginapi.AllocateResponse, error) {
 89 | 	klog.V(2).Infof("%+v allocation request for vcore", reqs)
 90 | 	return vr.mgr.Allocate(ctx, reqs)
 91 | }
 92 | 
 93 | func (vr *vcoreResourceServer) ListAndWatch(e *pluginapi.Empty, s pluginapi.DevicePlugin_ListAndWatchServer) error {
 94 | 	klog.V(2).Infof("ListAndWatch request for vcore")
 95 | 	return vr.mgr.ListAndWatchWithResourceName(types.VCoreAnnotation, e, s)
 96 | }
 97 | 
 98 | func (vr *vcoreResourceServer) GetDevicePluginOptions(ctx context.Context, e *pluginapi.Empty) (*pluginapi.DevicePluginOptions, error) {
 99 | 	klog.V(2).Infof("GetDevicePluginOptions request for vcore")
100 | 	return vr.mgr.GetDevicePluginOptions(ctx, e)
101 | }
102 | 
103 | func (vr *vcoreResourceServer) PreStartContainer(ctx context.Context, req *pluginapi.PreStartContainerRequest) (*pluginapi.PreStartContainerResponse, error) {
104 | 	klog.V(2).Infof("PreStartContainer request for vcore")
105 | 	return vr.mgr.PreStartContainer(ctx, req)
106 | }
107 | 


--------------------------------------------------------------------------------
/pkg/server/vmemory.go:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Tencent is pleased to support the open source community by making TKEStack available.
  3 |  *
  4 |  * Copyright (C) 2012-2019 Tencent. All Rights Reserved.
  5 |  *
  6 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not use
  7 |  * this file except in compliance with the License. You may obtain a copy of the
  8 |  * License at
  9 |  *
 10 |  * https://opensource.org/licenses/Apache-2.0
 11 |  *
 12 |  * Unless required by applicable law or agreed to in writing, software
 13 |  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 14 |  * WARRANTIES OF ANY KIND, either express or implied.  See the License for the
 15 |  * specific language governing permissions and limitations under the License.
 16 |  */
 17 | 
 18 | package server
 19 | 
 20 | import (
 21 | 	"context"
 22 | 	"net"
 23 | 	"os"
 24 | 	"path/filepath"
 25 | 	"syscall"
 26 | 
 27 | 	"google.golang.org/grpc"
 28 | 	"k8s.io/klog"
 29 | 	pluginapi "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1"
 30 | 
 31 | 	"tkestack.io/gpu-manager/pkg/types"
 32 | )
 33 | 
 34 | const (
 35 | 	vmemorySocketName = "vmemory.sock"
 36 | )
 37 | 
 38 | type vmemoryResourceServer struct {
 39 | 	resourceServerImpl
 40 | }
 41 | 
 42 | var _ pluginapi.DevicePluginServer = &vmemoryResourceServer{}
 43 | var _ ResourceServer = &vmemoryResourceServer{}
 44 | 
 45 | func newVmemoryServer(manager *managerImpl) ResourceServer {
 46 | 	socketFile := filepath.Join(manager.config.DevicePluginPath, vmemorySocketName)
 47 | 	return &vmemoryResourceServer{
 48 | 		resourceServerImpl: resourceServerImpl{
 49 | 			srv:        grpc.NewServer(),
 50 | 			socketFile: socketFile,
 51 | 			mgr:        manager,
 52 | 		},
 53 | 	}
 54 | }
 55 | 
 56 | func (vr *vmemoryResourceServer) SocketName() string {
 57 | 	return vr.socketFile
 58 | }
 59 | 
 60 | func (vr *vmemoryResourceServer) ResourceName() string {
 61 | 	return types.VMemoryAnnotation
 62 | }
 63 | 
 64 | func (vr *vmemoryResourceServer) Stop() {
 65 | 	vr.srv.Stop()
 66 | }
 67 | 
 68 | func (vr *vmemoryResourceServer) Run() error {
 69 | 	pluginapi.RegisterDevicePluginServer(vr.srv, vr)
 70 | 
 71 | 	err := syscall.Unlink(vr.socketFile)
 72 | 	if err != nil && !os.IsNotExist(err) {
 73 | 		return err
 74 | 	}
 75 | 
 76 | 	l, err := net.Listen("unix", vr.socketFile)
 77 | 	if err != nil {
 78 | 		return err
 79 | 	}
 80 | 
 81 | 	klog.V(2).Infof("Server %s is ready at %s", types.VMemoryAnnotation, vr.socketFile)
 82 | 
 83 | 	return vr.srv.Serve(l)
 84 | }
 85 | 
 86 | /** device plugin interface */
 87 | func (vr *vmemoryResourceServer) Allocate(ctx context.Context, reqs *pluginapi.AllocateRequest) (*pluginapi.AllocateResponse, error) {
 88 | 	klog.V(2).Infof("%+v allocation request for vmemory", reqs)
 89 | 	fakeData := make([]*pluginapi.ContainerAllocateResponse, 0)
 90 | 	fakeData = append(fakeData, &pluginapi.ContainerAllocateResponse{})
 91 | 
 92 | 	return &pluginapi.AllocateResponse{
 93 | 		ContainerResponses: fakeData,
 94 | 	}, nil
 95 | }
 96 | 
 97 | func (vr *vmemoryResourceServer) ListAndWatch(e *pluginapi.Empty, s pluginapi.DevicePlugin_ListAndWatchServer) error {
 98 | 	klog.V(2).Infof("ListAndWatch request for vmemory")
 99 | 	return vr.mgr.ListAndWatchWithResourceName(types.VMemoryAnnotation, e, s)
100 | }
101 | 
102 | func (vr *vmemoryResourceServer) GetDevicePluginOptions(ctx context.Context, e *pluginapi.Empty) (*pluginapi.DevicePluginOptions, error) {
103 | 	klog.V(2).Infof("GetDevicePluginOptions request for vmemory")
104 | 	return &pluginapi.DevicePluginOptions{}, nil
105 | }
106 | 
107 | func (vr *vmemoryResourceServer) PreStartContainer(ctx context.Context, req *pluginapi.PreStartContainerRequest) (*pluginapi.PreStartContainerResponse, error) {
108 | 	klog.V(2).Infof("PreStartContainer request for vmemory")
109 | 	return &pluginapi.PreStartContainerResponse{}, nil
110 | }
111 | 


--------------------------------------------------------------------------------
/pkg/services/allocator/cache/cache.go:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Tencent is pleased to support the open source community by making TKEStack available.
 3 |  *
 4 |  * Copyright (C) 2012-2019 Tencent. All Rights Reserved.
 5 |  *
 6 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not use
 7 |  * this file except in compliance with the License. You may obtain a copy of the
 8 |  * License at
 9 |  *
10 |  * https://opensource.org/licenses/Apache-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
14 |  * WARRANTIES OF ANY KIND, either express or implied.  See the License for the
15 |  * specific language governing permissions and limitations under the License.
16 |  */
17 | 
18 | package cache
19 | 
20 | //Info contains infomations aboud GPU
21 | type Info struct {
22 | 	Devices []string
23 | 	Cores   int64
24 | 	Memory  int64
25 | }
26 | 
27 | type containerToInfo map[string]*Info
28 | 
29 | // PodCache represents a list of pod to GPU mappings.
30 | type PodCache struct {
31 | 	PodGPUMapping map[string]containerToInfo
32 | }
33 | 
34 | //NewAllocateCache creates new PodCache
35 | func NewAllocateCache() *PodCache {
36 | 	return &PodCache{
37 | 		PodGPUMapping: make(map[string]containerToInfo),
38 | 	}
39 | }
40 | 
41 | //Pods returns all pods in PodCache
42 | func (pgpu *PodCache) Pods() []string {
43 | 	ret := make([]string, 0)
44 | 	for k := range pgpu.PodGPUMapping {
45 | 		ret = append(ret, k)
46 | 	}
47 | 	return ret
48 | }
49 | 
50 | //Insert adds GPU info of pod into PodCache if not exist
51 | func (pgpu *PodCache) Insert(podUID, contName string, cache *Info) {
52 | 	if _, exists := pgpu.PodGPUMapping[podUID]; !exists {
53 | 		pgpu.PodGPUMapping[podUID] = make(containerToInfo)
54 | 	}
55 | 	pgpu.PodGPUMapping[podUID][contName] = cache
56 | }
57 | 
58 | //GetCache returns GPU of pod if exist
59 | func (pgpu *PodCache) GetCache(podUID string) map[string]*Info {
60 | 	containers, exists := pgpu.PodGPUMapping[podUID]
61 | 	if !exists {
62 | 		return nil
63 | 	}
64 | 
65 | 	return containers
66 | }
67 | 
68 | //Delete removes GPU info in PodCache
69 | func (pgpu *PodCache) Delete(uid string) {
70 | 	delete(pgpu.PodGPUMapping, uid)
71 | }
72 | 


--------------------------------------------------------------------------------
/pkg/services/allocator/checkpoint/manager.go:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Tencent is pleased to support the open source community by making TKEStack available.
  3 |  *
  4 |  * Copyright (C) 2012-2019 Tencent. All Rights Reserved.
  5 |  *
  6 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not use
  7 |  * this file except in compliance with the License. You may obtain a copy of the
  8 |  * License at
  9 |  *
 10 |  * https://opensource.org/licenses/Apache-2.0
 11 |  *
 12 |  * Unless required by applicable law or agreed to in writing, software
 13 |  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 14 |  * WARRANTIES OF ANY KIND, either express or implied.  See the License for the
 15 |  * specific language governing permissions and limitations under the License.
 16 |  */
 17 | 
 18 | package checkpoint
 19 | 
 20 | import (
 21 | 	"fmt"
 22 | 	"io/ioutil"
 23 | 	"os"
 24 | 	"path/filepath"
 25 | )
 26 | 
 27 | const (
 28 | 	// Name prefix for the temporary files.
 29 | 	tmpPrefix = "."
 30 | )
 31 | 
 32 | var (
 33 | 	// ErrKeyNotFound is the error returned if key is not found in Store.
 34 | 	ErrKeyNotFound = fmt.Errorf("key is not found")
 35 | )
 36 | 
 37 | // CheckpointManager stores checkpoint in file.
 38 | type Manager struct {
 39 | 	// Absolute path to the base directory for storing checkpoint files.
 40 | 	directoryPath string
 41 | 	// File name of the storing checkpoint file.
 42 | 	file string
 43 | }
 44 | 
 45 | // NewManager returns an instance of CheckpointManager.
 46 | func NewManager(path string, file string) (*Manager, error) {
 47 | 	if err := ensureDirectory(path); err != nil {
 48 | 		return nil, err
 49 | 	}
 50 | 
 51 | 	return &Manager{directoryPath: path, file: file}, nil
 52 | }
 53 | 
 54 | // Write writes the given checkpoint to file.
 55 | func (f *Manager) Write(data []byte) error {
 56 | 	if err := ensureDirectory(f.directoryPath); err != nil {
 57 | 		return err
 58 | 	}
 59 | 
 60 | 	return writeFile(f.getPathOfFile(), data)
 61 | }
 62 | 
 63 | // Read reads the checkpoint from the file.
 64 | func (f *Manager) Read() ([]byte, error) {
 65 | 	bytes, err := ioutil.ReadFile(f.getPathOfFile())
 66 | 	if os.IsNotExist(err) {
 67 | 		return bytes, ErrKeyNotFound
 68 | 	}
 69 | 	return bytes, err
 70 | }
 71 | 
 72 | // Delete deletes the file.
 73 | func (f *Manager) Delete() error {
 74 | 	return removePath(f.getPathOfFile())
 75 | }
 76 | 
 77 | // getPathOfFile returns the full path of the file.
 78 | func (f *Manager) getPathOfFile() string {
 79 | 	return filepath.Join(f.directoryPath, f.file)
 80 | }
 81 | 
 82 | // ensureDirectory creates the directory if it does not exist.
 83 | func ensureDirectory(path string) error {
 84 | 	if _, err := os.Stat(path); err != nil {
 85 | 		// MkdirAll returns nil if directory already exists.
 86 | 		return os.MkdirAll(path, 0755)
 87 | 	}
 88 | 	return nil
 89 | }
 90 | 
 91 | // writeFile writes checkpoint to path in a single transaction.
 92 | func writeFile(path string, data []byte) (retErr error) {
 93 | 	// Create a temporary file in the base directory of `path` with a prefix.
 94 | 	tmpFile, err := ioutil.TempFile(filepath.Dir(path), tmpPrefix)
 95 | 	if err != nil {
 96 | 		return err
 97 | 	}
 98 | 
 99 | 	tmpPath := tmpFile.Name()
100 | 	shouldClose := true
101 | 
102 | 	defer func() {
103 | 		// Close the file.
104 | 		if shouldClose {
105 | 			if err := tmpFile.Close(); err != nil {
106 | 				if retErr == nil {
107 | 					retErr = fmt.Errorf("close error: %v", err)
108 | 				} else {
109 | 					retErr = fmt.Errorf("failed to close temp file after error %v; close error: %v", retErr, err)
110 | 				}
111 | 			}
112 | 		}
113 | 
114 | 		// Clean up the temp file on error.
115 | 		if retErr != nil && tmpPath != "" {
116 | 			if err := removePath(tmpPath); err != nil {
117 | 				retErr = fmt.Errorf("failed to remove the temporary file (%q) after error %v; remove error: %v", tmpPath, retErr, err)
118 | 			}
119 | 		}
120 | 	}()
121 | 
122 | 	// Write checkpoint.
123 | 	if _, err := tmpFile.Write(data); err != nil {
124 | 		return err
125 | 	}
126 | 
127 | 	// Sync file.
128 | 	if err := tmpFile.Sync(); err != nil {
129 | 		return err
130 | 	}
131 | 
132 | 	// Closing the file before renaming.
133 | 	err = tmpFile.Close()
134 | 	shouldClose = false
135 | 	if err != nil {
136 | 		return err
137 | 	}
138 | 
139 | 	return os.Rename(tmpPath, path)
140 | }
141 | 
142 | func removePath(path string) error {
143 | 	if err := os.Remove(path); err != nil && !os.IsNotExist(err) {
144 | 		return err
145 | 	}
146 | 	return nil
147 | }
148 | 


--------------------------------------------------------------------------------
/pkg/services/allocator/dummy/allocator.go:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Tencent is pleased to support the open source community by making TKEStack available.
  3 |  *
  4 |  * Copyright (C) 2012-2019 Tencent. All Rights Reserved.
  5 |  *
  6 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not use
  7 |  * this file except in compliance with the License. You may obtain a copy of the
  8 |  * License at
  9 |  *
 10 |  * https://opensource.org/licenses/Apache-2.0
 11 |  *
 12 |  * Unless required by applicable law or agreed to in writing, software
 13 |  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 14 |  * WARRANTIES OF ANY KIND, either express or implied.  See the License for the
 15 |  * specific language governing permissions and limitations under the License.
 16 |  */
 17 | 
 18 | package dummy
 19 | 
 20 | import (
 21 | 	"context"
 22 | 	"fmt"
 23 | 	"time"
 24 | 
 25 | 	"tkestack.io/gpu-manager/pkg/config"
 26 | 	"tkestack.io/gpu-manager/pkg/device"
 27 | 	"tkestack.io/gpu-manager/pkg/services/response"
 28 | 
 29 | 	// Register test allocator controller
 30 | 	_ "tkestack.io/gpu-manager/pkg/device/dummy"
 31 | 	"tkestack.io/gpu-manager/pkg/services/allocator"
 32 | 
 33 | 	"k8s.io/client-go/kubernetes"
 34 | 	"k8s.io/klog"
 35 | 	pluginapi "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1"
 36 | )
 37 | 
 38 | func init() {
 39 | 	allocator.Register("dummy", NewDummyAllocator)
 40 | }
 41 | 
 42 | //DummyAllocator is a struct{}
 43 | type DummyAllocator struct {
 44 | }
 45 | 
 46 | var _ allocator.GPUTopoService = &DummyAllocator{}
 47 | 
 48 | //NewDummyAllocator returns a new DummyAllocator
 49 | func NewDummyAllocator(_ *config.Config, _ device.GPUTree, _ kubernetes.Interface, _ response.Manager) allocator.GPUTopoService {
 50 | 	return &DummyAllocator{}
 51 | }
 52 | 
 53 | //Allocate returns /dev/fuse for dummy device
 54 | func (ta *DummyAllocator) Allocate(_ context.Context, reqs *pluginapi.AllocateRequest) (*pluginapi.AllocateResponse, error) {
 55 | 	resps := &pluginapi.AllocateResponse{}
 56 | 	for range reqs.ContainerRequests {
 57 | 		resps.ContainerResponses = append(resps.ContainerResponses, &pluginapi.ContainerAllocateResponse{
 58 | 			Devices: []*pluginapi.DeviceSpec{
 59 | 				{
 60 | 					// We use /dev/fuse for dummy device
 61 | 					ContainerPath: "/dev/fuse",
 62 | 					HostPath:      "/dev/fuse",
 63 | 					Permissions:   "mrw",
 64 | 				},
 65 | 			},
 66 | 		})
 67 | 	}
 68 | 
 69 | 	return resps, nil
 70 | }
 71 | 
 72 | //ListAndWatch not implement
 73 | func (ta *DummyAllocator) ListAndWatch(e *pluginapi.Empty, s pluginapi.DevicePlugin_ListAndWatchServer) error {
 74 | 	return fmt.Errorf("not implement")
 75 | }
 76 | 
 77 | //ListAndWatchWithResourceName sends dummy device back to server
 78 | func (ta *DummyAllocator) ListAndWatchWithResourceName(resourceName string, e *pluginapi.Empty, s pluginapi.DevicePlugin_ListAndWatchServer) error {
 79 | 	devs := []*pluginapi.Device{
 80 | 		{
 81 | 			ID:     fmt.Sprintf("dummy-%s-0", resourceName),
 82 | 			Health: pluginapi.Healthy,
 83 | 		},
 84 | 	}
 85 | 
 86 | 	s.Send(&pluginapi.ListAndWatchResponse{Devices: devs})
 87 | 
 88 | 	// We don't send unhealthy state
 89 | 	for {
 90 | 		time.Sleep(time.Second)
 91 | 	}
 92 | 
 93 | 	klog.V(2).Infof("ListAndWatch %s exit", resourceName)
 94 | 
 95 | 	return nil
 96 | }
 97 | 
 98 | //GetDevicePluginOptions returns empty DevicePluginOptions
 99 | func (ta *DummyAllocator) GetDevicePluginOptions(ctx context.Context, e *pluginapi.Empty) (*pluginapi.DevicePluginOptions, error) {
100 | 	return &pluginapi.DevicePluginOptions{}, nil
101 | }
102 | 
103 | //PreStartContainer returns empty PreStartContainerResponse
104 | func (ta *DummyAllocator) PreStartContainer(ctx context.Context, req *pluginapi.PreStartContainerRequest) (*pluginapi.PreStartContainerResponse, error) {
105 | 	return &pluginapi.PreStartContainerResponse{}, nil
106 | }
107 | 


--------------------------------------------------------------------------------
/pkg/services/allocator/nvidia/evaluator.go:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Tencent is pleased to support the open source community by making TKEStack available.
 3 |  *
 4 |  * Copyright (C) 2012-2019 Tencent. All Rights Reserved.
 5 |  *
 6 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not use
 7 |  * this file except in compliance with the License. You may obtain a copy of the
 8 |  * License at
 9 |  *
10 |  * https://opensource.org/licenses/Apache-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
14 |  * WARRANTIES OF ANY KIND, either express or implied.  See the License for the
15 |  * specific language governing permissions and limitations under the License.
16 |  */
17 | 
18 | package nvidia
19 | 
20 | import (
21 | 	node "tkestack.io/gpu-manager/pkg/device/nvidia"
22 | )
23 | 
24 | //Evaluator api for schedule algorithm
25 | type Evaluator interface {
26 | 	Evaluate(cores int64, memory int64) []*node.NvidiaNode
27 | }
28 | 


--------------------------------------------------------------------------------
/pkg/services/allocator/register/register.go:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Tencent is pleased to support the open source community by making TKEStack available.
 3 |  *
 4 |  * Copyright (C) 2012-2019 Tencent. All Rights Reserved.
 5 |  *
 6 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not use
 7 |  * this file except in compliance with the License. You may obtain a copy of the
 8 |  * License at
 9 |  *
10 |  * https://opensource.org/licenses/Apache-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
14 |  * WARRANTIES OF ANY KIND, either express or implied.  See the License for the
15 |  * specific language governing permissions and limitations under the License.
16 |  */
17 | 
18 | package register
19 | 
20 | import (
21 | 	// Register test allocator
22 | 	_ "tkestack.io/gpu-manager/pkg/services/allocator/dummy"
23 | 	// Register nvidia allocator
24 | 	_ "tkestack.io/gpu-manager/pkg/services/allocator/nvidia"
25 | )
26 | 


--------------------------------------------------------------------------------
/pkg/services/allocator/types.go:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Tencent is pleased to support the open source community by making TKEStack available.
 3 |  *
 4 |  * Copyright (C) 2012-2019 Tencent. All Rights Reserved.
 5 |  *
 6 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not use
 7 |  * this file except in compliance with the License. You may obtain a copy of the
 8 |  * License at
 9 |  *
10 |  * https://opensource.org/licenses/Apache-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
14 |  * WARRANTIES OF ANY KIND, either express or implied.  See the License for the
15 |  * specific language governing permissions and limitations under the License.
16 |  */
17 | 
18 | package allocator
19 | 
20 | import (
21 | 	"tkestack.io/gpu-manager/pkg/config"
22 | 	"tkestack.io/gpu-manager/pkg/device"
23 | 	"tkestack.io/gpu-manager/pkg/services/response"
24 | 
25 | 	"k8s.io/client-go/kubernetes"
26 | 	"k8s.io/klog"
27 | 	pluginapi "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1"
28 | )
29 | 
30 | //GPUTopoService is server api for GPU topology service
31 | type GPUTopoService interface {
32 | 	pluginapi.DevicePluginServer
33 | 	ListAndWatchWithResourceName(string, *pluginapi.Empty, pluginapi.DevicePlugin_ListAndWatchServer) error
34 | }
35 | 
36 | //NewFunc represents function for creating new GPUTopoService
37 | type NewFunc func(cfg *config.Config,
38 | 	tree device.GPUTree,
39 | 	k8sClient kubernetes.Interface,
40 | 	responseManager response.Manager) GPUTopoService
41 | 
42 | var (
43 | 	factory = make(map[string]NewFunc)
44 | )
45 | 
46 | //Register stores NewFunc in factory
47 | func Register(name string, item NewFunc) {
48 | 	if _, ok := factory[name]; ok {
49 | 		return
50 | 	}
51 | 
52 | 	klog.V(2).Infof("Register NewFunc with name %s", name)
53 | 
54 | 	factory[name] = item
55 | }
56 | 
57 | //NewFuncForName tries to find NewFunc by name, return nil if not found
58 | func NewFuncForName(name string) NewFunc {
59 | 	if item, ok := factory[name]; ok {
60 | 		return item
61 | 	}
62 | 
63 | 	klog.V(2).Infof("Can not find NewFunc with name %s", name)
64 | 
65 | 	return nil
66 | }
67 | 


--------------------------------------------------------------------------------
/pkg/services/display/helper.go:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Tencent is pleased to support the open source community by making TKEStack available.
 3 |  *
 4 |  * Copyright (C) 2012-2019 Tencent. All Rights Reserved.
 5 |  *
 6 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not use
 7 |  * this file except in compliance with the License. You may obtain a copy of the
 8 |  * License at
 9 |  *
10 |  * https://opensource.org/licenses/Apache-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
14 |  * WARRANTIES OF ANY KIND, either express or implied.  See the License for the
15 |  * specific language governing permissions and limitations under the License.
16 |  */
17 | 
18 | package display
19 | 
20 | type containerToCgroup map[string]string
21 | 
22 | // podGPUs represents a list of pod to GPU mappings.
23 | type podGPUs struct {
24 | 	podGPUMapping map[string]containerToCgroup
25 | }
26 | 
27 | func newPodGPUs() *podGPUs {
28 | 	return &podGPUs{
29 | 		podGPUMapping: make(map[string]containerToCgroup),
30 | 	}
31 | }
32 | 
33 | func (pgpu *podGPUs) pods() []string {
34 | 	ret := make([]string, 0)
35 | 	for k := range pgpu.podGPUMapping {
36 | 		ret = append(ret, k)
37 | 	}
38 | 	return ret
39 | }
40 | 
41 | func (pgpu *podGPUs) insert(podUID, contName string, cgroup string) {
42 | 	if _, exists := pgpu.podGPUMapping[podUID]; !exists {
43 | 		pgpu.podGPUMapping[podUID] = make(containerToCgroup)
44 | 	}
45 | 	pgpu.podGPUMapping[podUID][contName] = cgroup
46 | }
47 | 
48 | func (pgpu *podGPUs) getCgroup(podUID, contName string) string {
49 | 	containers, exists := pgpu.podGPUMapping[podUID]
50 | 	if !exists {
51 | 		return ""
52 | 	}
53 | 	cgroup, exists := containers[contName]
54 | 	if !exists {
55 | 		return ""
56 | 	}
57 | 	return cgroup
58 | }
59 | 
60 | func (pgpu *podGPUs) delete(uid string) []string {
61 | 	var cgroups []string
62 | 
63 | 	for _, cont := range pgpu.podGPUMapping[uid] {
64 | 		cgroups = append(cgroups, cont)
65 | 	}
66 | 
67 | 	delete(pgpu.podGPUMapping, uid)
68 | 
69 | 	return cgroups
70 | }
71 | 


--------------------------------------------------------------------------------
/pkg/services/response/fake.go:
--------------------------------------------------------------------------------
 1 | package response
 2 | 
 3 | import (
 4 | 	"k8s.io/klog"
 5 | 	pluginapi "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1"
 6 | )
 7 | 
 8 | type fakeResponseManager struct {
 9 | 	data map[string]containerResponseDataMapping
10 | }
11 | 
12 | var _ Manager = (*fakeResponseManager)(nil)
13 | 
14 | func NewFakeResponseManager() *fakeResponseManager {
15 | 	return &fakeResponseManager{
16 | 		data: make(map[string]containerResponseDataMapping),
17 | 	}
18 | }
19 | 
20 | func (m *fakeResponseManager) LoadFromFile(path string) error {
21 | 	return nil
22 | }
23 | 
24 | func (m *fakeResponseManager) InsertResp(podUID, containerName string, allocResp *pluginapi.ContainerAllocateResponse) {
25 | 	podData, ok := m.data[podUID]
26 | 	if !ok {
27 | 		podData = make(containerResponseDataMapping)
28 | 		m.data[podUID] = podData
29 | 	}
30 | 
31 | 	podData[containerName] = allocResp
32 | 
33 | 	klog.V(2).Infof("Insert %s/%s allocResp", podUID, containerName)
34 | }
35 | 
36 | func (m *fakeResponseManager) DeleteResp(podUID string, containerName string) {
37 | 	podData, ok := m.data[podUID]
38 | 	if !ok {
39 | 		return
40 | 	}
41 | 
42 | 	_, ok = podData[containerName]
43 | 	if !ok {
44 | 		return
45 | 	}
46 | 
47 | 	klog.V(2).Infof("Delete %s/%s allocResp", podUID, containerName)
48 | 
49 | 	delete(podData, containerName)
50 | 
51 | 	if len(podData) == 0 {
52 | 		delete(m.data, podUID)
53 | 	}
54 | }
55 | 
56 | func (m *fakeResponseManager) GetResp(podUID string, containerName string) *pluginapi.ContainerAllocateResponse {
57 | 	podData, ok := m.data[podUID]
58 | 	if !ok {
59 | 		return nil
60 | 	}
61 | 
62 | 	resp, ok := podData[containerName]
63 | 	if !ok {
64 | 		return nil
65 | 	}
66 | 
67 | 	return resp
68 | }
69 | 
70 | func (m *fakeResponseManager) ListAll() map[string]containerResponseDataMapping {
71 | 	return m.data
72 | }
73 | 


--------------------------------------------------------------------------------
/pkg/services/response/manager.go:
--------------------------------------------------------------------------------
  1 | package response
  2 | 
  3 | import (
  4 | 	"sync"
  5 | 
  6 | 	"k8s.io/klog"
  7 | 	pluginapi "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1"
  8 | 
  9 | 	"tkestack.io/gpu-manager/pkg/types"
 10 | 	"tkestack.io/gpu-manager/pkg/utils"
 11 | )
 12 | 
 13 | type Manager interface {
 14 | 	InsertResp(podUID, containerName string, resp *pluginapi.ContainerAllocateResponse)
 15 | 	DeleteResp(podUID, containerName string)
 16 | 	GetResp(podUID, containerName string) *pluginapi.ContainerAllocateResponse
 17 | 	ListAll() map[string]containerResponseDataMapping
 18 | 	LoadFromFile(path string) error
 19 | }
 20 | 
 21 | var _ Manager = (*responseManager)(nil)
 22 | 
 23 | type responseManager struct {
 24 | 	l    sync.Mutex
 25 | 	data map[string]containerResponseDataMapping
 26 | }
 27 | 
 28 | type containerResponseDataMapping map[string]*pluginapi.ContainerAllocateResponse
 29 | 
 30 | func NewResponseManager() *responseManager {
 31 | 	return &responseManager{
 32 | 		data: make(map[string]containerResponseDataMapping),
 33 | 	}
 34 | }
 35 | 
 36 | func (m *responseManager) LoadFromFile(path string) error {
 37 | 	cp, err := utils.GetCheckpointData(path)
 38 | 	if err != nil {
 39 | 		return err
 40 | 	}
 41 | 
 42 | 	for _, item := range cp.PodDeviceEntries {
 43 | 		// Only vcore resource has valid response data
 44 | 		if item.ResourceName == types.VCoreAnnotation {
 45 | 			allocResp := &pluginapi.ContainerAllocateResponse{}
 46 | 			if err := allocResp.Unmarshal(item.AllocResp); err != nil {
 47 | 				return err
 48 | 			}
 49 | 
 50 | 			m.InsertResp(item.PodUID, item.ContainerName, allocResp)
 51 | 		}
 52 | 	}
 53 | 
 54 | 	return nil
 55 | }
 56 | 
 57 | func (m *responseManager) InsertResp(podUID, containerName string, allocResp *pluginapi.ContainerAllocateResponse) {
 58 | 	m.l.Lock()
 59 | 	defer m.l.Unlock()
 60 | 
 61 | 	podData, ok := m.data[podUID]
 62 | 	if !ok {
 63 | 		podData = make(containerResponseDataMapping)
 64 | 		m.data[podUID] = podData
 65 | 	}
 66 | 
 67 | 	podData[containerName] = allocResp
 68 | 
 69 | 	klog.V(2).Infof("Insert %s/%s allocResp", podUID, containerName)
 70 | }
 71 | 
 72 | func (m *responseManager) DeleteResp(podUID string, containerName string) {
 73 | 	m.l.Lock()
 74 | 	defer m.l.Unlock()
 75 | 
 76 | 	podData, ok := m.data[podUID]
 77 | 	if !ok {
 78 | 		return
 79 | 	}
 80 | 
 81 | 	_, ok = podData[containerName]
 82 | 	if !ok {
 83 | 		return
 84 | 	}
 85 | 
 86 | 	klog.V(2).Infof("Delete %s/%s allocResp", podUID, containerName)
 87 | 
 88 | 	delete(podData, containerName)
 89 | 
 90 | 	if len(podData) == 0 {
 91 | 		delete(m.data, podUID)
 92 | 	}
 93 | }
 94 | 
 95 | func (m *responseManager) GetResp(podUID string, containerName string) *pluginapi.ContainerAllocateResponse {
 96 | 	m.l.Lock()
 97 | 	defer m.l.Unlock()
 98 | 
 99 | 	podData, ok := m.data[podUID]
100 | 	if !ok {
101 | 		return nil
102 | 	}
103 | 
104 | 	resp, ok := podData[containerName]
105 | 	if !ok {
106 | 		return nil
107 | 	}
108 | 
109 | 	return resp
110 | }
111 | 
112 | func (m *responseManager) ListAll() map[string]containerResponseDataMapping {
113 | 	m.l.Lock()
114 | 	defer m.l.Unlock()
115 | 
116 | 	snapshot := make(map[string]containerResponseDataMapping)
117 | 	for uid, containerMapping := range m.data {
118 | 		podData, ok := snapshot[uid]
119 | 		if !ok {
120 | 			podData = make(containerResponseDataMapping)
121 | 			snapshot[uid] = podData
122 | 		}
123 | 
124 | 		for name, resp := range containerMapping {
125 | 			podData[name] = resp
126 | 		}
127 | 	}
128 | 
129 | 	return snapshot
130 | }
131 | 


--------------------------------------------------------------------------------
/pkg/services/volume/ldcache/ldcache.go:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Tencent is pleased to support the open source community by making TKEStack available.
  3 |  *
  4 |  * Copyright (C) 2012-2019 Tencent. All Rights Reserved.
  5 |  *
  6 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not use
  7 |  * this file except in compliance with the License. You may obtain a copy of the
  8 |  * License at
  9 |  *
 10 |  * https://opensource.org/licenses/Apache-2.0
 11 |  *
 12 |  * Unless required by applicable law or agreed to in writing, software
 13 |  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 14 |  * WARRANTIES OF ANY KIND, either express or implied.  See the License for the
 15 |  * specific language governing permissions and limitations under the License.
 16 |  */
 17 | 
 18 | package ldcache
 19 | 
 20 | import (
 21 | 	"bytes"
 22 | 	"encoding/binary"
 23 | 	"errors"
 24 | 	"os"
 25 | 	"path/filepath"
 26 | 	"syscall"
 27 | 	"unsafe"
 28 | )
 29 | 
 30 | const ldcachePath = "/etc/ld.so.cache"
 31 | 
 32 | const (
 33 | 	magicString1 = "ld.so-1.7.0"
 34 | 	magicString2 = "glibc-ld.so.cache"
 35 | 	magicVersion = "1.1"
 36 | )
 37 | 
 38 | const (
 39 | 	flagTypeMask = 0x00ff
 40 | 	flagTypeELF  = 0x0001
 41 | 
 42 | 	flagArchMask    = 0xff00
 43 | 	flagArchI386    = 0x0000
 44 | 	flagArchX8664   = 0x0300
 45 | 	flagArchX32     = 0x0800
 46 | 	flagArchPpc64le = 0x0500
 47 | )
 48 | 
 49 | var ErrInvalidCache = errors.New("invalid ld.so.cache file")
 50 | 
 51 | type Header1 struct {
 52 | 	Magic [len(magicString1) + 1]byte // include null delimiter
 53 | 	NLibs uint32
 54 | }
 55 | 
 56 | type Entry1 struct {
 57 | 	Flags      int32
 58 | 	Key, Value uint32
 59 | }
 60 | 
 61 | type Header2 struct {
 62 | 	Magic     [len(magicString2)]byte
 63 | 	Version   [len(magicVersion)]byte
 64 | 	NLibs     uint32
 65 | 	TableSize uint32
 66 | 	_         [3]uint32 // unused
 67 | 	_         uint64    // force 8 byte alignment
 68 | }
 69 | 
 70 | type Entry2 struct {
 71 | 	Flags      int32
 72 | 	Key, Value uint32
 73 | 	OSVersion  uint32
 74 | 	HWCap      uint64
 75 | }
 76 | 
 77 | type LDCache struct {
 78 | 	*bytes.Reader
 79 | 
 80 | 	data, libs []byte
 81 | 	header     Header2
 82 | 	entries    []Entry2
 83 | }
 84 | 
 85 | func Open() (*LDCache, error) {
 86 | 	f, err := os.Open(ldcachePath)
 87 | 	if err != nil {
 88 | 		return nil, err
 89 | 	}
 90 | 	defer f.Close()
 91 | 
 92 | 	fi, err := f.Stat()
 93 | 	if err != nil {
 94 | 		return nil, err
 95 | 	}
 96 | 	d, err := syscall.Mmap(int(f.Fd()), 0, int(fi.Size()),
 97 | 		syscall.PROT_READ, syscall.MAP_PRIVATE)
 98 | 	if err != nil {
 99 | 		return nil, err
100 | 	}
101 | 
102 | 	cache := &LDCache{data: d, Reader: bytes.NewReader(d)}
103 | 	return cache, cache.parse()
104 | }
105 | 
106 | func (c *LDCache) Close() error {
107 | 	return syscall.Munmap(c.data)
108 | }
109 | 
110 | func (c *LDCache) Magic() string {
111 | 	return string(c.header.Magic[:])
112 | }
113 | 
114 | func (c *LDCache) Version() string {
115 | 	return string(c.header.Version[:])
116 | }
117 | 
118 | func strn(b []byte, n int) string {
119 | 	return string(b[:n])
120 | }
121 | 
122 | func (c *LDCache) parse() error {
123 | 	var header Header1
124 | 
125 | 	// Check for the old format (< glibc-2.2)
126 | 	if c.Len() <= int(unsafe.Sizeof(header)) {
127 | 		return ErrInvalidCache
128 | 	}
129 | 	if strn(c.data, len(magicString1)) == magicString1 {
130 | 		if err := binary.Read(c, binary.LittleEndian, &header); err != nil {
131 | 			return err
132 | 		}
133 | 		n := int64(header.NLibs) * int64(unsafe.Sizeof(Entry1{}))
134 | 		offset, err := c.Seek(n, 1) // skip old entries
135 | 		if err != nil {
136 | 			return err
137 | 		}
138 | 		n = (-offset) & int64(unsafe.Alignof(c.header)-1)
139 | 		_, err = c.Seek(n, 1) // skip padding
140 | 		if err != nil {
141 | 			return err
142 | 		}
143 | 	}
144 | 
145 | 	c.libs = c.data[c.Size()-int64(c.Len()):] // kv offsets start here
146 | 	if err := binary.Read(c, binary.LittleEndian, &c.header); err != nil {
147 | 		return err
148 | 	}
149 | 	if c.Magic() != magicString2 || c.Version() != magicVersion {
150 | 		return ErrInvalidCache
151 | 	}
152 | 	c.entries = make([]Entry2, c.header.NLibs)
153 | 	return binary.Read(c, binary.LittleEndian, &c.entries)
154 | }
155 | 
156 | func (c *LDCache) Lookup(libs ...string) (paths32, paths64 []string) {
157 | 	type void struct{}
158 | 	var paths *[]string
159 | 
160 | 	set := make(map[string]void)
161 | 	prefix := make([][]byte, len(libs))
162 | 
163 | 	for i := range libs {
164 | 		prefix[i] = []byte(libs[i])
165 | 	}
166 | 	for _, e := range c.entries {
167 | 		if ((e.Flags & flagTypeMask) & flagTypeELF) == 0 {
168 | 			continue
169 | 		}
170 | 		switch e.Flags & flagArchMask {
171 | 		case flagArchX8664:
172 | 			fallthrough
173 | 		case flagArchPpc64le:
174 | 			paths = &paths64
175 | 		case flagArchX32:
176 | 			fallthrough
177 | 		case flagArchI386:
178 | 			paths = &paths32
179 | 		default:
180 | 			continue
181 | 		}
182 | 		if e.Key > uint32(len(c.libs)) || e.Value > uint32(len(c.libs)) {
183 | 			continue
184 | 		}
185 | 		lib := c.libs[e.Key:]
186 | 		value := c.libs[e.Value:]
187 | 
188 | 		for _, p := range prefix {
189 | 			if bytes.HasPrefix(lib, p) {
190 | 				n := bytes.IndexByte(value, 0)
191 | 				if n < 0 {
192 | 					break
193 | 				}
194 | 				path, err := filepath.EvalSymlinks(strn(value, n))
195 | 				if err != nil {
196 | 					break
197 | 				}
198 | 				if _, ok := set[path]; ok {
199 | 					break
200 | 				}
201 | 				set[path] = void{}
202 | 				*paths = append(*paths, path)
203 | 				break
204 | 			}
205 | 		}
206 | 	}
207 | 	return
208 | }
209 | 


--------------------------------------------------------------------------------
/pkg/services/volume/util.go:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Tencent is pleased to support the open source community by making TKEStack available.
  3 |  *
  4 |  * Copyright (C) 2012-2019 Tencent. All Rights Reserved.
  5 |  *
  6 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not use
  7 |  * this file except in compliance with the License. You may obtain a copy of the
  8 |  * License at
  9 |  *
 10 |  * https://opensource.org/licenses/Apache-2.0
 11 |  *
 12 |  * Unless required by applicable law or agreed to in writing, software
 13 |  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 14 |  * WARRANTIES OF ANY KIND, either express or implied.  See the License for the
 15 |  * specific language governing permissions and limitations under the License.
 16 |  */
 17 | 
 18 | package volume
 19 | 
 20 | import (
 21 | 	"bufio"
 22 | 	"bytes"
 23 | 	"debug/elf"
 24 | 	"encoding/binary"
 25 | 	"io"
 26 | 	"os"
 27 | 	"os/exec"
 28 | 	"path"
 29 | 	"path/filepath"
 30 | 	"regexp"
 31 | 	"strings"
 32 | )
 33 | 
 34 | func which(bins ...string) ([]string, error) {
 35 | 	paths := make([]string, 0, len(bins))
 36 | 
 37 | 	out, _ := exec.Command("which", bins...).Output()
 38 | 	r := bufio.NewReader(bytes.NewBuffer(out))
 39 | 	for {
 40 | 		p, err := r.ReadString('\n')
 41 | 		if err == io.EOF {
 42 | 			break
 43 | 		}
 44 | 		if err != nil {
 45 | 			return nil, err
 46 | 		}
 47 | 		if p = strings.TrimSpace(p); !path.IsAbs(p) {
 48 | 			continue
 49 | 		}
 50 | 		realPath, err := filepath.EvalSymlinks(p)
 51 | 		if err != nil {
 52 | 			return nil, err
 53 | 		}
 54 | 		paths = append(paths, realPath)
 55 | 	}
 56 | 	return paths, nil
 57 | }
 58 | 
 59 | func clone(src, dst string) error {
 60 | 	// Prefer hard link, fallback to copy
 61 | 	err := os.Link(src, dst)
 62 | 	if err != nil {
 63 | 		err = fallbackCopy(src, dst)
 64 | 	}
 65 | 	return err
 66 | }
 67 | 
 68 | func fallbackCopy(src, dst string) error {
 69 | 	s, err := os.Open(src)
 70 | 	if err != nil {
 71 | 		return err
 72 | 	}
 73 | 	defer s.Close()
 74 | 
 75 | 	fi, err := s.Stat()
 76 | 	if err != nil {
 77 | 		return err
 78 | 	}
 79 | 
 80 | 	d, err := os.Create(dst)
 81 | 	if err != nil {
 82 | 		return err
 83 | 	}
 84 | 
 85 | 	if _, err := io.Copy(d, s); err != nil {
 86 | 		d.Close()
 87 | 		return err
 88 | 	}
 89 | 
 90 | 	if err := d.Chmod(fi.Mode()); err != nil {
 91 | 		d.Close()
 92 | 		return err
 93 | 	}
 94 | 
 95 | 	return d.Close()
 96 | }
 97 | 
 98 | func blacklisted(file string, obj *elf.File) (bool, error) {
 99 | 	lib := regexp.MustCompile(`^.*/lib([\w-]+)\.so[\d.]*$`)
100 | 	glcore := regexp.MustCompile(`libnvidia-e?glcore\.so`)
101 | 	gldispatch := regexp.MustCompile(`libGLdispatch\.so`)
102 | 
103 | 	if m := lib.FindStringSubmatch(file); m != nil {
104 | 		switch m[1] {
105 | 		// Blacklist EGL/OpenGL libraries issued by other vendors
106 | 		case "EGL":
107 | 			fallthrough
108 | 		case "GLESv1_CM":
109 | 			fallthrough
110 | 		case "GLESv2":
111 | 			fallthrough
112 | 		case "GL":
113 | 			deps, err := obj.DynString(elf.DT_NEEDED)
114 | 			if err != nil {
115 | 				return false, err
116 | 			}
117 | 			for _, d := range deps {
118 | 				if glcore.MatchString(d) || gldispatch.MatchString(d) {
119 | 					return false, nil
120 | 				}
121 | 			}
122 | 			return true, nil
123 | 
124 | 		// Blacklist TLS libraries using the old ABI (!= 2.3.99)
125 | 		case "nvidia-tls":
126 | 			const abi = 0x6300000003
127 | 			s, err := obj.Section(".note.ABI-tag").Data()
128 | 			if err != nil {
129 | 				return false, err
130 | 			}
131 | 			return binary.LittleEndian.Uint64(s[24:]) != abi, nil
132 | 		}
133 | 	}
134 | 	return false, nil
135 | }
136 | 


--------------------------------------------------------------------------------
/pkg/services/volume/volume.go:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Tencent is pleased to support the open source community by making TKEStack available.
  3 |  *
  4 |  * Copyright (C) 2012-2019 Tencent. All Rights Reserved.
  5 |  *
  6 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not use
  7 |  * this file except in compliance with the License. You may obtain a copy of the
  8 |  * License at
  9 |  *
 10 |  * https://opensource.org/licenses/Apache-2.0
 11 |  *
 12 |  * Unless required by applicable law or agreed to in writing, software
 13 |  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 14 |  * WARRANTIES OF ANY KIND, either express or implied.  See the License for the
 15 |  * specific language governing permissions and limitations under the License.
 16 |  */
 17 | 
 18 | package volume
 19 | 
 20 | import (
 21 | 	"debug/elf"
 22 | 	"encoding/json"
 23 | 	"fmt"
 24 | 	"os"
 25 | 	"path"
 26 | 	"path/filepath"
 27 | 	"strconv"
 28 | 	"strings"
 29 | 
 30 | 	"tkestack.io/gpu-manager/pkg/services/volume/ldcache"
 31 | 	"tkestack.io/gpu-manager/pkg/types"
 32 | 
 33 | 	"k8s.io/klog"
 34 | )
 35 | 
 36 | //VolumeManager manages volumes used by containers running GPU application
 37 | type VolumeManager struct {
 38 | 	Config  []Config `json:"volume,omitempty"`
 39 | 	cfgPath string
 40 | 
 41 | 	cudaControlFile string
 42 | 	cudaSoname      map[string]string
 43 | 	mlSoName        map[string]string
 44 | 	share           bool
 45 | }
 46 | 
 47 | type components map[string][]string
 48 | 
 49 | //Config contains volume details in config file
 50 | type Config struct {
 51 | 	Name       string     `json:"name,omitempty"`
 52 | 	Mode       string     `json:"mode,omitempty"`
 53 | 	Components components `json:"components,omitempty"`
 54 | 	BasePath   string     `json:"base,omitempty"`
 55 | }
 56 | 
 57 | const (
 58 | 	binDir   = "bin"
 59 | 	lib32Dir = "lib"
 60 | 	lib64Dir = "lib64"
 61 | )
 62 | 
 63 | type volumeDir struct {
 64 | 	name  string
 65 | 	files []string
 66 | }
 67 | 
 68 | //Volume contains directory and file info of volume
 69 | type Volume struct {
 70 | 	Path string
 71 | 	dirs []volumeDir
 72 | }
 73 | 
 74 | //VolumeMap stores Volume for each type
 75 | type VolumeMap map[string]*Volume
 76 | 
 77 | //NewVolumeManager returns a new VolumeManager
 78 | func NewVolumeManager(config string, share bool) (*VolumeManager, error) {
 79 | 	f, err := os.Open(config)
 80 | 	if err != nil {
 81 | 		return nil, err
 82 | 	}
 83 | 
 84 | 	defer f.Close()
 85 | 
 86 | 	volumeManager := &VolumeManager{
 87 | 		cfgPath:    filepath.Dir(config),
 88 | 		cudaSoname: make(map[string]string),
 89 | 		mlSoName:   make(map[string]string),
 90 | 		share:      share,
 91 | 	}
 92 | 
 93 | 	if err := json.NewDecoder(f).Decode(volumeManager); err != nil {
 94 | 		return nil, err
 95 | 	}
 96 | 
 97 | 	return volumeManager, nil
 98 | }
 99 | 
100 | //Run starts a VolumeManager
101 | func (vm *VolumeManager) Run() (err error) {
102 | 	cache, err := ldcache.Open()
103 | 	if err != nil {
104 | 		return err
105 | 	}
106 | 
107 | 	defer func() {
108 | 		if e := cache.Close(); err == nil {
109 | 			err = e
110 | 		}
111 | 	}()
112 | 
113 | 	vols := make(VolumeMap)
114 | 	for _, cfg := range vm.Config {
115 | 		vol := &Volume{
116 | 			Path: path.Join(cfg.BasePath, cfg.Name),
117 | 		}
118 | 
119 | 		if cfg.Name == "nvidia" {
120 | 			types.DriverLibraryPath = filepath.Join(cfg.BasePath, cfg.Name)
121 | 		} else {
122 | 			types.DriverOriginLibraryPath = filepath.Join(cfg.BasePath, cfg.Name)
123 | 		}
124 | 
125 | 		for t, c := range cfg.Components {
126 | 			switch t {
127 | 			case "binaries":
128 | 				bins, err := which(c...)
129 | 				if err != nil {
130 | 					return err
131 | 				}
132 | 
133 | 				klog.V(2).Infof("Find binaries: %+v", bins)
134 | 
135 | 				vol.dirs = append(vol.dirs, volumeDir{binDir, bins})
136 | 			case "libraries":
137 | 				libs32, libs64 := cache.Lookup(c...)
138 | 				klog.V(2).Infof("Find 32bit libraries: %+v", libs32)
139 | 				klog.V(2).Infof("Find 64bit libraries: %+v", libs64)
140 | 
141 | 				vol.dirs = append(vol.dirs, volumeDir{lib32Dir, libs32}, volumeDir{lib64Dir, libs64})
142 | 			}
143 | 
144 | 			vols[cfg.Name] = vol
145 | 		}
146 | 	}
147 | 
148 | 	if err := vm.mirror(vols); err != nil {
149 | 		return err
150 | 	}
151 | 
152 | 	klog.V(2).Infof("Volume manager is running")
153 | 
154 | 	return nil
155 | }
156 | 
157 | // #lizard forgives
158 | func (vm *VolumeManager) mirror(vols VolumeMap) error {
159 | 	for driver, vol := range vols {
160 | 		if exist, _ := vol.exist(); !exist {
161 | 			if err := os.MkdirAll(vol.Path, 0755); err != nil {
162 | 				return err
163 | 			}
164 | 		}
165 | 
166 | 		for _, d := range vol.dirs {
167 | 			vpath := path.Join(vol.Path, d.name)
168 | 			if err := os.MkdirAll(vpath, 0755); err != nil {
169 | 				return err
170 | 			}
171 | 
172 | 			// For each file matching the volume components (blacklist excluded), create a hardlink/copy
173 | 			// of it inside the volume directory. We also need to create soname symlinks similar to what
174 | 			// ldconfig does since our volume will only show up at runtime.
175 | 			for _, f := range d.files {
176 | 				klog.V(2).Infof("Mirror %s to %s", f, vpath)
177 | 				if err := vm.mirrorFiles(driver, vpath, f); err != nil {
178 | 					return err
179 | 				}
180 | 
181 | 				if strings.HasPrefix(path.Base(f), "libcuda.so") {
182 | 					driverStr := strings.SplitN(strings.TrimPrefix(path.Base(f), "libcuda.so."), ".", 2)
183 | 					types.DriverVersionMajor, _ = strconv.Atoi(driverStr[0])
184 | 					types.DriverVersionMinor, _ = strconv.Atoi(driverStr[1])
185 | 					klog.V(2).Infof("Driver version: %d.%d", types.DriverVersionMajor, types.DriverVersionMinor)
186 | 				}
187 | 
188 | 				if strings.HasPrefix(path.Base(f), "libcuda-control.so") {
189 | 					vm.cudaControlFile = f
190 | 				}
191 | 			}
192 | 		}
193 | 	}
194 | 
195 | 	vCudaFileFn := func(soFile string) error {
196 | 		if err := os.Remove(soFile); err != nil {
197 | 			if !os.IsNotExist(err) {
198 | 				return err
199 | 			}
200 | 		}
201 | 		if err := clone(vm.cudaControlFile, soFile); err != nil {
202 | 			return err
203 | 		}
204 | 
205 | 		klog.V(2).Infof("Vcuda %s to %s", vm.cudaControlFile, soFile)
206 | 
207 | 		l := strings.TrimRight(soFile, ".0123456789")
208 | 		if err := os.Remove(l); err != nil {
209 | 			if !os.IsNotExist(err) {
210 | 				return err
211 | 			}
212 | 		}
213 | 		if err := clone(vm.cudaControlFile, l); err != nil {
214 | 			return err
215 | 		}
216 | 		klog.V(2).Infof("Vcuda %s to %s", vm.cudaControlFile, l)
217 | 		return nil
218 | 	}
219 | 
220 | 	if vm.share && len(vm.cudaControlFile) > 0 {
221 | 		if len(vm.cudaSoname) > 0 {
222 | 			for _, f := range vm.cudaSoname {
223 | 				if err := vCudaFileFn(f); err != nil {
224 | 					return err
225 | 				}
226 | 			}
227 | 		}
228 | 
229 | 		if len(vm.mlSoName) > 0 {
230 | 			for _, f := range vm.mlSoName {
231 | 				if err := vCudaFileFn(f); err != nil {
232 | 					return err
233 | 				}
234 | 			}
235 | 		}
236 | 	}
237 | 
238 | 	return nil
239 | }
240 | 
241 | // #lizard forgives
242 | func (vm *VolumeManager) mirrorFiles(driver, vpath string, file string) error {
243 | 	obj, err := elf.Open(file)
244 | 	if err != nil {
245 | 		return fmt.Errorf("%s: %v", file, err)
246 | 	}
247 | 	defer obj.Close()
248 | 
249 | 	ok, err := blacklisted(file, obj)
250 | 	if err != nil {
251 | 		return fmt.Errorf("%s: %v", file, err)
252 | 	}
253 | 
254 | 	if ok {
255 | 		return nil
256 | 	}
257 | 
258 | 	l := path.Join(vpath, path.Base(file))
259 | 	if err := removeFile(l); err != nil {
260 | 		return err
261 | 	}
262 | 
263 | 	if err := clone(file, l); err != nil {
264 | 		return err
265 | 	}
266 | 
267 | 	soname, err := obj.DynString(elf.DT_SONAME)
268 | 	if err != nil {
269 | 		return fmt.Errorf("%s: %v", file, err)
270 | 	}
271 | 
272 | 	if len(soname) > 0 {
273 | 		l = path.Join(vpath, soname[0])
274 | 		if err := linkIfNotSameName(path.Base(file), l); err != nil && !os.IsExist(err) {
275 | 			return err
276 | 		}
277 | 
278 | 		// XXX Many applications (wrongly) assume that libcuda.so exists (e.g. with dlopen)
279 | 		// Hardcode the libcuda symlink for the time being.
280 | 		if strings.Contains(driver, "nvidia") {
281 | 			// Remove libcuda symbol link
282 | 			if vm.share && driver == "nvidia" && strings.HasPrefix(soname[0], "libcuda.so") {
283 | 				os.Remove(l)
284 | 				vm.cudaSoname[l] = l
285 | 			}
286 | 
287 | 			// Remove libnvidia-ml symbol link
288 | 			if vm.share && driver == "nvidia" && strings.HasPrefix(soname[0], "libnvidia-ml.so") {
289 | 				os.Remove(l)
290 | 				vm.mlSoName[l] = l
291 | 			}
292 | 
293 | 			// XXX GLVND requires this symlink for indirect GLX support
294 | 			// It won't be needed once we have an indirect GLX vendor neutral library.
295 | 			if strings.HasPrefix(soname[0], "libGLX_nvidia") {
296 | 				l = strings.Replace(l, "GLX_nvidia", "GLX_indirect", 1)
297 | 				if err := linkIfNotSameName(path.Base(file), l); err != nil && !os.IsExist(err) {
298 | 					return err
299 | 				}
300 | 			}
301 | 		}
302 | 	}
303 | 
304 | 	return nil
305 | }
306 | 
307 | func (v *Volume) exist() (bool, error) {
308 | 	_, err := os.Stat(v.Path)
309 | 	if os.IsNotExist(err) {
310 | 		return false, nil
311 | 	}
312 | 
313 | 	return true, err
314 | }
315 | 
316 | func (v *Volume) remove() error {
317 | 	return os.RemoveAll(v.Path)
318 | }
319 | 
320 | func removeFile(file string) error {
321 | 	if err := os.Remove(file); err != nil {
322 | 		if !os.IsNotExist(err) {
323 | 			return err
324 | 		}
325 | 	}
326 | 
327 | 	return nil
328 | }
329 | 
330 | func linkIfNotSameName(src, dst string) error {
331 | 	if path.Base(src) != path.Base(dst) {
332 | 		if err := removeFile(dst); err != nil {
333 | 			if !os.IsNotExist(err) {
334 | 				return err
335 | 			}
336 | 		}
337 | 
338 | 		l := strings.TrimRight(dst, ".0123456789")
339 | 		if err := removeFile(l); err != nil {
340 | 			if !os.IsExist(err) {
341 | 				return err
342 | 			}
343 | 		}
344 | 
345 | 		if err := os.Symlink(src, l); err != nil && !os.IsExist(err) {
346 | 			return err
347 | 		}
348 | 
349 | 		if err := os.Symlink(src, dst); err != nil && !os.IsExist(err) {
350 | 			return err
351 | 		}
352 | 	}
353 | 
354 | 	return nil
355 | }
356 | 


--------------------------------------------------------------------------------
/pkg/services/watchdog/label.go:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Tencent is pleased to support the open source community by making TKEStack available.
  3 |  *
  4 |  * Copyright (C) 2012-2019 Tencent. All Rights Reserved.
  5 |  *
  6 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not use
  7 |  * this file except in compliance with the License. You may obtain a copy of the
  8 |  * License at
  9 |  *
 10 |  * https://opensource.org/licenses/Apache-2.0
 11 |  *
 12 |  * Unless required by applicable law or agreed to in writing, software
 13 |  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 14 |  * WARRANTIES OF ANY KIND, either express or implied.  See the License for the
 15 |  * specific language governing permissions and limitations under the License.
 16 |  */
 17 | 
 18 | package watchdog
 19 | 
 20 | import (
 21 | 	"os"
 22 | 	"regexp"
 23 | 	"time"
 24 | 
 25 | 	"k8s.io/apimachinery/pkg/api/errors"
 26 | 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 27 | 	"k8s.io/apimachinery/pkg/util/wait"
 28 | 	v1core "k8s.io/client-go/kubernetes/typed/core/v1"
 29 | 	"k8s.io/klog"
 30 | 	"tkestack.io/nvml"
 31 | )
 32 | 
 33 | const (
 34 | 	gpuModelLabel = "gaia.tencent.com/gpu-model"
 35 | )
 36 | 
 37 | type labelFunc interface {
 38 | 	GetLabel() string
 39 | }
 40 | 
 41 | type nodeLabeler struct {
 42 | 	hostName    string
 43 | 	client      v1core.CoreV1Interface
 44 | 	labelMapper map[string]labelFunc
 45 | }
 46 | 
 47 | type modelFunc struct{}
 48 | type stringFunc string
 49 | 
 50 | var modelFn = modelFunc{}
 51 | 
 52 | func (m modelFunc) GetLabel() (model string) {
 53 | 	if err := nvml.Init(); err != nil {
 54 | 		klog.Warningf("Can't initialize nvml library, %v", err)
 55 | 		return
 56 | 	}
 57 | 
 58 | 	defer nvml.Shutdown()
 59 | 
 60 | 	// Assume all devices on this node are the same model
 61 | 	dev, err := nvml.DeviceGetHandleByIndex(0)
 62 | 	if err != nil {
 63 | 		klog.Warningf("Can't get device 0 information, %v", err)
 64 | 		return
 65 | 	}
 66 | 
 67 | 	rawName, err := dev.DeviceGetName()
 68 | 	if err != nil {
 69 | 		klog.Warningf("Can't get device name, %v", err)
 70 | 		return
 71 | 	}
 72 | 
 73 | 	klog.V(4).Infof("GPU name: %s", rawName)
 74 | 
 75 | 	return getTypeName(rawName)
 76 | }
 77 | 
 78 | func (s stringFunc) GetLabel() string {
 79 | 	return string(s)
 80 | }
 81 | 
 82 | var modelNameSplitPattern = regexp.MustCompile("\\s+")
 83 | 
 84 | func getTypeName(name string) string {
 85 | 	splits := modelNameSplitPattern.Split(name, -1)
 86 | 
 87 | 	if len(splits) > 2 {
 88 | 		return splits[1]
 89 | 	}
 90 | 
 91 | 	klog.V(4).Infof("GPU name splits: %v", splits)
 92 | 
 93 | 	return ""
 94 | }
 95 | 
 96 | //NewNodeLabeler returns a new nodeLabeler
 97 | func NewNodeLabeler(client v1core.CoreV1Interface, hostname string, labels map[string]string) *nodeLabeler {
 98 | 	if len(hostname) == 0 {
 99 | 		hostname, _ = os.Hostname()
100 | 	}
101 | 
102 | 	klog.V(2).Infof("Labeler for hostname %s", hostname)
103 | 
104 | 	labelMapper := make(map[string]labelFunc)
105 | 	for k, v := range labels {
106 | 		if k == gpuModelLabel {
107 | 			labelMapper[k] = modelFn
108 | 		} else {
109 | 			labelMapper[k] = stringFunc(v)
110 | 		}
111 | 	}
112 | 
113 | 	return &nodeLabeler{
114 | 		hostName:    hostname,
115 | 		client:      client,
116 | 		labelMapper: labelMapper,
117 | 	}
118 | }
119 | 
120 | func (nl *nodeLabeler) Run() error {
121 | 	err := wait.PollImmediate(time.Second, time.Minute, func() (bool, error) {
122 | 		node, err := nl.client.Nodes().Get(nl.hostName, metav1.GetOptions{})
123 | 		if err != nil {
124 | 			return false, err
125 | 		}
126 | 
127 | 		for k, fn := range nl.labelMapper {
128 | 			l := fn.GetLabel()
129 | 			if len(l) == 0 {
130 | 				klog.Warningf("Empty label for %s", k)
131 | 				continue
132 | 			}
133 | 
134 | 			klog.V(2).Infof("Label %s %s=%s", nl.hostName, k, l)
135 | 			node.Labels[k] = l
136 | 		}
137 | 
138 | 		_, updateErr := nl.client.Nodes().Update(node)
139 | 		if updateErr != nil {
140 | 			if errors.IsConflict(updateErr) {
141 | 				return false, nil
142 | 			}
143 | 			return true, updateErr
144 | 		}
145 | 
146 | 		return true, nil
147 | 	})
148 | 
149 | 	if err != nil {
150 | 		return err
151 | 	}
152 | 
153 | 	klog.V(2).Infof("Auto label is running")
154 | 
155 | 	return nil
156 | }
157 | 


--------------------------------------------------------------------------------
/pkg/services/watchdog/label_test.go:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Tencent is pleased to support the open source community by making TKEStack available.
 3 |  *
 4 |  * Copyright (C) 2012-2019 Tencent. All Rights Reserved.
 5 |  *
 6 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not use
 7 |  * this file except in compliance with the License. You may obtain a copy of the
 8 |  * License at
 9 |  *
10 |  * https://opensource.org/licenses/Apache-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
14 |  * WARRANTIES OF ANY KIND, either express or implied.  See the License for the
15 |  * specific language governing permissions and limitations under the License.
16 |  */
17 | 
18 | package watchdog
19 | 
20 | import (
21 | 	"flag"
22 | 	"testing"
23 | 	"time"
24 | 
25 | 	"k8s.io/api/core/v1"
26 | 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
27 | 	"k8s.io/apimachinery/pkg/util/wait"
28 | 	"k8s.io/client-go/kubernetes/fake"
29 | )
30 | 
31 | func init() {
32 | 	flag.Set("v", "4")
33 | 	flag.Set("logtostderr", "true")
34 | }
35 | 
36 | func TestNodeLabeler(t *testing.T) {
37 | 	flag.Parse()
38 | 	nodeName := "testnode"
39 | 	testKey := "testkey"
40 | 	testValue := "testvalue"
41 | 	labels := make(map[string]string)
42 | 	labels[testKey] = testValue
43 | 
44 | 	// create node with fake client
45 | 	k8sclient := fake.NewSimpleClientset()
46 | 	node := &v1.Node{
47 | 		ObjectMeta: metav1.ObjectMeta{
48 | 			Name:   nodeName,
49 | 			Labels: make(map[string]string),
50 | 		},
51 | 	}
52 | 	k8sclient.CoreV1().Nodes().Create(node)
53 | 
54 | 	// create nodeLabeler and run
55 | 	nodeLabeler := NewNodeLabeler(k8sclient.CoreV1(), nodeName, labels)
56 | 	go nodeLabeler.Run()
57 | 
58 | 	// check if nodeLabeler work well
59 | 	err := wait.PollImmediate(time.Second, time.Minute, func() (bool, error) {
60 | 		node, err := k8sclient.CoreV1().Nodes().Get(nodeName, metav1.GetOptions{})
61 | 		if err != nil {
62 | 			return false, err
63 | 		}
64 | 		if v, ok := node.Labels[testKey]; !ok || v != testValue {
65 | 			return false, nil
66 | 		}
67 | 		return true, nil
68 | 	})
69 | 	if err != nil {
70 | 		t.Fatalf("test failed: %s", err.Error())
71 | 	}
72 | }
73 | 


--------------------------------------------------------------------------------
/pkg/services/watchdog/watchdog.go:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Tencent is pleased to support the open source community by making TKEStack available.
  3 |  *
  4 |  * Copyright (C) 2012-2019 Tencent. All Rights Reserved.
  5 |  *
  6 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not use
  7 |  * this file except in compliance with the License. You may obtain a copy of the
  8 |  * License at
  9 |  *
 10 |  * https://opensource.org/licenses/Apache-2.0
 11 |  *
 12 |  * Unless required by applicable law or agreed to in writing, software
 13 |  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 14 |  * WARRANTIES OF ANY KIND, either express or implied.  See the License for the
 15 |  * specific language governing permissions and limitations under the License.
 16 |  */
 17 | 
 18 | package watchdog
 19 | 
 20 | import (
 21 | 	"fmt"
 22 | 	"time"
 23 | 
 24 | 	"tkestack.io/gpu-manager/pkg/utils"
 25 | 
 26 | 	"k8s.io/api/core/v1"
 27 | 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 28 | 	"k8s.io/apimachinery/pkg/fields"
 29 | 	"k8s.io/client-go/informers"
 30 | 	informerCore "k8s.io/client-go/informers/core/v1"
 31 | 	"k8s.io/client-go/kubernetes"
 32 | 	"k8s.io/klog"
 33 | )
 34 | 
 35 | const (
 36 | 	podHostField = "spec.nodeName"
 37 | )
 38 | 
 39 | //PodCache contains a podInformer of pod
 40 | type PodCache struct {
 41 | 	podInformer informerCore.PodInformer
 42 | }
 43 | 
 44 | var (
 45 | 	podCache *PodCache
 46 | )
 47 | 
 48 | //NewPodCache creates a new podCache
 49 | func NewPodCache(client kubernetes.Interface, hostName string) {
 50 | 	podCache = new(PodCache)
 51 | 
 52 | 	factory := informers.NewSharedInformerFactoryWithOptions(client, time.Minute,
 53 | 		informers.WithTweakListOptions(func(options *metav1.ListOptions) {
 54 | 			options.FieldSelector = fields.OneTermEqualSelector(podHostField, hostName).String()
 55 | 		}))
 56 | 	podCache.podInformer = factory.Core().V1().Pods()
 57 | 
 58 | 	ch := make(chan struct{})
 59 | 	go podCache.podInformer.Informer().Run(ch)
 60 | 
 61 | 	for !podCache.podInformer.Informer().HasSynced() {
 62 | 		time.Sleep(time.Second)
 63 | 	}
 64 | 	klog.V(2).Infof("Pod cache is running")
 65 | }
 66 | 
 67 | //NewPodCacheForTest creates a new podCache for testing
 68 | func NewPodCacheForTest(client kubernetes.Interface) {
 69 | 	podCache = new(PodCache)
 70 | 
 71 | 	informers := informers.NewSharedInformerFactory(client, 0)
 72 | 	podCache.podInformer = informers.Core().V1().Pods()
 73 | 	podCache.podInformer.Informer().AddEventHandler(podCache)
 74 | 	ch := make(chan struct{})
 75 | 	informers.Start(ch)
 76 | 
 77 | 	for !podCache.podInformer.Informer().HasSynced() {
 78 | 		time.Sleep(time.Second)
 79 | 	}
 80 | 	klog.V(2).Infof("Pod cache is running")
 81 | }
 82 | 
 83 | //OnAdd is a callback function for podInformer, do nothing for now.
 84 | func (p *PodCache) OnAdd(obj interface{}) {}
 85 | 
 86 | //OnUpdate is a callback function for podInformer, do nothing for now.
 87 | func (p *PodCache) OnUpdate(oldObj, newObj interface{}) {}
 88 | 
 89 | //OnDelete is a callback function for podInformer, do nothing for now.
 90 | func (p *PodCache) OnDelete(obj interface{}) {}
 91 | 
 92 | //GetActivePods get all active pods from podCache and returns them.
 93 | func GetActivePods() map[string]*v1.Pod {
 94 | 	if podCache == nil {
 95 | 		return nil
 96 | 	}
 97 | 
 98 | 	activePods := make(map[string]*v1.Pod)
 99 | 
100 | 	for _, item := range podCache.podInformer.Informer().GetStore().List() {
101 | 		pod, ok := item.(*v1.Pod)
102 | 		if !ok {
103 | 			continue
104 | 		}
105 | 
106 | 		if podIsTerminated(pod) {
107 | 			continue
108 | 		}
109 | 
110 | 		if !utils.IsGPURequiredPod(pod) {
111 | 			continue
112 | 		}
113 | 
114 | 		activePods[string(pod.UID)] = pod
115 | 	}
116 | 
117 | 	return activePods
118 | }
119 | 
120 | func GetPod(namespace, name string) (*v1.Pod, error) {
121 | 	pod, err := podCache.podInformer.Lister().Pods(namespace).Get(name)
122 | 	if err != nil {
123 | 		return nil, err
124 | 	}
125 | 
126 | 	if podIsTerminated(pod) {
127 | 		return nil, fmt.Errorf("terminated pod")
128 | 	}
129 | 
130 | 	if !utils.IsGPURequiredPod(pod) {
131 | 		return nil, fmt.Errorf("no gpu pod")
132 | 	}
133 | 
134 | 	return pod, nil
135 | }
136 | 
137 | func podIsTerminated(pod *v1.Pod) bool {
138 | 	return pod.Status.Phase == v1.PodFailed || pod.Status.Phase == v1.PodSucceeded || (pod.DeletionTimestamp != nil && notRunning(pod.Status.ContainerStatuses))
139 | }
140 | 
141 | // notRunning returns true if every status is terminated or waiting, or the status list
142 | // is empty.
143 | func notRunning(statuses []v1.ContainerStatus) bool {
144 | 	for _, status := range statuses {
145 | 		if status.State.Terminated == nil && status.State.Waiting == nil {
146 | 			return false
147 | 		}
148 | 	}
149 | 	return true
150 | }
151 | 


--------------------------------------------------------------------------------
/pkg/services/watchdog/watchdog_test.go:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Tencent is pleased to support the open source community by making TKEStack available.
 3 |  *
 4 |  * Copyright (C) 2012-2019 Tencent. All Rights Reserved.
 5 |  *
 6 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not use
 7 |  * this file except in compliance with the License. You may obtain a copy of the
 8 |  * License at
 9 |  *
10 |  * https://opensource.org/licenses/Apache-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
14 |  * WARRANTIES OF ANY KIND, either express or implied.  See the License for the
15 |  * specific language governing permissions and limitations under the License.
16 |  */
17 | 
18 | package watchdog
19 | 
20 | import (
21 | 	"flag"
22 | 	"fmt"
23 | 	"testing"
24 | 	"time"
25 | 
26 | 	"tkestack.io/gpu-manager/pkg/types"
27 | 
28 | 	"k8s.io/api/core/v1"
29 | 	"k8s.io/apimachinery/pkg/api/resource"
30 | 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
31 | 	k8stypes "k8s.io/apimachinery/pkg/types"
32 | 	"k8s.io/apimachinery/pkg/util/wait"
33 | 	"k8s.io/client-go/kubernetes/fake"
34 | )
35 | 
36 | func init() {
37 | 	flag.Set("v", "4")
38 | 	flag.Set("logtostderr", "true")
39 | }
40 | 
41 | func TestWatchdog(t *testing.T) {
42 | 	flag.Parse()
43 | 	podName := "testpod"
44 | 	podUID := "testuid"
45 | 	ns := "test-ns"
46 | 	containerName := "test-container"
47 | 	// create pod with fake client
48 | 	k8sclient := fake.NewSimpleClientset()
49 | 	pod := &v1.Pod{
50 | 		ObjectMeta: metav1.ObjectMeta{
51 | 			Name: podName,
52 | 			UID:  k8stypes.UID(podUID),
53 | 		},
54 | 		Spec: v1.PodSpec{Containers: []v1.Container{
55 | 			{
56 | 				Name: containerName,
57 | 				Resources: v1.ResourceRequirements{
58 | 					Limits: v1.ResourceList{
59 | 						types.VCoreAnnotation:   resource.MustParse(fmt.Sprintf("%d", 1)),
60 | 						types.VMemoryAnnotation: resource.MustParse(fmt.Sprintf("%d", 1)),
61 | 					},
62 | 				},
63 | 			},
64 | 		}},
65 | 		Status: v1.PodStatus{Phase: v1.PodRunning},
66 | 	}
67 | 	k8sclient.CoreV1().Pods(ns).Create(pod)
68 | 
69 | 	// create watchdog and run
70 | 	NewPodCacheForTest(k8sclient)
71 | 
72 | 	// check if watchdog work well
73 | 	err := wait.PollImmediate(time.Second, time.Minute, func() (bool, error) {
74 | 		activepods := GetActivePods()
75 | 		if v, ok := activepods[podUID]; !ok || v.Name != podName {
76 | 			t.Logf("can't find pod %s", podName)
77 | 			return false, nil
78 | 		}
79 | 		return true, nil
80 | 	})
81 | 	if err != nil {
82 | 		t.Fatalf("test failed: %s", err.Error())
83 | 	}
84 | }
85 | 


--------------------------------------------------------------------------------
/pkg/types/types.go:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Tencent is pleased to support the open source community by making TKEStack available.
  3 |  *
  4 |  * Copyright (C) 2012-2019 Tencent. All Rights Reserved.
  5 |  *
  6 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not use
  7 |  * this file except in compliance with the License. You may obtain a copy of the
  8 |  * License at
  9 |  *
 10 |  * https://opensource.org/licenses/Apache-2.0
 11 |  *
 12 |  * Unless required by applicable law or agreed to in writing, software
 13 |  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 14 |  * WARRANTIES OF ANY KIND, either express or implied.  See the License for the
 15 |  * specific language governing permissions and limitations under the License.
 16 |  */
 17 | 
 18 | package types
 19 | 
 20 | import (
 21 | 	pluginapi "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1"
 22 | )
 23 | 
 24 | const (
 25 | 	VDeviceAnnotation       = "tencent.com/vcuda-device"
 26 | 	VCoreAnnotation         = "tencent.com/vcuda-core"
 27 | 	VCoreLimitAnnotation    = "tencent.com/vcuda-core-limit"
 28 | 	VMemoryAnnotation       = "tencent.com/vcuda-memory"
 29 | 	PredicateTimeAnnotation = "tencent.com/predicate-time"
 30 | 	PredicateGPUIndexPrefix = "tencent.com/predicate-gpu-idx-"
 31 | 	GPUAssigned             = "tencent.com/gpu-assigned"
 32 | 	ClusterNameAnnotation   = "clusterName"
 33 | 
 34 | 	VCUDA_MOUNTPOINT = "/etc/vcuda"
 35 | 
 36 | 	/** 256MB */
 37 | 	MemoryBlockSize = 268435456
 38 | 
 39 | 	KubeletSocket                 = "kubelet.sock"
 40 | 	VDeviceSocket                 = "vcuda.sock"
 41 | 	CheckPointFileName            = "kubelet_internal_checkpoint"
 42 | 	PreStartContainerCheckErrMsg  = "PreStartContainer check failed"
 43 | 	PreStartContainerCheckErrType = "PreStartContainerCheckErr"
 44 | 	UnexpectedAdmissionErrType    = "UnexpectedAdmissionError"
 45 | )
 46 | 
 47 | const (
 48 | 	NvidiaCtlDevice    = "/dev/nvidiactl"
 49 | 	NvidiaUVMDevice    = "/dev/nvidia-uvm"
 50 | 	NvidiaFullpathRE   = `^/dev/nvidia([0-9]*)$`
 51 | 	NvidiaDevicePrefix = "/dev/nvidia"
 52 | )
 53 | 
 54 | const (
 55 | 	ManagerSocket = "/var/run/gpu-manager.sock"
 56 | )
 57 | 
 58 | const (
 59 | 	CGROUP_BASE  = "/sys/fs/cgroup/memory"
 60 | 	CGROUP_PROCS = "cgroup.procs"
 61 | )
 62 | 
 63 | type VCudaRequest struct {
 64 | 	PodUID           string
 65 | 	AllocateResponse *pluginapi.ContainerAllocateResponse
 66 | 	ContainerName    string
 67 | 	//Deprecated
 68 | 	Cores int64
 69 | 	//Deprecated
 70 | 	Memory int64
 71 | 	Done   chan error
 72 | }
 73 | 
 74 | type DevicesPerNUMA map[int64][]string
 75 | 
 76 | type PodDevicesEntry struct {
 77 | 	PodUID        string
 78 | 	ContainerName string
 79 | 	ResourceName  string
 80 | 	DeviceIDs     []string
 81 | 	AllocResp     []byte
 82 | }
 83 | 
 84 | type PodDevicesEntryNUMA struct {
 85 | 	PodUID        string
 86 | 	ContainerName string
 87 | 	ResourceName  string
 88 | 	DeviceIDs     DevicesPerNUMA
 89 | 	AllocResp     []byte
 90 | }
 91 | 
 92 | type CheckpointNUMA struct {
 93 | 	PodDeviceEntries  []PodDevicesEntryNUMA
 94 | 	RegisteredDevices map[string][]string
 95 | }
 96 | 
 97 | type Checkpoint struct {
 98 | 	PodDeviceEntries  []PodDevicesEntry
 99 | 	RegisteredDevices map[string][]string
100 | }
101 | 
102 | type CheckpointDataNUMA struct {
103 | 	Data *CheckpointNUMA `json:"Data"`
104 | }
105 | 
106 | type CheckpointData struct {
107 | 	Data *Checkpoint `json:"Data"`
108 | }
109 | 
110 | var (
111 | 	DriverVersionMajor      int
112 | 	DriverVersionMinor      int
113 | 	DriverLibraryPath       string
114 | 	DriverOriginLibraryPath string
115 | )
116 | 
117 | const (
118 | 	ContainerNameLabelKey = "io.kubernetes.container.name"
119 | 	PodNamespaceLabelKey  = "io.kubernetes.pod.namespace"
120 | 	PodNameLabelKey       = "io.kubernetes.pod.name"
121 | 	PodUIDLabelKey        = "io.kubernetes.pod.uid"
122 | 	PodCgroupNamePrefix   = "pod"
123 | )
124 | 


--------------------------------------------------------------------------------
/pkg/utils/cgroup/cgroup.go:
--------------------------------------------------------------------------------
 1 | package cgroup
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"path"
 6 | 	"strings"
 7 | 
 8 | 	cgroupsystemd "github.com/opencontainers/runc/libcontainer/cgroups/systemd"
 9 | 
10 | 	"k8s.io/klog"
11 | )
12 | 
13 | // CgroupName is the abstract name of a cgroup prior to any driver specific conversion.
14 | // It is specified as a list of strings from its individual components, such as:
15 | // {"kubepods", "burstable", "pod1234-abcd-5678-efgh"}
16 | type CgroupName []string
17 | 
18 | const (
19 | 	// systemdSuffix is the cgroup name suffix for systemd
20 | 	systemdSuffix string = ".slice"
21 | )
22 | 
23 | // NewCgroupName composes a new cgroup name.
24 | // Use RootCgroupName as base to start at the root.
25 | // This function does some basic check for invalid characters at the name.
26 | func NewCgroupName(base CgroupName, components ...string) CgroupName {
27 | 	for _, component := range components {
28 | 		// Forbit using "_" in internal names. When remapping internal
29 | 		// names to systemd cgroup driver, we want to remap "-" => "_",
30 | 		// so we forbid "_" so that we can always reverse the mapping.
31 | 		if strings.Contains(component, "/") || strings.Contains(component, "_") {
32 | 			panic(fmt.Errorf("invalid character in component [%q] of CgroupName", component))
33 | 		}
34 | 	}
35 | 	// copy data from the base cgroup to eliminate cases where CgroupNames share underlying slices.  See #68416
36 | 	baseCopy := make([]string, len(base))
37 | 	copy(baseCopy, base)
38 | 	return CgroupName(append(baseCopy, components...))
39 | }
40 | 
41 | // cgroupName.ToSystemd converts the internal cgroup name to a systemd name.
42 | // For example, the name {"kubepods", "burstable", "pod1234-abcd-5678-efgh"} becomes
43 | // "/kubepods.slice/kubepods-burstable.slice/kubepods-burstable-pod1234_abcd_5678_efgh.slice"
44 | // This function always expands the systemd name into the cgroupfs form. If only
45 | // the last part is needed, use path.Base(...) on it to discard the rest.
46 | func (cgroupName CgroupName) ToSystemd() string {
47 | 	if len(cgroupName) == 0 || (len(cgroupName) == 1 && cgroupName[0] == "") {
48 | 		return "/"
49 | 	}
50 | 	newparts := []string{}
51 | 	for _, part := range cgroupName {
52 | 		part = escapeSystemdCgroupName(part)
53 | 		newparts = append(newparts, part)
54 | 	}
55 | 
56 | 	result, err := cgroupsystemd.ExpandSlice(strings.Join(newparts, "-") + systemdSuffix)
57 | 	if err != nil {
58 | 		// Should never happen...
59 | 		panic(fmt.Errorf("error converting cgroup name [%v] to systemd format: %v", cgroupName, err))
60 | 	}
61 | 	return result
62 | }
63 | 
64 | func escapeSystemdCgroupName(part string) string {
65 | 	return strings.Replace(part, "-", "_", -1)
66 | }
67 | 
68 | func (cgroupName CgroupName) ToCgroupfs() string {
69 | 	return "/" + path.Join(cgroupName...)
70 | }
71 | 
72 | func SystemdPathPrefixOfRuntime(runtimeName string) string {
73 | 	switch runtimeName {
74 | 	case "cri-o":
75 | 		return "crio"
76 | 	case "containerd":
77 | 		return "cri-containerd"
78 | 	default:
79 | 		klog.Infof("prefix of container runtime %s was not tested. Maybe not correct!", runtimeName)
80 | 		return runtimeName
81 | 	}
82 | }
83 | 


--------------------------------------------------------------------------------
/pkg/utils/util.go:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Tencent is pleased to support the open source community by making TKEStack available.
  3 |  *
  4 |  * Copyright (C) 2012-2019 Tencent. All Rights Reserved.
  5 |  *
  6 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not use
  7 |  * this file except in compliance with the License. You may obtain a copy of the
  8 |  * License at
  9 |  *
 10 |  * https://opensource.org/licenses/Apache-2.0
 11 |  *
 12 |  * Unless required by applicable law or agreed to in writing, software
 13 |  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 14 |  * WARRANTIES OF ANY KIND, either express or implied.  See the License for the
 15 |  * specific language governing permissions and limitations under the License.
 16 |  */
 17 | 
 18 | package utils
 19 | 
 20 | import (
 21 | 	"context"
 22 | 	"encoding/json"
 23 | 	"fmt"
 24 | 	"io/ioutil"
 25 | 	"net"
 26 | 	"path/filepath"
 27 | 	"regexp"
 28 | 	"sort"
 29 | 	"strconv"
 30 | 	"strings"
 31 | 	"time"
 32 | 
 33 | 	pluginapi "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1"
 34 | 
 35 | 	nvtree "tkestack.io/gpu-manager/pkg/device/nvidia"
 36 | 	"tkestack.io/gpu-manager/pkg/types"
 37 | 
 38 | 	"github.com/fsnotify/fsnotify"
 39 | 	"github.com/pkg/errors"
 40 | 	"google.golang.org/grpc"
 41 | 	v1 "k8s.io/api/core/v1"
 42 | 	apierr "k8s.io/apimachinery/pkg/api/errors"
 43 | 	"k8s.io/klog"
 44 | )
 45 | 
 46 | //constants used in this package
 47 | const (
 48 | 	TruncateLen = 31
 49 | 	kubePrefix  = "k8s"
 50 | )
 51 | 
 52 | var (
 53 | 	//DefaultDialOptions contains default dial options used in grpc dial
 54 | 	DefaultDialOptions = []grpc.DialOption{grpc.WithInsecure(), grpc.WithDialer(UnixDial), grpc.WithBlock()}
 55 | )
 56 | 
 57 | //UnixDial dials to a unix socket using net.DialTimeout
 58 | func UnixDial(addr string, timeout time.Duration) (net.Conn, error) {
 59 | 	return net.DialTimeout("unix", addr, timeout)
 60 | }
 61 | 
 62 | //IsValidGPUPath checks if path is valid Nvidia GPU device path
 63 | func IsValidGPUPath(path string) bool {
 64 | 	return regexp.MustCompile(types.NvidiaFullpathRE).MatchString(path)
 65 | }
 66 | 
 67 | //GetGPUMinorID returns id in Nvidia GPU device path
 68 | func GetGPUMinorID(path string) (int, error) {
 69 | 	str := regexp.MustCompile(types.NvidiaFullpathRE).FindStringSubmatch(path)
 70 | 
 71 | 	if len(str) != 2 {
 72 | 		return -1, fmt.Errorf("not match pattern %s", types.NvidiaFullpathRE)
 73 | 	}
 74 | 
 75 | 	id, _ := strconv.ParseInt(str[1], 10, 32)
 76 | 
 77 | 	return int(id), nil
 78 | }
 79 | 
 80 | //GetGPUData get cores, memory and device names from annotations
 81 | func GetGPUData(annotations map[string]string) (gpuUtil int64, gpuMemory int64, deviceNames []string) {
 82 | 	for k, v := range annotations {
 83 | 		switch {
 84 | 		case strings.HasSuffix(k, types.VCoreAnnotation):
 85 | 			gpuUtil, _ = strconv.ParseInt(v, 10, 64)
 86 | 		case strings.HasSuffix(k, types.VMemoryAnnotation):
 87 | 			gpuMemory, _ = strconv.ParseInt(v, 10, 64)
 88 | 		case strings.HasSuffix(k, types.VDeviceAnnotation):
 89 | 			deviceNames = strings.Split(annotations[k], ",")
 90 | 		}
 91 | 	}
 92 | 
 93 | 	return gpuUtil, gpuMemory, deviceNames
 94 | }
 95 | 
 96 | //NewFSWatcher returns a file watcher created by fsnotify.NewWatcher
 97 | func NewFSWatcher(files ...string) (*fsnotify.Watcher, error) {
 98 | 	watcher, err := fsnotify.NewWatcher()
 99 | 	if err != nil {
100 | 		return nil, err
101 | 	}
102 | 
103 | 	for _, f := range files {
104 | 		err = watcher.Add(f)
105 | 		if err != nil {
106 | 			watcher.Close()
107 | 			return nil, err
108 | 		}
109 | 	}
110 | 
111 | 	return watcher, nil
112 | }
113 | 
114 | // WaitForServer checks if grpc server is alive
115 | // by making grpc blocking connection to the server socket
116 | func WaitForServer(socket string) error {
117 | 	conn, err := grpc.DialContext(context.Background(), socket, DefaultDialOptions...)
118 | 	if err == nil {
119 | 		conn.Close()
120 | 		return nil
121 | 	}
122 | 	return errors.Wrapf(err, "Failed dial context at %s", socket)
123 | }
124 | 
125 | func GetCheckpointData(devicePluginPath string) (*types.Checkpoint, error) {
126 | 	cpFile := filepath.Join(devicePluginPath, types.CheckPointFileName)
127 | 	data, err := ioutil.ReadFile(cpFile)
128 | 	if err != nil {
129 | 		return nil, err
130 | 	}
131 | 	klog.V(4).Infof("Try NUMA checkpoint data format")
132 | 	cpNUMAData := &types.CheckpointDataNUMA{}
133 | 	err = json.Unmarshal(data, cpNUMAData)
134 | 	if err != nil {
135 | 		klog.V(4).Infof("Failed NUMA checkpoint data format")
136 | 	} else { // flat deviceids
137 | 		v2DeivcesEntryies := make([]types.PodDevicesEntry, len(cpNUMAData.Data.PodDeviceEntries))
138 | 		for i, v := range cpNUMAData.Data.PodDeviceEntries {
139 | 			v2PodDevicesEntry := types.PodDevicesEntry{
140 | 				PodUID:        v.PodUID,
141 | 				ContainerName: v.ContainerName,
142 | 				ResourceName:  v.ResourceName,
143 | 				DeviceIDs:     make([]string, 0),
144 | 				AllocResp:     v.AllocResp,
145 | 			}
146 | 			for _, devices := range v.DeviceIDs {
147 | 				v2PodDevicesEntry.DeviceIDs = append(v2PodDevicesEntry.DeviceIDs, devices...)
148 | 			}
149 | 			v2DeivcesEntryies[i] = v2PodDevicesEntry
150 | 		}
151 | 		cpV1Data := &types.Checkpoint{}
152 | 		cpV1Data.RegisteredDevices = cpNUMAData.Data.RegisteredDevices
153 | 		cpV1Data.PodDeviceEntries = v2DeivcesEntryies
154 | 		return cpV1Data, nil
155 | 	}
156 | 
157 | 	klog.V(4).Infof("Try v2 checkpoint data format")
158 | 	cpV2Data := &types.CheckpointData{}
159 | 	err = json.Unmarshal(data, cpV2Data)
160 | 	if err != nil {
161 | 		return nil, err
162 | 	}
163 | 
164 | 	if cpV2Data.Data != nil {
165 | 		return cpV2Data.Data, nil
166 | 	}
167 | 
168 | 	klog.V(4).Infof("Try v1 checkpoint data format")
169 | 	cpV1Data := &types.Checkpoint{}
170 | 	err = json.Unmarshal(data, cpV1Data)
171 | 	if err != nil {
172 | 		return nil, err
173 | 	}
174 | 
175 | 	return cpV1Data, nil
176 | }
177 | 
178 | func IsStringSliceEqual(a, b []string) bool {
179 | 	if len(a) != len(b) {
180 | 		return false
181 | 	}
182 | 	sort.Strings(a)
183 | 	sort.Strings(b)
184 | 	for i, v := range a {
185 | 		if v != b[i] {
186 | 			return false
187 | 		}
188 | 	}
189 | 	return true
190 | }
191 | 
192 | func ShouldRetry(err error) bool {
193 | 	return apierr.IsConflict(err) || apierr.IsServerTimeout(err)
194 | }
195 | 
196 | func MakeContainerNamePrefix(containerName string) string {
197 | 	return fmt.Sprintf("/%s_%s_", kubePrefix, containerName)
198 | }
199 | 
200 | func IsGPURequiredPod(pod *v1.Pod) bool {
201 | 	vcore := GetGPUResourceOfPod(pod, types.VCoreAnnotation)
202 | 	vmemory := GetGPUResourceOfPod(pod, types.VMemoryAnnotation)
203 | 
204 | 	// Check if pod request for GPU resource
205 | 	if vcore <= 0 || (vcore < nvtree.HundredCore && vmemory <= 0) {
206 | 		klog.V(4).Infof("Pod %s in namespace %s does not Request for GPU resource",
207 | 			pod.Name,
208 | 			pod.Namespace)
209 | 		return false
210 | 	}
211 | 
212 | 	return true
213 | }
214 | 
215 | func IsGPURequiredContainer(c *v1.Container) bool {
216 | 	klog.V(4).Infof("Determine if the container %s needs GPU resource", c.Name)
217 | 
218 | 	vcore := GetGPUResourceOfContainer(c, types.VCoreAnnotation)
219 | 	vmemory := GetGPUResourceOfContainer(c, types.VMemoryAnnotation)
220 | 
221 | 	// Check if container request for GPU resource
222 | 	if vcore <= 0 || (vcore < nvtree.HundredCore && vmemory <= 0) {
223 | 		klog.V(4).Infof("Container %s does not Request for GPU resource", c.Name)
224 | 		return false
225 | 	}
226 | 
227 | 	return true
228 | }
229 | 
230 | func GetGPUResourceOfPod(pod *v1.Pod, resourceName v1.ResourceName) uint {
231 | 	var total uint
232 | 	containers := pod.Spec.Containers
233 | 	for _, container := range containers {
234 | 		if val, ok := container.Resources.Limits[resourceName]; ok {
235 | 			total += uint(val.Value())
236 | 		}
237 | 	}
238 | 	return total
239 | }
240 | 
241 | func ShouldDelete(pod *v1.Pod) bool {
242 | 	for _, status := range pod.Status.ContainerStatuses {
243 | 		if status.State.Waiting != nil &&
244 | 			strings.Contains(status.State.Waiting.Message, types.PreStartContainerCheckErrMsg) {
245 | 			return true
246 | 		}
247 | 	}
248 | 	if pod.Status.Reason == types.UnexpectedAdmissionErrType {
249 | 		return true
250 | 	}
251 | 	return false
252 | }
253 | 
254 | func IsGPUPredicatedPod(pod *v1.Pod) (predicated bool) {
255 | 	klog.V(4).Infof("Determine if the pod %s needs GPU resource", pod.Name)
256 | 	var ok bool
257 | 
258 | 	// Check if pod request for GPU resource
259 | 	if GetGPUResourceOfPod(pod, types.VCoreAnnotation) <= 0 || GetGPUResourceOfPod(pod, types.VMemoryAnnotation) <= 0 {
260 | 		klog.V(4).Infof("Pod %s in namespace %s does not Request for GPU resource",
261 | 			pod.Name,
262 | 			pod.Namespace)
263 | 		return predicated
264 | 	}
265 | 
266 | 	// Check if pod already has predicate time
267 | 	if _, ok = pod.ObjectMeta.Annotations[types.PredicateTimeAnnotation]; !ok {
268 | 		klog.V(4).Infof("No predicate time for pod %s in namespace %s",
269 | 			pod.Name,
270 | 			pod.Namespace)
271 | 		return predicated
272 | 	}
273 | 
274 | 	// Check if pod has already been assigned
275 | 	if assigned, ok := pod.ObjectMeta.Annotations[types.GPUAssigned]; !ok {
276 | 		klog.V(4).Infof("No assigned flag for pod %s in namespace %s",
277 | 			pod.Name,
278 | 			pod.Namespace)
279 | 		return predicated
280 | 	} else if assigned == "true" {
281 | 		klog.V(4).Infof("pod %s in namespace %s has already been assigned",
282 | 			pod.Name,
283 | 			pod.Namespace)
284 | 		return predicated
285 | 	}
286 | 	predicated = true
287 | 	return predicated
288 | }
289 | 
290 | // Check if pod has already been assigned
291 | func IsGPUAssignedPod(pod *v1.Pod) bool {
292 | 	if assigned, ok := pod.ObjectMeta.Annotations[types.GPUAssigned]; !ok {
293 | 		klog.V(4).Infof("No assigned flag for pod %s in namespace %s",
294 | 			pod.Name,
295 | 			pod.Namespace)
296 | 		return false
297 | 	} else if assigned == "false" {
298 | 		klog.V(4).Infof("pod %s in namespace %s has not been assigned",
299 | 			pod.Name,
300 | 			pod.Namespace)
301 | 		return false
302 | 	}
303 | 
304 | 	return true
305 | }
306 | 
307 | func GetPredicateTimeOfPod(pod *v1.Pod) (predicateTime uint64) {
308 | 	if predicateTimeStr, ok := pod.ObjectMeta.Annotations[types.PredicateTimeAnnotation]; ok {
309 | 		u64, err := strconv.ParseUint(predicateTimeStr, 10, 64)
310 | 		if err != nil {
311 | 			klog.Warningf("Failed to parse predicate Timestamp %s due to %v", predicateTimeStr, err)
312 | 		} else {
313 | 			predicateTime = u64
314 | 		}
315 | 	} else {
316 | 		// If predicate time not found, use createionTimestamp instead
317 | 		predicateTime = uint64(pod.ObjectMeta.CreationTimestamp.UnixNano())
318 | 	}
319 | 
320 | 	return predicateTime
321 | }
322 | 
323 | func GetGPUResourceOfContainer(container *v1.Container, resourceName v1.ResourceName) uint {
324 | 	var count uint
325 | 	if val, ok := container.Resources.Limits[resourceName]; ok {
326 | 		count = uint(val.Value())
327 | 	}
328 | 	return count
329 | }
330 | 
331 | func GetContainerIndexByName(pod *v1.Pod, containerName string) (int, error) {
332 | 	containerIndex := -1
333 | 	for i, c := range pod.Spec.Containers {
334 | 		if c.Name == containerName {
335 | 			containerIndex = i
336 | 			break
337 | 		}
338 | 	}
339 | 
340 | 	if containerIndex == -1 {
341 | 		return containerIndex, fmt.Errorf("failed to get index of container %s in pod %s", containerName, pod.UID)
342 | 	}
343 | 	return containerIndex, nil
344 | }
345 | 
346 | func GetVirtualControllerMountPath(resp *pluginapi.ContainerAllocateResponse) string {
347 | 	for _, mnt := range resp.Mounts {
348 | 		if mnt.ContainerPath == types.VCUDA_MOUNTPOINT {
349 | 			return mnt.HostPath
350 | 		}
351 | 	}
352 | 
353 | 	return ""
354 | }
355 | 


--------------------------------------------------------------------------------
/pkg/version/.gitattributes:
--------------------------------------------------------------------------------
1 | base.go export-subst
2 | 


--------------------------------------------------------------------------------
/pkg/version/base.go:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Tencent is pleased to support the open source community by making TKEStack available.
 3 |  *
 4 |  * Copyright (C) 2012-2019 Tencent. All Rights Reserved.
 5 |  *
 6 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not use
 7 |  * this file except in compliance with the License. You may obtain a copy of the
 8 |  * License at
 9 |  *
10 |  * https://opensource.org/licenses/Apache-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
14 |  * WARRANTIES OF ANY KIND, either express or implied.  See the License for the
15 |  * specific language governing permissions and limitations under the License.
16 |  */
17 | 
18 | package version
19 | 
20 | var (
21 | 	gitMajor  = "0"
22 | 	gitMinor  = "1"
23 | 	gitCommit = "8cd842ed00fca0efbf7907fc40ee3f6085187f5c"
24 | )
25 | 


--------------------------------------------------------------------------------
/pkg/version/verflags.go:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Tencent is pleased to support the open source community by making TKEStack available.
  3 |  *
  4 |  * Copyright (C) 2012-2019 Tencent. All Rights Reserved.
  5 |  *
  6 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not use
  7 |  * this file except in compliance with the License. You may obtain a copy of the
  8 |  * License at
  9 |  *
 10 |  * https://opensource.org/licenses/Apache-2.0
 11 |  *
 12 |  * Unless required by applicable law or agreed to in writing, software
 13 |  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 14 |  * WARRANTIES OF ANY KIND, either express or implied.  See the License for the
 15 |  * specific language governing permissions and limitations under the License.
 16 |  */
 17 | 
 18 | package version
 19 | 
 20 | import (
 21 | 	"fmt"
 22 | 	"os"
 23 | 	"strconv"
 24 | 
 25 | 	flag "github.com/spf13/pflag"
 26 | )
 27 | 
 28 | type versionValue int
 29 | 
 30 | const (
 31 | 	VersionFalse versionValue = 0
 32 | 	VersionTrue  versionValue = 1
 33 | 	VersionRaw   versionValue = 2
 34 | )
 35 | 
 36 | const strRawVersion = "raw"
 37 | 
 38 | func (v *versionValue) IsBoolFlag() bool {
 39 | 	return true
 40 | }
 41 | 
 42 | func (v *versionValue) Get() interface{} {
 43 | 	return versionValue(*v)
 44 | }
 45 | 
 46 | func (v *versionValue) Set(s string) error {
 47 | 	if s == strRawVersion {
 48 | 		*v = VersionRaw
 49 | 		return nil
 50 | 	}
 51 | 	boolVal, err := strconv.ParseBool(s)
 52 | 	if boolVal {
 53 | 		*v = VersionTrue
 54 | 	} else {
 55 | 		*v = VersionFalse
 56 | 	}
 57 | 	return err
 58 | }
 59 | 
 60 | func (v *versionValue) String() string {
 61 | 	if *v == VersionRaw {
 62 | 		return strRawVersion
 63 | 	}
 64 | 	return fmt.Sprintf("%v", bool(*v == VersionTrue))
 65 | }
 66 | 
 67 | // The type of the flag as required by the pflag.Value interface
 68 | func (v *versionValue) Type() string {
 69 | 	return "version"
 70 | }
 71 | 
 72 | func VersionVar(p *versionValue, name string, value versionValue, usage string) {
 73 | 	*p = value
 74 | 	flag.Var(p, name, usage)
 75 | 	// "--version" will be treated as "--version=true"
 76 | 	flag.Lookup(name).NoOptDefVal = "true"
 77 | }
 78 | 
 79 | func Version(name string, value versionValue, usage string) *versionValue {
 80 | 	p := new(versionValue)
 81 | 	VersionVar(p, name, value, usage)
 82 | 	return p
 83 | }
 84 | 
 85 | var (
 86 | 	versionFlag = Version("version", VersionFalse, "Print version information and quit")
 87 | )
 88 | 
 89 | // PrintAndExitIfRequested will check if the -version flag was passed
 90 | // and, if so, print the version and exit.
 91 | func PrintAndExitIfRequested() {
 92 | 	if *versionFlag == VersionRaw {
 93 | 		fmt.Printf("%#v\n", Get())
 94 | 		os.Exit(0)
 95 | 	} else if *versionFlag == VersionTrue {
 96 | 		fmt.Printf("Nvidia Manager %s\n", Get())
 97 | 		os.Exit(0)
 98 | 	}
 99 | }
100 | 


--------------------------------------------------------------------------------
/pkg/version/version.go:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Tencent is pleased to support the open source community by making TKEStack available.
 3 |  *
 4 |  * Copyright (C) 2012-2019 Tencent. All Rights Reserved.
 5 |  *
 6 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not use
 7 |  * this file except in compliance with the License. You may obtain a copy of the
 8 |  * License at
 9 |  *
10 |  * https://opensource.org/licenses/Apache-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
14 |  * WARRANTIES OF ANY KIND, either express or implied.  See the License for the
15 |  * specific language governing permissions and limitations under the License.
16 |  */
17 | 
18 | package version
19 | 
20 | import (
21 | 	"fmt"
22 | )
23 | 
24 | // Info contains version information
25 | type Info struct {
26 | 	Version string
27 | 	Commit  string
28 | }
29 | 
30 | // String returns info as a human-friend version string.
31 | func (info Info) String() string {
32 | 	return info.Commit
33 | }
34 | 
35 | // Get returns the overall codebase version. It's for detecting
36 | // what code a binary was built from.
37 | func Get() Info {
38 | 	return Info{
39 | 		Version: fmt.Sprintf("%s.%s", gitMajor, gitMinor),
40 | 		Commit:  gitCommit,
41 | 	}
42 | }
43 | 


--------------------------------------------------------------------------------
/revive.toml:
--------------------------------------------------------------------------------
 1 | # When set to false, ignores files with "GENERATED" header, similar to golint
 2 | ignoreGeneratedHeader = true
 3 | severity = "warning"
 4 | confidence = 0.8
 5 | errorCode = 0
 6 | warningCode = 0
 7 | 
 8 | [rule.context-as-argument]
 9 | [rule.context-keys-type]
10 | [rule.dot-imports]
11 | [rule.error-return]
12 | [rule.error-strings]
13 | [rule.error-naming]
14 | [rule.if-return]
15 | [rule.increment-decrement]
16 | [rule.var-declaration]
17 | [rule.package-comments]
18 | [rule.range]
19 | [rule.receiver-naming]
20 | [rule.time-naming]
21 | [rule.indent-error-flow]
22 | [rule.errorf]
23 | [rule.superfluous-else]
24 | [rule.unreachable-code]
25 | [rule.modifies-parameter]
26 | [rule.unnecessary-stmt]
27 | [rule.confusing-results]
28 | [rule.get-return]
29 | [rule.blank-imports]
30 | [rule.redefines-builtin-id]
31 | [rule.empty-lines]
32 | [rule.call-to-gc]
33 | [rule.atomic]
34 | [rule.waitgroup-by-value]
35 | [rule.range-val-in-closure]
36 | [rule.constant-logical-expr]
37 | [rule.modifies-value-receiver]
38 | [rule.bool-literal-in-expr]
39 | [rule.argument-limit]
40 |     arguments =[8]
41 | [rule.function-result-limit]
42 |     arguments =[8]
43 | [rule.imports-blacklist]
44 | 


--------------------------------------------------------------------------------
/staging/src/google/protobuf/empty.proto:
--------------------------------------------------------------------------------
 1 | // Protocol Buffers - Google's data interchange format
 2 | // Copyright 2008 Google Inc.  All rights reserved.
 3 | // https://developers.google.com/protocol-buffers/
 4 | //
 5 | // Redistribution and use in source and binary forms, with or without
 6 | // modification, are permitted provided that the following conditions are
 7 | // met:
 8 | //
 9 | //     * Redistributions of source code must retain the above copyright
10 | // notice, this list of conditions and the following disclaimer.
11 | //     * Redistributions in binary form must reproduce the above
12 | // copyright notice, this list of conditions and the following disclaimer
13 | // in the documentation and/or other materials provided with the
14 | // distribution.
15 | //     * Neither the name of Google Inc. nor the names of its
16 | // contributors may be used to endorse or promote products derived from
17 | // this software without specific prior written permission.
18 | //
19 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | 
31 | syntax = "proto3";
32 | 
33 | package google.protobuf;
34 | 
35 | option csharp_namespace = "Google.Protobuf.WellKnownTypes";
36 | option go_package = "github.com/golang/protobuf/ptypes/empty";
37 | option java_package = "com.google.protobuf";
38 | option java_outer_classname = "EmptyProto";
39 | option java_multiple_files = true;
40 | option objc_class_prefix = "GPB";
41 | option cc_enable_arenas = true;
42 | 
43 | // A generic empty message that you can re-use to avoid defining duplicated
44 | // empty messages in your APIs. A typical example is to use it as the request
45 | // or the response type of an API method. For instance:
46 | //
47 | //     service Foo {
48 | //       rpc Bar(google.protobuf.Empty) returns (google.protobuf.Empty);
49 | //     }
50 | //
51 | // The JSON representation for `Empty` is empty JSON object `{}`.
52 | message Empty {}


--------------------------------------------------------------------------------