├── LICENSE
├── README.md
├── doc
    └── optional_arg.png
├── email_conf.json
└── scramble4gpu.py


/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Wei Wang
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # GPUSnatcher
 2 | 
 3 | GPUSnatcher是用来在抢占显卡的脚本，主要是在实验室显卡资源紧张且自己亟需使用显卡的情况下，使用该脚本自动抢占一个或多个显卡。
 4 | 
 5 | **建议将scramble4gpu.py更改为train.py，并设置仅自己可见，防止被打...**
 6 | **请勿恶意抢占!!**
 7 | 
 8 | ## 依赖
 9 | 
10 | - numpy
11 | - torch or tensorflow
12 | 
13 | ## 使用
14 | 
15 | - **请先配置Email，否则当抢占GPU之后，不能发送Email**
16 | ```
17 | git clone https://github.com/wilmerwang/GPUSnatcher.git
18 | cd GPUSnatcher
19 | 
20 | # ./email_conf.json
21 | # 建议使用qq邮箱服务,如果用的其他邮箱服务器，请自行设置
22 | {
23 |   "host": "smtp.qq.com",  # qq邮箱server
24 |   "user": "2xxxxxxx6@qq.com",  # 要登陆的qq账号
25 |   "pwd": "xxxxxxxxxxxxxxxx",  # SMTP授权码, qq邮箱--> 设置 --> 账号 --> IMAP/SMTP服务开启 --> 生成授权码
26 |   "sender": "2xxxxxxx6@qq.com",  # 发送者
27 |   "receiver": "2xxxxxxx6@qq.com"  # 接收邮箱,可以是列表比如["a@qq.com", "b@qq.com"]
28 | }
29 | ```
30 | 
31 | - 配置之后运行程序
32 | ```shell
33 | python scramble4gpu.py
34 | ```
35 | 
36 | ### 可选参数
37 | 
38 | - -p --proportion 显卡空闲内存 / 全部内存 的阈值，取值在0-1之间。当p取1的时候，表示仅仅列出完全没有被使用的显卡。默认为0.8。
39 | - -n --gpu_nums 需要抢占的GPU数量，建议不要抢太多，容易挨揍。默认是1。
40 | - -t --times 抢占显卡之后，自动释放显卡的时间。默认是30分钟。
41 | - -e --email_conf email的配置参数，默认在./email_conf.json
42 | 
43 | 当想自己设置以上参数的时候：
44 | 
45 | ```shell
46 | # 查看参数详情
47 | python scramble4gpu.py -h
48 | 
49 | # 查找Free显存大于0.9的显卡，抢占4个，1800秒后自动释放,email配置路径为./email_conf.json
50 | python scramble4gpu.py -p 0.9 -n 4 -t 1800 -e ./email_conf.json
51 | ```
52 | 


--------------------------------------------------------------------------------
/doc/optional_arg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wilmerwang/GPUSnatcher/d4fc6ffae2f151d7caa59bac7fdf921b3ffe732a/doc/optional_arg.png


--------------------------------------------------------------------------------
/email_conf.json:
--------------------------------------------------------------------------------
1 | {
2 |   "host": "smtp.qq.com",
3 |   "user": "2xxxxxxx6@qq.com",
4 |   "pwd": "xxxxxxxxxxxxxxxx",
5 |   "sender": "2xxxxxxx6@qq.com",
6 |   "receiver": "2xxxxxxx6@qq.com"
7 | }
8 | 


--------------------------------------------------------------------------------
/scramble4gpu.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import os
  3 | import sys
  4 | import time
  5 | import argparse
  6 | import random
  7 | import multiprocessing
  8 | import json
  9 | import socket
 10 | from smtplib import SMTP_SSL
 11 | from email.mime.text import MIMEText
 12 | from email.utils import formataddr
 13 | 
 14 | import numpy as np
 15 | try:
 16 |     import torch
 17 | except ImportError:
 18 |     try:
 19 |         import tensorflow as tf
 20 |     except ImportError:
 21 |         print("No pytorch and tensorflow module, please install one of these!")
 22 |         sys.exit()
 23 |     
 24 | 
 25 | 
 26 | def set_parser():
 27 |     parser = argparse.ArgumentParser(description='..')
 28 |     parser.add_argument('-p', '--proportion', type=float, default=0.8,
 29 |                         help='The ratio of gpu free memory to total memory')
 30 |     parser.add_argument('-n', '--gpu_nums', type=int, default=1,
 31 |                         help='The numbers of GPU to scramble')
 32 |     parser.add_argument('-t', '--times', type=int, default=1800,
 33 |                         help='Sleep time if scramble gpu')
 34 |     parser.add_argument('-e', '--email_conf', type=str, default='./email_conf.json',
 35 |                         help='The path to email config')
 36 |     args = parser.parse_args()
 37 | 
 38 |     return args
 39 | 
 40 | 
 41 | def parse(qargs, results):
 42 |     result_np = []
 43 |     for line in results[1:]:
 44 |         result_np.append([''.join(filter(str.isdigit, word)) for word in line.split(',')])
 45 |     result_np = np.array(result_np)
 46 | 
 47 |     return result_np
 48 | 
 49 | 
 50 | def query_gpu():
 51 |     qargs = ['index', 'memory.free', 'memory.total']
 52 |     cmd = 'nvidia-smi --query-gpu={} --format=csv,noheader'.format(','.join(qargs))
 53 |     results = os.popen(cmd).readlines()
 54 | 
 55 |     return parse(qargs, results), results[0].strip()
 56 | 
 57 | 
 58 | class GPUManager(object):
 59 |     def __init__(self, args):
 60 |         self._args = args
 61 | 
 62 |     def choose_free_gpu(self):
 63 |         qresult, qindex = query_gpu()
 64 |         qresult = qresult.astype('int')
 65 | 
 66 |         if qresult.shape[0] == 0:
 67 |             print('No GPU, Check it.')
 68 |         else:
 69 |             qresult_sort_index = np.argsort(-qresult[:, 1])
 70 |             idex = [i for i in qresult_sort_index if qresult[i][1]/qresult[i][2] > self._args.proportion]
 71 |             gpus_index = qresult[:, 0][idex]
 72 |             gpus_memory = qresult[:, 1][idex]
 73 |             return gpus_index, gpus_memory
 74 | 
 75 | 
 76 | def compute_storage_size(memory):
 77 |     return pow(memory * 1024 * 1024 / 8, 1/3) * 0.9
 78 | 
 79 | 
 80 | def worker(gpus_id, size):
 81 |     try:
 82 |         a = torch.zeros([size, size, size], dtype=torch.double, device=gpus_id)
 83 |         while True:
 84 |             torch.mul(a[0], a[0])
 85 |     except Exception:
 86 |         os.environ["CUDA_VISIBLE_DEVICES"] = str(gpus_id)
 87 |         a = tf.zeros([size, size, size], dtype=tf.dtypes.float64)
 88 |         while True:
 89 |             tf.matmul(a[0], a[0])
 90 | 
 91 | 
 92 | class EmailSender(object):
 93 |     def __init__(self, host_server, user, pwd, sender):
 94 |         self.host_server = host_server
 95 |         self.user = user
 96 |         self.pwd = pwd
 97 |         self.sender = sender
 98 | 
 99 |     def send_email(self, receiver, subject, content):
100 |         receiver = [receiver] if isinstance(receiver, str) else receiver
101 |         message = MIMEText(content, 'plain', 'utf-8')
102 |         message['Subject'] = subject
103 |         message['From'] = formataddr(("GPUSnatcher", self.sender))
104 |         message['To'] = ", ".join(receiver)
105 | 
106 |         try:
107 |             smtp_obj = SMTP_SSL(self.host_server)
108 |             smtp_obj.ehlo(self.host_server)
109 |             smtp_obj.login(self.user, self.pwd)
110 |             smtp_obj.sendmail(self.sender, receiver, message.as_string())
111 |             smtp_obj.quit()
112 |             print("The mail was sent successfully.")
113 |         except Exception as e:
114 |             print(e)
115 | 
116 | 
117 | def main(args, ids):
118 |     with open(args.email_conf, "r") as f:
119 |         email_conf = json.load(f)
120 |     email_sender = EmailSender(email_conf['host'],
121 |                                email_conf['user'],
122 |                                email_conf['pwd'],
123 |                                email_conf['sender'])
124 | 
125 |     gpu_manager = GPUManager(args)
126 |     processes = []
127 |     
128 |     try:
129 |         while True:
130 |             gpus_free, gpus_memory = gpu_manager.choose_free_gpu()
131 | 
132 |             if len(gpus_free) == 0:
133 |                 pass
134 |             else:
135 |                 sca_nums = args.gpu_nums - len(processes)
136 |                 if sca_nums > 0:
137 | 
138 |                     sizes = [int(compute_storage_size(i)) for i in gpus_memory]
139 |                     for gpus_id, size in zip(gpus_free[:sca_nums], sizes[:sca_nums]):
140 |                         ids.append(gpus_id)
141 |                         print("Scramble GPU {}".format(gpus_id))
142 |                         p = multiprocessing.Process(target=worker, args=(gpus_id, size))
143 |                         p.start()
144 |                         processes.append(p)
145 |                         time.sleep(5)
146 |                 
147 |                 hostname = socket.gethostname()
148 |                 gpu_ids = ', '.join(gpus_free[:sca_nums].astype('str'))
149 |                 subject = f"{hostname}: GPU {gpu_ids} has been scrambled"
150 |                 content = f"{hostname}: GPU {gpu_ids} has been scrambled, and will be released in {args.times//60} minutes!"
151 |                 email_sender.send_email(email_conf['receiver'], subject, content)
152 |             
153 |             if len(ids) >= args.gpu_nums:
154 |                 time.sleep(args.times)
155 |                 break
156 |             time.sleep(60)
157 | 
158 |     except Exception as e:
159 |         print(e)
160 | 
161 |     finally:
162 |         for p in processes:
163 |             if p.is_alive():
164 |                 p.terminate()
165 |         for p in processes:
166 |              p.join()
167 | 
168 | 
169 | if __name__ == '__main__':
170 |     ids = []
171 |     args = set_parser()
172 |     main(args, ids)
173 | 


--------------------------------------------------------------------------------