├── LICENSE ├── README.md ├── doc └── optional_arg.png ├── email_conf.json └── scramble4gpu.py /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Wei Wang 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # GPUSnatcher 2 | 3 | GPUSnatcher是用来在抢占显卡的脚本,主要是在实验室显卡资源紧张且自己亟需使用显卡的情况下,使用该脚本自动抢占一个或多个显卡。 4 | 5 | **建议将scramble4gpu.py更改为train.py,并设置仅自己可见,防止被打...** 6 | **请勿恶意抢占!!** 7 | 8 | ## 依赖 9 | 10 | - numpy 11 | - torch or tensorflow 12 | 13 | ## 使用 14 | 15 | - **请先配置Email,否则当抢占GPU之后,不能发送Email** 16 | ``` 17 | git clone https://github.com/wilmerwang/GPUSnatcher.git 18 | cd GPUSnatcher 19 | 20 | # ./email_conf.json 21 | # 建议使用qq邮箱服务,如果用的其他邮箱服务器,请自行设置 22 | { 23 | "host": "smtp.qq.com", # qq邮箱server 24 | "user": "2xxxxxxx6@qq.com", # 要登陆的qq账号 25 | "pwd": "xxxxxxxxxxxxxxxx", # SMTP授权码, qq邮箱--> 设置 --> 账号 --> IMAP/SMTP服务开启 --> 生成授权码 26 | "sender": "2xxxxxxx6@qq.com", # 发送者 27 | "receiver": "2xxxxxxx6@qq.com" # 接收邮箱,可以是列表比如["a@qq.com", "b@qq.com"] 28 | } 29 | ``` 30 | 31 | - 配置之后运行程序 32 | ```shell 33 | python scramble4gpu.py 34 | ``` 35 | 36 | ### 可选参数 37 | 38 | - -p --proportion 显卡空闲内存 / 全部内存 的阈值,取值在0-1之间。当p取1的时候,表示仅仅列出完全没有被使用的显卡。默认为0.8。 39 | - -n --gpu_nums 需要抢占的GPU数量,建议不要抢太多,容易挨揍。默认是1。 40 | - -t --times 抢占显卡之后,自动释放显卡的时间。默认是30分钟。 41 | - -e --email_conf email的配置参数,默认在./email_conf.json 42 | 43 | 当想自己设置以上参数的时候: 44 | 45 | ```shell 46 | # 查看参数详情 47 | python scramble4gpu.py -h 48 | 49 | # 查找Free显存大于0.9的显卡,抢占4个,1800秒后自动释放,email配置路径为./email_conf.json 50 | python scramble4gpu.py -p 0.9 -n 4 -t 1800 -e ./email_conf.json 51 | ``` 52 | -------------------------------------------------------------------------------- /doc/optional_arg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wilmerwang/GPUSnatcher/d4fc6ffae2f151d7caa59bac7fdf921b3ffe732a/doc/optional_arg.png -------------------------------------------------------------------------------- /email_conf.json: -------------------------------------------------------------------------------- 1 | { 2 | "host": "smtp.qq.com", 3 | "user": "2xxxxxxx6@qq.com", 4 | "pwd": "xxxxxxxxxxxxxxxx", 5 | "sender": "2xxxxxxx6@qq.com", 6 | "receiver": "2xxxxxxx6@qq.com" 7 | } 8 | -------------------------------------------------------------------------------- /scramble4gpu.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import os 3 | import sys 4 | import time 5 | import argparse 6 | import random 7 | import multiprocessing 8 | import json 9 | import socket 10 | from smtplib import SMTP_SSL 11 | from email.mime.text import MIMEText 12 | from email.utils import formataddr 13 | 14 | import numpy as np 15 | try: 16 | import torch 17 | except ImportError: 18 | try: 19 | import tensorflow as tf 20 | except ImportError: 21 | print("No pytorch and tensorflow module, please install one of these!") 22 | sys.exit() 23 | 24 | 25 | 26 | def set_parser(): 27 | parser = argparse.ArgumentParser(description='..') 28 | parser.add_argument('-p', '--proportion', type=float, default=0.8, 29 | help='The ratio of gpu free memory to total memory') 30 | parser.add_argument('-n', '--gpu_nums', type=int, default=1, 31 | help='The numbers of GPU to scramble') 32 | parser.add_argument('-t', '--times', type=int, default=1800, 33 | help='Sleep time if scramble gpu') 34 | parser.add_argument('-e', '--email_conf', type=str, default='./email_conf.json', 35 | help='The path to email config') 36 | args = parser.parse_args() 37 | 38 | return args 39 | 40 | 41 | def parse(qargs, results): 42 | result_np = [] 43 | for line in results[1:]: 44 | result_np.append([''.join(filter(str.isdigit, word)) for word in line.split(',')]) 45 | result_np = np.array(result_np) 46 | 47 | return result_np 48 | 49 | 50 | def query_gpu(): 51 | qargs = ['index', 'memory.free', 'memory.total'] 52 | cmd = 'nvidia-smi --query-gpu={} --format=csv,noheader'.format(','.join(qargs)) 53 | results = os.popen(cmd).readlines() 54 | 55 | return parse(qargs, results), results[0].strip() 56 | 57 | 58 | class GPUManager(object): 59 | def __init__(self, args): 60 | self._args = args 61 | 62 | def choose_free_gpu(self): 63 | qresult, qindex = query_gpu() 64 | qresult = qresult.astype('int') 65 | 66 | if qresult.shape[0] == 0: 67 | print('No GPU, Check it.') 68 | else: 69 | qresult_sort_index = np.argsort(-qresult[:, 1]) 70 | idex = [i for i in qresult_sort_index if qresult[i][1]/qresult[i][2] > self._args.proportion] 71 | gpus_index = qresult[:, 0][idex] 72 | gpus_memory = qresult[:, 1][idex] 73 | return gpus_index, gpus_memory 74 | 75 | 76 | def compute_storage_size(memory): 77 | return pow(memory * 1024 * 1024 / 8, 1/3) * 0.9 78 | 79 | 80 | def worker(gpus_id, size): 81 | try: 82 | a = torch.zeros([size, size, size], dtype=torch.double, device=gpus_id) 83 | while True: 84 | torch.mul(a[0], a[0]) 85 | except Exception: 86 | os.environ["CUDA_VISIBLE_DEVICES"] = str(gpus_id) 87 | a = tf.zeros([size, size, size], dtype=tf.dtypes.float64) 88 | while True: 89 | tf.matmul(a[0], a[0]) 90 | 91 | 92 | class EmailSender(object): 93 | def __init__(self, host_server, user, pwd, sender): 94 | self.host_server = host_server 95 | self.user = user 96 | self.pwd = pwd 97 | self.sender = sender 98 | 99 | def send_email(self, receiver, subject, content): 100 | receiver = [receiver] if isinstance(receiver, str) else receiver 101 | message = MIMEText(content, 'plain', 'utf-8') 102 | message['Subject'] = subject 103 | message['From'] = formataddr(("GPUSnatcher", self.sender)) 104 | message['To'] = ", ".join(receiver) 105 | 106 | try: 107 | smtp_obj = SMTP_SSL(self.host_server) 108 | smtp_obj.ehlo(self.host_server) 109 | smtp_obj.login(self.user, self.pwd) 110 | smtp_obj.sendmail(self.sender, receiver, message.as_string()) 111 | smtp_obj.quit() 112 | print("The mail was sent successfully.") 113 | except Exception as e: 114 | print(e) 115 | 116 | 117 | def main(args, ids): 118 | with open(args.email_conf, "r") as f: 119 | email_conf = json.load(f) 120 | email_sender = EmailSender(email_conf['host'], 121 | email_conf['user'], 122 | email_conf['pwd'], 123 | email_conf['sender']) 124 | 125 | gpu_manager = GPUManager(args) 126 | processes = [] 127 | 128 | try: 129 | while True: 130 | gpus_free, gpus_memory = gpu_manager.choose_free_gpu() 131 | 132 | if len(gpus_free) == 0: 133 | pass 134 | else: 135 | sca_nums = args.gpu_nums - len(processes) 136 | if sca_nums > 0: 137 | 138 | sizes = [int(compute_storage_size(i)) for i in gpus_memory] 139 | for gpus_id, size in zip(gpus_free[:sca_nums], sizes[:sca_nums]): 140 | ids.append(gpus_id) 141 | print("Scramble GPU {}".format(gpus_id)) 142 | p = multiprocessing.Process(target=worker, args=(gpus_id, size)) 143 | p.start() 144 | processes.append(p) 145 | time.sleep(5) 146 | 147 | hostname = socket.gethostname() 148 | gpu_ids = ', '.join(gpus_free[:sca_nums].astype('str')) 149 | subject = f"{hostname}: GPU {gpu_ids} has been scrambled" 150 | content = f"{hostname}: GPU {gpu_ids} has been scrambled, and will be released in {args.times//60} minutes!" 151 | email_sender.send_email(email_conf['receiver'], subject, content) 152 | 153 | if len(ids) >= args.gpu_nums: 154 | time.sleep(args.times) 155 | break 156 | time.sleep(60) 157 | 158 | except Exception as e: 159 | print(e) 160 | 161 | finally: 162 | for p in processes: 163 | if p.is_alive(): 164 | p.terminate() 165 | for p in processes: 166 | p.join() 167 | 168 | 169 | if __name__ == '__main__': 170 | ids = [] 171 | args = set_parser() 172 | main(args, ids) 173 | --------------------------------------------------------------------------------