├── .gitignore ├── README.md ├── LICENCE ├── slaver.py └── master.py /.gitignore: -------------------------------------------------------------------------------- 1 | username_to_wechatname.txt 2 | itchat.pkl 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 用微信监控多个服务器的GPU运行情况 2 | 3 | ## 使用方法: 4 | 5 | 1. 切换到Python 3环境 6 | 7 | 2. 安装依赖: 8 | 9 | ```shell 10 | pip install itchat 11 | pip install psutil 12 | pip install requests 13 | ``` 14 | 15 | 3. 选择一个服务器作为master服务器,运行 16 | 17 | ```shell 18 | python master.py --address <主服务器IP地址> 19 | ``` 20 | 21 | 4. 在多个GPU服务器上运行 22 | 23 | ```shell 24 | python slaver.py --address <主服务器IP地址> 25 | ``` 26 | 27 | ## 查询功能 28 | 29 | 有效指令: 30 | - user:查看用户使用情况 31 | - server:查看服务器列表 32 | - gpu:查看所有GPU使用情况 33 | - gpu <完整IP地址或后缀>:查看指定服务器GPU使用情况 34 | 35 | ## 警报功能: 36 | 37 | 检测长时间占用GPU内存但是没有运行的进程,自动发送微信消息给相应用户。 38 | 39 | 使用该功能需要添加用户的微信账号,修改备注名称,并将服务器账号名与微信备注名成对记录在<`username_to_wechatname.txt`>中,比如: 40 | 41 | ```shell 42 | xiaoming 小明 43 | lilei 李雷 44 | hanmeimie 韩梅梅 45 | david David 46 | ``` 47 | 48 | 警报判定依据为长期满足以下条件: 49 | 50 | - 占用GPU内存大于一定阈值(如1000M) 51 | - 进程所在GPU使用率以及进程自身的CPU使用率均低于一定阈值(如10%) 52 | - 距离上次该进程触发警报过去了一段时间(如半小时) 53 | 54 | 以上数值可根据需要在启动master服务时更改,详见帮助信息: 55 | 56 | ```python 57 | python master.py -h 58 | ``` 59 | -------------------------------------------------------------------------------- /LICENCE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 WarBean 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /slaver.py: -------------------------------------------------------------------------------- 1 | import re 2 | import pwd 3 | import time 4 | import json 5 | import psutil 6 | import argparse 7 | import requests 8 | import subprocess 9 | 10 | def get_owner(pid): 11 | try: 12 | for line in open('/proc/%d/status' % pid): 13 | if line.startswith('Uid:'): 14 | uid = int(line.split()[1]) 15 | return pwd.getpwuid(uid).pw_name 16 | except: 17 | return None 18 | 19 | def get_info(): 20 | info = { 'gpu': [], 'process': [] } 21 | msg = subprocess.Popen('nvidia-smi', stdout = subprocess.PIPE).stdout.read().decode() 22 | msg = msg.strip().split('\n') 23 | 24 | lino = 8 25 | while True: 26 | status = re.findall('.*\d+%.*\d+C.*\d+W / +\d+W.* +(\d+)MiB / +(\d+)MiB.* +(\d+)%.*', msg[lino]) 27 | if status == []: break 28 | mem_usage, mem_total, percent = status[0] 29 | info['gpu'].append({ 30 | 'mem_usage': float(mem_usage), 31 | 'mem_total': float(mem_total), 32 | 'percent': float(percent), 33 | }) 34 | lino += 3 35 | 36 | lino = -1 37 | while True: 38 | lino -= 1 39 | status = re.findall('\| +(\d+) +(\d+) +\w+ +([^ ]*) +(\d+)MiB \|', msg[lino]) 40 | if status == []: break 41 | gpuid, pid, program, mem_usage = status[0] 42 | username = get_owner(int(pid)) 43 | if username is None: 44 | print('进程已经不存在') 45 | continue 46 | wechatname = name_dict.get(username, username) 47 | try: 48 | p = psutil.Process(int(pid)) 49 | p.cpu_percent() 50 | time.sleep(0.5) 51 | cpu_percent = p.cpu_percent() 52 | except psutil.NoSuchProcess: 53 | print('进程已经不存在') 54 | continue 55 | info['process'].append({ 56 | 'gpuid': int(gpuid), 57 | 'pid': int(pid), 58 | 'program': program, 59 | 'cpu_percent': cpu_percent, 60 | 'mem_usage': float(mem_usage), 61 | 'username': username, 62 | 'wechatname': wechatname, 63 | }) 64 | info['process'].reverse() 65 | 66 | return info 67 | 68 | def running_mean(mean_info, curr_info, decay): 69 | def merge(a, b): return a * decay + b * (1 - decay) 70 | new_info = { 'gpu': [], 'process': [] } 71 | for mean_gi, curr_gi in zip(mean_info['gpu'], curr_info['gpu']): 72 | new_info['gpu'].append({ 73 | 'mem_usage': merge(mean_gi['mem_usage'], curr_gi['mem_usage']), 74 | 'mem_total': merge(mean_gi['mem_total'], curr_gi['mem_total']), 75 | 'percent': merge(mean_gi['percent'], curr_gi['percent']), 76 | }) 77 | mean_pi_dict = { (pi['gpuid'], pi['pid'], pi['program'], pi['username']): pi for pi in mean_info['process'] } 78 | curr_pi_dict = { (pi['gpuid'], pi['pid'], pi['program'], pi['username']): pi for pi in curr_info['process'] } 79 | mean_pi_keys = set(mean_pi_dict.keys()) 80 | curr_pi_keys = set(curr_pi_dict.keys()) 81 | for key in sorted(set.union(mean_pi_keys, curr_pi_keys)): 82 | if key in mean_pi_keys and key in curr_pi_keys: 83 | mean_pi = mean_pi_dict[key] 84 | curr_pi = curr_pi_dict[key] 85 | mean_pi['mem_usage'] = merge(mean_pi['mem_usage'], curr_pi['mem_usage']) 86 | mean_pi['cpu_percent'] = merge(mean_pi['cpu_percent'], curr_pi['cpu_percent']) 87 | new_info['process'].append(mean_pi) 88 | elif key not in mean_pi_keys: 89 | curr_pi = curr_pi_dict[key] 90 | new_info['process'].append(curr_pi) 91 | return new_info 92 | 93 | parser = argparse.ArgumentParser() 94 | parser.add_argument('--address', required = True, help = 'master服务器IP地址') 95 | parser.add_argument('--port', default = '5678', help = 'master服务器端口,默认5678') 96 | opt = parser.parse_args() 97 | 98 | url = 'http://%s:%s' % (opt.address, opt.port) 99 | name_dict = dict([ 100 | line.strip().split() 101 | for line in open('username_to_wechatname.txt', encoding = 'utf8') 102 | ]) 103 | mean_info = None 104 | 105 | while True: 106 | curr_info = get_info() 107 | if mean_info is None: 108 | mean_info = curr_info 109 | else: 110 | mean_info = running_mean(mean_info, curr_info, 0.9) 111 | data = json.dumps(mean_info) 112 | try: 113 | response = requests.get(url, data = data) 114 | print('HTTP状态码:', response.status_code) 115 | except Exception as e: 116 | print(e) 117 | time.sleep(1) 118 | -------------------------------------------------------------------------------- /master.py: -------------------------------------------------------------------------------- 1 | import re 2 | import time 3 | import json 4 | import itchat 5 | import argparse 6 | from threading import Thread, Lock 7 | from http.server import HTTPServer 8 | from http.server import BaseHTTPRequestHandler 9 | 10 | class CustomHandler(BaseHTTPRequestHandler): 11 | alert_record = { } 12 | 13 | def do_GET(self): 14 | length = int(self.headers['content-length']) 15 | info = json.loads(self.rfile.read(length).decode()) 16 | slaver_address, _ = self.client_address 17 | lock.acquire() 18 | info_record[slaver_address] = info 19 | lock.release() 20 | alert_waste(info, self.alert_record) 21 | self.send_response(200) 22 | self.end_headers() 23 | 24 | def log_message(self, format, *args): 25 | return 26 | 27 | def http_func(): 28 | server = HTTPServer((opt.address, opt.port), CustomHandler) 29 | print("监听服务开启,按退出") 30 | server.serve_forever() 31 | 32 | def alert_condition(mem_usage, gpu_percent, cpu_percent, wechatname, pid, alert_record): 33 | curr_hour = int(time.strftime('%H')) 34 | if curr_hour < opt.beg_hour or curr_hour >= opt.end_hour: return False 35 | if mem_usage < opt.mem_usage_threshold: return False 36 | if cpu_percent > opt.cpu_percent_threshold and gpu_percent > opt.gpu_percent_threshold: return False 37 | curr_time = time.time() 38 | if (wechatname, pid) not in alert_record: 39 | alert_record[(wechatname, pid)] = curr_time 40 | return True 41 | if curr_time - alert_record[(wechatname, pid)] > opt.interval: 42 | alert_record[(wechatname, pid)] = curr_time 43 | return True 44 | return False 45 | 46 | def alert_waste(info, alert_record): 47 | for slaver_address in sorted(info_record.keys()): 48 | gi_list = info_record[slaver_address]['gpu'] 49 | pi_list = info_record[slaver_address]['process'] 50 | for pi in pi_list: 51 | gi = gi_list[pi['gpuid']] 52 | if alert_condition(pi['mem_usage'], gi['percent'], pi['cpu_percent'], pi['wechatname'], pi['pid'], alert_record): 53 | alerting = [ 54 | '检测到程序长时间高内存消耗且低负载空转:', 55 | '所在服务器:%s' % slaver_address, 56 | 'GPU id:%d' % pi['gpuid'], 57 | 'PID:%d' % pi['pid'], 58 | '程序名:%s' % pi['program'], 59 | '进程GPU内存占用:%dM' % pi['mem_usage'], 60 | '所在GPU内存占用:%dM/%dM' % (gi['mem_usage'], gi['mem_total']), 61 | '所在GPU使用率:%d%%' % gi['percent'], 62 | '进程CPU使用率:%d%%' % pi['cpu_percent'], 63 | ] 64 | print('向<%s>发送警报:\n\t%s' % (pi['wechatname'], '\n\t'.join(alerting))) 65 | friend = itchat.search_friends(remarkName = pi['wechatname']) 66 | if len(friend) == 0: 67 | print('不存在微信好友:<%s>' % pi['wechatname']) 68 | continue 69 | friend[0].send('\n'.join(alerting)) 70 | 71 | def report_server(): 72 | report = ['服务器列表:'] 73 | lock.acquire() 74 | for slaver_address in sorted(info_record.keys()): 75 | report.append(slaver_address) 76 | lock.release() 77 | report = '\n'.join(report) 78 | return report 79 | 80 | def report_gpu(slaver_address = None): 81 | report = [] 82 | lock.acquire() 83 | if slaver_address is None: 84 | address_list = sorted(info_record.keys()) 85 | else: 86 | address_list = [slaver_address] 87 | for slaver_address in address_list: 88 | report.append('服务器地址: %s' % slaver_address) 89 | gi_list = info_record[slaver_address]['gpu'] 90 | pi_list = info_record[slaver_address]['process'] 91 | for gpuid, gi in enumerate(gi_list): 92 | report.append('GPU%d 显存%dM/%dM 使用率%d%%' % ( 93 | gpuid, int(gi['mem_usage']), int(gi['mem_total']), int(gi['percent']) 94 | )) 95 | report.append('进程列表') 96 | for pi in pi_list: 97 | report.append('GPU%d %s %s 显存%dM CPU占比%d%%' % ( 98 | pi['gpuid'], pi['username'], pi['wechatname'], int(pi['mem_usage']), int(pi['cpu_percent']) 99 | )) 100 | report.append('=' * 10) 101 | if report != []: del report[-1] 102 | lock.release() 103 | report = '\n'.join(report) 104 | return report 105 | 106 | def report_user(): 107 | usage_dict = { } 108 | lock.acquire() 109 | for slaver_address in sorted(info_record.keys()): 110 | pi_list = info_record[slaver_address]['process'] 111 | for pi in pi_list: 112 | wechatname = pi['wechatname'] 113 | mem_usage = pi['mem_usage'] 114 | usage_dict[wechatname] = usage_dict.get(wechatname, 0) + mem_usage 115 | lock.release() 116 | usage_list = sorted(usage_dict.items(), key = lambda x: x[1]) 117 | report = ['用户显存占用排序:'] + ['%s : %dM' % (n, u) for n, u in usage_list] 118 | report = '\n'.join(report) 119 | return report 120 | 121 | parser = argparse.ArgumentParser() 122 | parser.add_argument('--address', required = True, help = 'master服务器IP地址') 123 | parser.add_argument('--port', type = int, default = '5678', help = 'master服务器端口,默认5678') 124 | parser.add_argument('--interval', type = int, default = '1800', help = '警报间隔时间,默认1800秒') 125 | parser.add_argument('--mem_usage_threshold', type = int, default = '1000', help = '警报功能GPU内存阈值') 126 | parser.add_argument('--gpu_percent_threshold', type = int, default = '10', help = '警报功能GPU使用率阈值') 127 | parser.add_argument('--cpu_percent_threshold', type = int, default = '10', help = '警报功能CPU使用率阈值') 128 | parser.add_argument('--beg_hour', type = int, default = '9', help = '警报功能在几点开启') 129 | parser.add_argument('--end_hour', type = int, default = '17', help = '警报功能在几点关闭') 130 | opt = parser.parse_args() 131 | 132 | info_record = { } 133 | lock = Lock() 134 | 135 | http_thread = Thread(target = http_func) 136 | http_thread.setDaemon(True) 137 | http_thread.start() 138 | 139 | @itchat.msg_register(itchat.content.TEXT) 140 | def receive_text(msg): 141 | print('收到指令: %s' % msg.text) 142 | error = '\n'.join([ 143 | '请使用有效指令:', 144 | 'user:查看用户使用情况', 145 | 'server:查看服务器列表', 146 | 'gpu:查看所有GPU使用情况', 147 | 'gpu <完整IP地址或后缀>:查看指定服务器GPU使用情况', 148 | ]) 149 | if msg.text == 'server': 150 | return report_server() 151 | if msg.text.startswith('gpu'): 152 | if msg.text == 'gpu': 153 | return report_gpu() 154 | tokens = msg.text.split() 155 | if len(tokens) != 2: 156 | return error 157 | slaver_address = tokens[1] 158 | if not re.fullmatch('\d+\.\d+\.\d+\.\d+', slaver_address): 159 | candidates = [add for add in info_record.keys() if add.endswith(slaver_address)] 160 | if len(candidates) == 0: 161 | return '服务器%s不存在' % slaver_address 162 | if len(candidates) > 1: 163 | report = ['请指明是以下哪个服务器:'] + candidates 164 | return '\n'.join(report) 165 | slaver_address = candidates[0] 166 | if slaver_address not in info_record: 167 | return '服务器%s不存在' % slaver_address 168 | return report_gpu(slaver_address) 169 | if msg.text in ['用户', 'user']: 170 | return report_user() 171 | return error 172 | 173 | itchat.auto_login(enableCmdQR = 2, hotReload = True) 174 | itchat.run() 175 | --------------------------------------------------------------------------------