├── .gitignore
├── README.md
├── LICENCE
├── slaver.py
└── master.py


/.gitignore:
--------------------------------------------------------------------------------
1 | username_to_wechatname.txt
2 | itchat.pkl
3 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # 用微信监控多个服务器的GPU运行情况
 2 | 
 3 | ## 使用方法：
 4 | 
 5 | 1. 切换到Python 3环境
 6 | 
 7 | 2. 安装依赖：
 8 | 
 9 | ```shell
10 | pip install itchat
11 | pip install psutil
12 | pip install requests
13 | ```
14 | 
15 | 3. 选择一个服务器作为master服务器，运行
16 | 
17 | ```shell
18 | python master.py --address <主服务器IP地址>
19 | ```
20 | 
21 | 4. 在多个GPU服务器上运行
22 | 
23 | ```shell
24 | python slaver.py --address <主服务器IP地址>
25 | ```
26 | 
27 | ## 查询功能
28 | 
29 | 有效指令：
30 | - user：查看用户使用情况
31 | - server：查看服务器列表
32 | - gpu：查看所有GPU使用情况
33 | - gpu <完整IP地址或后缀>：查看指定服务器GPU使用情况
34 | 
35 | ## 警报功能：
36 | 
37 | 检测长时间占用GPU内存但是没有运行的进程，自动发送微信消息给相应用户。
38 | 
39 | 使用该功能需要添加用户的微信账号，修改备注名称，并将服务器账号名与微信备注名成对记录在<`username_to_wechatname.txt`>中，比如：
40 | 
41 | ```shell
42 | xiaoming 小明
43 | lilei 李雷
44 | hanmeimie 韩梅梅
45 | david David
46 | ```
47 | 
48 | 警报判定依据为长期满足以下条件：
49 | 
50 | - 占用GPU内存大于一定阈值（如1000M）
51 | - 进程所在GPU使用率以及进程自身的CPU使用率均低于一定阈值（如10%）
52 | - 距离上次该进程触发警报过去了一段时间（如半小时）
53 | 
54 | 以上数值可根据需要在启动master服务时更改，详见帮助信息：
55 | 
56 | ```python
57 | python master.py -h
58 | ```
59 | 


--------------------------------------------------------------------------------
/LICENCE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 WarBean
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/slaver.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import pwd
  3 | import time
  4 | import json
  5 | import psutil
  6 | import argparse
  7 | import requests
  8 | import subprocess
  9 | 
 10 | def get_owner(pid):
 11 |     try:
 12 |         for line in open('/proc/%d/status' % pid):
 13 |             if line.startswith('Uid:'):
 14 |                 uid = int(line.split()[1])
 15 |                 return pwd.getpwuid(uid).pw_name
 16 |     except:
 17 |         return None
 18 | 
 19 | def get_info():
 20 |     info = { 'gpu': [], 'process': [] }
 21 |     msg = subprocess.Popen('nvidia-smi', stdout = subprocess.PIPE).stdout.read().decode()
 22 |     msg = msg.strip().split('\n')
 23 | 
 24 |     lino = 8
 25 |     while True:
 26 |         status = re.findall('.*\d+%.*\d+C.*\d+W / +\d+W.* +(\d+)MiB / +(\d+)MiB.* +(\d+)%.*', msg[lino])
 27 |         if status == []: break
 28 |         mem_usage, mem_total, percent = status[0]
 29 |         info['gpu'].append({
 30 |             'mem_usage': float(mem_usage),
 31 |             'mem_total': float(mem_total),
 32 |             'percent': float(percent),
 33 |         })
 34 |         lino += 3
 35 | 
 36 |     lino = -1
 37 |     while True:
 38 |         lino -= 1
 39 |         status = re.findall('\| +(\d+) +(\d+) +\w+ +([^ ]*) +(\d+)MiB \|', msg[lino])
 40 |         if status == []: break
 41 |         gpuid, pid, program, mem_usage = status[0]
 42 |         username = get_owner(int(pid))
 43 |         if username is None:
 44 |             print('进程已经不存在')
 45 |             continue
 46 |         wechatname = name_dict.get(username, username)
 47 |         try:
 48 |             p = psutil.Process(int(pid))
 49 |             p.cpu_percent()
 50 |             time.sleep(0.5)
 51 |             cpu_percent = p.cpu_percent()
 52 |         except psutil.NoSuchProcess:
 53 |             print('进程已经不存在')
 54 |             continue
 55 |         info['process'].append({
 56 |             'gpuid': int(gpuid),
 57 |             'pid': int(pid),
 58 |             'program': program,
 59 |             'cpu_percent': cpu_percent,
 60 |             'mem_usage': float(mem_usage),
 61 |             'username': username,
 62 |             'wechatname': wechatname,
 63 |         })
 64 |     info['process'].reverse()
 65 | 
 66 |     return info
 67 | 
 68 | def running_mean(mean_info, curr_info, decay):
 69 |     def merge(a, b): return a * decay + b * (1 - decay)
 70 |     new_info = { 'gpu': [], 'process': [] }
 71 |     for mean_gi, curr_gi in zip(mean_info['gpu'], curr_info['gpu']):
 72 |         new_info['gpu'].append({
 73 |             'mem_usage': merge(mean_gi['mem_usage'], curr_gi['mem_usage']),
 74 |             'mem_total': merge(mean_gi['mem_total'], curr_gi['mem_total']),
 75 |             'percent': merge(mean_gi['percent'], curr_gi['percent']),
 76 |         })
 77 |     mean_pi_dict = { (pi['gpuid'], pi['pid'], pi['program'], pi['username']): pi for pi in mean_info['process'] }
 78 |     curr_pi_dict = { (pi['gpuid'], pi['pid'], pi['program'], pi['username']): pi for pi in curr_info['process'] }
 79 |     mean_pi_keys = set(mean_pi_dict.keys())
 80 |     curr_pi_keys = set(curr_pi_dict.keys())
 81 |     for key in sorted(set.union(mean_pi_keys, curr_pi_keys)):
 82 |         if key in mean_pi_keys and key in curr_pi_keys:
 83 |             mean_pi = mean_pi_dict[key]
 84 |             curr_pi = curr_pi_dict[key]
 85 |             mean_pi['mem_usage'] = merge(mean_pi['mem_usage'], curr_pi['mem_usage'])
 86 |             mean_pi['cpu_percent'] = merge(mean_pi['cpu_percent'], curr_pi['cpu_percent'])
 87 |             new_info['process'].append(mean_pi)
 88 |         elif key not in mean_pi_keys:
 89 |             curr_pi = curr_pi_dict[key]
 90 |             new_info['process'].append(curr_pi)
 91 |     return new_info
 92 | 
 93 | parser = argparse.ArgumentParser()
 94 | parser.add_argument('--address', required = True, help = 'master服务器IP地址')
 95 | parser.add_argument('--port', default = '5678', help = 'master服务器端口，默认5678')
 96 | opt = parser.parse_args()
 97 | 
 98 | url = 'http://%s:%s' % (opt.address, opt.port)
 99 | name_dict = dict([
100 |     line.strip().split()
101 |     for line in open('username_to_wechatname.txt', encoding = 'utf8')
102 | ])
103 | mean_info = None
104 | 
105 | while True:
106 |     curr_info = get_info()
107 |     if mean_info is None:
108 |         mean_info = curr_info
109 |     else:
110 |         mean_info = running_mean(mean_info, curr_info, 0.9)
111 |     data = json.dumps(mean_info)
112 |     try:
113 |         response = requests.get(url, data = data)
114 |         print('HTTP状态码:', response.status_code)
115 |     except Exception as e:
116 |         print(e)
117 |     time.sleep(1)
118 | 


--------------------------------------------------------------------------------
/master.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import time
  3 | import json
  4 | import itchat
  5 | import argparse
  6 | from threading import Thread, Lock
  7 | from http.server import HTTPServer
  8 | from http.server import BaseHTTPRequestHandler
  9 | 
 10 | class CustomHandler(BaseHTTPRequestHandler):
 11 |     alert_record = { }
 12 | 
 13 |     def do_GET(self):
 14 |         length = int(self.headers['content-length'])
 15 |         info = json.loads(self.rfile.read(length).decode())
 16 |         slaver_address, _ = self.client_address
 17 |         lock.acquire()
 18 |         info_record[slaver_address] = info
 19 |         lock.release()
 20 |         alert_waste(info, self.alert_record)
 21 |         self.send_response(200)
 22 |         self.end_headers()
 23 | 
 24 |     def log_message(self, format, *args):
 25 |         return
 26 | 
 27 | def http_func():
 28 |     server = HTTPServer((opt.address, opt.port), CustomHandler)
 29 |     print("监听服务开启，按<Ctrl-C>退出")
 30 |     server.serve_forever()
 31 | 
 32 | def alert_condition(mem_usage, gpu_percent, cpu_percent, wechatname, pid, alert_record):
 33 |     curr_hour = int(time.strftime('%H'))
 34 |     if curr_hour < opt.beg_hour or curr_hour >= opt.end_hour: return False
 35 |     if mem_usage < opt.mem_usage_threshold: return False
 36 |     if cpu_percent > opt.cpu_percent_threshold and gpu_percent > opt.gpu_percent_threshold: return False
 37 |     curr_time = time.time()
 38 |     if (wechatname, pid) not in alert_record:
 39 |         alert_record[(wechatname, pid)] = curr_time
 40 |         return True
 41 |     if curr_time - alert_record[(wechatname, pid)] > opt.interval:
 42 |         alert_record[(wechatname, pid)] = curr_time
 43 |         return True
 44 |     return False
 45 | 
 46 | def alert_waste(info, alert_record):
 47 |     for slaver_address in sorted(info_record.keys()):
 48 |         gi_list = info_record[slaver_address]['gpu']
 49 |         pi_list = info_record[slaver_address]['process']
 50 |         for pi in pi_list:
 51 |             gi = gi_list[pi['gpuid']]
 52 |             if alert_condition(pi['mem_usage'], gi['percent'], pi['cpu_percent'], pi['wechatname'], pi['pid'], alert_record):
 53 |                 alerting = [
 54 |                     '检测到程序长时间高内存消耗且低负载空转：',
 55 |                     '所在服务器：%s' % slaver_address,
 56 |                     'GPU id：%d' % pi['gpuid'],
 57 |                     'PID：%d' % pi['pid'],
 58 |                     '程序名：%s' % pi['program'],
 59 |                     '进程GPU内存占用：%dM' % pi['mem_usage'],
 60 |                     '所在GPU内存占用：%dM/%dM' % (gi['mem_usage'], gi['mem_total']),
 61 |                     '所在GPU使用率：%d%%' % gi['percent'],
 62 |                     '进程CPU使用率：%d%%' % pi['cpu_percent'],
 63 |                 ]
 64 |                 print('向<%s>发送警报：\n\t%s' % (pi['wechatname'], '\n\t'.join(alerting)))
 65 |                 friend = itchat.search_friends(remarkName = pi['wechatname'])
 66 |                 if len(friend) == 0:
 67 |                     print('不存在微信好友：<%s>' % pi['wechatname'])
 68 |                     continue
 69 |                 friend[0].send('\n'.join(alerting))
 70 | 
 71 | def report_server():
 72 |     report = ['服务器列表：']
 73 |     lock.acquire()
 74 |     for slaver_address in sorted(info_record.keys()):
 75 |         report.append(slaver_address)
 76 |     lock.release()
 77 |     report = '\n'.join(report)
 78 |     return report
 79 | 
 80 | def report_gpu(slaver_address = None):
 81 |     report = []
 82 |     lock.acquire()
 83 |     if slaver_address is None:
 84 |         address_list = sorted(info_record.keys())
 85 |     else:
 86 |         address_list = [slaver_address]
 87 |     for slaver_address in address_list:
 88 |         report.append('服务器地址: %s' % slaver_address)
 89 |         gi_list = info_record[slaver_address]['gpu']
 90 |         pi_list = info_record[slaver_address]['process']
 91 |         for gpuid, gi in enumerate(gi_list):
 92 |             report.append('GPU%d 显存%dM/%dM 使用率%d%%' % (
 93 |                 gpuid, int(gi['mem_usage']), int(gi['mem_total']), int(gi['percent'])
 94 |             ))
 95 |         report.append('进程列表')
 96 |         for pi in pi_list:
 97 |             report.append('GPU%d %s %s 显存%dM CPU占比%d%%' % (
 98 |                 pi['gpuid'], pi['username'], pi['wechatname'], int(pi['mem_usage']), int(pi['cpu_percent'])
 99 |             ))
100 |         report.append('=' * 10)
101 |     if report != []: del report[-1]
102 |     lock.release()
103 |     report = '\n'.join(report)
104 |     return report
105 | 
106 | def report_user():
107 |     usage_dict = { }
108 |     lock.acquire()
109 |     for slaver_address in sorted(info_record.keys()):
110 |         pi_list = info_record[slaver_address]['process']
111 |         for pi in pi_list:
112 |             wechatname = pi['wechatname']
113 |             mem_usage = pi['mem_usage']
114 |             usage_dict[wechatname] = usage_dict.get(wechatname, 0) + mem_usage
115 |     lock.release()
116 |     usage_list = sorted(usage_dict.items(), key = lambda x: x[1])
117 |     report = ['用户显存占用排序：'] + ['%s : %dM' % (n, u) for n, u in usage_list]
118 |     report = '\n'.join(report)
119 |     return report
120 | 
121 | parser = argparse.ArgumentParser()
122 | parser.add_argument('--address', required = True, help = 'master服务器IP地址')
123 | parser.add_argument('--port', type = int, default = '5678', help = 'master服务器端口，默认5678')
124 | parser.add_argument('--interval', type = int, default = '1800', help = '警报间隔时间，默认1800秒')
125 | parser.add_argument('--mem_usage_threshold', type = int, default = '1000', help = '警报功能GPU内存阈值')
126 | parser.add_argument('--gpu_percent_threshold', type = int, default = '10', help = '警报功能GPU使用率阈值')
127 | parser.add_argument('--cpu_percent_threshold', type = int, default = '10', help = '警报功能CPU使用率阈值')
128 | parser.add_argument('--beg_hour', type = int, default = '9', help = '警报功能在几点开启')
129 | parser.add_argument('--end_hour', type = int, default = '17', help = '警报功能在几点关闭')
130 | opt = parser.parse_args()
131 | 
132 | info_record = { }
133 | lock = Lock()
134 | 
135 | http_thread = Thread(target = http_func)
136 | http_thread.setDaemon(True)
137 | http_thread.start()
138 | 
139 | @itchat.msg_register(itchat.content.TEXT)
140 | def receive_text(msg):
141 |     print('收到指令: %s' % msg.text)
142 |     error = '\n'.join([
143 |         '请使用有效指令：',
144 |         'user：查看用户使用情况',
145 |         'server：查看服务器列表',
146 |         'gpu：查看所有GPU使用情况',
147 |         'gpu <完整IP地址或后缀>：查看指定服务器GPU使用情况',
148 |     ])
149 |     if msg.text == 'server':
150 |         return report_server()
151 |     if msg.text.startswith('gpu'):
152 |         if msg.text == 'gpu':
153 |             return report_gpu()
154 |         tokens = msg.text.split()
155 |         if len(tokens) != 2:
156 |             return error
157 |         slaver_address = tokens[1]
158 |         if not re.fullmatch('\d+\.\d+\.\d+\.\d+', slaver_address):
159 |             candidates = [add for add in info_record.keys() if add.endswith(slaver_address)]
160 |             if len(candidates) == 0:
161 |                 return '服务器%s不存在' % slaver_address
162 |             if len(candidates) > 1:
163 |                 report = ['请指明是以下哪个服务器：'] + candidates
164 |                 return '\n'.join(report)
165 |             slaver_address = candidates[0]
166 |         if slaver_address not in info_record:
167 |             return '服务器%s不存在' % slaver_address
168 |         return report_gpu(slaver_address)
169 |     if msg.text in ['用户', 'user']:
170 |         return report_user()
171 |     return error
172 | 
173 | itchat.auto_login(enableCmdQR = 2, hotReload = True)
174 | itchat.run()
175 | 


--------------------------------------------------------------------------------