├── .gitignore ├── LICENSE ├── README.md ├── requirements.txt ├── setup.py └── slurm_gres_viz ├── __init__.py ├── args.py ├── displayer.py ├── main.py ├── parsers.py ├── pre_main.py ├── slurm_objects.py └── visualizer.py /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | *.egg-info/ 3 | dist/ 4 | deb_dist/ 5 | *.tar.gz 6 | *.tar 7 | build/ 8 | # hello.* 9 | 10 | test* 11 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Hyeo-geon Lee 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # SLURM_allocated_gres_visualizer 2 | **The app for visualizing allocated GPUs by SLURM** 3 | 4 | ![image](https://user-images.githubusercontent.com/25451196/222977415-c8b992e6-d46d-4856-9a26-558505e64956.png) 5 | 6 | When you are using Slurm and you want to check which gpus are allocated, you must have done something like 7 | - `ssh` to each computing node and run `nvidia-smi`. Then, repeat it. 8 | - Run `scontrol show job -d | grep GRES` and roll your eyeballs. 9 | 10 | 11 | both of which are very tedious. This project can solve this. 12 | 13 | # Requirements 14 | 15 | ## Packages 16 | - matplotlib 17 | - sty 18 | - prometheus-client 19 | - requests 20 | - pandas 21 | - bs4 22 | 23 | ## Slurm 24 | - Be sure that `slurmctld`(master) and `slurmd`(nodes) are active so that there are no problems for running `scontrol show nodes` or `scontrol show job`. 25 | - Be sure that `AutoDetect=nvml` for all computing nodes to avoid GPU index mismatch. 26 | - For all computing nodes, `node-exporter` are available at port `9100` and `dcgm-exporter` at `9400`. 27 | 28 | # Installation 29 | ```bash 30 | git clone https://github.com/Haawron/SLURM_allocated_gres_visualizer.git 31 | cd SLURM_allocated_gres_visualizer 32 | /usr/bin/python3 setup.py install # be sure to be without conda 33 | ``` 34 | 35 | # Usage 36 | ```bash 37 | slurm-gres-viz 38 | 39 | # GPU options 40 | slurm-gres-viz -i # stars are replaced to indices 41 | slurm-gres-viz -gm -gu # VRAM and GPU util 42 | slurm-gres-viz -f # Full information of GPUs 43 | slurm-gres-viz -m # mine: shows only my GPUs 44 | 45 | # others 46 | slurm-gres-viz -l 1 # looping every 1 second (same as nvidia-smi) 47 | ``` 48 | 49 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy==1.24.3 2 | matplotlib==3.6.3 3 | sty==1.0.4 4 | prometheus-client 5 | requests 6 | pandas==2.0.0rc0 7 | bs4 8 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | from pip._internal.req import parse_requirements 3 | import os 4 | 5 | 6 | requirements = parse_requirements('requirements.txt', session=False) 7 | required_packages = [str(package.__dict__.get('req', package.__dict__['requirement'])) for package in requirements] 8 | 9 | setup( 10 | name="slurm_gres_viz", 11 | version="2.1.1", 12 | author="Hyogun Lee(Haawron)", 13 | author_email="gunsbrother@khu.ac.kr", 14 | python_requires='>=3.6', 15 | install_requires=required_packages, 16 | description="The app for visualizing allocated GPUs by SLURM", 17 | license="MIT", 18 | url="https://github.com/Haawron/SLURM_allocated_gres_visualizer", 19 | packages=['slurm_gres_viz'], 20 | package_dir={'slurm_gres_viz': 'slurm_gres_viz'}, 21 | entry_points={ 22 | 'console_scripts' : [ 23 | f'slurm-gres-viz=slurm_gres_viz.main:{"forced_main" if bool(os.environ.get("FORCE_ONLY_MINE", False)) else "main"}' 24 | ] # todo: main function을 여러 개 만들고 main class가 옵션을 args가 아니라 init에서 받아와야 함 25 | }, 26 | classifiers=[ 27 | 'Environment :: Console', 28 | 'Intended Audience :: End Users', 29 | 'Operating System :: POSIX', 30 | 'Programming Language :: Python', 31 | ], 32 | ) 33 | -------------------------------------------------------------------------------- /slurm_gres_viz/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Haawron/SLURM_allocated_gres_visualizer/da92f81c6996959977d6f2682f1052a2ac881de1/slurm_gres_viz/__init__.py -------------------------------------------------------------------------------- /slurm_gres_viz/args.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | 4 | def rate_in_range(value): 5 | value = float(value) 6 | if value <= 0: 7 | raise argparse.ArgumentTypeError("Interval must be positive") 8 | elif value < 1: 9 | raise argparse.ArgumentTypeError("Interval smaller than 1s is not allowed") 10 | return value 11 | 12 | 13 | parser = argparse.ArgumentParser(description='SLURM Allocated GRES Visualizer') 14 | 15 | # gpu loggings 16 | parser.add_argument('-m', '--only-mine', action='store_true', 17 | help='asd') 18 | parser.add_argument('-f', '--full', action='store_true', 19 | help='asd') 20 | parser.add_argument('-i', '--index', action='store_true', 21 | help='Use Gres\' indices instead of stars(*)') 22 | parser.add_argument('-gm', '--gpu-memory', action='store_true', 23 | help='asd') 24 | parser.add_argument('-gu', '--gpu-util', action='store_true', 25 | help='asd') 26 | 27 | # iterate 28 | parser.add_argument('-l', '--loop', type=rate_in_range, default=-1, 29 | help='asd') 30 | 31 | # test 32 | parser.add_argument('-t', '--test-from-log', action='store_true', 33 | help='Test mode') 34 | 35 | args = parser.parse_args() 36 | -------------------------------------------------------------------------------- /slurm_gres_viz/displayer.py: -------------------------------------------------------------------------------- 1 | from typing import List, Dict, Tuple 2 | import os 3 | 4 | import pandas as pd 5 | import numpy as np 6 | import matplotlib.pyplot as plt 7 | from sty import fg, ef, bg 8 | 9 | if __name__.startswith('slurm_gres_viz'): 10 | from .slurm_objects import Node, Job, GPU 11 | from .args import args 12 | else: 13 | from slurm_objects import Node, Job, GPU 14 | from args import args 15 | from pprint import pprint 16 | 17 | 18 | cmap = plt.get_cmap('jet') 19 | RED = (255, 50, 0) 20 | YELLOW = (200, 200, 0) 21 | 22 | 23 | class Displayer: 24 | def __init__(self, nodes:List[Node], jobs:List[Job], **display_options): 25 | self.dashboard = DashBoard(nodes, jobs, **display_options) 26 | self.legend = Legend(jobs, **display_options) 27 | 28 | def show(self): 29 | self.dashboard.show() 30 | self.legend.show() 31 | 32 | 33 | class DashBoard: # Upper body 34 | def __init__(self, 35 | nodes:List[Node], jobs:List[Job], 36 | 37 | show_index:bool=False, show_gpu_memory:bool=False, show_gpu_util:bool=False, 38 | show_only_mine:bool=False 39 | ): 40 | self.nodes = nodes 41 | self.jobs = jobs 42 | 43 | self.show_index = show_index 44 | self.show_gpu_memory = show_gpu_memory 45 | self.show_gpu_util = show_gpu_util 46 | self.show_only_mine = show_only_mine 47 | 48 | self.max_num_node_gpus = max(map(lambda node: node.num_gpus_total, self.nodes)) 49 | self.delimiter_within_gpu = '|' 50 | if sum([self.show_index, self.show_gpu_memory, self.show_gpu_util]) <= 1: 51 | self.delimiter_between_gpu = '' 52 | else: 53 | self.delimiter_between_gpu = ' ' 54 | self.char_fill_hidden = '#' 55 | self.all_mine_masks = self.get_mine_mask() 56 | self.all_occupancy_masks = self.get_occupancy_mask() 57 | self.all_gpu_items = self.build_items() 58 | self.all_gpu_items = self.stylize_items(self.all_gpu_items) 59 | self.widths = self.calculate_widths() 60 | 61 | def show(self): 62 | lines = [ 63 | f'{node.name:{self.widths["nodename"]}}: ' 64 | f'[GPU] [{node.num_gpus_alloc}/{node.num_gpus_total}] {self.delimiter_between_gpu.join(gpu_items)} ' 65 | f'[CPU] {node.num_cpus_alloc:>{self.widths["cpu"]}}/{node.num_cpus_total:{self.widths["cpu"]}} ' 66 | f'[MEM] {node.mem_alloc:>{self.widths["mem"]-3}.0f}/{node.mem_total:{self.widths["mem"]}.2f} GiB' 67 | for node, gpu_items in zip(self.nodes, self.all_gpu_items.values()) 68 | ] 69 | body = '\n'.join(lines) 70 | print(body) 71 | 72 | def build_items(self): 73 | all_gpu_items:Dict[str,List[str]] = {} 74 | for node in self.nodes: 75 | mine_masks = self.all_mine_masks[node.name] 76 | occupancy_masks = self.all_occupancy_masks[node.name] 77 | gpu_items:List[str] = [] 78 | for gpu_idx in range(self.max_num_node_gpus): 79 | is_mine = mine_masks[gpu_idx] 80 | is_occupied = occupancy_masks[gpu_idx] 81 | will_be_hidden = self.show_only_mine and not is_mine 82 | if gpu_idx >= node.num_gpus_total: # pseudo item to align, as colorizer's width varies aligning with width does not work 83 | gpu_items.append(' '*len(gpu_item)) 84 | else: 85 | gpu_item = [] 86 | if any([self.show_index, self.show_gpu_memory, self.show_gpu_util]): 87 | if self.show_index: 88 | gpu_item.append(f'{gpu_idx}') 89 | if self.show_gpu_memory: 90 | gpu_item.append(f'{node.gpus[gpu_idx].vram_alloc:>4.1f}/{node.gpus[gpu_idx].vram_total:4.1f}GiB') 91 | if self.show_gpu_util: 92 | util = int(round(node.gpus[gpu_idx].util, 0)) 93 | gpu_item.append(f'{util:>2d}%' if util < 100 else '100') 94 | content = self.delimiter_within_gpu.join(gpu_item) 95 | if is_occupied: 96 | if will_be_hidden: 97 | content = self.char_fill_hidden * len(content) 98 | else: # idle GPUs 99 | content = '-' * len(content) 100 | gpu_item = '[' + content + ']' 101 | else: 102 | if is_occupied: 103 | if will_be_hidden: 104 | gpu_item = self.char_fill_hidden 105 | else: 106 | gpu_item = '*' 107 | else: # idle GPUs 108 | gpu_item = '-' 109 | gpu_items.append(gpu_item) 110 | all_gpu_items[node.name] = gpu_items 111 | return all_gpu_items 112 | 113 | def stylize_items(self, all_gpu_items): 114 | for job in self.jobs: 115 | color = get_color_from_idx(int(job.id)) 116 | is_mine = os.environ['USER'] in job.userid 117 | for nodename, tres_dict in job.tres_dict.items(): 118 | for gpu_idx in tres_dict['gpus']: 119 | will_be_hidden = self.show_only_mine and not is_mine 120 | if not will_be_hidden: 121 | content = colorize(all_gpu_items[nodename][gpu_idx], color) 122 | if is_mine: 123 | content = make_bold(content) 124 | all_gpu_items[nodename][gpu_idx] = content 125 | 126 | # not occupied -> colored into gray 127 | gray = tuple(100 for _ in range(3)) 128 | for nodename, occupancy_masks in self.all_occupancy_masks.items(): 129 | for gpu_idx, is_occupied in enumerate(occupancy_masks): 130 | if not is_occupied: # idle GPUs 131 | all_gpu_items[nodename][gpu_idx] = colorize(all_gpu_items[nodename][gpu_idx], gray) 132 | 133 | # TODO: 비정상(not in IDLE, MIXED, ALLOCATED) 노드 취소선 134 | for node in self.nodes: 135 | if any([invalid_state in node.states for invalid_state in ['DOWN', 'INVALID']]): 136 | for gpu_idx in range(node.num_gpus_total): 137 | all_gpu_items[node.name][gpu_idx] = colorize(all_gpu_items[node.name][gpu_idx], RED, True) 138 | elif 'DRAIN' in node.states: 139 | for gpu_idx in range(node.num_gpus_total): 140 | all_gpu_items[node.name][gpu_idx] = colorize(all_gpu_items[node.name][gpu_idx], YELLOW, True) 141 | else: # valid node 142 | pass 143 | return all_gpu_items 144 | 145 | def calculate_widths(self): 146 | widths = { 147 | 'nodename': max(map(lambda node: len(node.name), self.nodes)), 148 | 'cpu': max(map(lambda node: np.log10(node.num_cpus_total).astype(int)+1, self.nodes)), 149 | 'mem': 6 150 | # why don't we have gpu items' width? 151 | # => as colorizer's width varies aligning with width does not work 152 | } 153 | return widths 154 | 155 | def get_mine_mask(self): 156 | all_mine_masks:Dict[str,List[bool]] = {node.name: [False]*self.max_num_node_gpus for node in self.nodes} 157 | for job in self.jobs: 158 | is_mine = os.environ['USER'] in job.userid 159 | if is_mine: 160 | for nodename, tres_dict in job.tres_dict.items(): 161 | for gpu_idx in tres_dict['gpus']: 162 | all_mine_masks[nodename][gpu_idx] = True 163 | return all_mine_masks 164 | 165 | def get_occupancy_mask(self): 166 | all_occupancy_masks:Dict[str,List[bool]] = {node.name: [False]*self.max_num_node_gpus for node in self.nodes} 167 | for job in self.jobs: 168 | for nodename, tres_dict in job.tres_dict.items(): 169 | for gpu_idx in tres_dict['gpus']: 170 | all_occupancy_masks[nodename][gpu_idx] = True 171 | return all_occupancy_masks 172 | 173 | 174 | class Legend: # Lower body 175 | def __init__(self, 176 | jobs:List[Job], 177 | 178 | show_index:bool=False, show_gpu_memory:bool=False, show_gpu_util:bool=False, 179 | show_only_mine:bool=False 180 | ): 181 | self.jobs = jobs 182 | self.space_placeholder = '@' # not to be splitted by str.split 183 | self.delimiter_column = ' ' 184 | 185 | self.show_index = show_index 186 | self.show_gpu_memory = show_gpu_memory 187 | self.show_gpu_util = show_gpu_util 188 | self.show_only_mine = show_only_mine 189 | 190 | self.default_colnames = ['colors', 'user_id', 'job_id', 'job_arr_id', 'job_arr_task_id', 'job_name', 'node_name', 'gpus', 'cpus', 'mem'] 191 | self.default_display_colnames = [colname.replace('job_arr_task_id', 'arr_idx').upper() for colname in self.default_colnames if colname != 'job_arr_id'] 192 | self.default_aligns = pd.Series(['<', '<', '>', '<', '<', '<', '^', '^', '>', '>'], self.default_colnames) 193 | 194 | self.df, self.display_colnames, self.aligns = self.build_df() 195 | self.widths = self.calculate_widths(self.df, self.display_colnames) 196 | 197 | def show(self): 198 | if not self.df.empty: 199 | df_s = self.df.to_string(max_colwidth=0, index=False) 200 | lines = [line.split() for line in df_s.split('\n')] 201 | lines[0] = self.display_colnames 202 | else: 203 | lines = [self.display_colnames] 204 | s = [] 205 | for line in lines: 206 | ss = [] 207 | for elem, colname in zip(line, self.df.columns): 208 | ss.append(f'{elem:{self.aligns[colname]}{self.widths[colname]}}'.replace(self.space_placeholder, ' ')) 209 | ss = self.delimiter_column.join(ss) 210 | s.append(ss) 211 | whole_width = self.widths.sum() + (self.widths.shape[0]-1)*len(self.delimiter_column) 212 | print() 213 | print(f'{" LEGEND ":=^{whole_width}}') 214 | print('\n'.join(s)) 215 | 216 | def build_df(self): 217 | records = self.build_records_from_jobs(self.jobs) 218 | df = pd.DataFrame.from_records(records, columns=self.default_colnames[1:]) 219 | if self.show_only_mine: 220 | df = df[df['user_id'].str.contains(os.environ['USER'])] 221 | color_legend = df['job_id'].map(lambda jid: colorize('********', get_color_from_idx(int(jid)))) # before the column job_id overwritten 222 | df['job_id'] = df['job_arr_id'].fillna(df['job_id']) # firstly with job_arr_id, and overwrite with job_id only for none rows 223 | del df['job_arr_id'] 224 | df['gpus'] = df['gpus'].replace('', pd.NA).fillna('-') 225 | df['mem'] = df['mem'].astype(str) + f'{self.space_placeholder}GiB' 226 | # inserting the color legend 227 | df.insert(0, 'colors', color_legend) 228 | # masking multi-node jobs 229 | duplicates = df.duplicated(subset=['job_id', 'job_arr_task_id'], keep='first') 230 | df.loc[duplicates, ['colors', 'user_id', 'job_id', 'job_arr_task_id', 'job_name']] = self.space_placeholder 231 | 232 | no_arr_job = df['job_arr_task_id'].replace(self.space_placeholder, pd.NA).isna().all() 233 | display_colnames = self.default_display_colnames.copy() 234 | aligns = self.default_aligns.copy() 235 | if no_arr_job: 236 | del df['job_arr_task_id'] 237 | del aligns['job_arr_task_id'] 238 | display_colnames.remove('ARR_IDX') 239 | else: 240 | df['job_arr_task_id'] = df['job_arr_task_id'].fillna(self.space_placeholder) 241 | 242 | return df, display_colnames, aligns 243 | 244 | def build_records_from_jobs(self, jobs): 245 | records = [] 246 | for job in jobs: 247 | for nodename, tres_dict in job.tres_dict.items(): 248 | record = [ 249 | job.userid, job.id, job.arrayjobid, job.arraytaskid, job.name, nodename, 250 | ','.join(map(str, tres_dict['gpus'])), len(tres_dict['cpus']), int(tres_dict['mem']) 251 | ] 252 | records.append(record) 253 | return records 254 | 255 | def calculate_widths(self, df, display_colnames): 256 | tmp_df_for_calculating_width = pd.concat([df.astype(str), pd.DataFrame([display_colnames], columns=df.columns)], ignore_index=True) 257 | widths = tmp_df_for_calculating_width.applymap(lambda elem: len(str(elem))).max() 258 | widths['colors'] = 8 259 | return widths 260 | 261 | 262 | def get_color_from_idx(idx:int): 263 | color = cmap(((11*idx) % 256) / 256)[:-1] # RGB 264 | color = list(map(lambda x: int(x*255), color)) 265 | return color 266 | 267 | 268 | def colorize(source:str, color:List[int], background:bool=False): 269 | if not background: 270 | output = fg(*color) + source + fg.rs 271 | else: 272 | output = bg(*color) + source + bg.rs 273 | return output 274 | 275 | 276 | def make_bold(source:str): 277 | output = ef.b + source + ef.rs 278 | return output 279 | -------------------------------------------------------------------------------- /slurm_gres_viz/main.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | 4 | from pathlib import Path 5 | 6 | if __name__ == '__main__': # for test 7 | from args import args 8 | from visualizer import SlurmTresVisualizer 9 | else: # slurm_gres_viz.main 10 | from .args import args 11 | from .visualizer import SlurmTresVisualizer 12 | 13 | 14 | # TODO: GPU 정보 받아오는 건 GPU 옵션 받았을 때만 해야 함 15 | # TODO: 했는데도 느려서 프로파일링 해야 함 16 | 17 | 18 | def get_display_options(): 19 | display_options = { 20 | 'show_index': args.full or args.index, 21 | 'show_gpu_memory': args.full or args.gpu_memory, 22 | 'show_gpu_util': args.full or args.gpu_util, 23 | 'show_only_mine': args.only_mine, 24 | } 25 | return display_options 26 | 27 | 28 | def looper(func): # decorator 29 | def wrapper(**display_options): 30 | if args.loop < 0: 31 | func(**display_options) 32 | else: 33 | while True: 34 | func(**display_options) 35 | print('\n\n') 36 | time.sleep(args.loop) 37 | return wrapper 38 | 39 | 40 | @looper 41 | def run(**display_options): 42 | strings = { 43 | 'node_strings': os.popen('scontrol show nodes').read().strip().split('\n\n'), 44 | 'job_strings': os.popen('scontrol show job -d -a').read().strip().split('\n\n'), 45 | } 46 | viz = SlurmTresVisualizer(**strings, **display_options) 47 | viz.show() 48 | 49 | 50 | def main(): 51 | display_options = get_display_options() 52 | run(**display_options) 53 | 54 | 55 | def forced_main(): 56 | display_options = get_display_options() 57 | if 'admin' not in os.environ['USER']: 58 | display_options['show_only_mine'] = True 59 | run(**display_options) 60 | 61 | 62 | if __name__ == '__main__': # testing 63 | if args.test_from_log: 64 | strings = {} 65 | for p_case_dir in Path('test/logs').glob('**/*'): 66 | for obj in ['node', 'job']: 67 | p_obj = (p_case_dir / obj).with_suffix('.log') 68 | if p_obj.is_file(): 69 | with p_obj.open() as f: 70 | strings[f'{obj}_strings'] = f.read().strip().split('\n\n') 71 | else: 72 | break 73 | else: # no break 74 | print(p_case_dir.name) 75 | SlurmTresVisualizer(**strings) 76 | print() 77 | else: 78 | from cProfile import Profile 79 | from pstats import Stats 80 | profiler = Profile() 81 | profiler.runcall(main) 82 | stats = Stats(profiler) 83 | stats.strip_dirs() 84 | stats.sort_stats('tottime') 85 | stats.print_stats(20) 86 | stats.sort_stats('cumulative') 87 | stats.print_stats(20) 88 | -------------------------------------------------------------------------------- /slurm_gres_viz/parsers.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, List, Union 2 | import re 3 | import os 4 | import csv 5 | 6 | 7 | def parse_jobstring(job_string:str): 8 | """Parse job string for a single job. 9 | """ 10 | userid, = re.findall(r'UserId=(\S+)', job_string) 11 | jobid, = re.findall(r'^JobId=(\d+)', job_string) # Why ^? => not to capture ArrayJobId 12 | arrayjobid, = re.findall(r'ArrayJobId=(\d+)', job_string) or (None,) 13 | arraytaskid, = re.findall(r'ArrayTaskId=(\d+)', job_string) or (None,) 14 | jobname, = re.findall(r'JobName=(.*)', job_string) 15 | job_tres_strings = re.findall(r'\s(Nodes=.*)', job_string) # \s: white-space-like char 16 | job_tres_dict = dict(sum([list(job_tres_string_to_dict(job_tres_string).items()) for job_tres_string in job_tres_strings], [])) # sum up tres dicts 17 | return userid, jobid, arrayjobid, arraytaskid, jobname, job_tres_dict 18 | 19 | 20 | def parse_nodestring(node_string:str): 21 | nodename, = re.findall(r'NodeName=(\S+)', node_string) # \S: non-white-space-like char 22 | state, = re.findall(r'State=([\w\+]+)', node_string) 23 | num_cpus_alloc, = re.findall(r'CPUAlloc=(\d+)', node_string) 24 | num_cpus_total, = re.findall(r'CPUTot=(\d+)', node_string) 25 | num_gpus_alloc, = re.findall(r'AllocTRES=.*gres/gpu=(\d)', node_string) or [0] 26 | num_gpus_total, = re.findall(r'Gres=[a-zA-Z]+:(\d)', node_string) 27 | mem_alloc, = re.findall(r'AllocMem=(\d+)', node_string) 28 | mem_total, = re.findall(r'RealMemory=(\d+)', node_string) 29 | return nodename, state, int(num_cpus_alloc), int(num_cpus_total), int(num_gpus_alloc), int(num_gpus_total), MiB2GiB(float(mem_alloc)), MiB2GiB(float(mem_total)) 30 | 31 | 32 | def job_tres_string_to_dict(job_tres_string:str) -> Dict[str,List[Union[int,List[int],float]]]: 33 | """Convert TRES string of a job to python object 34 | 35 | Parameters 36 | ---------- 37 | job_tres_string : str 38 | * ex1) `"Nodes=node1 CPU_IDs=0-31 Mem=0 GRES=gpu(IDX:4-7)"` 39 | * ex2) `"Nodes=vll[2-3] CPU_IDs=8-11 Mem=5120 GRES=gpu:1(IDX:2)` 40 | 41 | Returns 42 | ------- 43 | Dict[str,List[int]] 44 | converted allocated TRES infos in appropriate python object 45 | * ex1) `{'node1': {'cpus': [0, 1, ..., 31], 'gpus': [4, 5, 6, 7]}, 'mem': 0}` 46 | * ex2) `{'vll2': {'cpus': [8, 9, 10, 11], 'gpus': [2], 'mem': 5120}, 'vll3': {'cpus': [8, 9, 10, 11], 'gpus': [2], 'mem': 5120}}` 47 | """ 48 | nodenames, = re.findall(r'Nodes=(\S+)', job_tres_string) 49 | nodenames = resolve_hostname_expr(nodenames) 50 | # nodenames = os.popen(f'scontrol show hostname {nodenames}').read().split() 51 | cpu_indices, = re.findall(r'CPU_IDs=([-,\d]+)', job_tres_string) or [''] 52 | gpu_indices, = re.findall(r'IDX:([-,\d]+)\)', job_tres_string) or [''] 53 | cpu_indices = resolve_index_expr(cpu_indices) 54 | gpu_indices = resolve_index_expr(gpu_indices) 55 | mem, = map(MiB2GiB, map(int, re.findall(r'Mem=(\d+)', job_tres_string))) 56 | job_tres_dict = { 57 | nodename: {'cpus': cpu_indices, 'gpus': gpu_indices, 'mem': mem} 58 | for nodename in nodenames 59 | } 60 | return job_tres_dict 61 | 62 | 63 | def resolve_hostname_expr(expr:str) -> List[str]: 64 | """# Vars (example) 65 | ex0) A single-node job 66 | @expr: `"batch1"` 67 | @return: `["batch1"]` 68 | 69 | ex1) 70 | @expr: `"batch[1,3-5]"` 71 | @return: `["batch1", "batch3", "batch4", "batch5"]` 72 | 73 | ex2) Dashed hostname 74 | @expr: `"debug-g[1-4]"` 75 | @return: `["debug-g1", "debug-g2", "debug-g3", "debug-g4"]` 76 | 77 | ex3) Multiple host ranges 78 | @expr: `"debug-g[1,3-4],batch[1-2]"` 79 | @return: `["debug-g1", "debug-g3", "debug-g4", "batch1", "batch2"]` 80 | """ 81 | # TODO: csv랑 re랑 비교 82 | # (?!...): negative lookahead assertion, matched only when ... does not follow 83 | # [^\[]*\]: ...] 84 | # ==> `,...]` will be ignored 85 | splitted_host_ranges = re.split(r',(?![^\[]*\])', expr) 86 | all_hostnames = [] 87 | for splitted_host_range in splitted_host_ranges: 88 | m = re.match(r'(?P[\w-]+)(\[(?P[\d,-]+)\])?', splitted_host_range) 89 | if m['range'] is not None: 90 | indices = resolve_index_expr(m['range']) 91 | hostnames = [f"{m['hostname_root']}{i}" for i in indices] 92 | else: 93 | hostnames = [m['hostname_root']] 94 | all_hostnames += hostnames 95 | return all_hostnames 96 | 97 | 98 | def resolve_index_expr(expr:str) -> List[int]: 99 | '''# Vars (example) 100 | @expr: `"0-1,3"` 101 | @return: `[0, 1, 3]` 102 | ''' 103 | if expr: 104 | # this function is called many times and slow speed of os.popen(...).read() matters 105 | # indices:List[str] = os.popen(f'scontrol show hostname [{expr_string}]').read().split() 106 | comma_splitted = expr.split(',') 107 | indices_lists = list(map(resolve_element_expr, comma_splitted)) 108 | indices = sum(indices_lists, []) 109 | return sorted(set(map(int, indices))) 110 | else: 111 | return [] 112 | 113 | 114 | def resolve_element_expr(element_expr:str): 115 | """# Vars (example) 116 | ex1) 117 | @element_expr: `"0-3"` 118 | @return: `[0, 1, 2, 3]` 119 | 120 | ex2) 121 | @element_expr: `"4"` 122 | @return: `[4]` 123 | """ 124 | dash_splitted = list(map(int, element_expr.split('-'))) 125 | if len(dash_splitted) == 1: 126 | return dash_splitted 127 | elif len(dash_splitted) == 2: 128 | x, y = dash_splitted 129 | return list(range(x, y+1)) 130 | else: 131 | raise 132 | 133 | 134 | def MiB2GiB(MiB:int) -> float: 135 | return MiB / 1024 136 | -------------------------------------------------------------------------------- /slurm_gres_viz/pre_main.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | import os 3 | import re 4 | import matplotlib.pyplot as plt 5 | from sty import fg 6 | import pprint 7 | if __name__ == '__main__': # for test 8 | from args import args 9 | else: 10 | from .args import args 11 | 12 | 13 | cmap = plt.get_cmap('jet') 14 | 15 | 16 | def main(): 17 | job_strings, node_strings = get_strings() 18 | node_attrs = dict(get_node_attrs(node_string) for node_string in node_strings) 19 | jobs = [get_running_job_with_gres_attrs(job_string) for job_string in job_strings if check_job_running_with_gres(job_string)] 20 | if not jobs: 21 | halt() 22 | prettify_gres(jobs, node_attrs) 23 | print_legends(jobs) 24 | 25 | 26 | def get_strings(): 27 | if args.test: 28 | with open('./test_jobs.txt', 'r') as f1, open('./test_nodes.txt', 'r') as f2: 29 | job_strings = f1.read().strip().split('\n\n') 30 | node_strings = f2.read().strip().split('\n\n') 31 | else: 32 | job_strings = os.popen('scontrol show job -d -a').read().strip().split('\n\n') 33 | node_strings = os.popen('scontrol show nodes').read().strip().split('\n\n') 34 | 35 | if job_strings[0] == 'No jobs in the system': 36 | halt() 37 | 38 | return job_strings, node_strings 39 | 40 | 41 | def halt(): 42 | print('No jobs in the system') 43 | exit() 44 | 45 | 46 | def prettify_gres(jobs, node_attrs): 47 | stars = get_stars(jobs, node_attrs) 48 | nodename_width = max(len(nodename) for nodename in stars) 49 | for nodename, star_components in stars.items(): 50 | print(f'{nodename:<{nodename_width}}: [GPUs] {star_components} {get_res_strings(nodename, node_attrs)}') 51 | 52 | def print_legends(jobs): 53 | column_names = ['COLORS', 'USER_ID', 'JOB_ID', 'ARRAY_IDX', 'JOB_NAME', 'NODE_NAME', 'GPUS', 'CPUS', 'MEM'] 54 | keys = ['userid', 'jobid', 'arraytaskid', 'jobname'] # columns to compute the column width of each 55 | widths = [8] + [get_column_width(jobs, key, column_name) for key, column_name in zip(keys, column_names[1:-4])]\ 56 | + [max(len(column_names[-4]), *[len(list(job['gpus'].keys())[0]) for job in jobs])]\ 57 | + [max(len(column_names[-3]), *[len(','.join(str(e) for e in list(job['gpus'].values())[0])) for job in jobs])]\ 58 | + [max(len(column_names[-2]), *[len(','.join(str(e) for e in list(job['cpus'].values())[0])) for job in jobs])]\ 59 | + [max(len(column_names[-1]), *[len(','.join(str(e) for e in list(job['mem'].values())[0])) for job in jobs])]\ 60 | 61 | delimiter = ' ' 62 | width = sum(widths) + (len(column_names)-1) * len(delimiter) 63 | print(f'\n{" LEGENDS ":=^{width}}') 64 | 65 | jobs_and_colors = get_jobs_and_colors(jobs) 66 | indent = sum(widths[:-4]) + (len(column_names)-5) * len(delimiter) 67 | header = delimiter.join([f'{column_name:{width}s}' for column_name, width in zip(column_names, widths) if width]) 68 | lines = [] 69 | for job, color in jobs_and_colors: 70 | # line_elems = [f'{color}********{bcolors.CEND}'] 71 | line_elems = [colorize('********', color)] 72 | for key, width in zip(keys, widths[1:-4]): 73 | if job[key] is not None: 74 | if key == 'jobid' and job['arrayjobid'] is not None: 75 | line_elems += [f"{job['arrayjobid']:<{width}}"] 76 | else: 77 | line_elems += [f"{job[key]:<{width}}"] 78 | elif width != 0: # this job does not have the value but some others do 79 | line_elems += [' ' * width] 80 | line_elems += [render_resource_string(job, indent, [widths[i] for i in range(-4, -1)] + [max(len(list(job['mem'].values())[0]) for job in jobs)])] 81 | line = delimiter.join(line_elems) 82 | lines += [line] 83 | body = '\n'.join(lines) 84 | print(header) 85 | print(body) 86 | 87 | 88 | def get_stars(jobs, node_attrs): 89 | jobs_and_colors = get_jobs_and_colors(jobs) 90 | stars = {nodename: get_gres_components(attr['num_gpus']) for nodename, attr in node_attrs.items()} 91 | for job, color in jobs_and_colors: 92 | for nodename, gpu_indices in job['gpus'].items(): 93 | for gpu_idx in gpu_indices: 94 | # stars[nodename][gpu_idx] = f'{color}{stars[nodename][gpu_idx]}{bcolors.CEND}' 95 | stars[nodename][gpu_idx] = colorize(f'{stars[nodename][gpu_idx]}', color) 96 | stars = {nodename: ''.join(star_components) for nodename, star_components in stars.items()} 97 | 98 | # for not the same number of GPUs in each node 99 | blank = " " if args.index else " " # " " for index, " " for star 100 | max_gpus = max([attr['num_gpus'] for attr in node_attrs.values()]) 101 | for nodename, num_gpus in zip(node_attrs.keys(), [attr['num_gpus'] for attr in node_attrs.values()]): 102 | stars[nodename] = f'{stars[nodename]}{blank*(max_gpus-num_gpus)}' 103 | 104 | return stars 105 | 106 | 107 | def render_resource_string(job, indent, widths): 108 | delimiter = '\n' + ' ' * indent 109 | cpus = list(job['cpus'].values())[0] 110 | mem = list(job['mem'].values())[0] 111 | 112 | # for multi-node jobs 113 | if len(job['gpus'].items()) > 1: 114 | cpus = str(int(cpus) // len(job['gpus'].items())) 115 | mem = str(float(mem) / len(job['gpus'].items())) 116 | 117 | return delimiter.join([ 118 | f'{nodename:{widths[-4]}s} {",".join(map(str, gpu_indices)):{widths[-3]}s} {cpus:{widths[-2]}s} {float(mem):<{widths[-1]}g} {"GiB"}' 119 | for nodename, gpu_indices in job['gpus'].items() 120 | ]) 121 | 122 | 123 | def get_column_width(jobs, key, column_name): 124 | width_for_values = max(len(job[key] or []) for job in jobs) # job[arraytaskid] can be None 125 | if width_for_values==0: 126 | return 0 127 | else: 128 | return max(width_for_values, len(column_name)) 129 | 130 | 131 | def get_gres_components(num_gpus) -> list: 132 | if args.index: 133 | return [f'[{str(i)}]' for i in range(num_gpus)] 134 | else: 135 | return ['*'] * num_gpus 136 | 137 | def get_res_strings(nodename, node_attrs): 138 | width_cpus = max([len(str(node_attrs[nodename]['alloc_cpus'])) for nodename in node_attrs]) 139 | width_alloc_cpus = max([len(str(node_attrs[nodename]['alloc_cpus'])) for nodename in node_attrs]) 140 | width_mem = max([len(str(node_attrs[nodename]['mem_size'])) for nodename in node_attrs]) 141 | width_alloc_mem = max([len(str(node_attrs[nodename]['alloc_mem'])) for nodename in node_attrs]) 142 | string_cpu = f'[CPUs] {node_attrs[nodename]["alloc_cpus"]:>{width_alloc_cpus}}/{node_attrs[nodename]["num_cpus"]:>{width_cpus}}' 143 | string_mem = f'[Mem] {float(node_attrs[nodename]["alloc_mem"]):{width_alloc_mem}g}/{node_attrs[nodename]["mem_size"]:{width_mem}g} GiB' 144 | return ' ' + string_cpu + ' ' + string_mem 145 | 146 | def get_running_job_with_gres_attrs(job_string): 147 | if check_job_running_with_gres(job_string): 148 | userid, = re.findall(r'UserId=(\S+)', job_string) 149 | jobid, = re.findall(r'^JobId=(\d+)', job_string) # Why ^? => not to capture ArrayJobId 150 | arrayjobid, = re.findall(r'ArrayJobId=(\d+)', job_string) or (None,) 151 | arraytaskid, = re.findall(r'ArrayTaskId=(\d+)', job_string) or (None,) 152 | jobname, = re.findall(r'JobName=(.*)', job_string) 153 | gpus = re.findall(r'\s(Nodes=.*)', job_string) # \s: white-space-like char 154 | gpus = dict(sum([list(get_res_attrs(res_string).items()) for res_string in gpus], [])) 155 | cpus, mem = get_alloc_res_job_attrs(job_string) 156 | return {'userid': userid, 'jobid': jobid, 'arrayjobid': arrayjobid, 157 | 'arraytaskid': arraytaskid, 'jobname': jobname, 158 | 'gpus': gpus, 'cpus': cpus, 'mem': mem} 159 | 160 | def get_parent_job_array_attrs(job_string): 161 | if check_job_parent_jobarray(job_string): 162 | pass 163 | 164 | def get_node_attrs(node_string): 165 | nodename, = re.findall(r'NodeName=(\S+)', node_string) # \S: non-white-space-like char 166 | num_gpus, = re.findall(r'Gres=[a-zA-Z]+:(\d+)', node_string) 167 | num_cpus, = re.findall(r'CPUTot=(\d+)', node_string) 168 | mem_size, = re.findall(r'RealMemory=(\d+)', node_string) 169 | 170 | alloc_gpus, alloc_cpus, alloc_mem = get_alloc_res_node_attrs(node_string) 171 | 172 | return nodename, {'num_gpus': int(num_gpus), 'num_cpus': int(num_cpus), 173 | 'mem_size': round(float(mem_size) / 1024, 2), 'alloc_gpus': int(alloc_gpus), 174 | 'alloc_cpus': int(alloc_cpus), 'alloc_mem': float(alloc_mem)} 175 | 176 | def get_alloc_res_node_attrs(node_string): 177 | alloc_tres = re.findall(r'AllocTRES=cpu=(\d+),mem=(\d+)([a-zA-Z]+),gres/gpu=(\d+)', node_string) 178 | if len(alloc_tres) == 0: 179 | return 0, 0, 0 180 | else: 181 | alloc_tres = list(alloc_tres[0]) 182 | return alloc_tres[-1], alloc_tres[0], get_mem_size_in_GiB(alloc_tres)[1] 183 | 184 | def get_alloc_res_job_attrs(job_string): 185 | nodename, = re.findall(r'\sNodes=(\S+)', job_string) # \s: white-space-like char 186 | alloc_tres = re.findall(r'TRES=cpu=(\d+),mem=(\d+)([a-zA-Z]+)', job_string) 187 | if len(alloc_tres) == 0: 188 | return {nodename:0}, {nodename:0} 189 | else: 190 | alloc_tres = list(alloc_tres[0]) 191 | return {nodename:alloc_tres[0]}, {nodename:get_mem_size_in_GiB(alloc_tres)[1]} 192 | 193 | def get_mem_size_in_GiB(alloc_tres): 194 | if alloc_tres[2] == 'M': 195 | alloc_tres[1] = str(round(float(alloc_tres[1]) / 1024, 2)) 196 | else: 197 | pass 198 | 199 | return alloc_tres 200 | 201 | def check_job_running_with_gres(job_string): 202 | jobstate, = re.findall(r'JobState=([A-Z]+)', job_string) 203 | return jobstate == 'RUNNING' and re.findall(r'GRES=\S+\(IDX:[-,\d]+\)', job_string) 204 | 205 | 206 | def check_job_parent_jobarray(job_string): 207 | jobid, = re.findall(r'^JobId=(\d+)', job_string) 208 | arrayjobid, = re.findall(r'ArrayJobId=(\d+)', job_string) 209 | jobstate, = re.findall(r'JobState=([A-Z]+)', job_string) 210 | if arrayjobid is None \ 211 | or jobstate not in ['RUNNING', 'PENDING'] \ 212 | or jobid != arrayjobid: 213 | return False 214 | else: 215 | return True 216 | 217 | 218 | def get_res_attrs(res_string): # Nodes=node1 CPU_IDs=0-31 Mem=0 GRES=gpu(IDX:4-7) -> {'node1': [4, 5, 6, 7]} 219 | nodes, = re.findall(r'Nodes=(\S+)', res_string) 220 | if '[' in nodes: 221 | indices, = re.findall(r'\[([-,\d]+)\]', nodes) 222 | indices = parse_exp(indices) 223 | rootname = nodes.split('[')[0] 224 | nodes = [rootname+str(idx) for idx in indices] 225 | else: 226 | nodes = [nodes] 227 | gres, = re.findall(r'IDX:([-,\d]+)\)', res_string) 228 | gres = parse_exp(gres) 229 | return {nodename: gres for nodename in nodes} 230 | 231 | 232 | def parse_exp(exp_string): # '0-1,3' -> [0, 1, 3] 233 | exps = exp_string.split(',') # '0-1,3' -> ['0-1', '3'] 234 | def expand_exp(exp): # '0-1' -> [0, 1] or '3' -> [3] 235 | if '-' in exp: 236 | a, b = map(int, exp.split('-')) 237 | return list(range(a, b+1)) 238 | else: 239 | return [int(exp)] 240 | return sum([expand_exp(exp) for exp in exps], []) # concat trick 241 | 242 | 243 | def get_jobs_and_colors(jobs) -> zip: 244 | def get_color_from_jid(jid:int): 245 | color = cmap(((11*jid) % 256) / 256)[:-1] 246 | color = list(map(lambda x:int(x*255), color)) 247 | return color 248 | 249 | return zip(jobs, [get_color_from_jid(int(job['jobid'])) for job in jobs]) 250 | 251 | 252 | def colorize(source:str, color:List[int]): 253 | return fg(*color) + source + fg.rs 254 | 255 | 256 | if __name__ == '__main__': 257 | main() 258 | -------------------------------------------------------------------------------- /slurm_gres_viz/slurm_objects.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, List, Tuple, Union 2 | import requests 3 | from bs4 import BeautifulSoup 4 | from prometheus_client.parser import text_string_to_metric_families 5 | if __name__.startswith('slurm_gres_viz'): 6 | from .parsers import parse_jobstring, parse_nodestring, MiB2GiB 7 | else: # for test 8 | from parsers import parse_jobstring, parse_nodestring, MiB2GiB 9 | 10 | 11 | NORMAL_NODE_STATES = ['IDLE', 'MIXED', 'ALLOCATED'] 12 | INVALID_NODE_STATES = ['DRAIN', 'DOWN', 'INVALID'] 13 | 14 | 15 | class Job: 16 | def __init__(self, job_string=None): 17 | self.job_string = job_string or '' 18 | self.userid, self.id, self.arrayjobid, self.arraytaskid, self.name, self.tres_dict = parse_jobstring(self.job_string) 19 | 20 | 21 | class GPU: 22 | def __init__(self, dcgm_stat:Union[Dict[str,float],None]=None): 23 | # self.gpuname = gpuname # TODO: gpu name from slurm.conf?? 24 | if dcgm_stat is None or 'DCGM_FI_DEV_GPU_UTIL' not in dcgm_stat: 25 | self.util = 0 26 | self.vram_alloc = 0 27 | self.vram_total = 0 28 | self.invalid = True 29 | else: 30 | self.util = float(dcgm_stat['DCGM_FI_DEV_GPU_UTIL']) 31 | self.vram_alloc = MiB2GiB(float(dcgm_stat['DCGM_FI_DEV_FB_USED'])) 32 | self.vram_total = MiB2GiB(float(dcgm_stat['DCGM_FI_DEV_FB_FREE'])) + self.vram_alloc 33 | self.invalid = False 34 | 35 | 36 | class Node: 37 | def __init__(self, node_string:str, node_ip_dict:Union[Dict[str,str],None], request_exporter:bool=False): 38 | """# Vars (example) 39 | @nodename: `"vll3"` 40 | @num_cpus: `96` 41 | @num_gpus: `8` 42 | @mem_total: `336833` 43 | @public_ip: `"xxx.xxx.xxx.xxx"` 44 | @gpu_infos: `[{"gpuname": "", "gpuutil": 83, "vram_used": 18832, "vram_total": 24080}, ...]` 45 | @cpu_load: `2.10` 46 | @mem_used: `61440` 47 | """ 48 | # getting infos from node_string (fast) 49 | self.node_string = node_string 50 | nodename, state, num_cpus_alloc, num_cpus_total, num_gpus_alloc, num_gpus_total, mem_alloc, mem_total = parse_nodestring(self.node_string) 51 | self.name = nodename # node_string[v], exporter 52 | self.states:List[str] = state.split('+') # ex: IDLE+DRAIN 53 | self.is_state_ok = all([invalid_state not in self.states for invalid_state in INVALID_NODE_STATES]) 54 | self.mem_alloc = mem_alloc # node_string[v], exporter 55 | self.mem_total = mem_total # node_string 56 | 57 | self.num_cpus_total = num_cpus_total # node_string 58 | self.num_cpus_alloc = num_cpus_alloc # node_string 59 | self.num_gpus_alloc = num_gpus_alloc # node_string 60 | self.num_gpus_total = num_gpus_total # node_string 61 | 62 | # ========================================================== 63 | # getting infos from exporters (slow) 64 | # todo: show 옵션을 받아오고, node가 정상인 상태에서만 가져와야 됨 65 | 66 | self.request_exporter = request_exporter 67 | if self.request_exporter: 68 | if self.is_state_ok: 69 | self.public_ip = node_ip_dict[self.name] # given 70 | # self.node_metrics = self.get_node_metrics() 71 | self.gpu_metrics, self.gpus = self.get_gpu_metrics() 72 | if any([gpu.invalid for gpu in self.gpus]): 73 | self.is_state_ok = False 74 | else: 75 | self.gpus = [GPU() for _ in range(self.num_gpus_total)] 76 | # self.cpu_loads = [ 77 | # self.node_metrics['node_load1'].samples[0].value, 78 | # self.node_metrics['node_load5'].samples[0].value, 79 | # self.node_metrics['node_load15'].samples[0].value, 80 | # ] # node_string, exporter[v] 81 | 82 | # def get_node_metrics(self) -> dict: 83 | # response = requests.get(f'http://{self.public_ip}:9100/metrics') # node exporter 84 | # if response.ok: 85 | # metrics = self.html2metrics(response.text) 86 | # return {metric.name: metric for metric in metrics} 87 | # else: 88 | # raise # The metric server does not respond 89 | 90 | def get_gpu_metrics(self) -> Tuple[dict, List[GPU]]: 91 | response = requests.get(f'http://{self.public_ip}:9400/metrics', timeout=.1) # dcgm exporter 92 | if response.ok: 93 | gpu_metrics = self.html2metrics(response.text) 94 | gpus = self.metrics2gpu_objs(gpu_metrics) 95 | return gpu_metrics, gpus 96 | else: 97 | raise # The metric server does not respond 98 | 99 | def metrics2gpu_objs(self, metrics) -> List[GPU]: 100 | gpu_indices = [] 101 | for metric in metrics: 102 | if metric.samples and 'gpu' in metric.samples[0].labels: 103 | gpu_indices.append(metric.samples[0].labels['gpu']) 104 | num_gpus = len(set(gpu_indices)) 105 | dcgm_stats:List[Dict[str,float]] = [{} for _ in range(num_gpus)] 106 | for metric in metrics: 107 | if metric.samples and 'gpu' in metric.samples[0].labels: 108 | sample = metric.samples[0] 109 | gpu_idx = int(sample.labels['gpu']) 110 | dcgm_stats[gpu_idx][sample.name] = sample.value 111 | return [GPU(dcgm_stat) for dcgm_stat in dcgm_stats] 112 | 113 | def html2metrics(self, html): 114 | soup = BeautifulSoup(html, 'html.parser') 115 | metrics = list(text_string_to_metric_families(soup.get_text())) 116 | return metrics 117 | -------------------------------------------------------------------------------- /slurm_gres_viz/visualizer.py: -------------------------------------------------------------------------------- 1 | import re 2 | from typing import List, Tuple, Dict 3 | from multiprocessing.pool import ThreadPool 4 | 5 | if __name__.startswith('slurm_gres_viz'): # for test 6 | from .slurm_objects import Job, GPU, Node 7 | from .displayer import Displayer 8 | else: 9 | from slurm_objects import Job, GPU, Node 10 | from displayer import Displayer 11 | 12 | 13 | class SlurmTresVisualizer: 14 | def __init__(self, 15 | node_strings:List[str], job_strings:List[str], 16 | test_mode:bool=False, 17 | 18 | show_index:bool=False, show_gpu_memory:bool=False, show_gpu_util:bool=False, 19 | show_only_mine:bool=False 20 | ): 21 | self.node_strings = node_strings 22 | self.job_strings = job_strings 23 | 24 | self.test_mode = test_mode 25 | 26 | self.show_index = show_index 27 | self.show_gpu_memory = show_gpu_memory 28 | self.show_gpu_util = show_gpu_util 29 | self.show_only_mine = show_only_mine 30 | 31 | self.nodes, self.jobs = self.get_infos() 32 | 33 | # ================================================================================================= 34 | 35 | def get_infos(self): 36 | nodes = self.get_node_infos() 37 | jobs = self.get_job_infos() 38 | return nodes, jobs 39 | 40 | def get_node_infos(self): 41 | request_exporter = self.show_gpu_memory or self.show_gpu_util 42 | node_ip_dict = get_ips_from_etchosts() if request_exporter else None 43 | def get_node(node_string): 44 | return Node( 45 | node_string=node_string, node_ip_dict=node_ip_dict, 46 | request_exporter=request_exporter 47 | ) 48 | if request_exporter: 49 | with ThreadPool(len(self.node_strings)) as t: 50 | nodes = t.map(get_node, self.node_strings) 51 | else: 52 | nodes = [get_node(node_string) for node_string in self.node_strings] 53 | return nodes 54 | 55 | def get_job_infos(self): 56 | job_infos:List[Job] = [] 57 | for job_string in self.job_strings: 58 | if job_string == 'No jobs in the system': 59 | job_infos = [] 60 | break 61 | else: 62 | jobstate, = re.findall(r'JobState=([A-Z]+)', job_string) 63 | if jobstate == 'RUNNING': 64 | job_info = Job(job_string) 65 | job_infos.append(job_info) 66 | return job_infos 67 | 68 | # ================================================================================================= 69 | 70 | def show(self): 71 | if self.test_mode: 72 | for job in self.jobs: 73 | print(job.userid, job.id, job.name) 74 | print(job.tres_dict) 75 | print() 76 | 77 | displayer = Displayer( 78 | self.nodes, self.jobs, 79 | show_index=self.show_index, 80 | show_gpu_memory=self.show_gpu_memory, 81 | show_gpu_util=self.show_gpu_util, 82 | show_only_mine=self.show_only_mine 83 | ) 84 | displayer.show() 85 | 86 | 87 | def get_ips_from_etchosts() -> Dict[str,str]: 88 | with open('/etc/hosts') as f: 89 | data = f.read() 90 | ip_pattern = r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' 91 | ip_node_pairs:List[Tuple[str,str]] = re.findall(ip_pattern + r'\s*([\w-]*)', data) # [(ip, nodename), ...] 92 | ip_node_pairs = list(map(lambda tuple: tuple[::-1], ip_node_pairs)) # [(nodename, ip), ...] 93 | return dict(ip_node_pairs) 94 | --------------------------------------------------------------------------------