├── .gitignore
├── LICENSE
├── README.md
├── requirements.txt
├── setup.py
└── slurm_gres_viz
    ├── __init__.py
    ├── args.py
    ├── displayer.py
    ├── main.py
    ├── parsers.py
    ├── pre_main.py
    ├── slurm_objects.py
    └── visualizer.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | __pycache__/
 2 | *.egg-info/
 3 | dist/
 4 | deb_dist/
 5 | *.tar.gz
 6 | *.tar
 7 | build/
 8 | # hello.*
 9 | 
10 | test*
11 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 Hyeo-geon Lee
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # SLURM_allocated_gres_visualizer
 2 | **The app for visualizing allocated GPUs by SLURM**
 3 | 
 4 | ![image](https://user-images.githubusercontent.com/25451196/222977415-c8b992e6-d46d-4856-9a26-558505e64956.png)
 5 | 
 6 | When you are using Slurm and you want to check which gpus are allocated, you must have done something like
 7 | - `ssh` to each computing node and run `nvidia-smi`. Then, repeat it.
 8 | - Run `scontrol show job -d | grep GRES` and roll your eyeballs.
 9 | 
10 | 
11 | both of which are very tedious. This project can solve this.
12 | 
13 | # Requirements
14 | 
15 | ## Packages
16 | - matplotlib
17 | - sty
18 | - prometheus-client
19 | - requests
20 | - pandas
21 | - bs4
22 | 
23 | ## Slurm
24 | - Be sure that `slurmctld`(master) and `slurmd`(nodes) are active so that there are no problems for running `scontrol show nodes` or `scontrol show job`.
25 | - Be sure that `AutoDetect=nvml` for all computing nodes to avoid GPU index mismatch.
26 | - For all computing nodes, `node-exporter` are available at port `9100` and `dcgm-exporter` at `9400`.
27 | 
28 | # Installation
29 | ```bash
30 | git clone https://github.com/Haawron/SLURM_allocated_gres_visualizer.git
31 | cd SLURM_allocated_gres_visualizer
32 | /usr/bin/python3 setup.py install  # be sure to be without conda
33 | ```
34 | 
35 | # Usage
36 | ```bash
37 | slurm-gres-viz
38 | 
39 | # GPU options
40 | slurm-gres-viz -i  # stars are replaced to indices
41 | slurm-gres-viz -gm -gu  # VRAM and GPU util
42 | slurm-gres-viz -f  # Full information of GPUs
43 | slurm-gres-viz -m  # mine: shows only my GPUs
44 | 
45 | # others
46 | slurm-gres-viz -l 1  # looping every 1 second (same as nvidia-smi)
47 | ```
48 | 
49 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy==1.24.3
2 | matplotlib==3.6.3
3 | sty==1.0.4
4 | prometheus-client
5 | requests
6 | pandas==2.0.0rc0
7 | bs4
8 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | from pip._internal.req import parse_requirements
 3 | import os
 4 | 
 5 | 
 6 | requirements = parse_requirements('requirements.txt', session=False)
 7 | required_packages = [str(package.__dict__.get('req', package.__dict__['requirement'])) for package in requirements]
 8 | 
 9 | setup(
10 |     name="slurm_gres_viz",
11 |     version="2.1.1",
12 |     author="Hyogun Lee(Haawron)",
13 |     author_email="gunsbrother@khu.ac.kr",
14 |     python_requires='>=3.6',
15 |     install_requires=required_packages,
16 |     description="The app for visualizing allocated GPUs by SLURM",
17 |     license="MIT",
18 |     url="https://github.com/Haawron/SLURM_allocated_gres_visualizer",
19 |     packages=['slurm_gres_viz'],
20 |     package_dir={'slurm_gres_viz': 'slurm_gres_viz'},
21 |     entry_points={
22 |         'console_scripts' : [
23 |             f'slurm-gres-viz=slurm_gres_viz.main:{"forced_main" if bool(os.environ.get("FORCE_ONLY_MINE", False)) else "main"}'
24 |         ]  # todo: main function을 여러 개 만들고 main class가 옵션을 args가 아니라 init에서 받아와야 함
25 |     },
26 |     classifiers=[
27 |         'Environment :: Console',
28 |         'Intended Audience :: End Users',
29 |         'Operating System :: POSIX',
30 |         'Programming Language :: Python',
31 |     ],
32 | )
33 | 


--------------------------------------------------------------------------------
/slurm_gres_viz/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Haawron/SLURM_allocated_gres_visualizer/da92f81c6996959977d6f2682f1052a2ac881de1/slurm_gres_viz/__init__.py


--------------------------------------------------------------------------------
/slurm_gres_viz/args.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | 
 4 | def rate_in_range(value):
 5 |     value = float(value)
 6 |     if value <= 0:
 7 |         raise argparse.ArgumentTypeError("Interval must be positive")
 8 |     elif value < 1:
 9 |         raise argparse.ArgumentTypeError("Interval smaller than 1s is not allowed")
10 |     return value
11 | 
12 | 
13 | parser = argparse.ArgumentParser(description='SLURM Allocated GRES Visualizer')
14 | 
15 | # gpu loggings
16 | parser.add_argument('-m', '--only-mine', action='store_true',
17 |                     help='asd')
18 | parser.add_argument('-f', '--full', action='store_true',
19 |                     help='asd')
20 | parser.add_argument('-i', '--index', action='store_true',
21 |                     help='Use Gres\' indices instead of stars(*)')
22 | parser.add_argument('-gm', '--gpu-memory', action='store_true',
23 |                     help='asd')
24 | parser.add_argument('-gu', '--gpu-util', action='store_true',
25 |                     help='asd')
26 | 
27 | # iterate
28 | parser.add_argument('-l', '--loop', type=rate_in_range, default=-1,
29 |                     help='asd')
30 | 
31 | # test
32 | parser.add_argument('-t', '--test-from-log', action='store_true',
33 |                     help='Test mode')
34 | 
35 | args = parser.parse_args()
36 | 


--------------------------------------------------------------------------------
/slurm_gres_viz/displayer.py:
--------------------------------------------------------------------------------
  1 | from typing import List, Dict, Tuple
  2 | import os
  3 | 
  4 | import pandas as pd
  5 | import numpy as np
  6 | import matplotlib.pyplot as plt
  7 | from sty import fg, ef, bg
  8 | 
  9 | if __name__.startswith('slurm_gres_viz'):
 10 |     from .slurm_objects import Node, Job, GPU
 11 |     from .args import args
 12 | else:
 13 |     from slurm_objects import Node, Job, GPU
 14 |     from args import args
 15 | from pprint import pprint
 16 | 
 17 | 
 18 | cmap = plt.get_cmap('jet')
 19 | RED = (255, 50, 0)
 20 | YELLOW = (200, 200, 0)
 21 | 
 22 | 
 23 | class Displayer:
 24 |     def __init__(self, nodes:List[Node], jobs:List[Job], **display_options):
 25 |         self.dashboard = DashBoard(nodes, jobs, **display_options)
 26 |         self.legend = Legend(jobs, **display_options)
 27 | 
 28 |     def show(self):
 29 |         self.dashboard.show()
 30 |         self.legend.show()
 31 | 
 32 | 
 33 | class DashBoard:  # Upper body
 34 |     def __init__(self,
 35 |         nodes:List[Node], jobs:List[Job],
 36 | 
 37 |         show_index:bool=False, show_gpu_memory:bool=False, show_gpu_util:bool=False,
 38 |         show_only_mine:bool=False
 39 |     ):
 40 |         self.nodes = nodes
 41 |         self.jobs = jobs
 42 | 
 43 |         self.show_index = show_index
 44 |         self.show_gpu_memory = show_gpu_memory
 45 |         self.show_gpu_util = show_gpu_util
 46 |         self.show_only_mine = show_only_mine
 47 | 
 48 |         self.max_num_node_gpus = max(map(lambda node: node.num_gpus_total, self.nodes))
 49 |         self.delimiter_within_gpu = '|'
 50 |         if sum([self.show_index, self.show_gpu_memory, self.show_gpu_util]) <= 1:
 51 |             self.delimiter_between_gpu = ''
 52 |         else:
 53 |             self.delimiter_between_gpu = ' '
 54 |         self.char_fill_hidden = '#'
 55 |         self.all_mine_masks = self.get_mine_mask()
 56 |         self.all_occupancy_masks = self.get_occupancy_mask()
 57 |         self.all_gpu_items = self.build_items()
 58 |         self.all_gpu_items = self.stylize_items(self.all_gpu_items)
 59 |         self.widths = self.calculate_widths()
 60 | 
 61 |     def show(self):
 62 |         lines = [
 63 |             f'{node.name:{self.widths["nodename"]}}: '
 64 |             f'[GPU] [{node.num_gpus_alloc}/{node.num_gpus_total}] {self.delimiter_between_gpu.join(gpu_items)}      '
 65 |             f'[CPU]  {node.num_cpus_alloc:>{self.widths["cpu"]}}/{node.num_cpus_total:{self.widths["cpu"]}}  '
 66 |             f'[MEM]  {node.mem_alloc:>{self.widths["mem"]-3}.0f}/{node.mem_total:{self.widths["mem"]}.2f} GiB'
 67 |             for node, gpu_items in zip(self.nodes, self.all_gpu_items.values())
 68 |         ]
 69 |         body = '\n'.join(lines)
 70 |         print(body)
 71 | 
 72 |     def build_items(self):
 73 |         all_gpu_items:Dict[str,List[str]] = {}
 74 |         for node in self.nodes:
 75 |             mine_masks = self.all_mine_masks[node.name]
 76 |             occupancy_masks = self.all_occupancy_masks[node.name]
 77 |             gpu_items:List[str] = []
 78 |             for gpu_idx in range(self.max_num_node_gpus):
 79 |                 is_mine = mine_masks[gpu_idx]
 80 |                 is_occupied = occupancy_masks[gpu_idx]
 81 |                 will_be_hidden = self.show_only_mine and not is_mine
 82 |                 if gpu_idx >= node.num_gpus_total:  # pseudo item to align, as colorizer's width varies aligning with width does not work
 83 |                     gpu_items.append(' '*len(gpu_item))
 84 |                 else:
 85 |                     gpu_item = []
 86 |                     if any([self.show_index, self.show_gpu_memory, self.show_gpu_util]):
 87 |                         if self.show_index:
 88 |                             gpu_item.append(f'{gpu_idx}')
 89 |                         if self.show_gpu_memory:
 90 |                             gpu_item.append(f'{node.gpus[gpu_idx].vram_alloc:>4.1f}/{node.gpus[gpu_idx].vram_total:4.1f}GiB')
 91 |                         if self.show_gpu_util:
 92 |                             util = int(round(node.gpus[gpu_idx].util, 0))
 93 |                             gpu_item.append(f'{util:>2d}%' if util < 100 else '100')
 94 |                         content = self.delimiter_within_gpu.join(gpu_item)
 95 |                         if is_occupied:
 96 |                             if will_be_hidden:
 97 |                                 content = self.char_fill_hidden * len(content)
 98 |                         else:  # idle GPUs
 99 |                             content = '-' * len(content)
100 |                         gpu_item = '[' + content + ']'
101 |                     else:
102 |                         if is_occupied:
103 |                             if will_be_hidden:
104 |                                 gpu_item = self.char_fill_hidden
105 |                             else:
106 |                                 gpu_item = '*'
107 |                         else:  # idle GPUs
108 |                             gpu_item = '-'
109 |                     gpu_items.append(gpu_item)
110 |             all_gpu_items[node.name] = gpu_items
111 |         return all_gpu_items
112 | 
113 |     def stylize_items(self, all_gpu_items):
114 |         for job in self.jobs:
115 |             color = get_color_from_idx(int(job.id))
116 |             is_mine = os.environ['USER'] in job.userid
117 |             for nodename, tres_dict in job.tres_dict.items():
118 |                 for gpu_idx in tres_dict['gpus']:
119 |                     will_be_hidden = self.show_only_mine and not is_mine
120 |                     if not will_be_hidden:
121 |                         content = colorize(all_gpu_items[nodename][gpu_idx], color)
122 |                         if is_mine:
123 |                             content = make_bold(content)
124 |                         all_gpu_items[nodename][gpu_idx] = content
125 | 
126 |         # not occupied -> colored into gray
127 |         gray = tuple(100 for _ in range(3))
128 |         for nodename, occupancy_masks in self.all_occupancy_masks.items():
129 |             for gpu_idx, is_occupied in enumerate(occupancy_masks):
130 |                 if not is_occupied:  # idle GPUs
131 |                     all_gpu_items[nodename][gpu_idx] = colorize(all_gpu_items[nodename][gpu_idx], gray)
132 | 
133 |         # TODO: 비정상(not in IDLE, MIXED, ALLOCATED) 노드 취소선
134 |         for node in self.nodes:
135 |             if any([invalid_state in node.states for invalid_state in ['DOWN', 'INVALID']]):
136 |                 for gpu_idx in range(node.num_gpus_total):
137 |                     all_gpu_items[node.name][gpu_idx] = colorize(all_gpu_items[node.name][gpu_idx], RED, True)
138 |             elif 'DRAIN' in node.states:
139 |                 for gpu_idx in range(node.num_gpus_total):
140 |                     all_gpu_items[node.name][gpu_idx] = colorize(all_gpu_items[node.name][gpu_idx], YELLOW, True)
141 |             else:  # valid node
142 |                 pass
143 |         return all_gpu_items
144 | 
145 |     def calculate_widths(self):
146 |         widths = {
147 |             'nodename': max(map(lambda node: len(node.name), self.nodes)),
148 |             'cpu': max(map(lambda node: np.log10(node.num_cpus_total).astype(int)+1, self.nodes)),
149 |             'mem': 6
150 |             # why don't we have gpu items' width?
151 |             # => as colorizer's width varies aligning with width does not work
152 |         }
153 |         return widths
154 | 
155 |     def get_mine_mask(self):
156 |         all_mine_masks:Dict[str,List[bool]] = {node.name: [False]*self.max_num_node_gpus for node in self.nodes}
157 |         for job in self.jobs:
158 |             is_mine = os.environ['USER'] in job.userid
159 |             if is_mine:
160 |                 for nodename, tres_dict in job.tres_dict.items():
161 |                     for gpu_idx in tres_dict['gpus']:
162 |                         all_mine_masks[nodename][gpu_idx] = True
163 |         return all_mine_masks
164 | 
165 |     def get_occupancy_mask(self):
166 |         all_occupancy_masks:Dict[str,List[bool]] = {node.name: [False]*self.max_num_node_gpus for node in self.nodes}
167 |         for job in self.jobs:
168 |             for nodename, tres_dict in job.tres_dict.items():
169 |                 for gpu_idx in tres_dict['gpus']:
170 |                     all_occupancy_masks[nodename][gpu_idx] = True
171 |         return all_occupancy_masks
172 | 
173 | 
174 | class Legend:  # Lower body
175 |     def __init__(self,
176 |             jobs:List[Job],
177 | 
178 |             show_index:bool=False, show_gpu_memory:bool=False, show_gpu_util:bool=False,
179 |             show_only_mine:bool=False
180 |         ):
181 |         self.jobs = jobs
182 |         self.space_placeholder = '@'  # not to be splitted by str.split
183 |         self.delimiter_column = '   '
184 | 
185 |         self.show_index = show_index
186 |         self.show_gpu_memory = show_gpu_memory
187 |         self.show_gpu_util = show_gpu_util
188 |         self.show_only_mine = show_only_mine
189 | 
190 |         self.default_colnames = ['colors', 'user_id', 'job_id', 'job_arr_id', 'job_arr_task_id', 'job_name', 'node_name', 'gpus', 'cpus', 'mem']
191 |         self.default_display_colnames = [colname.replace('job_arr_task_id', 'arr_idx').upper() for colname in self.default_colnames if colname != 'job_arr_id']
192 |         self.default_aligns = pd.Series(['<', '<', '>', '<', '<', '<', '^', '^', '>', '>'], self.default_colnames)
193 | 
194 |         self.df, self.display_colnames, self.aligns = self.build_df()
195 |         self.widths = self.calculate_widths(self.df, self.display_colnames)
196 | 
197 |     def show(self):
198 |         if not self.df.empty:
199 |             df_s = self.df.to_string(max_colwidth=0, index=False)
200 |             lines = [line.split() for line in df_s.split('\n')]
201 |             lines[0] = self.display_colnames
202 |         else:
203 |             lines = [self.display_colnames]
204 |         s = []
205 |         for line in lines:
206 |             ss = []
207 |             for elem, colname in zip(line, self.df.columns):
208 |                 ss.append(f'{elem:{self.aligns[colname]}{self.widths[colname]}}'.replace(self.space_placeholder, ' '))
209 |             ss = self.delimiter_column.join(ss)
210 |             s.append(ss)
211 |         whole_width = self.widths.sum() + (self.widths.shape[0]-1)*len(self.delimiter_column)
212 |         print()
213 |         print(f'{" LEGEND ":=^{whole_width}}')
214 |         print('\n'.join(s))
215 | 
216 |     def build_df(self):
217 |         records = self.build_records_from_jobs(self.jobs)
218 |         df = pd.DataFrame.from_records(records, columns=self.default_colnames[1:])
219 |         if self.show_only_mine:
220 |             df = df[df['user_id'].str.contains(os.environ['USER'])]
221 |         color_legend = df['job_id'].map(lambda jid: colorize('********', get_color_from_idx(int(jid))))  # before the column job_id overwritten
222 |         df['job_id'] = df['job_arr_id'].fillna(df['job_id'])  # firstly with job_arr_id, and overwrite with job_id only for none rows
223 |         del df['job_arr_id']
224 |         df['gpus'] = df['gpus'].replace('', pd.NA).fillna('-')
225 |         df['mem'] = df['mem'].astype(str) + f'{self.space_placeholder}GiB'
226 |         # inserting the color legend
227 |         df.insert(0, 'colors', color_legend)
228 |         # masking multi-node jobs
229 |         duplicates = df.duplicated(subset=['job_id', 'job_arr_task_id'], keep='first')
230 |         df.loc[duplicates, ['colors', 'user_id', 'job_id', 'job_arr_task_id', 'job_name']] = self.space_placeholder
231 | 
232 |         no_arr_job = df['job_arr_task_id'].replace(self.space_placeholder, pd.NA).isna().all()
233 |         display_colnames = self.default_display_colnames.copy()
234 |         aligns = self.default_aligns.copy()
235 |         if no_arr_job:
236 |             del df['job_arr_task_id']
237 |             del aligns['job_arr_task_id']
238 |             display_colnames.remove('ARR_IDX')
239 |         else:
240 |             df['job_arr_task_id'] = df['job_arr_task_id'].fillna(self.space_placeholder)
241 | 
242 |         return df, display_colnames, aligns
243 | 
244 |     def build_records_from_jobs(self, jobs):
245 |         records = []
246 |         for job in jobs:
247 |             for nodename, tres_dict in job.tres_dict.items():
248 |                 record = [
249 |                     job.userid, job.id, job.arrayjobid, job.arraytaskid, job.name, nodename,
250 |                     ','.join(map(str, tres_dict['gpus'])), len(tres_dict['cpus']), int(tres_dict['mem'])
251 |                 ]
252 |                 records.append(record)
253 |         return records
254 | 
255 |     def calculate_widths(self, df, display_colnames):
256 |         tmp_df_for_calculating_width = pd.concat([df.astype(str), pd.DataFrame([display_colnames], columns=df.columns)], ignore_index=True)
257 |         widths = tmp_df_for_calculating_width.applymap(lambda elem: len(str(elem))).max()
258 |         widths['colors'] = 8
259 |         return widths
260 | 
261 | 
262 | def get_color_from_idx(idx:int):
263 |     color = cmap(((11*idx) % 256) / 256)[:-1]  # RGB
264 |     color = list(map(lambda x: int(x*255), color))
265 |     return color
266 | 
267 | 
268 | def colorize(source:str, color:List[int], background:bool=False):
269 |     if not background:
270 |         output = fg(*color) + source + fg.rs
271 |     else:
272 |         output = bg(*color) + source + bg.rs
273 |     return output
274 | 
275 | 
276 | def make_bold(source:str):
277 |     output = ef.b + source + ef.rs
278 |     return output
279 | 


--------------------------------------------------------------------------------
/slurm_gres_viz/main.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import time
 3 | 
 4 | from pathlib import Path
 5 | 
 6 | if __name__ == '__main__':  # for test
 7 |     from args import args
 8 |     from visualizer import SlurmTresVisualizer
 9 | else:  # slurm_gres_viz.main
10 |     from .args import args
11 |     from .visualizer import SlurmTresVisualizer
12 | 
13 | 
14 | # TODO: GPU 정보 받아오는 건 GPU 옵션 받았을 때만 해야 함
15 | # TODO: 했는데도 느려서 프로파일링 해야 함
16 | 
17 | 
18 | def get_display_options():
19 |     display_options = {
20 |         'show_index': args.full or args.index,
21 |         'show_gpu_memory': args.full or args.gpu_memory,
22 |         'show_gpu_util': args.full or args.gpu_util,
23 |         'show_only_mine': args.only_mine,
24 |     }
25 |     return display_options
26 | 
27 | 
28 | def looper(func):  # decorator
29 |     def wrapper(**display_options):
30 |         if args.loop < 0:
31 |             func(**display_options)
32 |         else:
33 |             while True:
34 |                 func(**display_options)
35 |                 print('\n\n')
36 |                 time.sleep(args.loop)
37 |     return wrapper
38 | 
39 | 
40 | @looper
41 | def run(**display_options):
42 |     strings = {
43 |         'node_strings': os.popen('scontrol show nodes').read().strip().split('\n\n'),
44 |         'job_strings': os.popen('scontrol show job -d -a').read().strip().split('\n\n'),
45 |     }
46 |     viz = SlurmTresVisualizer(**strings, **display_options)
47 |     viz.show()
48 | 
49 | 
50 | def main():
51 |     display_options = get_display_options()
52 |     run(**display_options)
53 | 
54 | 
55 | def forced_main():
56 |     display_options = get_display_options()
57 |     if 'admin' not in os.environ['USER']:
58 |         display_options['show_only_mine'] = True
59 |     run(**display_options)
60 | 
61 | 
62 | if __name__ == '__main__':  # testing
63 |     if args.test_from_log:
64 |         strings = {}
65 |         for p_case_dir in Path('test/logs').glob('**/*'):
66 |             for obj in ['node', 'job']:
67 |                 p_obj = (p_case_dir / obj).with_suffix('.log')
68 |                 if p_obj.is_file():
69 |                     with p_obj.open() as f:
70 |                         strings[f'{obj}_strings'] = f.read().strip().split('\n\n')
71 |                 else:
72 |                     break
73 |             else:  # no break
74 |                 print(p_case_dir.name)
75 |                 SlurmTresVisualizer(**strings)
76 |                 print()
77 |     else:
78 |         from cProfile import Profile
79 |         from pstats import Stats
80 |         profiler = Profile()
81 |         profiler.runcall(main)
82 |         stats = Stats(profiler)
83 |         stats.strip_dirs()
84 |         stats.sort_stats('tottime')
85 |         stats.print_stats(20)
86 |         stats.sort_stats('cumulative')
87 |         stats.print_stats(20)
88 | 


--------------------------------------------------------------------------------
/slurm_gres_viz/parsers.py:
--------------------------------------------------------------------------------
  1 | from typing import Dict, List, Union
  2 | import re
  3 | import os
  4 | import csv
  5 | 
  6 | 
  7 | def parse_jobstring(job_string:str):
  8 |     """Parse job string for a single job.
  9 |     """
 10 |     userid, = re.findall(r'UserId=(\S+)', job_string)
 11 |     jobid, = re.findall(r'^JobId=(\d+)', job_string)  # Why ^? => not to capture ArrayJobId
 12 |     arrayjobid, = re.findall(r'ArrayJobId=(\d+)', job_string) or (None,)
 13 |     arraytaskid, = re.findall(r'ArrayTaskId=(\d+)', job_string) or (None,)
 14 |     jobname, = re.findall(r'JobName=(.*)', job_string)
 15 |     job_tres_strings = re.findall(r'\s(Nodes=.*)', job_string)  # \s: white-space-like char
 16 |     job_tres_dict = dict(sum([list(job_tres_string_to_dict(job_tres_string).items()) for job_tres_string in job_tres_strings], []))  # sum up tres dicts
 17 |     return userid, jobid, arrayjobid, arraytaskid, jobname, job_tres_dict
 18 | 
 19 | 
 20 | def parse_nodestring(node_string:str):
 21 |     nodename, = re.findall(r'NodeName=(\S+)', node_string)  # \S: non-white-space-like char
 22 |     state, = re.findall(r'State=([\w\+]+)', node_string)
 23 |     num_cpus_alloc, = re.findall(r'CPUAlloc=(\d+)', node_string)
 24 |     num_cpus_total, = re.findall(r'CPUTot=(\d+)', node_string)
 25 |     num_gpus_alloc, = re.findall(r'AllocTRES=.*gres/gpu=(\d)', node_string) or [0]
 26 |     num_gpus_total, = re.findall(r'Gres=[a-zA-Z]+:(\d)', node_string)
 27 |     mem_alloc, = re.findall(r'AllocMem=(\d+)', node_string)
 28 |     mem_total, = re.findall(r'RealMemory=(\d+)', node_string)
 29 |     return nodename, state, int(num_cpus_alloc), int(num_cpus_total), int(num_gpus_alloc), int(num_gpus_total), MiB2GiB(float(mem_alloc)), MiB2GiB(float(mem_total))
 30 | 
 31 | 
 32 | def job_tres_string_to_dict(job_tres_string:str) -> Dict[str,List[Union[int,List[int],float]]]:
 33 |     """Convert TRES string of a job to python object
 34 | 
 35 |     Parameters
 36 |     ----------
 37 |     job_tres_string : str
 38 |         * ex1) `"Nodes=node1 CPU_IDs=0-31 Mem=0 GRES=gpu(IDX:4-7)"`
 39 |         * ex2) `"Nodes=vll[2-3] CPU_IDs=8-11 Mem=5120 GRES=gpu:1(IDX:2)`
 40 | 
 41 |     Returns
 42 |     -------
 43 |     Dict[str,List[int]]
 44 |     converted allocated TRES infos in appropriate python object
 45 |     * ex1) `{'node1': {'cpus': [0, 1, ..., 31], 'gpus': [4, 5, 6, 7]}, 'mem': 0}`
 46 |     * ex2) `{'vll2': {'cpus': [8, 9, 10, 11], 'gpus': [2], 'mem': 5120}, 'vll3': {'cpus': [8, 9, 10, 11], 'gpus': [2], 'mem': 5120}}`
 47 |     """
 48 |     nodenames, = re.findall(r'Nodes=(\S+)', job_tres_string)
 49 |     nodenames = resolve_hostname_expr(nodenames)
 50 |     # nodenames = os.popen(f'scontrol show hostname {nodenames}').read().split()
 51 |     cpu_indices, = re.findall(r'CPU_IDs=([-,\d]+)', job_tres_string) or ['']
 52 |     gpu_indices, = re.findall(r'IDX:([-,\d]+)\)', job_tres_string) or ['']
 53 |     cpu_indices = resolve_index_expr(cpu_indices)
 54 |     gpu_indices = resolve_index_expr(gpu_indices)
 55 |     mem, = map(MiB2GiB, map(int, re.findall(r'Mem=(\d+)', job_tres_string)))
 56 |     job_tres_dict = {
 57 |         nodename: {'cpus': cpu_indices, 'gpus': gpu_indices, 'mem': mem}
 58 |         for nodename in nodenames
 59 |     }
 60 |     return job_tres_dict
 61 | 
 62 | 
 63 | def resolve_hostname_expr(expr:str) -> List[str]:
 64 |     """# Vars (example)
 65 |     ex0) A single-node job
 66 |     @expr: `"batch1"`
 67 |     @return: `["batch1"]`
 68 | 
 69 |     ex1)
 70 |     @expr: `"batch[1,3-5]"`
 71 |     @return: `["batch1", "batch3", "batch4", "batch5"]`
 72 | 
 73 |     ex2) Dashed hostname
 74 |     @expr: `"debug-g[1-4]"`
 75 |     @return: `["debug-g1", "debug-g2", "debug-g3", "debug-g4"]`
 76 | 
 77 |     ex3) Multiple host ranges
 78 |     @expr: `"debug-g[1,3-4],batch[1-2]"`
 79 |     @return: `["debug-g1", "debug-g3", "debug-g4", "batch1", "batch2"]`
 80 |     """
 81 |     # TODO: csv랑 re랑 비교
 82 |     # (?!...): negative lookahead assertion, matched only when ... does not follow
 83 |     # [^\[]*\]: ...]
 84 |     # ==> `,...]` will be ignored
 85 |     splitted_host_ranges = re.split(r',(?![^\[]*\])', expr)
 86 |     all_hostnames = []
 87 |     for splitted_host_range in splitted_host_ranges:
 88 |         m = re.match(r'(?P<hostname_root>[\w-]+)(\[(?P<range>[\d,-]+)\])?', splitted_host_range)
 89 |         if m['range'] is not None:
 90 |             indices = resolve_index_expr(m['range'])
 91 |             hostnames = [f"{m['hostname_root']}{i}" for i in indices]
 92 |         else:
 93 |             hostnames = [m['hostname_root']]
 94 |         all_hostnames += hostnames
 95 |     return all_hostnames
 96 | 
 97 | 
 98 | def resolve_index_expr(expr:str) -> List[int]:
 99 |     '''# Vars (example)
100 |     @expr: `"0-1,3"`
101 |     @return: `[0, 1, 3]`
102 |     '''
103 |     if expr:
104 |         # this function is called many times and slow speed of os.popen(...).read() matters
105 |         # indices:List[str] = os.popen(f'scontrol show hostname [{expr_string}]').read().split()
106 |         comma_splitted = expr.split(',')
107 |         indices_lists = list(map(resolve_element_expr, comma_splitted))
108 |         indices = sum(indices_lists, [])
109 |         return sorted(set(map(int, indices)))
110 |     else:
111 |         return []
112 | 
113 | 
114 | def resolve_element_expr(element_expr:str):
115 |     """# Vars (example)
116 |     ex1)
117 |     @element_expr: `"0-3"`
118 |     @return: `[0, 1, 2, 3]`
119 | 
120 |     ex2)
121 |     @element_expr: `"4"`
122 |     @return: `[4]`
123 |     """
124 |     dash_splitted = list(map(int, element_expr.split('-')))
125 |     if len(dash_splitted) == 1:
126 |         return dash_splitted
127 |     elif len(dash_splitted) == 2:
128 |         x, y = dash_splitted
129 |         return list(range(x, y+1))
130 |     else:
131 |         raise
132 | 
133 | 
134 | def MiB2GiB(MiB:int) -> float:
135 |     return MiB / 1024
136 | 


--------------------------------------------------------------------------------
/slurm_gres_viz/pre_main.py:
--------------------------------------------------------------------------------
  1 | from typing import List
  2 | import os
  3 | import re
  4 | import matplotlib.pyplot as plt
  5 | from sty import fg
  6 | import pprint
  7 | if __name__ == '__main__':  # for test
  8 |     from args import args
  9 | else:
 10 |     from .args import args
 11 | 
 12 | 
 13 | cmap = plt.get_cmap('jet')
 14 | 
 15 | 
 16 | def main():
 17 |     job_strings, node_strings = get_strings()
 18 |     node_attrs = dict(get_node_attrs(node_string) for node_string in node_strings)
 19 |     jobs = [get_running_job_with_gres_attrs(job_string) for job_string in job_strings if check_job_running_with_gres(job_string)]
 20 |     if not jobs:
 21 |         halt()
 22 |     prettify_gres(jobs, node_attrs)
 23 |     print_legends(jobs)
 24 | 
 25 | 
 26 | def get_strings():
 27 |     if args.test:
 28 |         with open('./test_jobs.txt', 'r') as f1, open('./test_nodes.txt', 'r') as f2:
 29 |             job_strings = f1.read().strip().split('\n\n')
 30 |             node_strings = f2.read().strip().split('\n\n')
 31 |     else:
 32 |         job_strings = os.popen('scontrol show job -d -a').read().strip().split('\n\n')
 33 |         node_strings = os.popen('scontrol show nodes').read().strip().split('\n\n')
 34 | 
 35 |     if job_strings[0] == 'No jobs in the system':
 36 |         halt()
 37 | 
 38 |     return job_strings, node_strings
 39 | 
 40 | 
 41 | def halt():
 42 |     print('No jobs in the system')
 43 |     exit()
 44 | 
 45 | 
 46 | def prettify_gres(jobs, node_attrs):
 47 |     stars = get_stars(jobs, node_attrs)
 48 |     nodename_width = max(len(nodename) for nodename in stars)
 49 |     for nodename, star_components in stars.items():
 50 |         print(f'{nodename:<{nodename_width}}: [GPUs] {star_components}  {get_res_strings(nodename, node_attrs)}')
 51 | 
 52 | def print_legends(jobs):
 53 |     column_names = ['COLORS', 'USER_ID', 'JOB_ID', 'ARRAY_IDX', 'JOB_NAME', 'NODE_NAME', 'GPUS', 'CPUS', 'MEM']
 54 |     keys = ['userid', 'jobid', 'arraytaskid', 'jobname']  # columns to compute the column width of each
 55 |     widths = [8] + [get_column_width(jobs, key, column_name) for key, column_name in zip(keys, column_names[1:-4])]\
 56 |         + [max(len(column_names[-4]), *[len(list(job['gpus'].keys())[0]) for job in jobs])]\
 57 |         + [max(len(column_names[-3]), *[len(','.join(str(e) for e in list(job['gpus'].values())[0])) for job in jobs])]\
 58 |         + [max(len(column_names[-2]), *[len(','.join(str(e) for e in list(job['cpus'].values())[0])) for job in jobs])]\
 59 |         + [max(len(column_names[-1]), *[len(','.join(str(e) for e in list(job['mem'].values())[0])) for job in jobs])]\
 60 | 
 61 |     delimiter = '  '
 62 |     width = sum(widths) + (len(column_names)-1) * len(delimiter)
 63 |     print(f'\n{" LEGENDS ":=^{width}}')
 64 | 
 65 |     jobs_and_colors = get_jobs_and_colors(jobs)
 66 |     indent = sum(widths[:-4]) + (len(column_names)-5) * len(delimiter)
 67 |     header = delimiter.join([f'{column_name:{width}s}' for column_name, width in zip(column_names, widths) if width])
 68 |     lines = []
 69 |     for job, color in jobs_and_colors:
 70 |         # line_elems = [f'{color}********{bcolors.CEND}']
 71 |         line_elems = [colorize('********', color)]
 72 |         for key, width in zip(keys, widths[1:-4]):
 73 |             if job[key] is not None:
 74 |                 if key == 'jobid' and job['arrayjobid'] is not None:
 75 |                     line_elems += [f"{job['arrayjobid']:<{width}}"]
 76 |                 else:
 77 |                     line_elems += [f"{job[key]:<{width}}"]
 78 |             elif width != 0:  # this job does not have the value but some others do
 79 |                 line_elems += [' ' * width]
 80 |         line_elems += [render_resource_string(job, indent, [widths[i] for i in range(-4, -1)] + [max(len(list(job['mem'].values())[0]) for job in jobs)])]
 81 |         line = delimiter.join(line_elems)
 82 |         lines += [line]
 83 |     body = '\n'.join(lines)
 84 |     print(header)
 85 |     print(body)
 86 | 
 87 | 
 88 | def get_stars(jobs, node_attrs):
 89 |     jobs_and_colors = get_jobs_and_colors(jobs)
 90 |     stars = {nodename: get_gres_components(attr['num_gpus']) for nodename, attr in node_attrs.items()}
 91 |     for job, color in jobs_and_colors:
 92 |         for nodename, gpu_indices in job['gpus'].items():
 93 |             for gpu_idx in gpu_indices:
 94 |                 # stars[nodename][gpu_idx] = f'{color}{stars[nodename][gpu_idx]}{bcolors.CEND}'
 95 |                 stars[nodename][gpu_idx] = colorize(f'{stars[nodename][gpu_idx]}', color)
 96 |     stars = {nodename: ''.join(star_components) for nodename, star_components in stars.items()}
 97 | 
 98 |     # for not the same number of GPUs in each node
 99 |     blank = "   " if args.index else " " # "   " for index, " " for star
100 |     max_gpus = max([attr['num_gpus'] for attr in node_attrs.values()])
101 |     for nodename, num_gpus in zip(node_attrs.keys(), [attr['num_gpus'] for attr in node_attrs.values()]):
102 |         stars[nodename] = f'{stars[nodename]}{blank*(max_gpus-num_gpus)}'
103 | 
104 |     return stars
105 | 
106 | 
107 | def render_resource_string(job, indent, widths):
108 |     delimiter = '\n' + ' ' * indent
109 |     cpus = list(job['cpus'].values())[0]
110 |     mem = list(job['mem'].values())[0]
111 | 
112 |     # for multi-node jobs
113 |     if len(job['gpus'].items()) > 1:
114 |         cpus = str(int(cpus) // len(job['gpus'].items()))
115 |         mem = str(float(mem) / len(job['gpus'].items()))
116 | 
117 |     return delimiter.join([
118 |         f'{nodename:{widths[-4]}s}  {",".join(map(str, gpu_indices)):{widths[-3]}s}  {cpus:{widths[-2]}s}  {float(mem):<{widths[-1]}g} {"GiB"}'
119 |         for nodename, gpu_indices in job['gpus'].items()
120 |     ])
121 | 
122 | 
123 | def get_column_width(jobs, key, column_name):
124 |     width_for_values = max(len(job[key] or []) for job in jobs)  # job[arraytaskid] can be None
125 |     if width_for_values==0:
126 |         return 0
127 |     else:
128 |         return max(width_for_values, len(column_name))
129 | 
130 | 
131 | def get_gres_components(num_gpus) -> list:
132 |     if args.index:
133 |         return [f'[{str(i)}]' for i in range(num_gpus)]
134 |     else:
135 |         return ['*'] * num_gpus
136 | 
137 | def get_res_strings(nodename, node_attrs):
138 |     width_cpus = max([len(str(node_attrs[nodename]['alloc_cpus'])) for nodename in node_attrs])
139 |     width_alloc_cpus = max([len(str(node_attrs[nodename]['alloc_cpus'])) for nodename in node_attrs])
140 |     width_mem = max([len(str(node_attrs[nodename]['mem_size'])) for nodename in node_attrs])
141 |     width_alloc_mem = max([len(str(node_attrs[nodename]['alloc_mem'])) for nodename in node_attrs])
142 |     string_cpu = f'[CPUs]  {node_attrs[nodename]["alloc_cpus"]:>{width_alloc_cpus}}/{node_attrs[nodename]["num_cpus"]:>{width_cpus}}'
143 |     string_mem = f'[Mem]  {float(node_attrs[nodename]["alloc_mem"]):{width_alloc_mem}g}/{node_attrs[nodename]["mem_size"]:{width_mem}g} GiB'
144 |     return '  ' + string_cpu + '   ' + string_mem
145 | 
146 | def get_running_job_with_gres_attrs(job_string):
147 |     if check_job_running_with_gres(job_string):
148 |         userid, = re.findall(r'UserId=(\S+)', job_string)
149 |         jobid, = re.findall(r'^JobId=(\d+)', job_string)  # Why ^? => not to capture ArrayJobId
150 |         arrayjobid, = re.findall(r'ArrayJobId=(\d+)', job_string) or (None,)
151 |         arraytaskid, = re.findall(r'ArrayTaskId=(\d+)', job_string) or (None,)
152 |         jobname, = re.findall(r'JobName=(.*)', job_string)
153 |         gpus = re.findall(r'\s(Nodes=.*)', job_string)  # \s: white-space-like char
154 |         gpus = dict(sum([list(get_res_attrs(res_string).items()) for res_string in gpus], []))
155 |         cpus, mem = get_alloc_res_job_attrs(job_string)
156 |         return {'userid': userid, 'jobid': jobid, 'arrayjobid': arrayjobid,
157 |                 'arraytaskid': arraytaskid, 'jobname': jobname,
158 |                 'gpus': gpus, 'cpus': cpus, 'mem': mem}
159 | 
160 | def get_parent_job_array_attrs(job_string):
161 |     if check_job_parent_jobarray(job_string):
162 |         pass
163 | 
164 | def get_node_attrs(node_string):
165 |     nodename, = re.findall(r'NodeName=(\S+)', node_string)  # \S: non-white-space-like char
166 |     num_gpus, = re.findall(r'Gres=[a-zA-Z]+:(\d+)', node_string)
167 |     num_cpus, = re.findall(r'CPUTot=(\d+)', node_string)
168 |     mem_size, = re.findall(r'RealMemory=(\d+)', node_string)
169 | 
170 |     alloc_gpus, alloc_cpus, alloc_mem = get_alloc_res_node_attrs(node_string)
171 | 
172 |     return nodename, {'num_gpus': int(num_gpus), 'num_cpus': int(num_cpus),
173 |                     'mem_size': round(float(mem_size) / 1024, 2), 'alloc_gpus': int(alloc_gpus),
174 |                     'alloc_cpus': int(alloc_cpus), 'alloc_mem': float(alloc_mem)}
175 | 
176 | def get_alloc_res_node_attrs(node_string):
177 |     alloc_tres = re.findall(r'AllocTRES=cpu=(\d+),mem=(\d+)([a-zA-Z]+),gres/gpu=(\d+)', node_string)
178 |     if len(alloc_tres) == 0:
179 |         return 0, 0, 0
180 |     else:
181 |         alloc_tres = list(alloc_tres[0])
182 |         return alloc_tres[-1], alloc_tres[0], get_mem_size_in_GiB(alloc_tres)[1]
183 | 
184 | def get_alloc_res_job_attrs(job_string):
185 |     nodename, = re.findall(r'\sNodes=(\S+)', job_string)  # \s: white-space-like char
186 |     alloc_tres = re.findall(r'TRES=cpu=(\d+),mem=(\d+)([a-zA-Z]+)', job_string)
187 |     if len(alloc_tres) == 0:
188 |         return {nodename:0}, {nodename:0}
189 |     else:
190 |         alloc_tres = list(alloc_tres[0])
191 |         return {nodename:alloc_tres[0]}, {nodename:get_mem_size_in_GiB(alloc_tres)[1]}
192 | 
193 | def get_mem_size_in_GiB(alloc_tres):
194 |     if alloc_tres[2] == 'M':
195 |         alloc_tres[1] = str(round(float(alloc_tres[1]) / 1024, 2))
196 |     else:
197 |         pass
198 | 
199 |     return alloc_tres
200 | 
201 | def check_job_running_with_gres(job_string):
202 |     jobstate, = re.findall(r'JobState=([A-Z]+)', job_string)
203 |     return jobstate == 'RUNNING' and re.findall(r'GRES=\S+\(IDX:[-,\d]+\)', job_string)
204 | 
205 | 
206 | def check_job_parent_jobarray(job_string):
207 |     jobid, = re.findall(r'^JobId=(\d+)', job_string)
208 |     arrayjobid, = re.findall(r'ArrayJobId=(\d+)', job_string)
209 |     jobstate, = re.findall(r'JobState=([A-Z]+)', job_string)
210 |     if arrayjobid is None \
211 |         or jobstate not in ['RUNNING', 'PENDING'] \
212 |         or jobid != arrayjobid:
213 |         return False
214 |     else:
215 |         return True
216 | 
217 | 
218 | def get_res_attrs(res_string):  # Nodes=node1 CPU_IDs=0-31 Mem=0 GRES=gpu(IDX:4-7) -> {'node1': [4, 5, 6, 7]}
219 |     nodes, = re.findall(r'Nodes=(\S+)', res_string)
220 |     if '[' in nodes:
221 |         indices, = re.findall(r'\[([-,\d]+)\]', nodes)
222 |         indices = parse_exp(indices)
223 |         rootname = nodes.split('[')[0]
224 |         nodes = [rootname+str(idx) for idx in indices]
225 |     else:
226 |         nodes = [nodes]
227 |     gres, = re.findall(r'IDX:([-,\d]+)\)', res_string)
228 |     gres = parse_exp(gres)
229 |     return {nodename: gres for nodename in nodes}
230 | 
231 | 
232 | def parse_exp(exp_string):  # '0-1,3' -> [0, 1, 3]
233 |     exps = exp_string.split(',')  # '0-1,3' -> ['0-1', '3']
234 |     def expand_exp(exp):  # '0-1' -> [0, 1] or '3' -> [3]
235 |         if '-' in exp:
236 |             a, b = map(int, exp.split('-'))
237 |             return list(range(a, b+1))
238 |         else:
239 |             return [int(exp)]
240 |     return sum([expand_exp(exp) for exp in exps], [])  # concat trick
241 | 
242 | 
243 | def get_jobs_and_colors(jobs) -> zip:
244 |     def get_color_from_jid(jid:int):
245 |         color = cmap(((11*jid) % 256) / 256)[:-1]
246 |         color = list(map(lambda x:int(x*255), color))
247 |         return color
248 | 
249 |     return zip(jobs, [get_color_from_jid(int(job['jobid'])) for job in jobs])
250 | 
251 | 
252 | def colorize(source:str, color:List[int]):
253 |     return fg(*color) + source + fg.rs
254 | 
255 | 
256 | if __name__ == '__main__':
257 |     main()
258 | 


--------------------------------------------------------------------------------
/slurm_gres_viz/slurm_objects.py:
--------------------------------------------------------------------------------
  1 | from typing import Dict, List, Tuple, Union
  2 | import requests
  3 | from bs4 import BeautifulSoup
  4 | from prometheus_client.parser import text_string_to_metric_families
  5 | if __name__.startswith('slurm_gres_viz'):
  6 |     from .parsers import parse_jobstring, parse_nodestring, MiB2GiB
  7 | else:  # for test
  8 |     from parsers import parse_jobstring, parse_nodestring, MiB2GiB
  9 | 
 10 | 
 11 | NORMAL_NODE_STATES = ['IDLE', 'MIXED', 'ALLOCATED']
 12 | INVALID_NODE_STATES = ['DRAIN', 'DOWN', 'INVALID']
 13 | 
 14 | 
 15 | class Job:
 16 |     def __init__(self, job_string=None):
 17 |         self.job_string = job_string or ''
 18 |         self.userid, self.id, self.arrayjobid, self.arraytaskid, self.name, self.tres_dict = parse_jobstring(self.job_string)
 19 | 
 20 | 
 21 | class GPU:
 22 |     def __init__(self, dcgm_stat:Union[Dict[str,float],None]=None):
 23 |         # self.gpuname = gpuname  # TODO: gpu name from slurm.conf??
 24 |         if dcgm_stat is None or 'DCGM_FI_DEV_GPU_UTIL' not in dcgm_stat:
 25 |             self.util = 0
 26 |             self.vram_alloc = 0
 27 |             self.vram_total = 0
 28 |             self.invalid = True
 29 |         else:
 30 |             self.util = float(dcgm_stat['DCGM_FI_DEV_GPU_UTIL'])
 31 |             self.vram_alloc = MiB2GiB(float(dcgm_stat['DCGM_FI_DEV_FB_USED']))
 32 |             self.vram_total = MiB2GiB(float(dcgm_stat['DCGM_FI_DEV_FB_FREE'])) + self.vram_alloc
 33 |             self.invalid = False
 34 | 
 35 | 
 36 | class Node:
 37 |     def __init__(self, node_string:str, node_ip_dict:Union[Dict[str,str],None], request_exporter:bool=False):
 38 |         """# Vars (example)
 39 |         @nodename: `"vll3"`
 40 |         @num_cpus: `96`
 41 |         @num_gpus: `8`
 42 |         @mem_total: `336833`
 43 |         @public_ip: `"xxx.xxx.xxx.xxx"`
 44 |         @gpu_infos: `[{"gpuname": "", "gpuutil": 83, "vram_used": 18832, "vram_total": 24080}, ...]`
 45 |         @cpu_load: `2.10`
 46 |         @mem_used: `61440`
 47 |         """
 48 |         # getting infos from node_string (fast)
 49 |         self.node_string = node_string
 50 |         nodename, state, num_cpus_alloc, num_cpus_total, num_gpus_alloc, num_gpus_total, mem_alloc, mem_total = parse_nodestring(self.node_string)
 51 |         self.name = nodename  # node_string[v], exporter
 52 |         self.states:List[str] = state.split('+')  # ex: IDLE+DRAIN
 53 |         self.is_state_ok = all([invalid_state not in self.states for invalid_state in INVALID_NODE_STATES])
 54 |         self.mem_alloc = mem_alloc  # node_string[v], exporter
 55 |         self.mem_total = mem_total  # node_string
 56 | 
 57 |         self.num_cpus_total = num_cpus_total  # node_string
 58 |         self.num_cpus_alloc = num_cpus_alloc  # node_string
 59 |         self.num_gpus_alloc = num_gpus_alloc  # node_string
 60 |         self.num_gpus_total = num_gpus_total  # node_string
 61 | 
 62 |         # ==========================================================
 63 |         # getting infos from exporters (slow)
 64 |         # todo: show 옵션을 받아오고, node가 정상인 상태에서만 가져와야 됨
 65 | 
 66 |         self.request_exporter = request_exporter
 67 |         if self.request_exporter:
 68 |             if self.is_state_ok:
 69 |                 self.public_ip = node_ip_dict[self.name]  # given
 70 |                 # self.node_metrics = self.get_node_metrics()
 71 |                 self.gpu_metrics, self.gpus = self.get_gpu_metrics()
 72 |                 if any([gpu.invalid for gpu in self.gpus]):
 73 |                     self.is_state_ok = False
 74 |             else:
 75 |                 self.gpus = [GPU() for _ in range(self.num_gpus_total)]
 76 |             # self.cpu_loads = [
 77 |             #     self.node_metrics['node_load1'].samples[0].value,
 78 |             #     self.node_metrics['node_load5'].samples[0].value,
 79 |             #     self.node_metrics['node_load15'].samples[0].value,
 80 |             # ]  # node_string, exporter[v]
 81 | 
 82 |     # def get_node_metrics(self) -> dict:
 83 |     #     response = requests.get(f'http://{self.public_ip}:9100/metrics')  # node exporter
 84 |     #     if response.ok:
 85 |     #         metrics = self.html2metrics(response.text)
 86 |     #         return {metric.name: metric for metric in metrics}
 87 |     #     else:
 88 |     #         raise  # The metric server does not respond
 89 | 
 90 |     def get_gpu_metrics(self) -> Tuple[dict, List[GPU]]:
 91 |         response = requests.get(f'http://{self.public_ip}:9400/metrics', timeout=.1)  # dcgm exporter
 92 |         if response.ok:
 93 |             gpu_metrics = self.html2metrics(response.text)
 94 |             gpus = self.metrics2gpu_objs(gpu_metrics)
 95 |             return gpu_metrics, gpus
 96 |         else:
 97 |             raise  # The metric server does not respond
 98 | 
 99 |     def metrics2gpu_objs(self, metrics) -> List[GPU]:
100 |         gpu_indices = []
101 |         for metric in metrics:
102 |             if metric.samples and 'gpu' in metric.samples[0].labels:
103 |                 gpu_indices.append(metric.samples[0].labels['gpu'])
104 |         num_gpus = len(set(gpu_indices))
105 |         dcgm_stats:List[Dict[str,float]] = [{} for _ in range(num_gpus)]
106 |         for metric in metrics:
107 |             if metric.samples and 'gpu' in metric.samples[0].labels:
108 |                 sample = metric.samples[0]
109 |                 gpu_idx = int(sample.labels['gpu'])
110 |                 dcgm_stats[gpu_idx][sample.name] = sample.value
111 |         return [GPU(dcgm_stat) for dcgm_stat in dcgm_stats]
112 | 
113 |     def html2metrics(self, html):
114 |         soup = BeautifulSoup(html, 'html.parser')
115 |         metrics = list(text_string_to_metric_families(soup.get_text()))
116 |         return metrics
117 | 


--------------------------------------------------------------------------------
/slurm_gres_viz/visualizer.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from typing import List, Tuple, Dict
 3 | from multiprocessing.pool import ThreadPool
 4 | 
 5 | if __name__.startswith('slurm_gres_viz'):  # for test
 6 |     from .slurm_objects import Job, GPU, Node
 7 |     from .displayer import Displayer
 8 | else:
 9 |     from slurm_objects import Job, GPU, Node
10 |     from displayer import Displayer
11 | 
12 | 
13 | class SlurmTresVisualizer:
14 |     def __init__(self,
15 |         node_strings:List[str], job_strings:List[str],
16 |         test_mode:bool=False,
17 | 
18 |         show_index:bool=False, show_gpu_memory:bool=False, show_gpu_util:bool=False,
19 |         show_only_mine:bool=False
20 |     ):
21 |         self.node_strings = node_strings
22 |         self.job_strings = job_strings
23 | 
24 |         self.test_mode = test_mode
25 | 
26 |         self.show_index = show_index
27 |         self.show_gpu_memory = show_gpu_memory
28 |         self.show_gpu_util = show_gpu_util
29 |         self.show_only_mine = show_only_mine
30 | 
31 |         self.nodes, self.jobs = self.get_infos()
32 | 
33 |     # =================================================================================================
34 | 
35 |     def get_infos(self):
36 |         nodes = self.get_node_infos()
37 |         jobs = self.get_job_infos()
38 |         return nodes, jobs
39 | 
40 |     def get_node_infos(self):
41 |         request_exporter = self.show_gpu_memory or self.show_gpu_util
42 |         node_ip_dict = get_ips_from_etchosts() if request_exporter else None
43 |         def get_node(node_string):
44 |             return Node(
45 |                 node_string=node_string, node_ip_dict=node_ip_dict,
46 |                 request_exporter=request_exporter
47 |             )
48 |         if request_exporter:
49 |             with ThreadPool(len(self.node_strings)) as t:
50 |                 nodes = t.map(get_node, self.node_strings)
51 |         else:
52 |             nodes = [get_node(node_string) for node_string in self.node_strings]
53 |         return nodes
54 | 
55 |     def get_job_infos(self):
56 |         job_infos:List[Job] = []
57 |         for job_string in self.job_strings:
58 |             if job_string == 'No jobs in the system':
59 |                 job_infos = []
60 |                 break
61 |             else:
62 |                 jobstate, = re.findall(r'JobState=([A-Z]+)', job_string)
63 |                 if jobstate == 'RUNNING':
64 |                     job_info = Job(job_string)
65 |                     job_infos.append(job_info)
66 |         return job_infos
67 | 
68 |     # =================================================================================================
69 | 
70 |     def show(self):
71 |         if self.test_mode:
72 |             for job in self.jobs:
73 |                 print(job.userid, job.id, job.name)
74 |                 print(job.tres_dict)
75 |             print()
76 | 
77 |         displayer = Displayer(
78 |             self.nodes, self.jobs,
79 |             show_index=self.show_index,
80 |             show_gpu_memory=self.show_gpu_memory,
81 |             show_gpu_util=self.show_gpu_util,
82 |             show_only_mine=self.show_only_mine
83 |         )
84 |         displayer.show()
85 | 
86 | 
87 | def get_ips_from_etchosts() -> Dict[str,str]:
88 |     with open('/etc/hosts') as f:
89 |         data = f.read()
90 |     ip_pattern = r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})'
91 |     ip_node_pairs:List[Tuple[str,str]] = re.findall(ip_pattern + r'\s*([\w-]*)', data)  # [(ip, nodename), ...]
92 |     ip_node_pairs = list(map(lambda tuple: tuple[::-1], ip_node_pairs))  # [(nodename, ip), ...]
93 |     return dict(ip_node_pairs)
94 | 


--------------------------------------------------------------------------------