├── slurmui ├── __init__.py ├── slurmui.tcss ├── slurmui_cli.py ├── debug_strings.py └── slurmui.py ├── .gitignore ├── asset ├── demo.png └── verbose_info.png ├── pyproject.toml └── README.md /slurmui/__init__.py: -------------------------------------------------------------------------------- 1 | from .slurmui import run_ui -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | build/ 3 | dist/ 4 | slurmui.egg-info/ 5 | -------------------------------------------------------------------------------- /asset/demo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShenhanQian/slurmui/HEAD/asset/demo.png -------------------------------------------------------------------------------- /asset/verbose_info.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShenhanQian/slurmui/HEAD/asset/verbose_info.png -------------------------------------------------------------------------------- /slurmui/slurmui.tcss: -------------------------------------------------------------------------------- 1 | Header { 2 | height: 1; 3 | text-style: bold; 4 | background: $boost; 5 | } 6 | 7 | Footer { 8 | text-style: bold; 9 | background: #303030; 10 | } 11 | 12 | Static { 13 | width: 1fr; 14 | } 15 | 16 | DataTable { 17 | background: $boost; 18 | } 19 | 20 | Tabs { 21 | background: $boost; 22 | } 23 | 24 | Tab { 25 | text-style: bold; 26 | padding-left: 1; 27 | padding-right: 3; 28 | } 29 | 30 | .time-tab { 31 | text-style: none; 32 | color: $foreground; 33 | } 34 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "slurmui" 3 | version = "1.1.17" 4 | description = "Terminal UI for Slurm" 5 | authors = [ 6 | {name = "Shenhan Qian", email = "shenhan.qian@tum.de"} 7 | ] 8 | requires-python = ">=3.6" 9 | license = {text = "MIT"} 10 | classifiers = [ 11 | "Programming Language :: Python :: 3", 12 | "License :: OSI Approved :: MIT License", 13 | "Operating System :: OS Independent" 14 | ] 15 | dependencies = [ 16 | "textual>=0.61.0", 17 | "pandas", 18 | "gpustat", 19 | ] 20 | [project.scripts] 21 | slurmui = "slurmui.slurmui_cli:slurmui_cli" 22 | 23 | [build-system] 24 | requires = ["hatchling"] 25 | build-backend = "hatchling.build" 26 | -------------------------------------------------------------------------------- /slurmui/slurmui_cli.py: -------------------------------------------------------------------------------- 1 | from slurmui import run_ui 2 | from argparse import ArgumentParser 3 | 4 | def slurmui_cli(): 5 | # adding arguments later 6 | parser = ArgumentParser("SLURM UI") 7 | parser.add_argument("-v", "--verbose", action='store_true') 8 | parser.add_argument("-c", "--cluster", help="Specify the name of the cluster") 9 | parser.add_argument("-i", "--interval", help="Specify the interval in seconds to refresh the UI", type=int, default=5) 10 | parser.add_argument("-r", "--history-range", help="Specify the time range of history jobs to load", type=str, default="1 week") 11 | args = parser.parse_args() 12 | run_ui(verbose=args.verbose, cluster=args.cluster, interval=args.interval) 13 | 14 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # SlurmUI 2 | 3 | Enhanced command-line UI to ease working with slurm. 4 | Written in Python, derived from [SlurmUI](https://github.com/SirWyver/slurmui). 5 | 6 | Viewing and managing 7 | - GPUs 8 | - Jobs in the history 9 | - Jobs in the queue 10 | - Logs for current and past jobs 11 | 12 |
13 | 14 |
15 | 16 | ## Install and run 17 | ```shell 18 | pip install -U git+https://github.com/ShenhanQian/slurmui.git 19 | slurmui 20 | ``` 21 | Optional arguments: 22 | - `-i` update interval in seconds. (10 by default. Set to 0 to disable). 23 | - `-v` verbose mode (printing info and error to the info panel). 24 | - `-r` time range of history jobs to load (1 week by default)) 25 | - `-c` cluster name (deprecated as the latest version of SlurmUI does not have cluster-specific configuration). 26 | 27 | ## Basics 28 | Under the interface of SlurmUI we rely on three basic slurm commands: 29 | - `sinfo` for information of nodes, GPUs, etc. 30 | - `squeue` for current jobs in the queue 31 | - `sacct` for history jobs 32 | 33 | Make sure you can get meaningful output from these commands on your cluster before trying SlurmUI. 34 | 35 | To debug, you could srun `slurmui` with `-i 0` to disable auto update and `-v` to force verbose logging. Then, you will see the full commands that slurmui sends to slurm in the info panel. 36 | 37 |
38 | 39 |
40 | 41 | ## Supported Clusters 42 | - [TUM CVG](https://cvg.cit.tum.de/) 43 | - [TUM VCG](https://www.niessnerlab.org/) 44 | - [LRZ AI](https://doku.lrz.de/lrz-ai-systems-11484278.html) 45 | 46 | > [!NOTE] 47 | > If SlurmUI does not work on your cluster, try the debugging suggestions in [Basics](README.md#basics) and feel free to open an issue. 48 | 49 | ## Contributions 50 | Open to contribution including but not limited to: 51 | - Improving startup/launch speed 52 | - Enhancing multithreading and concurrency handling 53 | - Strengthening crash recovery and process resiliency 54 | - Expanding features or addressing edge cases 55 | -------------------------------------------------------------------------------- /slurmui/debug_strings.py: -------------------------------------------------------------------------------- 1 | ### ---> FOR DEBUGGING 2 | 3 | SQUEUE_DEBUG = """ 4 | JOBID PA NAME USER STATE TIME TIME_L NODE NODELIST(REASON) 5 | 190689 in interactive bob RUNNING 9:46 6:00:00 1 lothlann 6 | 190663 su triplane3_ll bob RUNNING 2:56:36 4-00:00:00 1 himring 7 | 190662 su triplane2_ll bob RUNNING 2:56:40 4-00:00:00 1 himring 8 | 190661 su triplane1_ll bob RUNNING 2:56:46 4-00:00:00 1 himring 9 | 190660 su triplane1_l bob RUNNING 2:57:04 4-00:00:00 1 himring 10 | 190659 su triplane2_l bob RUNNING 2:57:10 4-00:00:00 1 balrog 11 | 190658 su triplane3_l bob RUNNING 2:57:28 4-00:00:00 1 balrog 12 | 190657 su triplane3_m bob RUNNING 2:57:31 4-00:00:00 1 balrog 13 | 190656 su triplane2_m bob RUNNING 2:57:36 4-00:00:00 1 balrog 14 | 190655 su triplane1_m bob RUNNING 2:57:39 4-00:00:00 1 balrog 15 | 190654 su triplane1_m bob RUNNING 2:57:43 4-00:00:00 1 angmar 16 | 190651 su triplane0_m bob RUNNING 3:03:06 4-00:00:00 1 valinor 17 | 190650 su triplane0_ll bob RUNNING 3:03:13 4-00:00:00 1 valinor 18 | 190649 su triplane0_l bob RUNNING 3:03:17 4-00:00:00 1 valinor 19 | """ 20 | 21 | SINFO_DEBUG = """ 22 | HOSTNAMES GRES GRES_USED STATE 23 | andram gpu:a100:4,mps:a100:400 gpu:a100:4(IDX:0-3),mps:a100:0(IDX:N/A) mix 24 | andram gpu:a100:4,mps:a100:400 gpu:a100:4(IDX:0-3),mps:a100:0(IDX:N/A) mix 25 | angmar gpu:rtx_a6000:8,mps:rtx_a6000:800 gpu:rtx_a6000:8(IDX:0-7),mps:rtx_a6000:0(IDX:N/A) mix 26 | angmar gpu:rtx_a6000:8,mps:rtx_a6000:800 gpu:rtx_a6000:8(IDX:0-7),mps:rtx_a6000:0(IDX:N/A) mix 27 | balar gpu:a100:4,mps:a100:400 gpu:a100:4(IDX:0-3),mps:a100:0(IDX:N/A) mix 28 | balar gpu:a100:4,mps:a100:400 gpu:a100:4(IDX:0-3),mps:a100:0(IDX:N/A) mix 29 | balrog gpu:rtx_3090:8,mps:rtx_3090:800 gpu:rtx_3090:7(IDX:0-3,5-7),mps:rtx_3090:0(IDX:N/A) mix 30 | balrog gpu:rtx_3090:8,mps:rtx_3090:800 gpu:rtx_3090:7(IDX:0-3,5-7),mps:rtx_3090:0(IDX:N/A) mix 31 | char gpu:gtx_1080:4,mps:gtx_1080:800 gpu:gtx_1080:4(IDX:0-3),mps:gtx_1080:0(IDX:N/A) mix 32 | char gpu:gtx_1080:4,mps:gtx_1080:800 gpu:gtx_1080:4(IDX:0-3),mps:gtx_1080:0(IDX:N/A) mix 33 | daidalos gpu:rtx_a6000:8,mps:rtx_a6000:800 gpu:rtx_a6000:8(IDX:0-7),mps:rtx_a6000:0(IDX:N/A) mix 34 | daidalos gpu:rtx_a6000:8,mps:rtx_a6000:800 gpu:rtx_a6000:8(IDX:0-7),mps:rtx_a6000:0(IDX:N/A) mix 35 | doriath gpu:a100:4,mps:a100:400 gpu:a100:4(IDX:0-3),mps:a100:0(IDX:N/A) mix 36 | doriath gpu:a100:4,mps:a100:400 gpu:a100:4(IDX:0-3),mps:a100:0(IDX:N/A) mix 37 | erebor gpu:rtx_a6000:8,mps:rtx_a6000:800 gpu:rtx_a6000:8(IDX:0-7),mps:rtx_a6000:0(IDX:N/A) mix 38 | erebor gpu:rtx_a6000:8,mps:rtx_a6000:800 gpu:rtx_a6000:8(IDX:0-7),mps:rtx_a6000:0(IDX:N/A) mix 39 | eriador gpu:rtx_a6000:8,mps:rtx_a6000:800 gpu:rtx_a6000:8(IDX:0-7),mps:rtx_a6000:0(IDX:N/A) mix 40 | eriador gpu:rtx_a6000:8,mps:rtx_a6000:800 gpu:rtx_a6000:8(IDX:0-7),mps:rtx_a6000:0(IDX:N/A) mix 41 | falas gpu:a100:4,mps:a100:400 gpu:a100:3(IDX:0-2),mps:a100:0(IDX:N/A) mix 42 | falas gpu:a100:4,mps:a100:400 gpu:a100:3(IDX:0-2),mps:a100:0(IDX:N/A) mix 43 | gimli gpu:rtx_3090:8,mps:rtx_3090:800 gpu:rtx_3090:8(IDX:0-7),mps:rtx_3090:0(IDX:N/A) mix 44 | gimli gpu:rtx_3090:8,mps:rtx_3090:800 gpu:rtx_3090:8(IDX:0-7),mps:rtx_3090:0(IDX:N/A) mix 45 | gondor gpu:rtx_2080:9,mps:rtx_2080:900 gpu:rtx_2080:9(IDX:0-8),mps:rtx_2080:0(IDX:N/A) mix 46 | gondor gpu:rtx_2080:9,mps:rtx_2080:900 gpu:rtx_2080:9(IDX:0-8),mps:rtx_2080:0(IDX:N/A) mix 47 | himring gpu:rtx_a6000:8,mps:rtx_a6000:800 gpu:rtx_a6000:8(IDX:0-7),mps:rtx_a6000:0(IDX:N/A) mix 48 | himring gpu:rtx_a6000:8,mps:rtx_a6000:800 gpu:rtx_a6000:8(IDX:0-7),mps:rtx_a6000:0(IDX:N/A) mix 49 | hithlum gpu:rtx_a6000:8,mps:rtx_a6000:800 gpu:rtx_a6000:8(IDX:0-7),mps:rtx_a6000:0(IDX:N/A) mix 50 | hithlum gpu:rtx_a6000:8,mps:rtx_a6000:800 gpu:rtx_a6000:8(IDX:0-7),mps:rtx_a6000:0(IDX:N/A) mix 51 | ikarus gpu:rtx_a6000:8,mps:rtx_a6000:800 gpu:rtx_a6000:8(IDX:0-7),mps:rtx_a6000:0(IDX:N/A) mix 52 | ikarus gpu:rtx_a6000:8,mps:rtx_a6000:800 gpu:rtx_a6000:8(IDX:0-7),mps:rtx_a6000:0(IDX:N/A) mix 53 | lothlann gpu:rtx_2080:8,mps:rtx_2080:800 gpu:rtx_2080:8(IDX:0-7),mps:rtx_2080:0(IDX:N/A) mix 54 | lothlann gpu:rtx_2080:8,mps:rtx_2080:800 gpu:rtx_2080:8(IDX:0-7),mps:rtx_2080:0(IDX:N/A) mix 55 | moria gpu:rtx_2080:8,mps:rtx_2080:800 gpu:rtx_2080:8(IDX:0-7),mps:rtx_2080:0(IDX:N/A) mix 56 | moria gpu:rtx_2080:8,mps:rtx_2080:800 gpu:rtx_2080:8(IDX:0-7),mps:rtx_2080:0(IDX:N/A) mix 57 | pegasus gpu:gtx_1080:8,mps:gtx_1080:800 gpu:gtx_1080:0(IDX:N/A),mps:gtx_1080:0(IDX:N/A) idle 58 | pegasus gpu:gtx_1080:8,mps:gtx_1080:800 gpu:gtx_1080:0(IDX:N/A),mps:gtx_1080:0(IDX:N/A) idle 59 | ramdal gpu:a100:4,mps:a100:400 gpu:a100:4(IDX:0-3),mps:a100:0(IDX:N/A) mix 60 | ramdal gpu:a100:4,mps:a100:400 gpu:a100:4(IDX:0-3),mps:a100:0(IDX:N/A) mix 61 | seti gpu:rtx_2080:8,mps:rtx_2080:800 gpu:rtx_2080:0(IDX:N/A),mps:rtx_2080:0(IDX:N/A) drain 62 | seti gpu:rtx_2080:8,mps:rtx_2080:800 gpu:rtx_2080:0(IDX:N/A),mps:rtx_2080:0(IDX:N/A) drain 63 | sorona gpu:rtx_2080:8,mps:rtx_2080:800 gpu:rtx_2080:8(IDX:0-7),mps:rtx_2080:0(IDX:N/A) mix 64 | sorona gpu:rtx_2080:8,mps:rtx_2080:800 gpu:rtx_2080:8(IDX:0-7),mps:rtx_2080:0(IDX:N/A) mix 65 | tarsonis gpu:gtx_1080:4,mps:gtx_1080:400 gpu:gtx_1080:0(IDX:N/A),mps:gtx_1080:0(IDX:N/A) idle 66 | tarsonis gpu:gtx_1080:4,mps:gtx_1080:400 gpu:gtx_1080:0(IDX:N/A),mps:gtx_1080:0(IDX:N/A) idle 67 | umoja gpu:rtx_2080:8,mps:rtx_2080:800 gpu:rtx_2080:6(IDX:0,2-4,6-7),mps:rtx_2080:0(IDX:N/A) mix 68 | umoja gpu:rtx_2080:8,mps:rtx_2080:800 gpu:rtx_2080:6(IDX:0,2-4,6-7),mps:rtx_2080:0(IDX:N/A) mix 69 | valinor gpu:rtx_a6000:8,mps:rtx_a6000:800 gpu:rtx_a6000:8(IDX:0-7),mps:rtx_a6000:0(IDX:N/A) mix 70 | valinor gpu:rtx_a6000:8,mps:rtx_a6000:800 gpu:rtx_a6000:8(IDX:0-7),mps:rtx_a6000:0(IDX:N/A) mix 71 | """ -------------------------------------------------------------------------------- /slurmui/slurmui.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | import io 3 | import importlib.metadata 4 | from textual.app import App, ComposeResult 5 | from textual.binding import Binding 6 | from textual.widgets import Header, Footer, RichLog, DataTable, Tabs, Tab 7 | from textual.containers import Container 8 | from rich.text import Text 9 | import subprocess 10 | import pandas as pd 11 | import re 12 | import os 13 | import threading 14 | from functools import wraps 15 | from datetime import datetime, timedelta 16 | import time 17 | import socket 18 | import signal 19 | 20 | 21 | DEBUG = False 22 | if DEBUG: 23 | from slurmui.debug_strings import SINFO_DEBUG, SQUEUE_DEBUG 24 | 25 | 26 | def run_in_thread(func): 27 | """Decorator to run a function in a separate thread""" 28 | @wraps(func) 29 | def wrapper(*args, **kwargs): 30 | thread = threading.Thread(target=func, args=args, kwargs=kwargs) 31 | thread.start() 32 | return wrapper 33 | 34 | def handle_error(func): 35 | """Decorator to wrap action methods with try-except logic""" 36 | @wraps(func) 37 | def wrapper(self, *args, **kwargs): 38 | try: 39 | return func(self, *args, **kwargs) 40 | except Exception as e: 41 | self.info_log.write(f"Error ({func.__name__}): {e}") 42 | os.system(f"echo 'Error ({func.__name__}): {e}'") 43 | return wrapper 44 | 45 | 46 | class SlurmUI(App): 47 | # configuration that can be set from the command line 48 | verbose = False 49 | cluster = None 50 | interval = 10 51 | history_range = "1 week" # Default history range 52 | 53 | # internal states 54 | STAGE = { 55 | "action": "job", 56 | "job": { 57 | # "sort_column": 0, 58 | # "sort_ascending": True, 59 | }, 60 | "history": { 61 | "sort_column": 0, 62 | "sort_ascending": False, 63 | }, 64 | "node": { 65 | # "sort_column": 0, 66 | # "sort_ascending": True, 67 | }, 68 | } 69 | stats = {} 70 | selected_jobid = [] 71 | only_available_nodes = True 72 | only_completed_history = False 73 | only_running_jobs = False 74 | only_my_jobs = True 75 | 76 | theme = "textual-dark" 77 | selected_text_style = "bold on orange3" 78 | border_type = "solid" 79 | border_color = "white" 80 | 81 | CSS_PATH = "slurmui.tcss" 82 | TITLE = f"SlurmUI (v{importlib.metadata.version('slurmui')})" 83 | 84 | BINDINGS = [ 85 | Binding("g", "display_nodes", "GPUs"), 86 | Binding("h", "display_history_jobs", "History"), 87 | Binding("j", "display_jobs", "Jobs"), 88 | Binding("q", "abort", "Abort"), 89 | Binding("space", "select", "Select"), 90 | Binding("v", "select_inverse", "Inverse"), 91 | Binding("r", "refresh", "Refresh"), 92 | Binding("enter", "confirm", "Confirm", priority=True, key_display='enter'), 93 | Binding("s", "sort", "Sort"), 94 | Binding("d", "delete", "Delete"), 95 | Binding("G", "print_gpustat", "GPU"), 96 | Binding("l", "display_job_log", "Log"), 97 | Binding("L", "open_with_less", "Open with less"), 98 | Binding("J", "toggle_user_range", "State"), 99 | Binding("H", "toggle_history_range", "Range"), 100 | ] 101 | 102 | def compose(self) -> ComposeResult: 103 | self.header = Header() 104 | self.footer = Footer() 105 | 106 | self.tab_nodes = Tab("GPUs", id="node") 107 | self.tab_history = Tab("History", id="history") 108 | self.tab_jobs = Tab("Jobs", id="job") 109 | self.tab_time = Tab("Time", id="time", disabled=True, classes="time-tab") 110 | self.tabs = Tabs(self.tab_nodes, self.tab_history, self.tab_jobs, self.tab_time, id="tabs") 111 | self.tabs.can_focus = False 112 | 113 | self.node_table = DataTable(id="node_table") 114 | self.job_table = DataTable(id="job_table") 115 | self.history_table = DataTable(id="history_table") 116 | self.tables = { 117 | "job": self.job_table, 118 | "node": self.node_table, 119 | "history": self.history_table 120 | } 121 | self.active_table = "job" 122 | 123 | self.info_log = RichLog(wrap=True, highlight=True, id="info_log", auto_scroll=True) 124 | self.info_log.can_focus = False 125 | self.info_log.border_title = "Info" 126 | 127 | self.job_log = RichLog(wrap=True, highlight=True, id="job_log", auto_scroll=False) 128 | self.job_log_position = None 129 | 130 | 131 | yield self.header 132 | yield self.tabs 133 | yield Container(self.job_table, self.node_table, self.history_table, self.info_log, self.job_log) 134 | yield self.footer 135 | 136 | def on_mount(self): 137 | pass 138 | 139 | def on_ready(self) -> None: 140 | self.rewrite_table("job") 141 | self.rewrite_table("node") 142 | self.rewrite_table("history") 143 | self.tabs.active = 'job' 144 | if self.interval > 0: 145 | self.set_interval(self.interval, self.auto_refresh) 146 | 147 | def on_tabs_tab_activated(self, message): 148 | tab_id = message.tab.id 149 | if self.verbose: 150 | self.info_log.write(f"Tab activated: {tab_id}") 151 | 152 | self.STAGE.update({"action": tab_id}) 153 | self.update_table(tab_id) 154 | self.switch_display(tab_id) 155 | self.refresh_bindings() 156 | 157 | @handle_error 158 | def check_action(self, action: str, parameters): 159 | """Check if an action may run.""" 160 | if action == "display_nodes" and self.STAGE['action'] not in ['node', 'history', 'job']: 161 | return False 162 | elif action == "display_history_jobs" and self.STAGE['action'] not in ['node', 'history', 'job']: 163 | return False 164 | elif action == "display_jobs" and self.STAGE['action'] not in ['node', 'history', 'job']: 165 | return False 166 | elif action == "abort": 167 | pass 168 | elif action == "select" and self.STAGE['action'] not in ['job', 'select']: 169 | return False 170 | elif action == "select_inverse" and self.STAGE['action'] not in ['job', 'select']: 171 | return False 172 | elif action == "refresh" and self.STAGE['action'] not in ['job', 'history', 'node', 'job_log']: 173 | return False 174 | elif action == "confirm" and self.STAGE['action'] != 'delete': 175 | return False 176 | elif action == "sort" and self.STAGE['action'] not in ['job', 'history', 'node']: 177 | return False 178 | elif action == "delete" and self.STAGE['action'] not in ['job', 'select']: 179 | return False 180 | elif action == "print_gpustat" and self.STAGE['action'] != 'job': 181 | return False 182 | elif action == "display_job_log" and self.STAGE['action'] not in ['job', 'history']: 183 | return False 184 | elif action == "open_with_less" and self.STAGE['action'] not in ['job', 'job_log', 'history']: 185 | return False 186 | elif action == "toggle_user_range" and self.STAGE['action'] != 'job': 187 | return False 188 | elif action == "toggle_history_range" and self.STAGE['action'] != 'history': 189 | return False 190 | return True 191 | 192 | @handle_error 193 | def action_display_nodes(self): 194 | if self.STAGE[self.active_table]['updating']: 195 | return 196 | if self.STAGE["action"] in ["history", "job"]: 197 | self.STAGE.update({"action": "node"}) 198 | self.update_table("node") 199 | self.refresh_bindings() 200 | self.tabs.active = "node" 201 | elif self.STAGE["action"] == "node": 202 | self.only_available_nodes = not self.only_available_nodes 203 | self.rewrite_table("node", keep_state=True) 204 | self.switch_display("node") 205 | 206 | @handle_error 207 | def action_display_history_jobs(self): 208 | if self.STAGE[self.active_table]['updating']: 209 | return 210 | if self.STAGE["action"] in ["node", "job"]: 211 | self.STAGE.update({"action": "history"}) 212 | self.update_table("history") 213 | self.refresh_bindings() 214 | self.tabs.active = "history" 215 | elif self.STAGE["action"] == "history": 216 | self.only_completed_history = not self.only_completed_history 217 | self.rewrite_table("history", keep_state=True) 218 | self.switch_display("history") 219 | 220 | @handle_error 221 | def action_display_jobs(self): 222 | if self.STAGE[self.active_table]['updating']: 223 | return 224 | if self.STAGE["action"] in ["node", "history"]: 225 | self.STAGE.update({"action": "job"}) 226 | self.update_table("job") 227 | self.refresh_bindings() 228 | self.tabs.active = "job" 229 | elif self.STAGE["action"] == "job": 230 | self.only_running_jobs = not self.only_running_jobs 231 | self.rewrite_table("job", keep_state=True) 232 | self.refresh_bindings() 233 | self.switch_display("job") 234 | 235 | @handle_error 236 | def action_abort(self): 237 | if self.STAGE["action"] == "delete": 238 | self.info_log.write("Delete: aborted") 239 | self.selected_jobid = [] 240 | self.STAGE.pop("job_id", None) 241 | self.STAGE.pop("job_name", None) 242 | elif self.STAGE["action"] == "job_log": 243 | self.job_log_position = None 244 | self.STAGE.pop("job_id", None) 245 | self.STAGE.pop("job_name", None) 246 | self.STAGE.pop("log_fn", None) 247 | elif self.STAGE["action"] == "select": 248 | self.info_log.write("Select: none") 249 | self.selected_jobid = [] 250 | elif self.STAGE["action"] in ["node", "history"]: 251 | self.tabs.active = "job" 252 | action = self.tabs.active 253 | self.STAGE['action'] = action 254 | self.update_table(action) 255 | self.switch_display(action) 256 | self.refresh() 257 | 258 | @handle_error 259 | def action_select(self): 260 | if (self.STAGE["action"] == "job" and not self.selected_jobid) or self.STAGE["action"] == "select": 261 | i = self.tables[self.active_table].cursor_coordinate[0] 262 | value = str(self.tables[self.active_table].get_cell_at((i, 0))) 263 | 264 | job_id = self._get_selected_job() 265 | if job_id in self.selected_jobid: 266 | self.selected_jobid.remove(job_id) 267 | self.tables[self.active_table].update_cell_at((i, 0), value) 268 | else: 269 | self.selected_jobid.append(job_id) 270 | self.tables[self.active_table].update_cell_at((i, 0), Text(str(value), style=self.selected_text_style)) 271 | 272 | if self.selected_jobid: 273 | self.STAGE["action"] = "select" 274 | self.info_log.write(f"Select: {' '.join(self.selected_jobid)}") 275 | else: 276 | self.STAGE["action"] = "job" 277 | self.info_log.write(f"Select: none") 278 | self.tables[self.active_table].action_cursor_down() 279 | self.refresh_bindings() 280 | 281 | @handle_error 282 | def action_select_inverse(self): 283 | assert self.STAGE["action"] in ["job", "select"] 284 | for i in range(len(self.tables[self.active_table].rows)): 285 | job_id = str(self.tables[self.active_table].get_cell_at((i, 0))) 286 | 287 | if job_id in self.selected_jobid: 288 | self.selected_jobid.remove(job_id) 289 | self.tables[self.active_table].update_cell_at((i, 0), job_id) 290 | else: 291 | self.selected_jobid.append(job_id) 292 | self.tables[self.active_table].update_cell_at((i, 0), Text(str(job_id), style=self.selected_text_style)) 293 | self.tables[self.active_table].move_cursor(row=i, column=0) 294 | if self.selected_jobid: 295 | self.STAGE["action"] = "select" 296 | self.info_log.write(f"Select: {' '.join(self.selected_jobid)}") 297 | else: 298 | self.STAGE["action"] = "job" 299 | self.info_log.write(f"Select: none") 300 | self.refresh_bindings() 301 | 302 | @run_in_thread 303 | @handle_error 304 | def auto_refresh(self): 305 | if self.verbose: 306 | self.info_log.write("Auto-refreshing...") 307 | if self.STAGE["action"] == "job": 308 | self.update_table("job") 309 | elif self.STAGE["action"] == "job_log": 310 | self.update_log(self.STAGE["log_fn"]) 311 | elif self.STAGE["action"] == "node": 312 | self.update_table("node") 313 | # elif self.STAGE["action"] == "history": 314 | # self.update_table("history") 315 | self.update_status() 316 | 317 | @handle_error 318 | def action_refresh(self): 319 | if self.STAGE["action"] == "job": 320 | self.rewrite_table("job", keep_state=True) 321 | elif self.STAGE["action"] == "job_log": 322 | self.update_log(self.STAGE["log_fn"]) 323 | elif self.STAGE["action"] == "node": 324 | self.rewrite_table("node", keep_state=True) 325 | elif self.STAGE["action"] == "history": 326 | self.rewrite_table("history", keep_state=True) 327 | self.update_status() 328 | 329 | @handle_error 330 | def action_confirm(self): 331 | # job to delete 332 | if self.STAGE["action"] == "delete": 333 | perform_scancel(self.STAGE['job_id']) 334 | self.info_log.write(f"Delete: {self.STAGE['job_id']}? succeeded") 335 | self.selected_jobid = [] 336 | self.update_table("job") 337 | self.STAGE["action"] = "job" 338 | self.refresh_bindings() 339 | 340 | @handle_error 341 | def action_sort(self): 342 | sort_column = self.tables[self.active_table].cursor_column 343 | if sort_column != self.STAGE[self.STAGE["action"]].get("sort_column"): 344 | self.STAGE[self.STAGE["action"]]["sort_ascending"] = False 345 | else: 346 | self.STAGE[self.STAGE["action"]]["sort_ascending"] = not self.STAGE[self.STAGE["action"]].get("sort_ascending", True) 347 | self.STAGE[self.STAGE["action"]]['sort_column'] = sort_column 348 | 349 | self.rewrite_table(self.active_table, keep_state=True) 350 | self.tables[self.active_table].move_cursor(row=0, column=sort_column) 351 | 352 | @handle_error 353 | def action_delete(self): 354 | if self.STAGE["action"] == "job": 355 | job_id = self._get_selected_job() 356 | self.info_log.write(f"Delete: {job_id}? press <> to confirm") 357 | self.STAGE.update({"action": "delete", "job_id": job_id}) 358 | elif self.STAGE["action"] == "select": 359 | self.info_log.write(f"Delete: {' '.join(self.selected_jobid)}? press <> to confirm") 360 | self.STAGE.update({"action": "delete", "job_id": ' '.join(self.selected_jobid)}) 361 | self.refresh_bindings() 362 | 363 | @handle_error 364 | def action_print_gpustat(self): 365 | if self.STAGE["action"] == "job": 366 | job_id = self._get_selected_job() 367 | gpustat = subprocess.check_output(f"""srun --jobid {job_id} gpustat""", shell=True, timeout=3).decode("utf-8").rstrip() 368 | self.info_log.write(gpustat) 369 | 370 | @handle_error 371 | def action_display_job_log(self): 372 | if self.STAGE["action"] in ["job", "history"]: 373 | job_id = self._get_selected_job() 374 | log_fn = self._get_log_fn(job_id) 375 | assert os.path.exists(log_fn), f"Log file not found: {log_fn}" 376 | self.STAGE.update({"action": "job_log", "log_fn": log_fn}) 377 | self.update_log(log_fn) 378 | self.switch_display("job_log") 379 | self.refresh_bindings() 380 | 381 | @handle_error 382 | def action_open_with_less(self): 383 | if self.STAGE["action"] in ["job", "job_log", "history"]: 384 | if 'log_fn' not in self.STAGE: 385 | job_id = self._get_selected_job() 386 | log_fn = self._get_log_fn(job_id) 387 | else: 388 | log_fn = self.STAGE['log_fn'] 389 | assert os.path.exists(log_fn), f"Log file not found: {log_fn}" 390 | with self.suspend(): 391 | # Save the current SIGINT handler 392 | original_sigint = signal.signal(signal.SIGINT, signal.SIG_IGN) 393 | try: 394 | subprocess.run(['less', '+G', log_fn]) 395 | finally: 396 | # Restore the original SIGINT handler 397 | signal.signal(signal.SIGINT, original_sigint) 398 | self.refresh() 399 | 400 | @handle_error 401 | def action_toggle_user_range(self): 402 | if self.STAGE["action"] == "job": 403 | self.only_my_jobs = not self.only_my_jobs 404 | self.rewrite_table("job", keep_state=True) 405 | self.refresh_bindings() 406 | self.switch_display("job") 407 | 408 | @handle_error 409 | def action_toggle_history_range(self): 410 | if self.STAGE[self.active_table]['updating']: 411 | return 412 | if self.history_range == "1 week": 413 | self.history_range = "1 month" 414 | elif self.history_range == "1 month": 415 | self.history_range = "4 months" 416 | elif self.history_range == "4 months": 417 | self.history_range = "1 year" 418 | else: 419 | self.history_range = "1 week" 420 | self.rewrite_table("history", keep_state=True) 421 | self.switch_display("history") 422 | 423 | def print_tab_prompt(self, tab_id): 424 | if not self.verbose: 425 | self.info_log.clear() 426 | 427 | if tab_id == "node": 428 | info = f"Press 'g' to toggle nodes: {'Available' if self.only_available_nodes else 'All'}" 429 | elif tab_id == "history": 430 | info = f"Press 'h' to toggle job states: {'Completed' if self.only_completed_history else 'All'}\t| " \ 431 | + f"Press 'H' to toggle history range: {self.history_range}" 432 | elif tab_id == "job": 433 | info = f"Press 'j' to toggle job states: {'Running' if self.only_running_jobs else 'All'}\t|" \ 434 | + f" Press 'J' to toggle user range: {'Me' if self.only_my_jobs else 'All'}" 435 | self.info_log.write(info) 436 | 437 | def switch_display(self, action): 438 | if self.verbose: 439 | self.info_log.write(f"Switch display: {action}") 440 | if action == "node": 441 | self.node_table.styles.height = "80%" 442 | self.active_table = action 443 | self.tables[self.active_table].focus() 444 | self.info_log.styles.height="20%" 445 | self.info_log.styles.border = (self.border_type, self.border_color) 446 | self.print_tab_prompt(action) 447 | 448 | self.job_table.styles.height = "0%" 449 | self.history_table.styles.height = "0%" 450 | self.job_log.styles.height="0%" 451 | self.job_log.styles.border = (self.border_type, self.border_color) 452 | self.job_log.clear() 453 | elif action == "history": 454 | self.history_table.styles.height = "80%" 455 | self.active_table = action 456 | self.tables[self.active_table].focus() 457 | self.info_log.styles.height="20%" 458 | self.info_log.styles.border = (self.border_type, self.border_color) 459 | self.print_tab_prompt(action) 460 | 461 | self.job_table.styles.height = "0%" 462 | self.node_table.styles.height = "0%" 463 | self.job_log.styles.height="0%" 464 | self.job_log.styles.border = (self.border_type, self.border_color) 465 | self.job_log.clear() 466 | elif action == "job": 467 | self.job_table.styles.height = "80%" 468 | self.active_table = action 469 | self.tables[self.active_table].focus() 470 | self.info_log.styles.border = (self.border_type, self.border_color) 471 | self.info_log.styles.height="20%" 472 | self.print_tab_prompt(action) 473 | 474 | self.history_table.styles.height = "0%" 475 | self.node_table.styles.height = "0%" 476 | self.job_log.styles.border = (self.border_type, self.border_color) 477 | self.job_log.styles.height="0%" 478 | self.job_log.clear() 479 | elif action == "job_log": 480 | self.job_log.styles.height="100%" 481 | self.job_log.styles.border = (self.border_type, self.border_color) 482 | self.job_log.focus() 483 | 484 | self.job_table.styles.height="0%" 485 | self.node_table.styles.height="0%" 486 | self.history_table.styles.height="0%" 487 | self.info_log.styles.height="0%" 488 | self.info_log.styles.border = ("none", self.border_color) 489 | else: 490 | raise ValueError(f"Invalid action: {action}") 491 | 492 | @run_in_thread 493 | @handle_error 494 | def update_status(self): 495 | self.title = f"SlurmUI (v{importlib.metadata.version('slurmui')})" 496 | 497 | njobs = self.stats.get("njobs", 0) 498 | njobs_running = self.stats.get("njobs_running", 0) 499 | self.tab_jobs.label = f"Jobs: {njobs_running}/{njobs}" 500 | 501 | ngpus_avail = self.stats.get("ngpus_avail", 0) 502 | ngpus = self.stats.get("ngpus", 0) 503 | self.tab_nodes.label = f"GPUs: {ngpus_avail}/{ngpus}" 504 | 505 | nhistory = self.stats.get("nhistory", 0) 506 | nhistory_completed = self.stats.get("nhistory_completed", 0) 507 | self.tab_history.label = f"History: {nhistory_completed}/{nhistory}" 508 | 509 | self.tab_time.label = f"{socket.gethostname()} | {datetime.now().strftime('%H:%M:%S')}" 510 | 511 | @handle_error 512 | def query_jobs(self, sort_column=None, sort_ascending=True): 513 | squeue_df = self.get_squeue(self.cluster, self.only_my_jobs, self.only_running_jobs) 514 | if sort_column is not None: 515 | squeue_df = squeue_df.sort_values(squeue_df.columns[sort_column], ascending=sort_ascending) 516 | 517 | self.stats['njobs'] = len(squeue_df) 518 | self.stats['njobs_running'] = sum(1 for row in squeue_df.iterrows() if row[1]['STATE'] == 'RUNNING') 519 | return squeue_df 520 | 521 | @run_in_thread 522 | @handle_error 523 | def rewrite_table(self, table_type, keep_state=False): 524 | if table_type not in self.STAGE: 525 | self.STAGE[table_type] = {} 526 | if self.STAGE[table_type].get('updating', False): 527 | return 528 | self.STAGE[table_type]['updating'] = True 529 | 530 | if 'sort_column' in self.STAGE[table_type]: 531 | sort_column = self.STAGE[table_type]['sort_column'] 532 | else: 533 | sort_column = None 534 | if 'sort_ascending' in self.STAGE[table_type]: 535 | sort_ascending = self.STAGE[table_type]['sort_ascending'] 536 | else: 537 | sort_ascending = True 538 | 539 | df = self.query_table_data(table_type, sort_column, sort_ascending) 540 | self.update_status() 541 | 542 | table = self.tables[table_type] 543 | if keep_state: 544 | cursor_column = table.cursor_column 545 | cursor_row = table.cursor_row 546 | else: 547 | cursor_column = 0 548 | cursor_row = 0 549 | 550 | table.clear(columns=True) 551 | table.add_columns(*df.columns) 552 | 553 | for _, row in df.iterrows(): 554 | table_row = [str(row[col]) for col in df.columns] 555 | table.add_row(*table_row) 556 | 557 | cursor_row = min(cursor_row, len(table.rows) - 1) 558 | cursor_column = min(cursor_column, len(table.columns) - 1) 559 | table.move_cursor(row=cursor_row, column=cursor_column) 560 | 561 | time.sleep(0.3) 562 | self.STAGE[table_type]['updating'] = False 563 | 564 | @run_in_thread 565 | @handle_error 566 | def update_table(self, table_type): 567 | if self.STAGE[table_type].get('updating', False): 568 | return 569 | self.STAGE[table_type]['updating'] = True 570 | if 'sort_column' in self.STAGE[table_type]: 571 | sort_column = self.STAGE[table_type]['sort_column'] 572 | else: 573 | sort_column = None 574 | if 'sort_ascending' in self.STAGE[table_type]: 575 | sort_ascending = self.STAGE[table_type]['sort_ascending'] 576 | else: 577 | sort_ascending = True 578 | 579 | df = self.query_table_data(table_type, sort_column, sort_ascending) 580 | self.update_status() 581 | 582 | table = self.tables[table_type] 583 | if not table.columns: 584 | table.add_columns(*df.columns) 585 | for _, row in df.iterrows(): 586 | table_row = [str(row[col]) for col in df.columns] 587 | table.add_row(*table_row) 588 | return 589 | 590 | for row_index, (_, row) in enumerate(df.iterrows()): 591 | table_row = [str(row[col]) for col in df.columns] 592 | if row_index < len(table.rows): 593 | for col_index, cell in enumerate(table_row): 594 | if table.get_cell_at((row_index, col_index)) != cell: 595 | table.update_cell_at((row_index, col_index), cell) 596 | else: 597 | table.add_row(*table_row) 598 | 599 | while len(table.rows) > len(df): 600 | row_key, _ = table.coordinate_to_cell_key((len(table.rows) - 1, 0)) 601 | table.remove_row(row_key) 602 | 603 | self.STAGE[table_type]['updating'] = False 604 | 605 | @handle_error 606 | def query_table_data(self, table_type, sort_column=None, sort_ascending=True): 607 | if table_type == "job": 608 | return self.query_jobs(sort_column, sort_ascending) 609 | elif table_type == "node": 610 | return self.query_gpus(sort_column, sort_ascending) 611 | elif table_type == "history": 612 | return self.query_history(sort_column, sort_ascending) 613 | else: 614 | raise ValueError(f"Invalid table type: {table_type}") 615 | 616 | @handle_error 617 | def _get_selected_job(self): 618 | row_idx = self.tables[self.active_table].cursor_row 619 | row = self.tables[self.active_table].get_row_at(row_idx) 620 | job_id = str(row[0]) 621 | return job_id 622 | 623 | @handle_error 624 | def update_log(self, log_fn): 625 | self.job_log.border_title = f"{log_fn}" 626 | current_scroll_y = self.job_log.scroll_offset[1] 627 | 628 | if not self.job_log_position: 629 | with open(log_fn, 'r') as f: 630 | self.job_log_position = max(sum(len(line) for line in f) - 2**12, 0) # read the last 4KB 631 | 632 | with open(log_fn, 'r') as log_file: 633 | log_file.seek(self.job_log_position) 634 | new_lines = log_file.readlines()[1:] # drop the first line because it can be incomplete 635 | self.job_log_position = log_file.tell() 636 | else: 637 | with open(log_fn, 'r') as log_file: 638 | log_file.seek(self.job_log_position) 639 | new_lines = log_file.readlines() 640 | self.job_log_position = log_file.tell() 641 | 642 | update_scroll = current_scroll_y == self.job_log.max_scroll_y 643 | 644 | for line in new_lines: 645 | self.job_log.write(line) 646 | 647 | if update_scroll: 648 | self.job_log.scroll_end(animate=False) 649 | 650 | @handle_error 651 | def _get_log_fn(self, job_id): 652 | if self.STAGE["action"] == "history": 653 | response_string = subprocess.check_output(f"""sacct -j {job_id} --format=StdOut -P""", shell=True).decode("utf-8") 654 | formatted_string = response_string.split("\n")[1].strip() 655 | formatted_string = formatted_string.replace("%j", job_id) 656 | elif self.STAGE["action"] in ["job", "job_log"]: 657 | response_string = subprocess.check_output(f"""scontrol show job {job_id} | grep StdOut""", shell=True).decode("utf-8") 658 | formatted_string = response_string.split("=")[-1].strip() 659 | else: 660 | raise ValueError(f"Cannot get log file for action: {self.STAGE['action']}") 661 | return formatted_string 662 | 663 | @handle_error 664 | def query_gpus(self, sort_column=None, sort_ascending=True): 665 | overview_df = self.get_sinfo(self.cluster) 666 | self.stats['ngpus'] = overview_df["GPUs (Total)"].sum() 667 | self.stats['ngpus_avail'] = overview_df["GPUs (Avail)"].sum() 668 | if self.only_available_nodes: 669 | # filter out nodes with no available GPUs 670 | overview_df = overview_df[overview_df["GPUs (Avail)"] > 0] 671 | 672 | # hide columns for simplicity 673 | overview_df = overview_df.drop(columns=["GPUs (Total)", "GPUs (Avail)"]) 674 | 675 | if sort_column is not None: 676 | overview_df = overview_df.sort_values(overview_df.columns[sort_column],ascending=sort_ascending) 677 | return overview_df 678 | 679 | @handle_error 680 | def query_history(self, sort_column=None, sort_ascending=True): 681 | starttime = self.get_history_starttime() 682 | sacct_df = self.get_sacct(starttime=starttime) 683 | if sort_column is not None: 684 | sacct_df = sacct_df.sort_values(sacct_df.columns[sort_column], ascending=sort_ascending) 685 | 686 | self.stats['nhistory'] = len(sacct_df) 687 | 688 | if self.only_completed_history: 689 | sacct_df = sacct_df[sacct_df["State"] == "COMPLETED"] 690 | 691 | self.stats['nhistory_completed'] = len(sacct_df) 692 | return sacct_df 693 | 694 | @handle_error 695 | def get_history_starttime(self): 696 | if self.history_range == "1 week": 697 | return (datetime.now() - timedelta(weeks=1)).strftime('%Y-%m-%d') 698 | elif self.history_range == "1 month": 699 | return (datetime.now() - timedelta(days=30)).strftime('%Y-%m-%d') 700 | elif self.history_range == "4 months": 701 | return (datetime.now() - timedelta(days=120)).strftime('%Y-%m-%d') 702 | elif self.history_range == "1 year": 703 | return (datetime.now() - timedelta(days=365)).strftime('%Y-%m-%d') 704 | else: 705 | return "2024-11-26" 706 | 707 | @handle_error 708 | def get_squeue(self, cluster=None, only_my_jobs=True, only_running_jobs=False): 709 | sep = "|" 710 | if DEBUG: 711 | response_string = SQUEUE_DEBUG 712 | else: 713 | args = f"{sep},".join([ 714 | "JOBID:18", 715 | "USERNAME:10", 716 | "PRIORITY:8", 717 | "PARTITION:80", 718 | "NAME:200", 719 | "STATE:8", 720 | "TimeUsed:10", 721 | "StartTime:30", 722 | "TimeLimit:15", 723 | "tres-alloc:100", 724 | "ReasonList:100", 725 | ]) 726 | query_string = f"""squeue --Format="{args}" -S T""" 727 | if self.verbose: 728 | self.info_log.write(query_string) 729 | 730 | if only_my_jobs: 731 | query_string += " --me" 732 | if only_running_jobs: 733 | query_string += " --state=RUNNING" 734 | response_string = subprocess.check_output(query_string, shell=True).decode("utf-8") 735 | compact_string = re.sub(' +', '', response_string) 736 | data = io.StringIO(compact_string) 737 | df = pd.read_csv(data, sep=sep) 738 | 739 | # right align time 740 | max_length = df["TIME"].str.len().max() 741 | df.loc[:, "TIME"] = df.loc[:, "TIME"].apply(lambda x: f"{x:>{max_length}}") 742 | 743 | # remove years from start time 744 | df.loc[:, "START_TIME"] = df.loc[:, "START_TIME"].apply(lambda x: simplify_start_time(x)) 745 | 746 | # simplify tres 747 | # df.loc[:, "TRES_ALLOC"] = df.loc[:, "TRES_ALLOC"].apply(simplify_tres) 748 | return df 749 | 750 | @handle_error 751 | def get_sinfo(self, cluster): 752 | if DEBUG: 753 | response_string = SINFO_DEBUG 754 | else: 755 | args = f",".join([ 756 | "Partition:25", 757 | "NodeHost", 758 | "Gres:500", 759 | "GresUsed:500", 760 | "StateCompact", 761 | "FreeMem", 762 | "Memory", 763 | "CPUsState", 764 | "Features:200" 765 | ]) 766 | query_string = f"""sinfo -O {args}""" 767 | if self.verbose: 768 | self.info_log.write(query_string) 769 | 770 | response_string = subprocess.check_output(query_string, shell=True).decode("utf-8") 771 | 772 | formatted_string = re.sub(' +', ' ', response_string) 773 | data = io.StringIO(formatted_string) 774 | df = pd.read_csv(data, sep=" ") 775 | overview_df = [] 776 | for row in df.iterrows(): 777 | # overview_df = overview_df[['Partition', 'Host', "Device", "State", "Mem (GB)", "CPUs", "GPUs", "Free IDX", "Feature"]] 778 | 779 | if row[1]['GRES'] != "(null)": 780 | device, ngpus = self.parse_gres(row[1]['GRES'], cluster) 781 | else: 782 | continue 783 | 784 | node_available = row[1]["STATE"] in ["mix", "idle", "alloc"] 785 | if not node_available: 786 | gpu_avail_idx = [] 787 | else: 788 | device, gpu_avail_idx = self.parse_gres_used(row[1]['GRES_USED'], ngpus, cluster) 789 | ngpus_avail = len(gpu_avail_idx) 790 | 791 | host_info = OrderedDict() 792 | 793 | host_info['Partition'] = str(row[1]["PARTITION"]) 794 | host_info['Host'] = str(row[1]["HOSTNAMES"]) 795 | host_info['Device'] = device 796 | host_info['State'] = str(row[1]["STATE"]) 797 | 798 | try: 799 | mem_avail = int(row[1]["FREE_MEM"]) // 1024 800 | except: 801 | mem_avail = row[1]["FREE_MEM"] 802 | # host_info['Mem (Avail)'] = mem_avail 803 | try: 804 | mem_total = int(row[1]["MEMORY"]) // 1024 805 | except: 806 | mem_total = row[1]["MEMORY"] 807 | # host_info['Mem (Total)'] = mem_total 808 | host_info['Mem (GB)'] = f"{mem_avail}/{mem_total}" 809 | 810 | cpu_info = row[1]["CPUS(A/I/O/T)"].split("/") 811 | ncpus_avail = cpu_info[1] 812 | ncpus_total = cpu_info[3] 813 | # host_info['CPUs (Avail)'] = ncpus_avail 814 | # host_info['CPUs (Total)'] = ncpus_total 815 | host_info['CPUs'] = f"{ncpus_avail}/{ncpus_total}" 816 | 817 | host_info['GPUs (Total)'] = ngpus 818 | host_info['GPUs (Avail)'] = ngpus_avail 819 | host_info["GPUs"] = f"{host_info['GPUs (Avail)']}/{ngpus}" 820 | host_info['GPUs (Avail IDX)'] = f"[{','.join(str(idx) for idx in gpu_avail_idx)}]" 821 | 822 | features = row[1]["AVAIL_FEATURES"] 823 | if ',' in features: 824 | unnamed_features = [] 825 | for feature in features.split(","): 826 | if ':' in feature: 827 | name, value = feature.split(':') 828 | host_info[name] = value 829 | else: 830 | unnamed_features.append(feature) 831 | else: 832 | unnamed_features = [features] 833 | host_info['Feature'] = ','.join(unnamed_features) 834 | 835 | overview_df.append(host_info) 836 | overview_df = pd.DataFrame.from_records(overview_df).drop_duplicates("Host") 837 | return overview_df 838 | 839 | @handle_error 840 | def parse_gres(self, gres_str, cluster=None): 841 | match = re.match(r"([^:]+)(?::([^:()]+))?:([^:(,]+)(?:\(S:([^)]+)\))?", gres_str) 842 | 843 | if match: 844 | groups = match.groups() 845 | if self.verbose: 846 | self.info_log.write(f"Parsed gres: {groups} from {gres_str}") 847 | _, device, ngpus, _ = groups 848 | ngpus = int(ngpus) 849 | else: 850 | error_msg = f"Error parsing gres: {gres_str}" 851 | raise ValueError(error_msg) 852 | 853 | return device, ngpus 854 | 855 | @handle_error 856 | def parse_gres_used(self, gres_used_str, num_total, cluster=None): 857 | match = re.match(r"([^:]+)(?::([^:]+))?:([^:(,]+)(?:\(IDX:([^)]+)\))?", gres_used_str) 858 | if match: 859 | groups = match.groups() 860 | if self.verbose: 861 | self.info_log.write(f"Parsed gres_used: {groups} from {gres_used_str}") 862 | _, device, ngpus_used, alloc_str = groups 863 | ngpus_used = int(ngpus_used) 864 | else: 865 | error_msg = f"Error parsing gres_used: {gres_used_str}" 866 | raise ValueError(error_msg) 867 | 868 | gpu_used_idx = [] 869 | if alloc_str: 870 | for gpu_ids in alloc_str.split(","): 871 | if "-" in gpu_ids: 872 | start, end = gpu_ids.split("-") 873 | for i in range(int(start), int(end)+1): 874 | gpu_used_idx.append(i) 875 | else: 876 | if gpu_ids == "N/A": 877 | pass 878 | else: 879 | gpu_used_idx.append(int(gpu_ids)) 880 | assert ngpus_used == len(gpu_used_idx), f"Number of used GPUs {ngpus_used} does not match parsed indices {gpu_used_idx} in gres_used: {gres_used_str}" 881 | 882 | gpu_avail_idx = [idx for idx in range(num_total) if idx not in gpu_used_idx] 883 | return device, gpu_avail_idx 884 | 885 | @handle_error 886 | def get_sacct(self, starttime="2024-11-26", endtime="now"): 887 | args = f",".join([ 888 | "JobID", 889 | "JobName", 890 | "State", 891 | "Start", 892 | "Elapsed", 893 | "NodeList", 894 | "AllocTRES", 895 | "Partition", 896 | "StdOut" 897 | ]) 898 | query_string = f"""sacct --format={args} -P -X --starttime={starttime} --endtime={endtime}""" 899 | if self.verbose: 900 | self.info_log.write(query_string) 901 | 902 | response_string = subprocess.check_output( 903 | query_string, 904 | shell=True 905 | ).decode("utf-8") 906 | data = io.StringIO(response_string) 907 | df = pd.read_csv(data, sep='|') 908 | 909 | # Strip whitespace from column names 910 | df.columns = df.columns.str.strip() 911 | 912 | # Strip whitespace from each string element in the DataFrame 913 | for col in df.select_dtypes(['object']).columns: 914 | df[col] = df[col].str.strip() 915 | return df 916 | 917 | def perform_scancel(job_id): 918 | os.system(f"""scancel {job_id}""") 919 | 920 | def remove_first_line(input_string): 921 | lines = input_string.split('\n') 922 | return '\n'.join(lines[1:]) 923 | 924 | def simplify_start_time(start_time): 925 | try: 926 | if start_time != "nan": 927 | start_time = datetime.strptime(start_time, "%Y-%m-%dT%H:%M:%S").strftime("%m-%d %H:%M") 928 | except Exception as e: 929 | pass 930 | return start_time 931 | 932 | def simplify_tres(tres): 933 | tres_ = [] 934 | for x in str(tres).split(","): 935 | if 'billing=' in x: 936 | continue 937 | tres_.append(x) 938 | return ",".join(tres_) 939 | 940 | def read_log(fn, num_lines=100): 941 | with open(os.path.expanduser(fn), 'r') as f: 942 | txt_lines = list(f.readlines()[-num_lines:]) 943 | 944 | return txt_lines 945 | 946 | def run_ui(verbose=False, cluster=None, interval=10, history_range="1 week"): 947 | # if debug: 948 | # # global for quick debugging 949 | # global DEBUG 950 | # DEBUG = True 951 | app = SlurmUI() 952 | app.verbose = verbose 953 | app.cluster = cluster 954 | app.interval = interval 955 | app.history_range = history_range 956 | app.run() 957 | 958 | 959 | if __name__ == "__main__": 960 | run_ui() 961 | --------------------------------------------------------------------------------