├── slurmui
├── __init__.py
├── slurmui.tcss
├── slurmui_cli.py
├── debug_strings.py
└── slurmui.py
├── .gitignore
├── asset
├── demo.png
└── verbose_info.png
├── pyproject.toml
└── README.md
/slurmui/__init__.py:
--------------------------------------------------------------------------------
1 | from .slurmui import run_ui
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | build/
3 | dist/
4 | slurmui.egg-info/
5 |
--------------------------------------------------------------------------------
/asset/demo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShenhanQian/slurmui/HEAD/asset/demo.png
--------------------------------------------------------------------------------
/asset/verbose_info.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShenhanQian/slurmui/HEAD/asset/verbose_info.png
--------------------------------------------------------------------------------
/slurmui/slurmui.tcss:
--------------------------------------------------------------------------------
1 | Header {
2 | height: 1;
3 | text-style: bold;
4 | background: $boost;
5 | }
6 |
7 | Footer {
8 | text-style: bold;
9 | background: #303030;
10 | }
11 |
12 | Static {
13 | width: 1fr;
14 | }
15 |
16 | DataTable {
17 | background: $boost;
18 | }
19 |
20 | Tabs {
21 | background: $boost;
22 | }
23 |
24 | Tab {
25 | text-style: bold;
26 | padding-left: 1;
27 | padding-right: 3;
28 | }
29 |
30 | .time-tab {
31 | text-style: none;
32 | color: $foreground;
33 | }
34 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [project]
2 | name = "slurmui"
3 | version = "1.1.17"
4 | description = "Terminal UI for Slurm"
5 | authors = [
6 | {name = "Shenhan Qian", email = "shenhan.qian@tum.de"}
7 | ]
8 | requires-python = ">=3.6"
9 | license = {text = "MIT"}
10 | classifiers = [
11 | "Programming Language :: Python :: 3",
12 | "License :: OSI Approved :: MIT License",
13 | "Operating System :: OS Independent"
14 | ]
15 | dependencies = [
16 | "textual>=0.61.0",
17 | "pandas",
18 | "gpustat",
19 | ]
20 | [project.scripts]
21 | slurmui = "slurmui.slurmui_cli:slurmui_cli"
22 |
23 | [build-system]
24 | requires = ["hatchling"]
25 | build-backend = "hatchling.build"
26 |
--------------------------------------------------------------------------------
/slurmui/slurmui_cli.py:
--------------------------------------------------------------------------------
1 | from slurmui import run_ui
2 | from argparse import ArgumentParser
3 |
4 | def slurmui_cli():
5 | # adding arguments later
6 | parser = ArgumentParser("SLURM UI")
7 | parser.add_argument("-v", "--verbose", action='store_true')
8 | parser.add_argument("-c", "--cluster", help="Specify the name of the cluster")
9 | parser.add_argument("-i", "--interval", help="Specify the interval in seconds to refresh the UI", type=int, default=5)
10 | parser.add_argument("-r", "--history-range", help="Specify the time range of history jobs to load", type=str, default="1 week")
11 | args = parser.parse_args()
12 | run_ui(verbose=args.verbose, cluster=args.cluster, interval=args.interval)
13 |
14 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # SlurmUI
2 |
3 | Enhanced command-line UI to ease working with slurm.
4 | Written in Python, derived from [SlurmUI](https://github.com/SirWyver/slurmui).
5 |
6 | Viewing and managing
7 | - GPUs
8 | - Jobs in the history
9 | - Jobs in the queue
10 | - Logs for current and past jobs
11 |
12 |
13 |

14 |
15 |
16 | ## Install and run
17 | ```shell
18 | pip install -U git+https://github.com/ShenhanQian/slurmui.git
19 | slurmui
20 | ```
21 | Optional arguments:
22 | - `-i` update interval in seconds. (10 by default. Set to 0 to disable).
23 | - `-v` verbose mode (printing info and error to the info panel).
24 | - `-r` time range of history jobs to load (1 week by default))
25 | - `-c` cluster name (deprecated as the latest version of SlurmUI does not have cluster-specific configuration).
26 |
27 | ## Basics
28 | Under the interface of SlurmUI we rely on three basic slurm commands:
29 | - `sinfo` for information of nodes, GPUs, etc.
30 | - `squeue` for current jobs in the queue
31 | - `sacct` for history jobs
32 |
33 | Make sure you can get meaningful output from these commands on your cluster before trying SlurmUI.
34 |
35 | To debug, you could srun `slurmui` with `-i 0` to disable auto update and `-v` to force verbose logging. Then, you will see the full commands that slurmui sends to slurm in the info panel.
36 |
37 |
38 |

39 |
40 |
41 | ## Supported Clusters
42 | - [TUM CVG](https://cvg.cit.tum.de/)
43 | - [TUM VCG](https://www.niessnerlab.org/)
44 | - [LRZ AI](https://doku.lrz.de/lrz-ai-systems-11484278.html)
45 |
46 | > [!NOTE]
47 | > If SlurmUI does not work on your cluster, try the debugging suggestions in [Basics](README.md#basics) and feel free to open an issue.
48 |
49 | ## Contributions
50 | Open to contribution including but not limited to:
51 | - Improving startup/launch speed
52 | - Enhancing multithreading and concurrency handling
53 | - Strengthening crash recovery and process resiliency
54 | - Expanding features or addressing edge cases
55 |
--------------------------------------------------------------------------------
/slurmui/debug_strings.py:
--------------------------------------------------------------------------------
1 | ### ---> FOR DEBUGGING
2 |
3 | SQUEUE_DEBUG = """
4 | JOBID PA NAME USER STATE TIME TIME_L NODE NODELIST(REASON)
5 | 190689 in interactive bob RUNNING 9:46 6:00:00 1 lothlann
6 | 190663 su triplane3_ll bob RUNNING 2:56:36 4-00:00:00 1 himring
7 | 190662 su triplane2_ll bob RUNNING 2:56:40 4-00:00:00 1 himring
8 | 190661 su triplane1_ll bob RUNNING 2:56:46 4-00:00:00 1 himring
9 | 190660 su triplane1_l bob RUNNING 2:57:04 4-00:00:00 1 himring
10 | 190659 su triplane2_l bob RUNNING 2:57:10 4-00:00:00 1 balrog
11 | 190658 su triplane3_l bob RUNNING 2:57:28 4-00:00:00 1 balrog
12 | 190657 su triplane3_m bob RUNNING 2:57:31 4-00:00:00 1 balrog
13 | 190656 su triplane2_m bob RUNNING 2:57:36 4-00:00:00 1 balrog
14 | 190655 su triplane1_m bob RUNNING 2:57:39 4-00:00:00 1 balrog
15 | 190654 su triplane1_m bob RUNNING 2:57:43 4-00:00:00 1 angmar
16 | 190651 su triplane0_m bob RUNNING 3:03:06 4-00:00:00 1 valinor
17 | 190650 su triplane0_ll bob RUNNING 3:03:13 4-00:00:00 1 valinor
18 | 190649 su triplane0_l bob RUNNING 3:03:17 4-00:00:00 1 valinor
19 | """
20 |
21 | SINFO_DEBUG = """
22 | HOSTNAMES GRES GRES_USED STATE
23 | andram gpu:a100:4,mps:a100:400 gpu:a100:4(IDX:0-3),mps:a100:0(IDX:N/A) mix
24 | andram gpu:a100:4,mps:a100:400 gpu:a100:4(IDX:0-3),mps:a100:0(IDX:N/A) mix
25 | angmar gpu:rtx_a6000:8,mps:rtx_a6000:800 gpu:rtx_a6000:8(IDX:0-7),mps:rtx_a6000:0(IDX:N/A) mix
26 | angmar gpu:rtx_a6000:8,mps:rtx_a6000:800 gpu:rtx_a6000:8(IDX:0-7),mps:rtx_a6000:0(IDX:N/A) mix
27 | balar gpu:a100:4,mps:a100:400 gpu:a100:4(IDX:0-3),mps:a100:0(IDX:N/A) mix
28 | balar gpu:a100:4,mps:a100:400 gpu:a100:4(IDX:0-3),mps:a100:0(IDX:N/A) mix
29 | balrog gpu:rtx_3090:8,mps:rtx_3090:800 gpu:rtx_3090:7(IDX:0-3,5-7),mps:rtx_3090:0(IDX:N/A) mix
30 | balrog gpu:rtx_3090:8,mps:rtx_3090:800 gpu:rtx_3090:7(IDX:0-3,5-7),mps:rtx_3090:0(IDX:N/A) mix
31 | char gpu:gtx_1080:4,mps:gtx_1080:800 gpu:gtx_1080:4(IDX:0-3),mps:gtx_1080:0(IDX:N/A) mix
32 | char gpu:gtx_1080:4,mps:gtx_1080:800 gpu:gtx_1080:4(IDX:0-3),mps:gtx_1080:0(IDX:N/A) mix
33 | daidalos gpu:rtx_a6000:8,mps:rtx_a6000:800 gpu:rtx_a6000:8(IDX:0-7),mps:rtx_a6000:0(IDX:N/A) mix
34 | daidalos gpu:rtx_a6000:8,mps:rtx_a6000:800 gpu:rtx_a6000:8(IDX:0-7),mps:rtx_a6000:0(IDX:N/A) mix
35 | doriath gpu:a100:4,mps:a100:400 gpu:a100:4(IDX:0-3),mps:a100:0(IDX:N/A) mix
36 | doriath gpu:a100:4,mps:a100:400 gpu:a100:4(IDX:0-3),mps:a100:0(IDX:N/A) mix
37 | erebor gpu:rtx_a6000:8,mps:rtx_a6000:800 gpu:rtx_a6000:8(IDX:0-7),mps:rtx_a6000:0(IDX:N/A) mix
38 | erebor gpu:rtx_a6000:8,mps:rtx_a6000:800 gpu:rtx_a6000:8(IDX:0-7),mps:rtx_a6000:0(IDX:N/A) mix
39 | eriador gpu:rtx_a6000:8,mps:rtx_a6000:800 gpu:rtx_a6000:8(IDX:0-7),mps:rtx_a6000:0(IDX:N/A) mix
40 | eriador gpu:rtx_a6000:8,mps:rtx_a6000:800 gpu:rtx_a6000:8(IDX:0-7),mps:rtx_a6000:0(IDX:N/A) mix
41 | falas gpu:a100:4,mps:a100:400 gpu:a100:3(IDX:0-2),mps:a100:0(IDX:N/A) mix
42 | falas gpu:a100:4,mps:a100:400 gpu:a100:3(IDX:0-2),mps:a100:0(IDX:N/A) mix
43 | gimli gpu:rtx_3090:8,mps:rtx_3090:800 gpu:rtx_3090:8(IDX:0-7),mps:rtx_3090:0(IDX:N/A) mix
44 | gimli gpu:rtx_3090:8,mps:rtx_3090:800 gpu:rtx_3090:8(IDX:0-7),mps:rtx_3090:0(IDX:N/A) mix
45 | gondor gpu:rtx_2080:9,mps:rtx_2080:900 gpu:rtx_2080:9(IDX:0-8),mps:rtx_2080:0(IDX:N/A) mix
46 | gondor gpu:rtx_2080:9,mps:rtx_2080:900 gpu:rtx_2080:9(IDX:0-8),mps:rtx_2080:0(IDX:N/A) mix
47 | himring gpu:rtx_a6000:8,mps:rtx_a6000:800 gpu:rtx_a6000:8(IDX:0-7),mps:rtx_a6000:0(IDX:N/A) mix
48 | himring gpu:rtx_a6000:8,mps:rtx_a6000:800 gpu:rtx_a6000:8(IDX:0-7),mps:rtx_a6000:0(IDX:N/A) mix
49 | hithlum gpu:rtx_a6000:8,mps:rtx_a6000:800 gpu:rtx_a6000:8(IDX:0-7),mps:rtx_a6000:0(IDX:N/A) mix
50 | hithlum gpu:rtx_a6000:8,mps:rtx_a6000:800 gpu:rtx_a6000:8(IDX:0-7),mps:rtx_a6000:0(IDX:N/A) mix
51 | ikarus gpu:rtx_a6000:8,mps:rtx_a6000:800 gpu:rtx_a6000:8(IDX:0-7),mps:rtx_a6000:0(IDX:N/A) mix
52 | ikarus gpu:rtx_a6000:8,mps:rtx_a6000:800 gpu:rtx_a6000:8(IDX:0-7),mps:rtx_a6000:0(IDX:N/A) mix
53 | lothlann gpu:rtx_2080:8,mps:rtx_2080:800 gpu:rtx_2080:8(IDX:0-7),mps:rtx_2080:0(IDX:N/A) mix
54 | lothlann gpu:rtx_2080:8,mps:rtx_2080:800 gpu:rtx_2080:8(IDX:0-7),mps:rtx_2080:0(IDX:N/A) mix
55 | moria gpu:rtx_2080:8,mps:rtx_2080:800 gpu:rtx_2080:8(IDX:0-7),mps:rtx_2080:0(IDX:N/A) mix
56 | moria gpu:rtx_2080:8,mps:rtx_2080:800 gpu:rtx_2080:8(IDX:0-7),mps:rtx_2080:0(IDX:N/A) mix
57 | pegasus gpu:gtx_1080:8,mps:gtx_1080:800 gpu:gtx_1080:0(IDX:N/A),mps:gtx_1080:0(IDX:N/A) idle
58 | pegasus gpu:gtx_1080:8,mps:gtx_1080:800 gpu:gtx_1080:0(IDX:N/A),mps:gtx_1080:0(IDX:N/A) idle
59 | ramdal gpu:a100:4,mps:a100:400 gpu:a100:4(IDX:0-3),mps:a100:0(IDX:N/A) mix
60 | ramdal gpu:a100:4,mps:a100:400 gpu:a100:4(IDX:0-3),mps:a100:0(IDX:N/A) mix
61 | seti gpu:rtx_2080:8,mps:rtx_2080:800 gpu:rtx_2080:0(IDX:N/A),mps:rtx_2080:0(IDX:N/A) drain
62 | seti gpu:rtx_2080:8,mps:rtx_2080:800 gpu:rtx_2080:0(IDX:N/A),mps:rtx_2080:0(IDX:N/A) drain
63 | sorona gpu:rtx_2080:8,mps:rtx_2080:800 gpu:rtx_2080:8(IDX:0-7),mps:rtx_2080:0(IDX:N/A) mix
64 | sorona gpu:rtx_2080:8,mps:rtx_2080:800 gpu:rtx_2080:8(IDX:0-7),mps:rtx_2080:0(IDX:N/A) mix
65 | tarsonis gpu:gtx_1080:4,mps:gtx_1080:400 gpu:gtx_1080:0(IDX:N/A),mps:gtx_1080:0(IDX:N/A) idle
66 | tarsonis gpu:gtx_1080:4,mps:gtx_1080:400 gpu:gtx_1080:0(IDX:N/A),mps:gtx_1080:0(IDX:N/A) idle
67 | umoja gpu:rtx_2080:8,mps:rtx_2080:800 gpu:rtx_2080:6(IDX:0,2-4,6-7),mps:rtx_2080:0(IDX:N/A) mix
68 | umoja gpu:rtx_2080:8,mps:rtx_2080:800 gpu:rtx_2080:6(IDX:0,2-4,6-7),mps:rtx_2080:0(IDX:N/A) mix
69 | valinor gpu:rtx_a6000:8,mps:rtx_a6000:800 gpu:rtx_a6000:8(IDX:0-7),mps:rtx_a6000:0(IDX:N/A) mix
70 | valinor gpu:rtx_a6000:8,mps:rtx_a6000:800 gpu:rtx_a6000:8(IDX:0-7),mps:rtx_a6000:0(IDX:N/A) mix
71 | """
--------------------------------------------------------------------------------
/slurmui/slurmui.py:
--------------------------------------------------------------------------------
1 | from collections import OrderedDict
2 | import io
3 | import importlib.metadata
4 | from textual.app import App, ComposeResult
5 | from textual.binding import Binding
6 | from textual.widgets import Header, Footer, RichLog, DataTable, Tabs, Tab
7 | from textual.containers import Container
8 | from rich.text import Text
9 | import subprocess
10 | import pandas as pd
11 | import re
12 | import os
13 | import threading
14 | from functools import wraps
15 | from datetime import datetime, timedelta
16 | import time
17 | import socket
18 | import signal
19 |
20 |
21 | DEBUG = False
22 | if DEBUG:
23 | from slurmui.debug_strings import SINFO_DEBUG, SQUEUE_DEBUG
24 |
25 |
26 | def run_in_thread(func):
27 | """Decorator to run a function in a separate thread"""
28 | @wraps(func)
29 | def wrapper(*args, **kwargs):
30 | thread = threading.Thread(target=func, args=args, kwargs=kwargs)
31 | thread.start()
32 | return wrapper
33 |
34 | def handle_error(func):
35 | """Decorator to wrap action methods with try-except logic"""
36 | @wraps(func)
37 | def wrapper(self, *args, **kwargs):
38 | try:
39 | return func(self, *args, **kwargs)
40 | except Exception as e:
41 | self.info_log.write(f"Error ({func.__name__}): {e}")
42 | os.system(f"echo 'Error ({func.__name__}): {e}'")
43 | return wrapper
44 |
45 |
46 | class SlurmUI(App):
47 | # configuration that can be set from the command line
48 | verbose = False
49 | cluster = None
50 | interval = 10
51 | history_range = "1 week" # Default history range
52 |
53 | # internal states
54 | STAGE = {
55 | "action": "job",
56 | "job": {
57 | # "sort_column": 0,
58 | # "sort_ascending": True,
59 | },
60 | "history": {
61 | "sort_column": 0,
62 | "sort_ascending": False,
63 | },
64 | "node": {
65 | # "sort_column": 0,
66 | # "sort_ascending": True,
67 | },
68 | }
69 | stats = {}
70 | selected_jobid = []
71 | only_available_nodes = True
72 | only_completed_history = False
73 | only_running_jobs = False
74 | only_my_jobs = True
75 |
76 | theme = "textual-dark"
77 | selected_text_style = "bold on orange3"
78 | border_type = "solid"
79 | border_color = "white"
80 |
81 | CSS_PATH = "slurmui.tcss"
82 | TITLE = f"SlurmUI (v{importlib.metadata.version('slurmui')})"
83 |
84 | BINDINGS = [
85 | Binding("g", "display_nodes", "GPUs"),
86 | Binding("h", "display_history_jobs", "History"),
87 | Binding("j", "display_jobs", "Jobs"),
88 | Binding("q", "abort", "Abort"),
89 | Binding("space", "select", "Select"),
90 | Binding("v", "select_inverse", "Inverse"),
91 | Binding("r", "refresh", "Refresh"),
92 | Binding("enter", "confirm", "Confirm", priority=True, key_display='enter'),
93 | Binding("s", "sort", "Sort"),
94 | Binding("d", "delete", "Delete"),
95 | Binding("G", "print_gpustat", "GPU"),
96 | Binding("l", "display_job_log", "Log"),
97 | Binding("L", "open_with_less", "Open with less"),
98 | Binding("J", "toggle_user_range", "State"),
99 | Binding("H", "toggle_history_range", "Range"),
100 | ]
101 |
102 | def compose(self) -> ComposeResult:
103 | self.header = Header()
104 | self.footer = Footer()
105 |
106 | self.tab_nodes = Tab("GPUs", id="node")
107 | self.tab_history = Tab("History", id="history")
108 | self.tab_jobs = Tab("Jobs", id="job")
109 | self.tab_time = Tab("Time", id="time", disabled=True, classes="time-tab")
110 | self.tabs = Tabs(self.tab_nodes, self.tab_history, self.tab_jobs, self.tab_time, id="tabs")
111 | self.tabs.can_focus = False
112 |
113 | self.node_table = DataTable(id="node_table")
114 | self.job_table = DataTable(id="job_table")
115 | self.history_table = DataTable(id="history_table")
116 | self.tables = {
117 | "job": self.job_table,
118 | "node": self.node_table,
119 | "history": self.history_table
120 | }
121 | self.active_table = "job"
122 |
123 | self.info_log = RichLog(wrap=True, highlight=True, id="info_log", auto_scroll=True)
124 | self.info_log.can_focus = False
125 | self.info_log.border_title = "Info"
126 |
127 | self.job_log = RichLog(wrap=True, highlight=True, id="job_log", auto_scroll=False)
128 | self.job_log_position = None
129 |
130 |
131 | yield self.header
132 | yield self.tabs
133 | yield Container(self.job_table, self.node_table, self.history_table, self.info_log, self.job_log)
134 | yield self.footer
135 |
136 | def on_mount(self):
137 | pass
138 |
139 | def on_ready(self) -> None:
140 | self.rewrite_table("job")
141 | self.rewrite_table("node")
142 | self.rewrite_table("history")
143 | self.tabs.active = 'job'
144 | if self.interval > 0:
145 | self.set_interval(self.interval, self.auto_refresh)
146 |
147 | def on_tabs_tab_activated(self, message):
148 | tab_id = message.tab.id
149 | if self.verbose:
150 | self.info_log.write(f"Tab activated: {tab_id}")
151 |
152 | self.STAGE.update({"action": tab_id})
153 | self.update_table(tab_id)
154 | self.switch_display(tab_id)
155 | self.refresh_bindings()
156 |
157 | @handle_error
158 | def check_action(self, action: str, parameters):
159 | """Check if an action may run."""
160 | if action == "display_nodes" and self.STAGE['action'] not in ['node', 'history', 'job']:
161 | return False
162 | elif action == "display_history_jobs" and self.STAGE['action'] not in ['node', 'history', 'job']:
163 | return False
164 | elif action == "display_jobs" and self.STAGE['action'] not in ['node', 'history', 'job']:
165 | return False
166 | elif action == "abort":
167 | pass
168 | elif action == "select" and self.STAGE['action'] not in ['job', 'select']:
169 | return False
170 | elif action == "select_inverse" and self.STAGE['action'] not in ['job', 'select']:
171 | return False
172 | elif action == "refresh" and self.STAGE['action'] not in ['job', 'history', 'node', 'job_log']:
173 | return False
174 | elif action == "confirm" and self.STAGE['action'] != 'delete':
175 | return False
176 | elif action == "sort" and self.STAGE['action'] not in ['job', 'history', 'node']:
177 | return False
178 | elif action == "delete" and self.STAGE['action'] not in ['job', 'select']:
179 | return False
180 | elif action == "print_gpustat" and self.STAGE['action'] != 'job':
181 | return False
182 | elif action == "display_job_log" and self.STAGE['action'] not in ['job', 'history']:
183 | return False
184 | elif action == "open_with_less" and self.STAGE['action'] not in ['job', 'job_log', 'history']:
185 | return False
186 | elif action == "toggle_user_range" and self.STAGE['action'] != 'job':
187 | return False
188 | elif action == "toggle_history_range" and self.STAGE['action'] != 'history':
189 | return False
190 | return True
191 |
192 | @handle_error
193 | def action_display_nodes(self):
194 | if self.STAGE[self.active_table]['updating']:
195 | return
196 | if self.STAGE["action"] in ["history", "job"]:
197 | self.STAGE.update({"action": "node"})
198 | self.update_table("node")
199 | self.refresh_bindings()
200 | self.tabs.active = "node"
201 | elif self.STAGE["action"] == "node":
202 | self.only_available_nodes = not self.only_available_nodes
203 | self.rewrite_table("node", keep_state=True)
204 | self.switch_display("node")
205 |
206 | @handle_error
207 | def action_display_history_jobs(self):
208 | if self.STAGE[self.active_table]['updating']:
209 | return
210 | if self.STAGE["action"] in ["node", "job"]:
211 | self.STAGE.update({"action": "history"})
212 | self.update_table("history")
213 | self.refresh_bindings()
214 | self.tabs.active = "history"
215 | elif self.STAGE["action"] == "history":
216 | self.only_completed_history = not self.only_completed_history
217 | self.rewrite_table("history", keep_state=True)
218 | self.switch_display("history")
219 |
220 | @handle_error
221 | def action_display_jobs(self):
222 | if self.STAGE[self.active_table]['updating']:
223 | return
224 | if self.STAGE["action"] in ["node", "history"]:
225 | self.STAGE.update({"action": "job"})
226 | self.update_table("job")
227 | self.refresh_bindings()
228 | self.tabs.active = "job"
229 | elif self.STAGE["action"] == "job":
230 | self.only_running_jobs = not self.only_running_jobs
231 | self.rewrite_table("job", keep_state=True)
232 | self.refresh_bindings()
233 | self.switch_display("job")
234 |
235 | @handle_error
236 | def action_abort(self):
237 | if self.STAGE["action"] == "delete":
238 | self.info_log.write("Delete: aborted")
239 | self.selected_jobid = []
240 | self.STAGE.pop("job_id", None)
241 | self.STAGE.pop("job_name", None)
242 | elif self.STAGE["action"] == "job_log":
243 | self.job_log_position = None
244 | self.STAGE.pop("job_id", None)
245 | self.STAGE.pop("job_name", None)
246 | self.STAGE.pop("log_fn", None)
247 | elif self.STAGE["action"] == "select":
248 | self.info_log.write("Select: none")
249 | self.selected_jobid = []
250 | elif self.STAGE["action"] in ["node", "history"]:
251 | self.tabs.active = "job"
252 | action = self.tabs.active
253 | self.STAGE['action'] = action
254 | self.update_table(action)
255 | self.switch_display(action)
256 | self.refresh()
257 |
258 | @handle_error
259 | def action_select(self):
260 | if (self.STAGE["action"] == "job" and not self.selected_jobid) or self.STAGE["action"] == "select":
261 | i = self.tables[self.active_table].cursor_coordinate[0]
262 | value = str(self.tables[self.active_table].get_cell_at((i, 0)))
263 |
264 | job_id = self._get_selected_job()
265 | if job_id in self.selected_jobid:
266 | self.selected_jobid.remove(job_id)
267 | self.tables[self.active_table].update_cell_at((i, 0), value)
268 | else:
269 | self.selected_jobid.append(job_id)
270 | self.tables[self.active_table].update_cell_at((i, 0), Text(str(value), style=self.selected_text_style))
271 |
272 | if self.selected_jobid:
273 | self.STAGE["action"] = "select"
274 | self.info_log.write(f"Select: {' '.join(self.selected_jobid)}")
275 | else:
276 | self.STAGE["action"] = "job"
277 | self.info_log.write(f"Select: none")
278 | self.tables[self.active_table].action_cursor_down()
279 | self.refresh_bindings()
280 |
281 | @handle_error
282 | def action_select_inverse(self):
283 | assert self.STAGE["action"] in ["job", "select"]
284 | for i in range(len(self.tables[self.active_table].rows)):
285 | job_id = str(self.tables[self.active_table].get_cell_at((i, 0)))
286 |
287 | if job_id in self.selected_jobid:
288 | self.selected_jobid.remove(job_id)
289 | self.tables[self.active_table].update_cell_at((i, 0), job_id)
290 | else:
291 | self.selected_jobid.append(job_id)
292 | self.tables[self.active_table].update_cell_at((i, 0), Text(str(job_id), style=self.selected_text_style))
293 | self.tables[self.active_table].move_cursor(row=i, column=0)
294 | if self.selected_jobid:
295 | self.STAGE["action"] = "select"
296 | self.info_log.write(f"Select: {' '.join(self.selected_jobid)}")
297 | else:
298 | self.STAGE["action"] = "job"
299 | self.info_log.write(f"Select: none")
300 | self.refresh_bindings()
301 |
302 | @run_in_thread
303 | @handle_error
304 | def auto_refresh(self):
305 | if self.verbose:
306 | self.info_log.write("Auto-refreshing...")
307 | if self.STAGE["action"] == "job":
308 | self.update_table("job")
309 | elif self.STAGE["action"] == "job_log":
310 | self.update_log(self.STAGE["log_fn"])
311 | elif self.STAGE["action"] == "node":
312 | self.update_table("node")
313 | # elif self.STAGE["action"] == "history":
314 | # self.update_table("history")
315 | self.update_status()
316 |
317 | @handle_error
318 | def action_refresh(self):
319 | if self.STAGE["action"] == "job":
320 | self.rewrite_table("job", keep_state=True)
321 | elif self.STAGE["action"] == "job_log":
322 | self.update_log(self.STAGE["log_fn"])
323 | elif self.STAGE["action"] == "node":
324 | self.rewrite_table("node", keep_state=True)
325 | elif self.STAGE["action"] == "history":
326 | self.rewrite_table("history", keep_state=True)
327 | self.update_status()
328 |
329 | @handle_error
330 | def action_confirm(self):
331 | # job to delete
332 | if self.STAGE["action"] == "delete":
333 | perform_scancel(self.STAGE['job_id'])
334 | self.info_log.write(f"Delete: {self.STAGE['job_id']}? succeeded")
335 | self.selected_jobid = []
336 | self.update_table("job")
337 | self.STAGE["action"] = "job"
338 | self.refresh_bindings()
339 |
340 | @handle_error
341 | def action_sort(self):
342 | sort_column = self.tables[self.active_table].cursor_column
343 | if sort_column != self.STAGE[self.STAGE["action"]].get("sort_column"):
344 | self.STAGE[self.STAGE["action"]]["sort_ascending"] = False
345 | else:
346 | self.STAGE[self.STAGE["action"]]["sort_ascending"] = not self.STAGE[self.STAGE["action"]].get("sort_ascending", True)
347 | self.STAGE[self.STAGE["action"]]['sort_column'] = sort_column
348 |
349 | self.rewrite_table(self.active_table, keep_state=True)
350 | self.tables[self.active_table].move_cursor(row=0, column=sort_column)
351 |
352 | @handle_error
353 | def action_delete(self):
354 | if self.STAGE["action"] == "job":
355 | job_id = self._get_selected_job()
356 | self.info_log.write(f"Delete: {job_id}? press <> to confirm")
357 | self.STAGE.update({"action": "delete", "job_id": job_id})
358 | elif self.STAGE["action"] == "select":
359 | self.info_log.write(f"Delete: {' '.join(self.selected_jobid)}? press <> to confirm")
360 | self.STAGE.update({"action": "delete", "job_id": ' '.join(self.selected_jobid)})
361 | self.refresh_bindings()
362 |
363 | @handle_error
364 | def action_print_gpustat(self):
365 | if self.STAGE["action"] == "job":
366 | job_id = self._get_selected_job()
367 | gpustat = subprocess.check_output(f"""srun --jobid {job_id} gpustat""", shell=True, timeout=3).decode("utf-8").rstrip()
368 | self.info_log.write(gpustat)
369 |
370 | @handle_error
371 | def action_display_job_log(self):
372 | if self.STAGE["action"] in ["job", "history"]:
373 | job_id = self._get_selected_job()
374 | log_fn = self._get_log_fn(job_id)
375 | assert os.path.exists(log_fn), f"Log file not found: {log_fn}"
376 | self.STAGE.update({"action": "job_log", "log_fn": log_fn})
377 | self.update_log(log_fn)
378 | self.switch_display("job_log")
379 | self.refresh_bindings()
380 |
381 | @handle_error
382 | def action_open_with_less(self):
383 | if self.STAGE["action"] in ["job", "job_log", "history"]:
384 | if 'log_fn' not in self.STAGE:
385 | job_id = self._get_selected_job()
386 | log_fn = self._get_log_fn(job_id)
387 | else:
388 | log_fn = self.STAGE['log_fn']
389 | assert os.path.exists(log_fn), f"Log file not found: {log_fn}"
390 | with self.suspend():
391 | # Save the current SIGINT handler
392 | original_sigint = signal.signal(signal.SIGINT, signal.SIG_IGN)
393 | try:
394 | subprocess.run(['less', '+G', log_fn])
395 | finally:
396 | # Restore the original SIGINT handler
397 | signal.signal(signal.SIGINT, original_sigint)
398 | self.refresh()
399 |
400 | @handle_error
401 | def action_toggle_user_range(self):
402 | if self.STAGE["action"] == "job":
403 | self.only_my_jobs = not self.only_my_jobs
404 | self.rewrite_table("job", keep_state=True)
405 | self.refresh_bindings()
406 | self.switch_display("job")
407 |
408 | @handle_error
409 | def action_toggle_history_range(self):
410 | if self.STAGE[self.active_table]['updating']:
411 | return
412 | if self.history_range == "1 week":
413 | self.history_range = "1 month"
414 | elif self.history_range == "1 month":
415 | self.history_range = "4 months"
416 | elif self.history_range == "4 months":
417 | self.history_range = "1 year"
418 | else:
419 | self.history_range = "1 week"
420 | self.rewrite_table("history", keep_state=True)
421 | self.switch_display("history")
422 |
423 | def print_tab_prompt(self, tab_id):
424 | if not self.verbose:
425 | self.info_log.clear()
426 |
427 | if tab_id == "node":
428 | info = f"Press 'g' to toggle nodes: {'Available' if self.only_available_nodes else 'All'}"
429 | elif tab_id == "history":
430 | info = f"Press 'h' to toggle job states: {'Completed' if self.only_completed_history else 'All'}\t| " \
431 | + f"Press 'H' to toggle history range: {self.history_range}"
432 | elif tab_id == "job":
433 | info = f"Press 'j' to toggle job states: {'Running' if self.only_running_jobs else 'All'}\t|" \
434 | + f" Press 'J' to toggle user range: {'Me' if self.only_my_jobs else 'All'}"
435 | self.info_log.write(info)
436 |
437 | def switch_display(self, action):
438 | if self.verbose:
439 | self.info_log.write(f"Switch display: {action}")
440 | if action == "node":
441 | self.node_table.styles.height = "80%"
442 | self.active_table = action
443 | self.tables[self.active_table].focus()
444 | self.info_log.styles.height="20%"
445 | self.info_log.styles.border = (self.border_type, self.border_color)
446 | self.print_tab_prompt(action)
447 |
448 | self.job_table.styles.height = "0%"
449 | self.history_table.styles.height = "0%"
450 | self.job_log.styles.height="0%"
451 | self.job_log.styles.border = (self.border_type, self.border_color)
452 | self.job_log.clear()
453 | elif action == "history":
454 | self.history_table.styles.height = "80%"
455 | self.active_table = action
456 | self.tables[self.active_table].focus()
457 | self.info_log.styles.height="20%"
458 | self.info_log.styles.border = (self.border_type, self.border_color)
459 | self.print_tab_prompt(action)
460 |
461 | self.job_table.styles.height = "0%"
462 | self.node_table.styles.height = "0%"
463 | self.job_log.styles.height="0%"
464 | self.job_log.styles.border = (self.border_type, self.border_color)
465 | self.job_log.clear()
466 | elif action == "job":
467 | self.job_table.styles.height = "80%"
468 | self.active_table = action
469 | self.tables[self.active_table].focus()
470 | self.info_log.styles.border = (self.border_type, self.border_color)
471 | self.info_log.styles.height="20%"
472 | self.print_tab_prompt(action)
473 |
474 | self.history_table.styles.height = "0%"
475 | self.node_table.styles.height = "0%"
476 | self.job_log.styles.border = (self.border_type, self.border_color)
477 | self.job_log.styles.height="0%"
478 | self.job_log.clear()
479 | elif action == "job_log":
480 | self.job_log.styles.height="100%"
481 | self.job_log.styles.border = (self.border_type, self.border_color)
482 | self.job_log.focus()
483 |
484 | self.job_table.styles.height="0%"
485 | self.node_table.styles.height="0%"
486 | self.history_table.styles.height="0%"
487 | self.info_log.styles.height="0%"
488 | self.info_log.styles.border = ("none", self.border_color)
489 | else:
490 | raise ValueError(f"Invalid action: {action}")
491 |
492 | @run_in_thread
493 | @handle_error
494 | def update_status(self):
495 | self.title = f"SlurmUI (v{importlib.metadata.version('slurmui')})"
496 |
497 | njobs = self.stats.get("njobs", 0)
498 | njobs_running = self.stats.get("njobs_running", 0)
499 | self.tab_jobs.label = f"Jobs: {njobs_running}/{njobs}"
500 |
501 | ngpus_avail = self.stats.get("ngpus_avail", 0)
502 | ngpus = self.stats.get("ngpus", 0)
503 | self.tab_nodes.label = f"GPUs: {ngpus_avail}/{ngpus}"
504 |
505 | nhistory = self.stats.get("nhistory", 0)
506 | nhistory_completed = self.stats.get("nhistory_completed", 0)
507 | self.tab_history.label = f"History: {nhistory_completed}/{nhistory}"
508 |
509 | self.tab_time.label = f"{socket.gethostname()} | {datetime.now().strftime('%H:%M:%S')}"
510 |
511 | @handle_error
512 | def query_jobs(self, sort_column=None, sort_ascending=True):
513 | squeue_df = self.get_squeue(self.cluster, self.only_my_jobs, self.only_running_jobs)
514 | if sort_column is not None:
515 | squeue_df = squeue_df.sort_values(squeue_df.columns[sort_column], ascending=sort_ascending)
516 |
517 | self.stats['njobs'] = len(squeue_df)
518 | self.stats['njobs_running'] = sum(1 for row in squeue_df.iterrows() if row[1]['STATE'] == 'RUNNING')
519 | return squeue_df
520 |
521 | @run_in_thread
522 | @handle_error
523 | def rewrite_table(self, table_type, keep_state=False):
524 | if table_type not in self.STAGE:
525 | self.STAGE[table_type] = {}
526 | if self.STAGE[table_type].get('updating', False):
527 | return
528 | self.STAGE[table_type]['updating'] = True
529 |
530 | if 'sort_column' in self.STAGE[table_type]:
531 | sort_column = self.STAGE[table_type]['sort_column']
532 | else:
533 | sort_column = None
534 | if 'sort_ascending' in self.STAGE[table_type]:
535 | sort_ascending = self.STAGE[table_type]['sort_ascending']
536 | else:
537 | sort_ascending = True
538 |
539 | df = self.query_table_data(table_type, sort_column, sort_ascending)
540 | self.update_status()
541 |
542 | table = self.tables[table_type]
543 | if keep_state:
544 | cursor_column = table.cursor_column
545 | cursor_row = table.cursor_row
546 | else:
547 | cursor_column = 0
548 | cursor_row = 0
549 |
550 | table.clear(columns=True)
551 | table.add_columns(*df.columns)
552 |
553 | for _, row in df.iterrows():
554 | table_row = [str(row[col]) for col in df.columns]
555 | table.add_row(*table_row)
556 |
557 | cursor_row = min(cursor_row, len(table.rows) - 1)
558 | cursor_column = min(cursor_column, len(table.columns) - 1)
559 | table.move_cursor(row=cursor_row, column=cursor_column)
560 |
561 | time.sleep(0.3)
562 | self.STAGE[table_type]['updating'] = False
563 |
564 | @run_in_thread
565 | @handle_error
566 | def update_table(self, table_type):
567 | if self.STAGE[table_type].get('updating', False):
568 | return
569 | self.STAGE[table_type]['updating'] = True
570 | if 'sort_column' in self.STAGE[table_type]:
571 | sort_column = self.STAGE[table_type]['sort_column']
572 | else:
573 | sort_column = None
574 | if 'sort_ascending' in self.STAGE[table_type]:
575 | sort_ascending = self.STAGE[table_type]['sort_ascending']
576 | else:
577 | sort_ascending = True
578 |
579 | df = self.query_table_data(table_type, sort_column, sort_ascending)
580 | self.update_status()
581 |
582 | table = self.tables[table_type]
583 | if not table.columns:
584 | table.add_columns(*df.columns)
585 | for _, row in df.iterrows():
586 | table_row = [str(row[col]) for col in df.columns]
587 | table.add_row(*table_row)
588 | return
589 |
590 | for row_index, (_, row) in enumerate(df.iterrows()):
591 | table_row = [str(row[col]) for col in df.columns]
592 | if row_index < len(table.rows):
593 | for col_index, cell in enumerate(table_row):
594 | if table.get_cell_at((row_index, col_index)) != cell:
595 | table.update_cell_at((row_index, col_index), cell)
596 | else:
597 | table.add_row(*table_row)
598 |
599 | while len(table.rows) > len(df):
600 | row_key, _ = table.coordinate_to_cell_key((len(table.rows) - 1, 0))
601 | table.remove_row(row_key)
602 |
603 | self.STAGE[table_type]['updating'] = False
604 |
605 | @handle_error
606 | def query_table_data(self, table_type, sort_column=None, sort_ascending=True):
607 | if table_type == "job":
608 | return self.query_jobs(sort_column, sort_ascending)
609 | elif table_type == "node":
610 | return self.query_gpus(sort_column, sort_ascending)
611 | elif table_type == "history":
612 | return self.query_history(sort_column, sort_ascending)
613 | else:
614 | raise ValueError(f"Invalid table type: {table_type}")
615 |
616 | @handle_error
617 | def _get_selected_job(self):
618 | row_idx = self.tables[self.active_table].cursor_row
619 | row = self.tables[self.active_table].get_row_at(row_idx)
620 | job_id = str(row[0])
621 | return job_id
622 |
623 | @handle_error
624 | def update_log(self, log_fn):
625 | self.job_log.border_title = f"{log_fn}"
626 | current_scroll_y = self.job_log.scroll_offset[1]
627 |
628 | if not self.job_log_position:
629 | with open(log_fn, 'r') as f:
630 | self.job_log_position = max(sum(len(line) for line in f) - 2**12, 0) # read the last 4KB
631 |
632 | with open(log_fn, 'r') as log_file:
633 | log_file.seek(self.job_log_position)
634 | new_lines = log_file.readlines()[1:] # drop the first line because it can be incomplete
635 | self.job_log_position = log_file.tell()
636 | else:
637 | with open(log_fn, 'r') as log_file:
638 | log_file.seek(self.job_log_position)
639 | new_lines = log_file.readlines()
640 | self.job_log_position = log_file.tell()
641 |
642 | update_scroll = current_scroll_y == self.job_log.max_scroll_y
643 |
644 | for line in new_lines:
645 | self.job_log.write(line)
646 |
647 | if update_scroll:
648 | self.job_log.scroll_end(animate=False)
649 |
650 | @handle_error
651 | def _get_log_fn(self, job_id):
652 | if self.STAGE["action"] == "history":
653 | response_string = subprocess.check_output(f"""sacct -j {job_id} --format=StdOut -P""", shell=True).decode("utf-8")
654 | formatted_string = response_string.split("\n")[1].strip()
655 | formatted_string = formatted_string.replace("%j", job_id)
656 | elif self.STAGE["action"] in ["job", "job_log"]:
657 | response_string = subprocess.check_output(f"""scontrol show job {job_id} | grep StdOut""", shell=True).decode("utf-8")
658 | formatted_string = response_string.split("=")[-1].strip()
659 | else:
660 | raise ValueError(f"Cannot get log file for action: {self.STAGE['action']}")
661 | return formatted_string
662 |
663 | @handle_error
664 | def query_gpus(self, sort_column=None, sort_ascending=True):
665 | overview_df = self.get_sinfo(self.cluster)
666 | self.stats['ngpus'] = overview_df["GPUs (Total)"].sum()
667 | self.stats['ngpus_avail'] = overview_df["GPUs (Avail)"].sum()
668 | if self.only_available_nodes:
669 | # filter out nodes with no available GPUs
670 | overview_df = overview_df[overview_df["GPUs (Avail)"] > 0]
671 |
672 | # hide columns for simplicity
673 | overview_df = overview_df.drop(columns=["GPUs (Total)", "GPUs (Avail)"])
674 |
675 | if sort_column is not None:
676 | overview_df = overview_df.sort_values(overview_df.columns[sort_column],ascending=sort_ascending)
677 | return overview_df
678 |
679 | @handle_error
680 | def query_history(self, sort_column=None, sort_ascending=True):
681 | starttime = self.get_history_starttime()
682 | sacct_df = self.get_sacct(starttime=starttime)
683 | if sort_column is not None:
684 | sacct_df = sacct_df.sort_values(sacct_df.columns[sort_column], ascending=sort_ascending)
685 |
686 | self.stats['nhistory'] = len(sacct_df)
687 |
688 | if self.only_completed_history:
689 | sacct_df = sacct_df[sacct_df["State"] == "COMPLETED"]
690 |
691 | self.stats['nhistory_completed'] = len(sacct_df)
692 | return sacct_df
693 |
694 | @handle_error
695 | def get_history_starttime(self):
696 | if self.history_range == "1 week":
697 | return (datetime.now() - timedelta(weeks=1)).strftime('%Y-%m-%d')
698 | elif self.history_range == "1 month":
699 | return (datetime.now() - timedelta(days=30)).strftime('%Y-%m-%d')
700 | elif self.history_range == "4 months":
701 | return (datetime.now() - timedelta(days=120)).strftime('%Y-%m-%d')
702 | elif self.history_range == "1 year":
703 | return (datetime.now() - timedelta(days=365)).strftime('%Y-%m-%d')
704 | else:
705 | return "2024-11-26"
706 |
707 | @handle_error
708 | def get_squeue(self, cluster=None, only_my_jobs=True, only_running_jobs=False):
709 | sep = "|"
710 | if DEBUG:
711 | response_string = SQUEUE_DEBUG
712 | else:
713 | args = f"{sep},".join([
714 | "JOBID:18",
715 | "USERNAME:10",
716 | "PRIORITY:8",
717 | "PARTITION:80",
718 | "NAME:200",
719 | "STATE:8",
720 | "TimeUsed:10",
721 | "StartTime:30",
722 | "TimeLimit:15",
723 | "tres-alloc:100",
724 | "ReasonList:100",
725 | ])
726 | query_string = f"""squeue --Format="{args}" -S T"""
727 | if self.verbose:
728 | self.info_log.write(query_string)
729 |
730 | if only_my_jobs:
731 | query_string += " --me"
732 | if only_running_jobs:
733 | query_string += " --state=RUNNING"
734 | response_string = subprocess.check_output(query_string, shell=True).decode("utf-8")
735 | compact_string = re.sub(' +', '', response_string)
736 | data = io.StringIO(compact_string)
737 | df = pd.read_csv(data, sep=sep)
738 |
739 | # right align time
740 | max_length = df["TIME"].str.len().max()
741 | df.loc[:, "TIME"] = df.loc[:, "TIME"].apply(lambda x: f"{x:>{max_length}}")
742 |
743 | # remove years from start time
744 | df.loc[:, "START_TIME"] = df.loc[:, "START_TIME"].apply(lambda x: simplify_start_time(x))
745 |
746 | # simplify tres
747 | # df.loc[:, "TRES_ALLOC"] = df.loc[:, "TRES_ALLOC"].apply(simplify_tres)
748 | return df
749 |
750 | @handle_error
751 | def get_sinfo(self, cluster):
752 | if DEBUG:
753 | response_string = SINFO_DEBUG
754 | else:
755 | args = f",".join([
756 | "Partition:25",
757 | "NodeHost",
758 | "Gres:500",
759 | "GresUsed:500",
760 | "StateCompact",
761 | "FreeMem",
762 | "Memory",
763 | "CPUsState",
764 | "Features:200"
765 | ])
766 | query_string = f"""sinfo -O {args}"""
767 | if self.verbose:
768 | self.info_log.write(query_string)
769 |
770 | response_string = subprocess.check_output(query_string, shell=True).decode("utf-8")
771 |
772 | formatted_string = re.sub(' +', ' ', response_string)
773 | data = io.StringIO(formatted_string)
774 | df = pd.read_csv(data, sep=" ")
775 | overview_df = []
776 | for row in df.iterrows():
777 | # overview_df = overview_df[['Partition', 'Host', "Device", "State", "Mem (GB)", "CPUs", "GPUs", "Free IDX", "Feature"]]
778 |
779 | if row[1]['GRES'] != "(null)":
780 | device, ngpus = self.parse_gres(row[1]['GRES'], cluster)
781 | else:
782 | continue
783 |
784 | node_available = row[1]["STATE"] in ["mix", "idle", "alloc"]
785 | if not node_available:
786 | gpu_avail_idx = []
787 | else:
788 | device, gpu_avail_idx = self.parse_gres_used(row[1]['GRES_USED'], ngpus, cluster)
789 | ngpus_avail = len(gpu_avail_idx)
790 |
791 | host_info = OrderedDict()
792 |
793 | host_info['Partition'] = str(row[1]["PARTITION"])
794 | host_info['Host'] = str(row[1]["HOSTNAMES"])
795 | host_info['Device'] = device
796 | host_info['State'] = str(row[1]["STATE"])
797 |
798 | try:
799 | mem_avail = int(row[1]["FREE_MEM"]) // 1024
800 | except:
801 | mem_avail = row[1]["FREE_MEM"]
802 | # host_info['Mem (Avail)'] = mem_avail
803 | try:
804 | mem_total = int(row[1]["MEMORY"]) // 1024
805 | except:
806 | mem_total = row[1]["MEMORY"]
807 | # host_info['Mem (Total)'] = mem_total
808 | host_info['Mem (GB)'] = f"{mem_avail}/{mem_total}"
809 |
810 | cpu_info = row[1]["CPUS(A/I/O/T)"].split("/")
811 | ncpus_avail = cpu_info[1]
812 | ncpus_total = cpu_info[3]
813 | # host_info['CPUs (Avail)'] = ncpus_avail
814 | # host_info['CPUs (Total)'] = ncpus_total
815 | host_info['CPUs'] = f"{ncpus_avail}/{ncpus_total}"
816 |
817 | host_info['GPUs (Total)'] = ngpus
818 | host_info['GPUs (Avail)'] = ngpus_avail
819 | host_info["GPUs"] = f"{host_info['GPUs (Avail)']}/{ngpus}"
820 | host_info['GPUs (Avail IDX)'] = f"[{','.join(str(idx) for idx in gpu_avail_idx)}]"
821 |
822 | features = row[1]["AVAIL_FEATURES"]
823 | if ',' in features:
824 | unnamed_features = []
825 | for feature in features.split(","):
826 | if ':' in feature:
827 | name, value = feature.split(':')
828 | host_info[name] = value
829 | else:
830 | unnamed_features.append(feature)
831 | else:
832 | unnamed_features = [features]
833 | host_info['Feature'] = ','.join(unnamed_features)
834 |
835 | overview_df.append(host_info)
836 | overview_df = pd.DataFrame.from_records(overview_df).drop_duplicates("Host")
837 | return overview_df
838 |
839 | @handle_error
840 | def parse_gres(self, gres_str, cluster=None):
841 | match = re.match(r"([^:]+)(?::([^:()]+))?:([^:(,]+)(?:\(S:([^)]+)\))?", gres_str)
842 |
843 | if match:
844 | groups = match.groups()
845 | if self.verbose:
846 | self.info_log.write(f"Parsed gres: {groups} from {gres_str}")
847 | _, device, ngpus, _ = groups
848 | ngpus = int(ngpus)
849 | else:
850 | error_msg = f"Error parsing gres: {gres_str}"
851 | raise ValueError(error_msg)
852 |
853 | return device, ngpus
854 |
855 | @handle_error
856 | def parse_gres_used(self, gres_used_str, num_total, cluster=None):
857 | match = re.match(r"([^:]+)(?::([^:]+))?:([^:(,]+)(?:\(IDX:([^)]+)\))?", gres_used_str)
858 | if match:
859 | groups = match.groups()
860 | if self.verbose:
861 | self.info_log.write(f"Parsed gres_used: {groups} from {gres_used_str}")
862 | _, device, ngpus_used, alloc_str = groups
863 | ngpus_used = int(ngpus_used)
864 | else:
865 | error_msg = f"Error parsing gres_used: {gres_used_str}"
866 | raise ValueError(error_msg)
867 |
868 | gpu_used_idx = []
869 | if alloc_str:
870 | for gpu_ids in alloc_str.split(","):
871 | if "-" in gpu_ids:
872 | start, end = gpu_ids.split("-")
873 | for i in range(int(start), int(end)+1):
874 | gpu_used_idx.append(i)
875 | else:
876 | if gpu_ids == "N/A":
877 | pass
878 | else:
879 | gpu_used_idx.append(int(gpu_ids))
880 | assert ngpus_used == len(gpu_used_idx), f"Number of used GPUs {ngpus_used} does not match parsed indices {gpu_used_idx} in gres_used: {gres_used_str}"
881 |
882 | gpu_avail_idx = [idx for idx in range(num_total) if idx not in gpu_used_idx]
883 | return device, gpu_avail_idx
884 |
885 | @handle_error
886 | def get_sacct(self, starttime="2024-11-26", endtime="now"):
887 | args = f",".join([
888 | "JobID",
889 | "JobName",
890 | "State",
891 | "Start",
892 | "Elapsed",
893 | "NodeList",
894 | "AllocTRES",
895 | "Partition",
896 | "StdOut"
897 | ])
898 | query_string = f"""sacct --format={args} -P -X --starttime={starttime} --endtime={endtime}"""
899 | if self.verbose:
900 | self.info_log.write(query_string)
901 |
902 | response_string = subprocess.check_output(
903 | query_string,
904 | shell=True
905 | ).decode("utf-8")
906 | data = io.StringIO(response_string)
907 | df = pd.read_csv(data, sep='|')
908 |
909 | # Strip whitespace from column names
910 | df.columns = df.columns.str.strip()
911 |
912 | # Strip whitespace from each string element in the DataFrame
913 | for col in df.select_dtypes(['object']).columns:
914 | df[col] = df[col].str.strip()
915 | return df
916 |
917 | def perform_scancel(job_id):
918 | os.system(f"""scancel {job_id}""")
919 |
920 | def remove_first_line(input_string):
921 | lines = input_string.split('\n')
922 | return '\n'.join(lines[1:])
923 |
924 | def simplify_start_time(start_time):
925 | try:
926 | if start_time != "nan":
927 | start_time = datetime.strptime(start_time, "%Y-%m-%dT%H:%M:%S").strftime("%m-%d %H:%M")
928 | except Exception as e:
929 | pass
930 | return start_time
931 |
932 | def simplify_tres(tres):
933 | tres_ = []
934 | for x in str(tres).split(","):
935 | if 'billing=' in x:
936 | continue
937 | tres_.append(x)
938 | return ",".join(tres_)
939 |
940 | def read_log(fn, num_lines=100):
941 | with open(os.path.expanduser(fn), 'r') as f:
942 | txt_lines = list(f.readlines()[-num_lines:])
943 |
944 | return txt_lines
945 |
946 | def run_ui(verbose=False, cluster=None, interval=10, history_range="1 week"):
947 | # if debug:
948 | # # global for quick debugging
949 | # global DEBUG
950 | # DEBUG = True
951 | app = SlurmUI()
952 | app.verbose = verbose
953 | app.cluster = cluster
954 | app.interval = interval
955 | app.history_range = history_range
956 | app.run()
957 |
958 |
959 | if __name__ == "__main__":
960 | run_ui()
961 |
--------------------------------------------------------------------------------