├── .gitignore ├── LICENSE ├── scripts ├── jobqueue ├── myaccount ├── jobhist ├── jobeff ├── noderes └── jobinfo └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | notes.txt 3 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD Zero Clause License (0BSD) 2 | 3 | Copyright (c) 2024-2025 USC Center for Advanced Research Computing 4 | 5 | Permission to use, copy, modify, and/or distribute this software for any 6 | purpose with or without fee is hereby granted. 7 | 8 | THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 9 | REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 10 | AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 11 | INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 12 | LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 13 | OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 14 | PERFORMANCE OF THIS SOFTWARE. 15 | -------------------------------------------------------------------------------- /scripts/jobqueue: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 4 | jobqueue 5 | 6 | View job queue information 7 | 8 | This script requires: 9 | - Python 3.5+ (for the subprocess.run command) 10 | - Slurm (any recent version should work) 11 | """ 12 | 13 | import argparse 14 | import subprocess 15 | import sys 16 | 17 | # Set up arguments and options ########################################################### 18 | 19 | parser = argparse.ArgumentParser( 20 | prog = "jobqueue", 21 | formatter_class = argparse.RawDescriptionHelpFormatter, 22 | description = "View job queue information", 23 | epilog = """\ 24 | examples: 25 | 26 | jobqueue 27 | jobqueue --me 28 | jobqueue --all 29 | jobqueue -p gpu 30 | jobqueue -p oneweek,largemem 31 | jobqueue -u ttrojan 32 | jobqueue -u ttrojan,btrojan 33 | jobqueue -p gpu -u ttrojan 34 | jobqueue -a ttrojan_123 35 | jobqueue -n simulation 36 | jobqueue -r class 37 | jobqueue -t pending 38 | jobqueue -w a04-05 39 | 40 | notes: 41 | 42 | to view completed jobs, run jobhist 43 | to view more job details, run jobinfo 44 | 45 | support: 46 | 47 | https://github.com/uschpc/slurm-tools/issues""" 48 | ) 49 | parser.add_argument( 50 | "-a", 51 | "--account", 52 | help = "filter by account (comma-separated list)" 53 | ) 54 | parser.add_argument( 55 | "--all", 56 | action = "store_true", 57 | help ="view jobs in all partitions" 58 | ) 59 | parser.add_argument( 60 | "--me", 61 | action = "store_true", 62 | help = "view own jobs" 63 | ) 64 | parser.add_argument( 65 | "-n", 66 | "--name", 67 | help = "filter by job name (comma-separated list)" 68 | ) 69 | parser.add_argument( 70 | "-w", 71 | "--nodelist", 72 | help = "filter by node name (range or comma-separated list)" 73 | ) 74 | parser.add_argument( 75 | "-p", 76 | "--partition", 77 | help = "filter by partition (comma-separated list)" 78 | ) 79 | parser.add_argument( 80 | "-r", 81 | "--reservation", 82 | help = "filter by reservation name" 83 | ) 84 | parser.add_argument( 85 | "-t", 86 | "--state", 87 | help = "filter by job state (comma-separated list)" 88 | ) 89 | parser.add_argument( 90 | "-u", 91 | "--user", 92 | help = "filter by user (comma-separated list)" 93 | ) 94 | parser.add_argument( 95 | "-V", 96 | "--version", 97 | action = "store_true", 98 | help = "print version" 99 | ) 100 | args = parser.parse_args() 101 | 102 | if args.version: 103 | print("jobqueue 1.3.1") 104 | sys.exit(0) 105 | 106 | # Run squeue command and print results ################################################### 107 | 108 | # Specify squeue fields to query and output format 109 | fmt = "%12i %.10u %.12j %.10P %.8T %.11M %.20R" 110 | 111 | # Derive squeue command based on given arguments 112 | cmd = ["squeue", "-h", "-o", fmt] 113 | if args.account: 114 | cmd = cmd + ["-A", args.account] 115 | if args.all: 116 | cmd = cmd + ["--all"] 117 | if args.me: 118 | cmd = cmd + ["--me"] 119 | if args.name: 120 | cmd = cmd + ["-n", args.name] 121 | if args.nodelist: 122 | cmd = cmd + ["-w", args.nodelist] 123 | if args.partition: 124 | cmd = cmd + ["-p", args.partition] 125 | if args.reservation: 126 | cmd = cmd + ["-R", args.reservation] 127 | if args.state: 128 | cmd = cmd + ["-t", args.state] 129 | if args.user: 130 | cmd = cmd + ["-u", args.user] 131 | 132 | # Run squeue command and print results 133 | print("-----------------------------------------------------------------------------------------") 134 | print("Job ID User Job Name Partition State Elapsed Nodelist(Reason)") 135 | print("------------ ---------- ------------ ---------- -------- ----------- --------------------") 136 | subprocess.run(cmd) 137 | -------------------------------------------------------------------------------- /scripts/myaccount: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 4 | myaccount 5 | 6 | View account information for user 7 | 8 | This script requires: 9 | - Python 3.7+ (for the subprocess.run options) 10 | - Slurm (any recent version should work) 11 | - Slurm configured with: 12 | - PriorityType=priority/multifactor 13 | 14 | Notes: 15 | - Based on output from sacctmgr, sshare, and scontrol 16 | """ 17 | 18 | import argparse 19 | import os 20 | import re 21 | import subprocess 22 | import sys 23 | 24 | # Set up arguments and options ########################################################### 25 | 26 | parser = argparse.ArgumentParser( 27 | prog = "myaccount", 28 | formatter_class = argparse.RawDescriptionHelpFormatter, 29 | description = "View account information for user", 30 | epilog = """\ 31 | examples: 32 | 33 | myaccount 34 | myaccount ttrojan 35 | myaccount -c condo 36 | myaccount ttrojan -c condo 37 | 38 | support: 39 | 40 | https://github.com/uschpc/slurm-tools/issues""" 41 | ) 42 | parser.add_argument( 43 | "user", 44 | nargs = "?", 45 | default = os.environ["USER"], 46 | help = "user to query for account information (default = $USER)" 47 | ) 48 | parser.add_argument( 49 | "-c", 50 | "--cluster", 51 | help = "cluster to query for account information (default = local cluster)" 52 | ) 53 | parser.add_argument( 54 | "-V", 55 | "--version", 56 | action = "store_true", 57 | help = "print version" 58 | ) 59 | args = parser.parse_args() 60 | 61 | if args.version: 62 | print("myaccount 1.2.1") 63 | sys.exit(0) 64 | 65 | # Check if user exists 66 | cmdid = ["id", args.user] 67 | ident = subprocess.run(cmdid, capture_output = True, text = True) 68 | if "no such user" in ident.stderr: 69 | print("Error: user does not exist", file = sys.stderr) 70 | sys.exit(1) 71 | 72 | # Get cluster accounts ################################################################### 73 | 74 | # Get name of cluster 75 | if not args.cluster: 76 | cmd = ["scontrol", "--local", "show", "config"] 77 | proc = subprocess.run(cmd, capture_output = True, text = True) 78 | cluster = re.search(r"ClusterName\s+=\s+(\S+)", proc.stdout).group(1) 79 | else: 80 | # Check if cluster exists 81 | cmd = ["scontrol", "-M", args.cluster, "show", "config"] 82 | proc = subprocess.run(cmd, capture_output = True, text = True) 83 | if not proc.stdout: 84 | print("Error: cluster does not exist", file = sys.stderr) 85 | sys.exit(1) 86 | else: 87 | cluster = args.cluster 88 | 89 | # Run sacctmgr command and capture output 90 | fmt = "format=user%-10,account%-15,cluster%-10,defaultaccount%-15,qos%-11" 91 | cmd = ["sacctmgr", "-n", "-s", "list", "user", args.user, fmt, "cluster=" + cluster] 92 | proc = subprocess.run(cmd, capture_output = True, text = True) 93 | if not proc.stdout: 94 | print("No cluster accounts found for " + args.user) 95 | sys.exit(0) 96 | 97 | # Process output to get cluster accounts 98 | out = proc.stdout.split("\n")[:-1] 99 | if not re.search(cluster, proc.stdout): 100 | print("No cluster accounts found for " + args.user) 101 | sys.exit(0) 102 | accounts = [line.split()[1] for line in out] 103 | 104 | # Print table of cluster accounts ######################################################## 105 | 106 | print("-----------------------------------------------------------------") 107 | print("Cluster accounts ") 108 | print("-----------------------------------------------------------------") 109 | print("User Account Cluster Default QOS ") 110 | print("---------- --------------- ---------- --------------- -----------") 111 | for line in out: 112 | print(line) 113 | print("") 114 | 115 | # Print table of cluster account limits and usage ######################################## 116 | 117 | print("-----------------------------------------------------------------") 118 | print("Cluster account service units (SUs) ") 119 | print("-----------------------------------------------------------------") 120 | print("Account Limit Usage Remaining ") 121 | print("----------------- --------------- --------------- ---------------") 122 | fmt2 = "account,grptresmins,grptresraw" 123 | cmd2 = ["sshare", "-P", "-o", fmt2, "-M", cluster] 124 | proc2 = subprocess.run(cmd2, capture_output = True, text = True) 125 | if not proc2: 126 | print("No information found") 127 | out2 = proc2.stdout.split("\n") 128 | for account in accounts: 129 | sub = [i for i in out2 if re.search(account, i)] 130 | sub = sub[0].split("|")[1:3] 131 | res = [re.search(r"billing=([0-9]+)", i) for i in sub] 132 | if res[0] and res[1]: 133 | limit = res[0].group(1) 134 | usage = res[1].group(1) 135 | remaining = str(int(limit) - int(usage)) 136 | elif res[1]: 137 | limit = "n/a" 138 | usage = res[1].group(1) 139 | remaining = "n/a" 140 | else: 141 | limit = "n/a" 142 | usage = "n/a" 143 | remaining = "n/a" 144 | print(f"{account[:17]:<17}" + " " + 145 | f"{limit[:15]:<15}" + " " + 146 | f"{usage[:15]:<15}" + " " + 147 | f"{remaining[:15]:<15}") 148 | print("") 149 | 150 | # Print table of allowed cluster partitions ############################################## 151 | 152 | print("-----------------------------------------------------------------") 153 | print("Allowed cluster partitions ") 154 | print("-----------------------------------------------------------------") 155 | print("Partition Allowed accounts ") 156 | print("-------------- --------------------------------------------------") 157 | accounts2 = "|".join(accounts) 158 | cmd3 = ["scontrol", "-o", "-M", cluster, "show", "partitions"] 159 | proc3 = subprocess.run(cmd3, capture_output = True, text = True) 160 | if not proc3: 161 | print("No information found") 162 | out3 = proc3.stdout.split("\n")[:-1] 163 | sub = [i for i in out3 if re.search(accounts2 + "|AllowAccounts=ALL", i)] 164 | partitions = [re.search(r"PartitionName=(\S+)", i).group(1) for i in sub] 165 | allowed = [re.search(r"AllowAccounts=(\S+)", i).group(1) for i in sub] 166 | for i, partition in enumerate(partitions): 167 | print(f"{partition[:14]:<14}" + " " + 168 | f"{allowed[i]}") 169 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Slurm CLI tools 2 | 3 | A collection of CLI tools for various tasks and queries on Slurm clusters. 4 | 5 | ## Installation 6 | 7 | The CLI tools are Python scripts that rely only on the Python standard library. All scripts will run with Python 3.10+. The scripts should run on most Slurm clusters, but they depend on certain Slurm configuration. Check the notes in each script for more information. 8 | 9 | To install, simply clone the repo: 10 | 11 | ``` 12 | git clone --depth 1 https://github.com/uschpc/slurm-tools.git 13 | ``` 14 | 15 | The scripts will be downloaded with execute permissions. If desired, move the scripts to another directory, such as a directory on `PATH`. If needed, load a compatible version of Python or change the hashbang in the scripts to use a compatible Python executable. 16 | 17 | ## Usage 18 | 19 | Each script contains help and usage information that can be viewed with the `-h/--help` flag (e.g., `jobinfo -h`). 20 | 21 | The scripts are described and shown with example output below. 22 | 23 | ### myaccount 24 | 25 | View account information for user. 26 | 27 | ``` 28 | $ myaccount 29 | ----------------------------------------------------------------- 30 | Cluster accounts 31 | ----------------------------------------------------------------- 32 | User Account Cluster Default QOS 33 | ---------- --------------- ---------- --------------- ----------- 34 | ttrojan ttrojan_123 discovery ttrojan_123 normal 35 | ttrojan ttrojan_125 discovery ttrojan_123 normal 36 | 37 | ----------------------------------------------------------------- 38 | Cluster account service units (SUs) 39 | ----------------------------------------------------------------- 40 | Account Limit Usage Remaining 41 | ----------------- --------------- --------------- --------------- 42 | ttrojan_123 12000000 422699 11577301 43 | ttrojan_125 n/a 839856 n/a 44 | 45 | ----------------------------------------------------------------- 46 | Allowed cluster partitions 47 | ----------------------------------------------------------------- 48 | Partition Allowed accounts 49 | -------------- -------------------------------------------------- 50 | trojan ttrojan_125 51 | shared ALL 52 | ``` 53 | 54 | ### noderes 55 | 56 | View available resources (free or configured) on nodes. 57 | 58 | ``` 59 | $ noderes -p largemem 60 | ------------------------------------------------------------------- 61 | Node Partition State CPU GPU Free Free Free 62 | Model Model CPUs Memory GPUs 63 | ------ ------------ --------- ----------- -------- ---- ------ ---- 64 | a01-10 largemem mixed epyc-7513 -- 22 78G -- 65 | a02-10 largemem allocated epyc-7513 -- 0 54G -- 66 | a03-10 largemem mixed epyc-7513 -- 12 50G -- 67 | a04-10 largemem reserved epyc-7513 -- 62 38G -- 68 | b17-13 largemem allocated epyc-9354 -- 0 0G -- 69 | ``` 70 | 71 | ### jobqueue 72 | 73 | View job queue information. 74 | 75 | ``` 76 | $ jobqueue -p largemem 77 | ----------------------------------------------------------------------------------------- 78 | Job ID User Job Name Partition State Elapsed Nodelist(Reason) 79 | ------------ ---------- ------------ ---------- -------- ----------- -------------------- 80 | 7453378 jesp Run_do52bran largemem PENDING 0:00 (QOSMaxMemoryPerUser 81 | 7453379 jesp Run_do6b.job largemem PENDING 0:00 (QOSMaxMemoryPerUser 82 | 7473836 ttrojan ood/jupyter largemem RUNNING 2:42:28 a04-10 83 | 7449562 snet run_study_4x largemem RUNNING 2-23:17:10 a02-10 84 | 7453377 jesp Run_do51bran largemem RUNNING 2-17:02:07 a01-10 85 | 7470944 huy435 rfmixchr1 largemem RUNNING 21:18:51 a02-10,a04-10 86 | ``` 87 | 88 | ### jobhist 89 | 90 | View compact history of jobs. 91 | 92 | ``` 93 | $ jobhist -p largemem 94 | ---------------------------------------------------------------------------------------------------- 95 | Job ID Startdate User Job Name Partition State Elapsed Nodes CPUs Memory 96 | ------------- ---------- ---------- ------------ ---------- ---------- ----------- ----- ---- ------ 97 | 14690860 2024-07-29 ttrojan sim.sl largemem RUNNING 3-08:19:19 1 32 128G 98 | 14734145 2024-07-31 jesp sfla largemem RUNNING 2-21:46:24 1 64 998G 99 | 14738354 2024-07-31 snet interactive largemem COMPLETED 06:56:19 1 16 400G 100 | 14741823 2024-07-31 huy435 model_fit1 largemem COMPLETED 07:04:19 1 64 248G 101 | 14741846 2024-07-31 huy435 model_fit2 largemem COMPLETED 08:10:59 1 64 248G 102 | 14741918 2024-08-01 snet feature.sl largemem FAILED 00:02:16 1 8 300G 103 | ``` 104 | 105 | ### jobinfo 106 | 107 | View detailed job information. 108 | 109 | ``` 110 | $ jobinfo 483699 111 | Job ID | 483699 112 | Job name | simdebug.sl 113 | User | ttrojan 114 | Account | ttrojan_123 115 | Working directory | /project/ttrojan_123/sim 116 | Cluster | discovery 117 | Partition | main 118 | State | COMPLETED 119 | Exit code | 0:0 120 | Nodes | 2 121 | Tasks | 32 122 | CPUs | 32 123 | Memory | 120G 124 | GPUs | 0 125 | Nodelist | e05-[42,76] 126 | Submit time | 2024-01-26T14:56:23 127 | Start time | 2024-01-26T14:56:24 128 | End time | 2024-01-26T14:57:32 129 | Wait time | 00:00:01 130 | Reserved walltime | 00:10:00 131 | Elapsed walltime | 00:01:08 132 | Time efficiency | 11.33% 133 | Elapsed CPU walltime | 00:36:16 134 | Used CPU time | 00:35:18.342 135 | CPU efficiency | 97.37% 136 | User CPU time pct | 96.35% 137 | System CPU time pct | 3.65% 138 | Max memory used | 64.74G (estimate) 139 | Memory efficiency | 53.95% 140 | Max disk read | 14.42M (estimate) 141 | Max disk write | 1.04M (estimate) 142 | ``` 143 | 144 | ### jobeff 145 | 146 | View job efficiency information. 147 | 148 | ``` 149 | $ jobeff -p largemem 150 | ------------------------------------------------------------------ 151 | Job ID State CPU Efficiency Memory Efficiency 152 | ------------- ---------- -------------------- -------------------- 153 | 1131130 CANCELLED 42.02% [|||| ] 50.03% [||||| ] 154 | 1140921 COMPLETED 9.78% [| ] 4.15% [ ] 155 | 1140925 RUNNING 156 | 1189016 FAILED 87.30% [||||||||| ] 85.03% [||||||||| ] 157 | 1201010 OUT_OF_MEM 86.08% [||||||||| ] 99.69% [||||||||||] 158 | 1201035 TIMEOUT 23.00% [|| ] 73.35% [||||||| ] 159 | ``` 160 | 161 | ## License 162 | 163 | [0BSD](LICENSE) 164 | -------------------------------------------------------------------------------- /scripts/jobhist: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 4 | jobhist 5 | 6 | View compact history of jobs 7 | 8 | This script requires: 9 | - Python 3.10+ (for the zip() strict = True option) 10 | - Slurm (any recent version should work) 11 | - Slurm configured with: 12 | - JobAcctGatherType=jobacct_gather/linux or jobacct_gather/cgroup 13 | 14 | Notes: 15 | - Based on output from sacct 16 | - Default time period is past day 17 | - State filter is based on current job state 18 | """ 19 | 20 | from datetime import datetime, timedelta 21 | import argparse 22 | import os 23 | import re 24 | import subprocess 25 | import sys 26 | 27 | # Set up arguments and options ########################################################### 28 | 29 | parser = argparse.ArgumentParser( 30 | prog = "jobhist", 31 | formatter_class = argparse.RawDescriptionHelpFormatter, 32 | description = "View compact history of jobs", 33 | epilog = """\ 34 | examples: 35 | 36 | jobhist 37 | jobhist --me 38 | jobhist -s 2024-07-01 39 | jobhist -s 2024-07-01 -e 2024-07-02 40 | jobhist -u ttrojan 41 | jobhist -p largemem 42 | jobhist -u ttrojan -p main 43 | jobhist -u ttrojan -s 2024-07-01 -p gpu 44 | jobhist -a ttrojan_123 45 | jobhist -n sim2 46 | jobhist -w a01-[01-20] 47 | jobhist -t failed 48 | 49 | notes: 50 | 51 | default time period is past day 52 | state filter is based on current job state 53 | to view more job details, run jobinfo 54 | 55 | support: 56 | 57 | https://github.com/uschpc/slurm-tools/issues""" 58 | ) 59 | parser.add_argument( 60 | "-a", 61 | "--account", 62 | help = "account to query for history (comma-separated list)" 63 | ) 64 | parser.add_argument( 65 | "-e", 66 | "--end", 67 | help = "end date to query for history (YYYY-MM-DD)" 68 | ) 69 | parser.add_argument( 70 | "--me", 71 | action = "store_true", 72 | help = "query own jobs" 73 | ) 74 | parser.add_argument( 75 | "-n", 76 | "--name", 77 | help = "job name (partial or full) to query for history (comma-separated list)" 78 | ) 79 | parser.add_argument( 80 | "-w", 81 | "--nodelist", 82 | help = "node list to query for history (range or comma-separated list)" 83 | ) 84 | parser.add_argument( 85 | "-p", 86 | "--partition", 87 | help = "partition to query for history (comma-separated list)" 88 | ) 89 | parser.add_argument( 90 | "-q", 91 | "--qos", 92 | help = "QOS to query for history (comma-separated list)" 93 | ) 94 | parser.add_argument( 95 | "-r", 96 | "--reason", 97 | help = "last pending reason to query for history (excluding resources and priority) (comma-separated list)" 98 | ) 99 | parser.add_argument( 100 | "-s", 101 | "--start", 102 | help = "start date to query for history (YYYY-MM-DD)" 103 | ) 104 | parser.add_argument( 105 | "-t", 106 | "--state", 107 | help = "current job state to query for history (comma-separated list)" 108 | ) 109 | parser.add_argument( 110 | "-u", 111 | "--user", 112 | help = "user to query for history (comma-separated list)" 113 | ) 114 | parser.add_argument( 115 | "-V", 116 | "--version", 117 | action = "store_true", 118 | help = "print version" 119 | ) 120 | args = parser.parse_args() 121 | 122 | if args.version: 123 | print("jobhist 2.4.0") 124 | sys.exit(0) 125 | 126 | # Define functions ####################################################################### 127 | 128 | def memtogib(mem): 129 | """ 130 | Convert memory value to GiB 131 | 132 | # Arguments 133 | - mem (str): memory value 134 | 135 | # Returns 136 | - str: memory value in GiB 137 | - str: "??" if unit not found 138 | """ 139 | m = float(mem[:-1]) 140 | if mem.endswith("T"): 141 | return str(int(m * 1024)) + "G" 142 | if mem.endswith("G"): 143 | return str(int(m)) + "G" 144 | if mem.endswith("M"): 145 | return str(int(m / 1024)) + "G" 146 | if mem.endswith("K"): 147 | return str(int(m / 1024**2)) + "G" 148 | if mem[-1].isdigit(): 149 | return str(int(mem / 1024**3)) + "G" 150 | return "??" 151 | 152 | def derivev(fields, values): 153 | """ 154 | Derive job info values from sacct values 155 | 156 | # Arguments 157 | - fields (str): job info fields used for sacct -o 158 | - values (list): job info values from sacct -X -p output 159 | 160 | # Returns 161 | - dict: job info key-value pairs 162 | 163 | # Notes 164 | - Assumes sacct -X -p -o options are used 165 | - Assumes sacct state,reqtres,alloctres fields are present 166 | """ 167 | # Create dict from keys and values 168 | keys = fields.split(",") 169 | d = dict(zip(keys, values, strict = True)) 170 | # Replace state CANCELLED by ... with just CANCELLED 171 | if "CANCELLED" in d["state"]: 172 | d["state"] = "CANCELLED" 173 | # Get requested nodes 174 | reqnodes = re.search(r"node=([0-9]+)", d["reqtres"]) 175 | if not reqnodes: 176 | d["reqnodes"] = "??" 177 | else: 178 | d["reqnodes"] = reqnodes.group(1) 179 | # Get requested CPUs 180 | reqcpus = re.search(r"cpu=([0-9]+)", d["reqtres"]) 181 | if not reqcpus: 182 | d["reqcpus"] = "??" 183 | else: 184 | d["reqcpus"] = reqcpus.group(1) 185 | # Get requested memory (may be in varying units) 186 | reqmem = re.search(r"mem=([0-9]+.[0-9]+[A-Z]|[0-9]+[A-Z])", d["reqtres"]) 187 | if not reqmem: 188 | d["reqmem"] = "??" 189 | else: 190 | d["reqmem"] = memtogib(reqmem.group(1)) 191 | # alloctres is empty if resources have not been allocated 192 | # If job is PENDING or CANCELLED/FAILED before job started 193 | if not d["alloctres"]: 194 | d["allocnodes"] = "--" 195 | d["alloccpus"] = "--" 196 | d["allocmem"] = "--" 197 | else: 198 | # Get allocated nodes 199 | allocnodes = re.search(r"node=([0-9]+)", d["alloctres"]) 200 | if not allocnodes: 201 | d["allocnodes"] = "??" 202 | else: 203 | d["allocnodes"] = allocnodes.group(1) 204 | # Get allocated CPUs 205 | alloccpus = re.search(r"cpu=([0-9]+)", d["alloctres"]) 206 | if not alloccpus: 207 | d["alloccpus"] = "??" 208 | else: 209 | d["alloccpus"] = alloccpus.group(1) 210 | # Get allocated memory (may be in varying units) 211 | allocmem = re.search(r"mem=([0-9]+.[0-9]+[A-Z]|[0-9]+[A-Z])", d["alloctres"]) 212 | if not allocmem: 213 | d["allocmem"] = "??" 214 | else: 215 | d["allocmem"] = memtogib(allocmem.group(1)) 216 | return d 217 | 218 | # Get sacct values ####################################################################### 219 | 220 | # Specify sacct fields to query 221 | fs = "jobid,start,user,jobname,partition,state,elapsed,reqtres,alloctres" 222 | 223 | # Derive sacct command based on given arguments 224 | cmd = ["sacct", "-a", "-X", "-n", "-p", "-o", fs] 225 | tday = datetime.today() 226 | yday = tday - timedelta(days = 1) 227 | tday = tday.strftime("%Y-%m-%dT%H:%M:%S") 228 | yday = yday.strftime("%Y-%m-%d") 229 | if args.start and args.end: 230 | cmd = cmd + ["-S", args.start, "-E", args.end] 231 | elif args.start and not args.end: 232 | cmd = cmd + ["-S", args.start, "-E", tday] 233 | elif not args.start and args.end: 234 | print("Error: no start date given", file = sys.stderr) 235 | sys.exit(1) 236 | else: 237 | cmd = cmd + ["-S", yday, "-E", tday] 238 | if args.account: 239 | cmd = cmd + ["-A", args.account] 240 | if args.me: 241 | cmd = cmd + ["-u", os.environ["USER"]] 242 | if args.nodelist: 243 | cmd = cmd + ["-N", args.nodelist] 244 | if args.partition: 245 | cmd = cmd + ["-r", args.partition] 246 | if args.qos: 247 | cmd = cmd + ["-q", args.qos] 248 | if args.reason: 249 | cmd = cmd + ["--reason", args.reason] 250 | if args.user: 251 | cmd = cmd + ["-u", args.user] 252 | 253 | # Run sacct command and capture output 254 | proc = subprocess.run(cmd, env = {"SLURM_TIME_FORMAT":"%Y-%m-%d"}, capture_output = True, text = True) 255 | if not proc.stdout: 256 | print("No jobs found") 257 | sys.exit(0) 258 | 259 | # Job names may contain \n (causing line break) 260 | # So splitting by \n does not necessarily split output by line 261 | # First remove all \n and split by |, then group line values using number of fields 262 | out = re.sub(r"\n", "", proc.stdout)[:-1] 263 | out2 = out.split("|") 264 | n = len(fs.split(",")) 265 | joblist = [out2[i:i + n] for i in range(0, len(out2), n)] 266 | 267 | # Filter by job name using partial or full match 268 | if args.name: 269 | names = args.name.replace(",", "|") 270 | joblist = [job for job in joblist if re.search(names, job[3])] 271 | 272 | # sacct filters job state over the given time period, which could be multiple states 273 | # Filter based on current job state instead 274 | if args.state: 275 | states = args.state.upper().replace(",", "|") 276 | joblist = [job for job in joblist if re.search(states, job[5])] 277 | 278 | # Exit if no jobs found after filters 279 | if not joblist: 280 | print("No jobs found") 281 | sys.exit(0) 282 | 283 | # Process sacct values and print results ################################################# 284 | 285 | print("----------------------------------------------------------------------------------------------------") 286 | print("Job ID Startdate User Job Name Partition State Elapsed Nodes CPUs Memory") 287 | print("------------- ---------- ---------- ------------ ---------- ---------- ----------- ----- ---- ------") 288 | for job in joblist: 289 | v = derivev(fs, job) 290 | # Print requested resources in some cases 291 | if (v["state"] == "PENDING" 292 | # If job is CANCELLED before resources are allocated 293 | or (v["state"] == "CANCELLED" and v["alloctres"] == "") 294 | # If job submission FAILED (e.g., requested node configuration is not available) 295 | or (v["state"] == "FAILED" and v["start"] == "None")): 296 | print(f"{v['jobid'][:13]:<13}" + " " + 297 | f"{v['start'][:10]:>10}" + " " + 298 | f"{v['user'][:10]:>10}" + " " + 299 | f"{v['jobname'][:12].strip():>12}" + " " + 300 | f"{v['partition'][:10]:>10}" + " " + 301 | f"{v['state'][:10]:>10}" + " " + 302 | f"{v['elapsed'][:11]:>11}" + " " + 303 | f"{v['reqnodes'][:5]:>5}" + " " + 304 | f"{v['reqcpus'][:4]:>4}" + " " + 305 | f"{v['reqmem'][:6]:>6}") 306 | # Print allocated resources in all other cases 307 | else: 308 | print(f"{v['jobid'][:13]:<13}" + " " + 309 | f"{v['start'][:10]:>10}" + " " + 310 | f"{v['user'][:10]:>10}" + " " + 311 | f"{v['jobname'][:12].strip():>12}" + " " + 312 | f"{v['partition'][:10]:>10}" + " " + 313 | f"{v['state'][:10].strip():>10}" + " " + 314 | f"{v['elapsed'][:11]:>11}" + " " + 315 | f"{v['allocnodes'][:5]:>5}" + " " + 316 | f"{v['alloccpus'][:4]:>4}" + " " + 317 | f"{v['allocmem'][:6]:>6}") 318 | -------------------------------------------------------------------------------- /scripts/jobeff: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 4 | jobeff 5 | 6 | View job efficiency information 7 | 8 | This script requires: 9 | - Python 3.10+ (for the zip() strict = True option) 10 | - Slurm (any recent version should work) 11 | - Slurm configured with: 12 | - JobAcctGatherType=jobacct_gather/linux or jobacct_gather/cgroup 13 | 14 | Notes: 15 | - Based on output from sacct 16 | - Default time period is past day 17 | - State filter is based on current job state 18 | """ 19 | 20 | from datetime import datetime, timedelta 21 | from statistics import quantiles 22 | import argparse 23 | import os 24 | import re 25 | import subprocess 26 | import sys 27 | 28 | # Set up arguments and options ########################################################### 29 | 30 | parser = argparse.ArgumentParser( 31 | prog = "jobeff", 32 | formatter_class = argparse.RawDescriptionHelpFormatter, 33 | description = "View job efficiency information", 34 | epilog = """\ 35 | examples: 36 | 37 | jobeff 123456 38 | jobeff 123456,123457,123567 39 | jobeff --me 40 | jobeff -u ttrojan 41 | jobeff -a ttrojan_123 42 | jobeff -n sim 43 | jobeff -w a01-01 44 | jobeff -p largemem 45 | jobeff -s 2025-10-01 46 | jobeff -s 2025-09-01 -e 2025-09-30 47 | jobeff -t completed,failed,timeout 48 | jobeff --me -lc -70 49 | jobeff --me -lm 50 50 | jobeff --me -z 51 | 52 | notes: 53 | 54 | to view all jobs in a job array, use the base array job ID 55 | to view more job details, run jobinfo 56 | 57 | support: 58 | 59 | https://github.com/uschpc/slurm-tools/issues""" 60 | ) 61 | parser.add_argument( 62 | "jobids", 63 | nargs = "?", 64 | help = "job IDs to query (comma-separated list)" 65 | ) 66 | parser.add_argument( 67 | "-a", 68 | "--account", 69 | help = "account to query for history (comma-separated list)" 70 | ) 71 | parser.add_argument( 72 | "-e", 73 | "--end", 74 | help = "end date to query for history (YYYY-MM-DD)" 75 | ) 76 | parser.add_argument( 77 | "-lc", 78 | "--levelcpu", 79 | type = int, 80 | help = "only report jobs below(-)/above(+) given CPU efficiency level" 81 | ) 82 | parser.add_argument( 83 | "-lm", 84 | "--levelmem", 85 | type = int, 86 | help = "only report jobs below(-)/above(+) given memory efficiency level" 87 | ) 88 | parser.add_argument( 89 | "--me", 90 | action = "store_true", 91 | help = "query own jobs" 92 | ) 93 | parser.add_argument( 94 | "-n", 95 | "--name", 96 | help = "job name (partial or full) to query for history (comma-separated list)" 97 | ) 98 | parser.add_argument( 99 | "-w", 100 | "--nodelist", 101 | help = "node list to query for history (range or comma-separated list)" 102 | ) 103 | parser.add_argument( 104 | "-p", 105 | "--partition", 106 | help = "partition to query for history (comma-separated list)" 107 | ) 108 | parser.add_argument( 109 | "-q", 110 | "--qos", 111 | help = "QOS to query for history (comma-separated list)" 112 | ) 113 | parser.add_argument( 114 | "-s", 115 | "--start", 116 | help = "start date to query for history (YYYY-MM-DD)" 117 | ) 118 | parser.add_argument( 119 | "-t", 120 | "--state", 121 | help = "current job state to query for history (comma-separated list)" 122 | ) 123 | parser.add_argument( 124 | "-z", 125 | "--summary", 126 | action = "store_true", 127 | help = "print five-number summaries instead of table" 128 | ) 129 | parser.add_argument( 130 | "-u", 131 | "--user", 132 | help = "user to query for history (comma-separated list)" 133 | ) 134 | parser.add_argument( 135 | "-V", 136 | "--version", 137 | action = "store_true", 138 | help = "print version" 139 | ) 140 | args = parser.parse_args() 141 | 142 | # Require either job IDs or filter options 143 | if not any(vars(args).values()): 144 | print("Error: no job IDs or filter options given", file = sys.stderr) 145 | print("See jobeff -h for usage", file = sys.stderr) 146 | sys.exit(1) 147 | 148 | if args.version: 149 | print("jobeff 1.0.0") 150 | sys.exit(0) 151 | 152 | # Define utility functions ############################################################### 153 | 154 | def parsetime(x): 155 | """ 156 | Parse time elements from string 157 | 158 | # Arguments 159 | - x (str): sacct value for time 160 | 161 | # Returns 162 | - tuple: days, hours, minutes, seconds 163 | 164 | # Notes 165 | - Slurm time format is dd-hh:mm:ss.sss 166 | - Value will at least include mm:ss 167 | """ 168 | m = re.match(r"(((?P\d+)-)?(?P\d\d):)?" 169 | + r"(?P\d\d):(?P\d\d(\.\d+)?)", x) 170 | if not m: 171 | return 0, 0, 0, 0.0 172 | dd = int(m.group("days") or "0") 173 | hh = int(m.group("hours") or "0") 174 | mm = int(m.group("minutes")) 175 | ss = float(m.group("seconds")) 176 | return dd, hh, mm, ss 177 | 178 | def toseconds(x): 179 | """ 180 | Convert time to seconds 181 | 182 | # Arguments 183 | - x (str): sacct value for time 184 | 185 | # Returns 186 | - float: time in seconds 187 | """ 188 | dd, hh, mm, ss = parsetime(x) 189 | return dd*24*60*60 + hh*60*60 + mm*60 + ss 190 | 191 | def tobytes(x): 192 | """ 193 | Convert string to bytes 194 | 195 | # Arguments 196 | - x (str): sacct value for size 197 | 198 | # Returns 199 | - float: -1.0 (if x is empty) 200 | - float: size in bytes 201 | """ 202 | # Some values may be empty if job is RUNNING 203 | # Return -1.0 for comparison purposes 204 | if not x: 205 | return -1.0 206 | units = {"K": 10, "M": 20, "G": 30, "T": 40, "P": 50, "E": 60} 207 | scale = 2**units.get(x[-1], 0) 208 | if scale != 1: 209 | x = x[:-1] 210 | return float(x) * scale 211 | 212 | # Define selector functions ############################################################## 213 | 214 | def firstv(x, y): 215 | """ 216 | Select first non-empty value in multi-line output 217 | 218 | # Arguments 219 | - x (str): first value to compare 220 | - y (str): second value to compare 221 | 222 | # Returns 223 | - str: first non-empty value 224 | """ 225 | return x == "" and y or x 226 | 227 | def allv(x, y): 228 | """ 229 | Select all values in multi-line output 230 | 231 | # Arguments 232 | - x (str): first value to compare 233 | - y (str): second value to compare 234 | 235 | # Returns 236 | - str: all values (separated by |) 237 | """ 238 | if y == "": 239 | return x 240 | return x + "|" + y 241 | 242 | def maxtime(x, y): 243 | """ 244 | Select max time value in multi-line output 245 | 246 | # Arguments 247 | - x (str): first value to compare 248 | - y (str): second value to compare 249 | 250 | # Returns 251 | - str: max time value 252 | """ 253 | if "UNLIMITED" in [x, y]: 254 | return "UNLIMITED" 255 | if x in ["", "INVALID"]: 256 | return y 257 | if y in ["", "INVALID"]: 258 | return x 259 | return max(x, y) 260 | 261 | def selectv(lines, selectors): 262 | """ 263 | Select sacct values from multi-line output 264 | 265 | # Arguments 266 | - lines (list): one list of parsed values per line of output 267 | - selectors (list): selector functions 268 | 269 | # Returns 270 | - list: selected sacct values 271 | """ 272 | # Extract first line of values 273 | vals = lines[0] 274 | # If more lines, run selector functions 275 | for line in lines[1:]: 276 | for i, selector in enumerate(selectors): 277 | vals[i] = selector(vals[i], line[i]) 278 | return vals 279 | 280 | # Define calculation functions ########################################################### 281 | 282 | def cpueff(x, y): 283 | """ 284 | Calculate CPU efficiency 285 | 286 | # Arguments 287 | - x (str): sacct value for totalcpu 288 | - y (str): sacct value for cputime 289 | 290 | # Returns 291 | - str: "--" (if x or y is 0) 292 | - str: formatted % 293 | """ 294 | usedcpu = toseconds(x) 295 | elapsedcpu = toseconds(y) 296 | # Total CPU time is 0 if job not COMPLETED 297 | # Elapsed CPU time is 0 if job not started 298 | if usedcpu == 0 or elapsedcpu == 0: 299 | return "--" 300 | return f"{usedcpu / elapsedcpu * 100:5.2f}%" 301 | 302 | def memeff(x, y): 303 | """ 304 | Calculate memory efficiency 305 | 306 | # Arguments 307 | - x (str): estimated value for max memory used 308 | - y (str): sacct value for allocated memory 309 | 310 | # Returns 311 | - str: "--" (if x or y is not available) 312 | - str: formatted % 313 | """ 314 | # Value set to -1.0 if job not started or is RUNNING and not user or root 315 | # Allocated mem set to ?? if not found 316 | if x == -1.0 or y == "??": 317 | return "--" 318 | allocmem = tobytes(y) 319 | return f"{x / allocmem * 100:5.2f}%" 320 | 321 | # Define functions to get and derive job info ############################################ 322 | 323 | def getsacctv(scmd, fields): 324 | """ 325 | Get sacct values and modify as needed 326 | 327 | # Arguments 328 | - scmd (str): sacct command stem 329 | - fields (list): sacct fields to query and associated selector functions 330 | 331 | # Returns 332 | - list: list of dicts (one dict per job) 333 | 334 | # Notes 335 | - Minimize sacct calls to reduce stress on Slurm 336 | - Assumes sacct -n -p options are used 337 | - Assumes saact jobid,state fields are present 338 | """ 339 | # Derive sacct command and run 340 | keys = [item[0] for item in fields] 341 | fmt = ",".join(keys) 342 | scmd = scmd + ["-o", fmt] 343 | proc = subprocess.run(scmd, env = {"SLURM_TIME_FORMAT":"%Y-%m-%d"}, capture_output = True, text = True) 344 | if not proc.stdout: 345 | print("No jobs found") 346 | sys.exit(0) 347 | # Job names may contain \n (causing line break) 348 | # So splitting by \n does not necessarily split output by line 349 | # First remove all \n and split by |, then group line values using number of fields 350 | out = re.sub(r"\n", "", proc.stdout)[:-1] 351 | out2 = out.split("|") 352 | n = len(fields) 353 | out3 = [out2[i:i + n] for i in range(0, len(out2), n)] 354 | # Get unique job IDs 355 | jobids = [] 356 | for line in out3: 357 | jobid = line[0].split(".")[0] 358 | jobids.append(jobid) 359 | ujobids = list(set(jobids)) 360 | # Create dict to store lines grouped by job ID 361 | d = {jobid: [] for jobid in ujobids} 362 | # Match line to job ID and append to corresponding list in dict 363 | for line in out3: 364 | jobid = line[0].split(".")[0] 365 | d[jobid].append(line) 366 | # Output returned from sacct may have multiple lines for each job 367 | # Jobs that are PENDING will have one line 368 | # Jobs that are CANCELLED or FAILED before starting will have one line 369 | # Otherwise, jobs that start will have multiple lines 370 | # Main job line, .batch or .interactive line, and .extern line 371 | # If srun is used, one line for each srun call 372 | # Select one value or combine multiple values depending on field 373 | selectors = [item[1] for item in fields] 374 | out4 = [] 375 | for jobout in d.values(): 376 | vals = selectv(jobout, selectors) 377 | v = dict(zip(keys, vals, strict = True)) 378 | # Replace state CANCELLED by ... with just CANCELLED 379 | if "CANCELLED" in v["state"]: 380 | v["state"] = "CANCELLED" 381 | out4.append(v) 382 | # Sort by job ID 383 | out5 = sorted(out4, key = lambda x: x["jobid"]) 384 | return out5 385 | 386 | def derivev(vals): 387 | """ 388 | Derive other values from sacct values 389 | 390 | # Arguments 391 | - vals (dict): key-value pairs for sacct fields 392 | 393 | # Returns 394 | - dict: key-value pairs for original and derived fields 395 | """ 396 | d = vals.copy() 397 | # alloctres is empty if resources have not been allocated 398 | # If job is PENDING or CANCELLED/FAILED before job started 399 | if d["alloctres"] == "": 400 | d["allocmem"] = "--" 401 | else: 402 | # Get allocated memory (may be in varying units) 403 | allocmem = re.search(r"mem=([0-9]+.[0-9]+[A-Z]|[0-9]+[A-Z])", d["alloctres"]) 404 | if allocmem: 405 | d["allocmem"] = allocmem.group(1) 406 | else: 407 | d["allocmem"] = "??" 408 | # Calculate CPU efficiency 409 | d["cpueff"] = cpueff(d["totalcpu"], d["cputime"]) 410 | # tresusageintot values may be empty 411 | # If job is PENDING or CANCELLED/FAILED before job started 412 | # If job is RUNNING but not user or root 413 | # Derive max memory used from tresusageintot 414 | allmem = re.findall(r",mem=([0-9]+.[0-9]+[A-Z]|[0-9]+[A-Z])", d["tresusageintot"]) 415 | if not allmem: 416 | d["maxmem"] = -1.0 417 | else: 418 | allmembytes = [tobytes(mem) for mem in allmem] 419 | d["maxmem"] = max(allmembytes) 420 | # Calculate memory efficiency 421 | d["memeff"] = memeff(d["maxmem"], d["allocmem"]) 422 | return d 423 | 424 | # Define main function to get job efficiency values ###################################### 425 | 426 | def getjobeff(scmd, fields): 427 | """ 428 | Get job efficiency values 429 | 430 | # Arguments 431 | - scmd (str): sacct command stem 432 | - fields (list): sacct fields to query and associated selector functions 433 | 434 | # Returns 435 | - list: list of dicts (one dict per job) 436 | """ 437 | sacctv = getsacctv(scmd, fields) 438 | eff = [] 439 | for jobout in sacctv: 440 | out = derivev(jobout) 441 | eff.append(out) 442 | return eff 443 | 444 | # Get job efficiency values ############################################################## 445 | 446 | # Specify sacct fields to query and associated selector functions 447 | fs = [["jobid", firstv], 448 | ["state", firstv], 449 | ["user", firstv], 450 | ["jobname", firstv], 451 | ["alloctres", firstv], 452 | ["cputime", firstv], 453 | ["totalcpu", firstv], 454 | ["tresusageintot", allv]] 455 | 456 | # Derive sacct command stem based on given arguments 457 | if args.jobids: 458 | cmd = ["sacct", "-n", "-p", "-j", args.jobids] 459 | else: 460 | cmd = ["sacct", "-n", "-p", "-a"] 461 | 462 | tday = datetime.today() 463 | yday = tday - timedelta(days = 1) 464 | tday = tday.strftime("%Y-%m-%dT%H:%M:%S") 465 | yday = yday.strftime("%Y-%m-%d") 466 | 467 | if args.start and args.end: 468 | cmd = cmd + ["-S", args.start, "-E", args.end] 469 | elif args.start and not args.end: 470 | cmd = cmd + ["-S", args.start, "-E", tday] 471 | elif not args.start and args.end: 472 | print("Error: no start date given", file = sys.stderr) 473 | sys.exit(1) 474 | else: 475 | cmd = cmd + ["-S", yday, "-E", tday] 476 | if args.account: 477 | cmd = cmd + ["-A", args.account] 478 | if args.me: 479 | cmd = cmd + ["-u", os.environ["USER"]] 480 | if args.nodelist: 481 | cmd = cmd + ["-N", args.nodelist] 482 | if args.partition: 483 | cmd = cmd + ["-r", args.partition] 484 | if args.qos: 485 | cmd = cmd + ["-q", args.qos] 486 | if args.user: 487 | cmd = cmd + ["-u", args.user] 488 | 489 | # Run main function to get job efficiency values 490 | res = getjobeff(cmd, fs) 491 | 492 | # Filter by job name using partial or full match 493 | if args.name: 494 | names = args.name.replace(",", "|") 495 | res = [job for job in res if re.search(names, job["jobname"])] 496 | 497 | # sacct filters job state over the given time period, which could be multiple states 498 | # Filter based on current job state instead 499 | if args.state: 500 | states = args.state.upper().replace(",", "|") 501 | res = [job for job in res if re.search(states, job["state"])] 502 | 503 | # Filter by efficiency levels 504 | if args.levelcpu: 505 | res = [job for job in res if job["cpueff"] != "--"] 506 | if args.levelcpu < 0: 507 | res = [job for job in res if float(job["cpueff"][:-1]) <= float(args.levelcpu * -1)] 508 | else: 509 | res = [job for job in res if float(job["cpueff"][:-1]) >= float(args.levelcpu)] 510 | if args.levelmem: 511 | res = [job for job in res if job["memeff"] != "--"] 512 | if args.levelmem < 0: 513 | res = [job for job in res if float(job["memeff"][:-1]) <= float(args.levelmem * -1)] 514 | else: 515 | res = [job for job in res if float(job["memeff"][:-1]) >= float(args.levelmem)] 516 | 517 | # Exit if no jobs found after filters 518 | if not res: 519 | print("No jobs found") 520 | sys.exit(0) 521 | 522 | # Print results ########################################################################## 523 | 524 | if not args.summary: 525 | # Print table of job efficiency values 526 | print("------------------------------------------------------------------") 527 | print("Job ID State CPU Efficiency Memory Efficiency") 528 | print("------------- ---------- -------------------- --------------------") 529 | for job in res: 530 | # Create percentage visuals using line of | up to length 10 531 | if job["cpueff"] == "--": 532 | vcpueff = "" 533 | else: 534 | n1 = round(float(job["cpueff"][:-1]) / 10) 535 | vcpueff = job["cpueff"] + " [" + f"{('|' * n1)[:10]:<10}" + "]" 536 | if job["memeff"] == "--": 537 | vmemeff = "" 538 | else: 539 | n2 = round(float(job["memeff"][:-1]) / 10) 540 | vmemeff = job["memeff"] + " [" + f"{('|' * n2)[:10]:<10}" + "]" 541 | print(f"{job['jobid'][:13]:<13}" + " " + 542 | f"{job['state'][:10]:>10}" + " " + 543 | f"{vcpueff[:20]:>20}" + " " + 544 | f"{vmemeff[:20]:>20}") 545 | else: 546 | # Print five-number summaries for job efficiency values 547 | # Exclude jobs without efficiency values 548 | res = [job for job in res if job["cpueff"] != "--"] 549 | res = [job for job in res if job["memeff"] != "--"] 550 | if not res or len(res) == 1: 551 | print("No jobs found to summarize") 552 | sys.exit(0) 553 | allcpueff = [float(job["cpueff"][:-1]) for job in res] 554 | allmemeff = [float(job["memeff"][:-1]) for job in res] 555 | qcpu = [round(q, 2) for q in quantiles(allcpueff, n = 4)] 556 | qmem = [round(q, 2) for q in quantiles(allmemeff, n = 4)] 557 | print("----------------------------------------------------") 558 | print("Five-number summaries for job efficiencies ") 559 | print("----------------------------------------------------") 560 | print("Number of jobs: " + str(len(res))) 561 | print("") 562 | print(" Min Q1 Med Q3 Max") 563 | print("CPU: " + " " + 564 | f"{min(allcpueff):>6}" + " " + 565 | f"{qcpu[0]:>6}" + " " + 566 | f"{qcpu[1]:>6}" + " " + 567 | f"{qcpu[2]:>6}" + " " + 568 | f"{max(allcpueff):>6}") 569 | print("Memory:" + " " + 570 | f"{min(allmemeff):>6}" + " " + 571 | f"{qmem[0]:>6}" + " " + 572 | f"{qmem[1]:>6}" + " " + 573 | f"{qmem[2]:>6}" + " " + 574 | f"{max(allmemeff):>6}") 575 | -------------------------------------------------------------------------------- /scripts/noderes: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 4 | noderes 5 | 6 | View available resources on nodes 7 | 8 | This script requires: 9 | - Python 3.7+ (for the subprocess.run options) 10 | - Slurm 21.08+ (for parsing node state from the scontrol show node output) 11 | - Slurm configured with: 12 | - GresTypes=gpu 13 | 14 | Notes: 15 | - Based on output from scontrol 16 | - Resources include node's CfgTRES (CPUs, memory, and GPUs) 17 | """ 18 | 19 | from itertools import groupby 20 | import argparse 21 | import re 22 | import subprocess 23 | import sys 24 | 25 | # Set up arguments and options ########################################################### 26 | 27 | parser = argparse.ArgumentParser( 28 | prog = "noderes", 29 | formatter_class = argparse.RawDescriptionHelpFormatter, 30 | description = "View available resources on nodes", 31 | epilog = """\ 32 | examples: 33 | 34 | noderes 35 | noderes -a 36 | noderes -c 37 | noderes -p gpu 38 | noderes -p main,oneweek 39 | noderes -s idle 40 | noderes -s idle,mixed 41 | noderes -p gpu -s idle 42 | noderes -f 43 | noderes -f -p gpu 44 | noderes -g 45 | noderes -f -g 46 | noderes -f -g -p gpu 47 | noderes -m epyc-7513 48 | noderes -m a100,a40 49 | noderes -fl 50 | noderes -ft ndr200 51 | noderes -z 52 | noderes -z -g 53 | 54 | support: 55 | 56 | https://github.com/uschpc/slurm-tools/issues""" 57 | ) 58 | parser.add_argument( 59 | "-a", 60 | "--allocated", 61 | action = "store_true", 62 | help = "print allocated resources" 63 | ) 64 | parser.add_argument( 65 | "-c", 66 | "--configured", 67 | action = "store_true", 68 | help = "print configured resources" 69 | ) 70 | parser.add_argument( 71 | "-ft", 72 | "--feature", 73 | help = "filter by node feature (comma-separated list)" 74 | ) 75 | parser.add_argument( 76 | "-fl", 77 | "--feature-list", 78 | action = "store_true", 79 | help = "list active node features" 80 | ) 81 | parser.add_argument( 82 | "-f", 83 | "--free", 84 | action = "store_true", 85 | help = "only include nodes with free resources" 86 | ) 87 | parser.add_argument( 88 | "-g", 89 | "--gpus", 90 | action = "store_true", 91 | help = "only include nodes with GPUs" 92 | ) 93 | parser.add_argument( 94 | "-m", 95 | "--model", 96 | help = "filter by CPU or GPU model (comma-separated list)" 97 | ) 98 | parser.add_argument( 99 | "-p", 100 | "--partition", 101 | help = "filter by partition (comma-separated list)" 102 | ) 103 | parser.add_argument( 104 | "-s", 105 | "--state", 106 | help = "filter by node state (comma-separated list)" 107 | ) 108 | parser.add_argument( 109 | "-z", 110 | "--summary", 111 | action = "store_true", 112 | help = "summarize free resources by partition" 113 | ) 114 | parser.add_argument( 115 | "-V", 116 | "--version", 117 | action = "store_true", 118 | help = "print version" 119 | ) 120 | args = parser.parse_args() 121 | 122 | if args.version: 123 | print("noderes 1.2.0") 124 | sys.exit(0) 125 | 126 | # Define functions ####################################################################### 127 | 128 | def memtogib(mem): 129 | """ 130 | Convert memory value to GiB 131 | 132 | # Arguments 133 | - mem (str): memory value 134 | 135 | # Returns 136 | - int: memory value in GiB 137 | - str: "??" if unit not found 138 | """ 139 | m = float(mem[:-1]) 140 | if mem.endswith("T"): 141 | return int(m * 1024) 142 | if mem.endswith("G"): 143 | return int(m) 144 | if mem.endswith("M"): 145 | return int(m / 1024) 146 | if mem.endswith("K"): 147 | return int(m / 1024**2) 148 | if mem[-1].isdigit(): 149 | return int(mem / 1024**3) 150 | return "??" 151 | 152 | def getnoderes(node): 153 | """ 154 | Get information about node resources 155 | 156 | # Arguments 157 | - node (str): single line of node information 158 | from "scontrol --oneliner show node" output 159 | 160 | # Returns 161 | - dict: resource information for node 162 | """ 163 | # Initialize dict to store results 164 | d = { 165 | "nodename":"", 166 | "partition":"", 167 | "nodestate":"", 168 | "nodestateshort":"", 169 | "cpumodel":"", 170 | "gpumodel":"", 171 | "cfgtrescpu":0, 172 | "cfgtresmem":0, 173 | "cfgtresgpu":0, 174 | "alloctrescpu":0, 175 | "alloctresmem":0, 176 | "alloctresgpu":0, 177 | "freecpus":0, 178 | "freemem":0, 179 | "freegpus":0 180 | } 181 | 182 | # Get partition 183 | partition = re.search(r"Partitions=(\S+)", node) 184 | # Some nodes may not be assigned partitions yet 185 | if partition is None: 186 | d["partition"] = "none" 187 | else: 188 | d["partition"] = partition.group(1) 189 | 190 | # Get node name 191 | d["nodename"] = re.search(r"NodeName=(\S+)", node).group(1) 192 | 193 | # Get node state 194 | d["nodestate"] = re.search(r"State=(\S+)", node).group(1).lower() 195 | # Starting with Slurm 21.08, format is base_state+flag 196 | # For ease of printing, only use base_state with some exceptions 197 | d["nodestateshort"] = d["nodestate"].split("+", maxsplit = 1)[0] 198 | if re.search(r"drain", d["nodestate"]): 199 | # Special type of idle 200 | d["nodestateshort"] = "drain" 201 | if re.search(r"maintenance", d["nodestate"]): 202 | # Can be idle, mixed, or allocated 203 | d["nodestateshort"] = "maint" 204 | if re.search(r"reserved", d["nodestate"]): 205 | # Can be idle, mixed, or allocated 206 | d["nodestateshort"] = "reserved" 207 | 208 | # Get active node features 209 | features = re.search(r"ActiveFeatures=(\S+)", node) 210 | if not features: 211 | d["features"] = "" 212 | else: 213 | d["features"] = features.group(1) 214 | 215 | # Get CPU model 216 | if not d["features"]: 217 | d["cpumodel"] = "n/a" 218 | else: 219 | cpumodel = re.search(r"((epyc|xeon)[-_a-zA-Z0-9]+)", d["features"]) 220 | if not cpumodel: 221 | d["cpumodel"] = "n/a" 222 | else: 223 | d["cpumodel"] = cpumodel.group(1) 224 | 225 | # Get configured resources for node 226 | if re.search(r"future|unknown", d["nodestate"]): 227 | d["cfgtrescpu"] = 0 228 | d["cfgtresmem"] = 0 229 | d["cfgtresgpu"] = 0 230 | d["gpumodel"] = "n/a" 231 | else: 232 | cfgtres = re.search(r"CfgTRES=(\S+)", node).group(1) 233 | # Get CPUs 234 | cfgtrescpu = re.search(r"cpu=([0-9]+)", cfgtres) 235 | if not cfgtrescpu: 236 | d["cfgtrescpu"] = "??" 237 | else: 238 | d["cfgtrescpu"] = int(cfgtrescpu.group(1)) 239 | # Get memory (may be in varying units) 240 | cfgtresmem = re.search(r"mem=([0-9]+.[0-9]+[A-Z]|[0-9]+[A-Z])", cfgtres) 241 | if not cfgtresmem: 242 | d["cfgtresmem"] = "??" 243 | else: 244 | d["cfgtresmem"] = memtogib(cfgtresmem.group(1)) 245 | if d["cfgtresmem"] != "??": 246 | # Get memory reserved for Slurm daemons 247 | memspeclimit = re.search(r"MemSpecLimit=([0-9]+)", node) 248 | if not memspeclimit: 249 | memspeclimit = 0 250 | else: 251 | # Always configured in MiB, convert to GiB 252 | memspeclimit = int(float(memspeclimit.group(1)) / 1024) 253 | # Subtract reserved memory from cfgtresmem 254 | d["cfgtresmem"] = d["cfgtresmem"] - memspeclimit 255 | # Get GPUs 256 | cfgtresgpu = re.search(r"gres/gpu=([0-9]+)", cfgtres) 257 | if not cfgtresgpu: 258 | d["cfgtresgpu"] = 0 259 | else: 260 | d["cfgtresgpu"] = int(cfgtresgpu.group(1)) 261 | # Get GPU model 262 | gpumodel = re.search(r"gres/gpu:([a-zA-Z0-9]+)", cfgtres) 263 | if d["cfgtresgpu"] == 0: 264 | d["gpumodel"] = "--" 265 | elif not gpumodel: 266 | d["gpumodel"] = "n/a" 267 | else: 268 | d["gpumodel"] = gpumodel.group(1) 269 | 270 | # Get allocated resources for node and calculate free resources 271 | if re.search(r"down|error|future|unknown", d["nodestate"]): 272 | # Set to 0 in order to sum later for args.summary option 273 | d["alloctrescpu"] = 0 274 | d["alloctresmem"] = 0 275 | d["alloctresgpu"] = 0 276 | d["freecpus"] = 0 277 | d["freemem"] = 0 278 | d["freegpus"] = 0 279 | elif re.search(r"idle", d["nodestate"]): 280 | d["alloctrescpu"] = 0 281 | d["alloctresmem"] = 0 282 | d["alloctresgpu"] = 0 283 | d["freecpus"] = d["cfgtrescpu"] 284 | d["freemem"] = d["cfgtresmem"] 285 | d["freegpus"] = d["cfgtresgpu"] 286 | else: 287 | # For node in mixed or allocated state, get allocated values 288 | # A node is in allocated state only if all CPUs are allocated 289 | alloctres = re.search(r"AllocTRES=(\S+)", node).group(1) 290 | # Get CPUs 291 | alloctrescpu = re.search(r"cpu=([0-9]+)", alloctres) 292 | if not alloctrescpu: 293 | d["alloctrescpu"] = "??" 294 | else: 295 | d["alloctrescpu"] = int(alloctrescpu.group(1)) 296 | # Get memory (may be in varying units) 297 | alloctresmem = re.search(r"mem=([0-9]+.[0-9]+[A-Z]|[0-9]+[A-Z])", alloctres) 298 | if not alloctresmem: 299 | d["alloctresmem"] = "??" 300 | else: 301 | d["alloctresmem"] = memtogib(alloctresmem.group(1)) 302 | # Get GPUs 303 | alloctresgpu = re.search(r"gres/gpu=([0-9]+)", alloctres) 304 | if not alloctresgpu: 305 | d["alloctresgpu"] = 0 306 | else: 307 | d["alloctresgpu"] = int(alloctresgpu.group(1)) 308 | # Calculate free CPUs 309 | if d["cfgtrescpu"] == "??" or d["alloctrescpu"] == "??": 310 | d["freecpus"] = "??" 311 | else: 312 | d["freecpus"] = d["cfgtrescpu"] - d["alloctrescpu"] 313 | # Calculate free memory 314 | if d["cfgtresmem"] == "??" or d["alloctresmem"] == "??": 315 | d["freemem"] = "??" 316 | else: 317 | d["freemem"] = d["cfgtresmem"] - d["alloctresmem"] 318 | # Calculate free GPUs 319 | if d["cfgtresgpu"] == "??" or d["alloctresgpu"] == "??": 320 | d["freegpus"] = "??" 321 | else: 322 | d["freegpus"] = d["cfgtresgpu"] - d["alloctresgpu"] 323 | 324 | return d 325 | 326 | # Get node resource information ########################################################## 327 | 328 | # Get scontrol information for all nodes in cluster 329 | cmd = ["scontrol", "--oneliner", "show", "node"] 330 | proc = subprocess.run(cmd, capture_output = True, text = True) 331 | if not proc.stdout: 332 | print("No nodes found") 333 | sys.exit(0) 334 | nodelist = proc.stdout.split("\n")[:-1] 335 | 336 | # Print active node features 337 | if args.feature_list: 338 | feats = [re.findall(r"ActiveFeatures=(\S+)", node) for node in nodelist] 339 | if not feats: 340 | print("No node features found") 341 | else: 342 | # Get sorted list of unique active node features 343 | feats2 = [feat[0].split(",") for feat in feats] 344 | feats3 = [allfeat for node in feats2 for allfeat in node] 345 | ufeats = sorted(list(set(feats3))) 346 | print("------------------------------") 347 | print("Active node features") 348 | print("------------------------------") 349 | for ufeat in ufeats: 350 | print(ufeat) 351 | sys.exit(0) 352 | 353 | # Get resource information for each node 354 | reslist = [getnoderes(n) for n in nodelist] 355 | 356 | # Filter nodes by node feature 357 | if args.feature: 358 | nf = args.feature.split(",") 359 | reslist = [n for n in reslist if any(x in (n["features"]) for x in nf)] 360 | 361 | # Only include nodes with free resources 362 | if args.free: 363 | reslist = [n for n in reslist if n["freecpus"] > 0 and n["freemem"] > 0] 364 | 365 | # Only include nodes with GPUs 366 | if args.gpus: 367 | reslist = [n for n in reslist if n["gpumodel"] != "--"] 368 | # Only include nodes with accessible GPUs 369 | if args.free: 370 | reslist = [n for n in reslist if n["freegpus"] > 0] 371 | 372 | # Filter nodes by CPU or GPU model 373 | if args.model: 374 | fm = args.model.split(",") 375 | reslist = [n for n in reslist if any(x in (n["cpumodel"] + "," + n["gpumodel"]) for x in fm)] 376 | 377 | # Filter nodes by partition 378 | if args.partition: 379 | fp = args.partition.split(",") 380 | reslist = [n for n in reslist if any(x in n["partition"] for x in fp)] 381 | 382 | # Filter nodes by state 383 | if args.state: 384 | fs = args.state.split(",") 385 | reslist = [n for n in reslist if any(x in n["nodestateshort"] for x in fs)] 386 | 387 | # Print results ########################################################################## 388 | 389 | if not args.summary: 390 | if not args.configured and not args.allocated: 391 | # Print free resources for nodes (default) 392 | print("-------------------------------------------------------------------") 393 | print("Node Partition State CPU GPU Free Free Free") 394 | print(" Model Model CPUs Memory GPUs") 395 | print("------ ------------ --------- ----------- -------- ---- ------ ----") 396 | for n in reslist: 397 | # For unavailable nodes, replace free values with -- 398 | if re.search(r"down|error|future|unknown", n["nodestate"]): 399 | n["freecpus"] = "--" 400 | n["freemem"] = "--" 401 | n["freegpus"] = "--" 402 | # For nodes without GPUs, replace GPUs value 0 with -- 403 | if n["gpumodel"] == "--": 404 | n["freegpus"] = "--" 405 | print(f"{n['nodename'][:6]:<6}" + " " + 406 | f"{n['partition'][:12]:>12}" + " " + 407 | f"{n['nodestateshort'][:9]:>9}" + " " + 408 | f"{n['cpumodel'][:11]:>11}" + " " + 409 | f"{n['gpumodel'][:8]:>8}" + " " + 410 | f"{n['freecpus']:>4}" + " " + 411 | f"{n['freemem']:>5}" + "G " + 412 | f"{n['freegpus']:>4}") 413 | elif args.configured and not args.allocated: 414 | # Print configured resources for nodes 415 | print("-------------------------------------------------------------------") 416 | print("Node Partition State CPU GPU Cfg Cfg Cfg") 417 | print(" Model Model CPUs Memory GPUs") 418 | print("------ ------------ --------- ----------- -------- ---- ------ ----") 419 | for n in reslist: 420 | # For nodes in future or unknown state, replace configured values with -- 421 | if re.search(r"future|unknown", n["nodestate"]): 422 | n["cfgtrescpu"] = "--" 423 | n["cfgtresmem"] = "--" 424 | n["cfgtresgpu"] = "--" 425 | # For nodes without GPUs, replace GPUs value 0 with -- 426 | if n["gpumodel"] == "--": 427 | n["cfgtresgpu"] = "--" 428 | print(f"{n['nodename'][:6]:<6}" + " " + 429 | f"{n['partition'][:12]:>12}" + " " + 430 | f"{n['nodestateshort'][:9]:>9}" + " " + 431 | f"{n['cpumodel'][:11]:>11}" + " " + 432 | f"{n['gpumodel'][:8]:>8}" + " " + 433 | f"{n['cfgtrescpu']:>4}" + " " + 434 | f"{n['cfgtresmem']:>5}" + "G " + 435 | f"{n['cfgtresgpu']:>4}") 436 | else: 437 | # Print allocated resources for nodes 438 | print("-------------------------------------------------------------------") 439 | print("Node Partition State CPU GPU Alct Alct Alct") 440 | print(" Model Model CPUs Memory GPUs") 441 | print("------ ------------ --------- ----------- -------- ---- ------ ----") 442 | for n in reslist: 443 | # For nodes in future or unknown state, replace allocated values with -- 444 | if re.search(r"future|unknown", n["nodestate"]): 445 | n["alloctrescpu"] = "--" 446 | n["alloctresmem"] = "--" 447 | n["alloctresgpu"] = "--" 448 | # For nodes without GPUs, replace GPUs value 0 with -- 449 | if n["gpumodel"] == "--": 450 | n["alloctresgpu"] = "--" 451 | print(f"{n['nodename'][:6]:<6}" + " " + 452 | f"{n['partition'][:12]:>12}" + " " + 453 | f"{n['nodestateshort'][:9]:>9}" + " " + 454 | f"{n['cpumodel'][:11]:>11}" + " " + 455 | f"{n['gpumodel'][:8]:>8}" + " " + 456 | f"{n['alloctrescpu']:>4}" + " " + 457 | f"{n['alloctresmem']:>5}" + "G " + 458 | f"{n['alloctresgpu']:>4}") 459 | else: 460 | if not args.gpus: 461 | # Summarize information by partition 462 | print("-----------------------------") 463 | print("Partition Free Free Free") 464 | print(" CPUs Memory GPUs") 465 | print("------------ ---- ------ ----") 466 | # Create sublists by partition 467 | partlist = [] 468 | sortedreslist = sorted(reslist, key=lambda x: x["partition"]) 469 | for k, v in groupby(sortedreslist, key=lambda x: x["partition"]): 470 | partlist.append(list(v)) 471 | # Sum and print information by partition 472 | for p in partlist: 473 | s = { 474 | "partition":p[0]["partition"], 475 | "cfgtresgpu":0, 476 | "freecpus":0, 477 | "freemem":0, 478 | "freegpus":0 479 | } 480 | for n in p: 481 | s["cfgtresgpu"] += n["cfgtresgpu"] 482 | s["freecpus"] += n["freecpus"] 483 | s["freemem"] += n["freemem"] 484 | s["freegpus"] += n["freegpus"] 485 | # For partitions without GPUs replace 0 with -- 486 | if s["cfgtresgpu"] == 0: 487 | s["freegpus"] = "--" 488 | print(f"{s['partition'][:12]:<12}" + " " + 489 | f"{s['freecpus']:>4}" + " " + 490 | f"{s['freemem']:>5}" + "G " + 491 | f"{s['freegpus']:>4}") 492 | else: 493 | # Summarize information by partition and GPU model 494 | print("--------------------------------------") 495 | print("Partition GPU Free Free Free") 496 | print(" Model CPUs Memory GPUs") 497 | print("------------ -------- ---- ------ ----") 498 | # Create sublists by partition and GPU model 499 | partgpulist = [] 500 | sortedreslist = sorted(reslist, key=lambda x: (x["partition"], x["gpumodel"])) 501 | for k, v in groupby(sortedreslist, key=lambda x: (x["partition"], x["gpumodel"])): 502 | partgpulist.append(list(v)) 503 | # Sum and print information by partition and GPU model 504 | for p in partgpulist: 505 | s = { 506 | "partition":p[0]["partition"], 507 | "gpumodel":p[0]["gpumodel"], 508 | "freecpus":0, 509 | "freemem":0, 510 | "freegpus":0 511 | } 512 | for n in p: 513 | s["freecpus"] += n["freecpus"] 514 | s["freemem"] += n["freemem"] 515 | s["freegpus"] += n["freegpus"] 516 | print(f"{s['partition'][:12]:<12}" + " " + 517 | f"{s['gpumodel']:>8}" + " " + 518 | f"{s['freecpus']:>4}" + " " + 519 | f"{s['freemem']:>5}" + "G " + 520 | f"{s['freegpus']:>4}") 521 | -------------------------------------------------------------------------------- /scripts/jobinfo: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 4 | jobinfo 5 | 6 | View detailed job information 7 | 8 | This script requires: 9 | - Python 3.10+ (for the zip() strict = True option) 10 | - Slurm 23.02+ (for the sacct field "planned") 11 | - Slurm configured with: 12 | - Job accounting (required): JobAcctGatherType=jobacct_gather/linux or jobacct_gather/cgroup 13 | - GPU gres (optional): GresTypes=gpu 14 | - GPU accounting (optional): AccountingStorageTRES=gres/gpu 15 | 16 | Notes: 17 | - Based on output from sacct, squeue, and sstat 18 | - Starting with Slurm 23.02, the sacct field "reserved" was renamed to "planned" 19 | """ 20 | 21 | import argparse 22 | import math 23 | import os 24 | import re 25 | import subprocess 26 | import sys 27 | 28 | # Set up arguments and options ########################################################### 29 | 30 | parser = argparse.ArgumentParser( 31 | prog = "jobinfo", 32 | formatter_class = argparse.RawDescriptionHelpFormatter, 33 | description = "View detailed job information", 34 | epilog = """\ 35 | examples: 36 | 37 | jobinfo 123456 38 | jobinfo $SLURM_JOB_ID 39 | 40 | notes: 41 | 42 | to find job IDs, run jobhist 43 | 44 | support: 45 | 46 | https://github.com/uschpc/slurm-tools/issues""" 47 | ) 48 | parser.add_argument( 49 | "jobid", 50 | nargs = "?", 51 | help = "job ID to query" 52 | ) 53 | parser.add_argument( 54 | "-V", 55 | "--version", 56 | action = "store_true", 57 | help = "print version" 58 | ) 59 | args = parser.parse_args() 60 | 61 | if args.version: 62 | print("jobinfo 2.3.0") 63 | sys.exit(0) 64 | 65 | if not args.jobid: 66 | print("Error: no job ID provided", file = sys.stderr) 67 | print("Usage: jobinfo [-h] [-V] [jobid]", file = sys.stderr) 68 | print("To find job IDs, run jobhist", file = sys.stderr) 69 | sys.exit(1) 70 | 71 | if re.search(r",", args.jobid): 72 | print("Error: only one job ID is accepted", file = sys.stderr) 73 | print("Usage: jobinfo [-h] [-V] [jobid]", file = sys.stderr) 74 | sys.exit(1) 75 | 76 | # Define utility functions ############################################################### 77 | 78 | def parsetime(x): 79 | """ 80 | Parse time elements from string 81 | 82 | # Arguments 83 | - x (str): sacct value for time 84 | 85 | # Returns 86 | - tuple: days, hours, minutes, seconds 87 | 88 | # Notes 89 | - Slurm time format is dd-hh:mm:ss.sss 90 | - Value will at least include mm:ss 91 | """ 92 | m = re.match(r"(((?P\d+)-)?(?P\d\d):)?" 93 | + r"(?P\d\d):(?P\d\d(\.\d+)?)", x) 94 | if not m: 95 | return 0, 0, 0, 0.0 96 | dd = int(m.group("days") or "0") 97 | hh = int(m.group("hours") or "0") 98 | mm = int(m.group("minutes")) 99 | ss = float(m.group("seconds")) 100 | return dd, hh, mm, ss 101 | 102 | def toseconds(x): 103 | """ 104 | Convert time to seconds 105 | 106 | # Arguments 107 | - x (str): sacct value for time 108 | 109 | # Returns 110 | - float: time in seconds 111 | """ 112 | dd, hh, mm, ss = parsetime(x) 113 | return dd*24*60*60 + hh*60*60 + mm*60 + ss 114 | 115 | def tobytes(x): 116 | """ 117 | Convert string to bytes 118 | 119 | # Arguments 120 | - x (str): sacct or sstat value for size 121 | 122 | # Returns 123 | - float: -1.0 (if x is empty) 124 | - float: size in bytes 125 | """ 126 | # Some values may be empty if job is RUNNING 127 | # Return -1.0 for comparison purposes 128 | if not x: 129 | return -1.0 130 | units = {"K": 10, "M": 20, "G": 30, "T": 40, "P": 50, "E": 60} 131 | scale = 2**units.get(x[-1], 0) 132 | if scale != 1: 133 | x = x[:-1] 134 | return float(x) * scale 135 | 136 | # Define selector functions ############################################################## 137 | 138 | def firstv(x, y): 139 | """ 140 | Select first non-empty value in multi-line output 141 | 142 | # Arguments 143 | - x (str): first value to compare 144 | - y (str): second value to compare 145 | 146 | # Returns 147 | - str: first non-empty value 148 | """ 149 | return x == "" and y or x 150 | 151 | def uniquev(x, y): 152 | """ 153 | Select all unique values in multi-line output 154 | 155 | # Arguments 156 | - x (str): first value to compare 157 | - y (str): second value to compare 158 | 159 | # Returns 160 | - str: unique values (separated by /) 161 | """ 162 | if y in x: 163 | return x 164 | return x + "/" + y 165 | 166 | def allv(x, y): 167 | """ 168 | Select all values in multi-line output 169 | 170 | # Arguments 171 | - x (str): first value to compare 172 | - y (str): second value to compare 173 | 174 | # Returns 175 | - str: all values (separated by |) 176 | """ 177 | if y == "": 178 | return x 179 | return x + "|" + y 180 | 181 | def maxint(x, y): 182 | """ 183 | Select max integer value in multi-line output 184 | 185 | # Arguments 186 | - x (str): first value to compare 187 | - y (str): second value to compare 188 | 189 | # Returns 190 | - str: max integer value 191 | """ 192 | if not x: 193 | xint = 0 194 | else: 195 | xint = int(x) 196 | if not y: 197 | yint = 0 198 | else: 199 | yint = int(y) 200 | return str(max(xint, yint)) 201 | 202 | def maxtime(x, y): 203 | """ 204 | Select max time value in multi-line output 205 | 206 | # Arguments 207 | - x (str): first value to compare 208 | - y (str): second value to compare 209 | 210 | # Returns 211 | - str: max time value 212 | """ 213 | if "UNLIMITED" in [x, y]: 214 | return "UNLIMITED" 215 | if x in ["", "INVALID"]: 216 | return y 217 | if y in ["", "INVALID"]: 218 | return x 219 | return max(x, y) 220 | 221 | def selectv(lines, selectors): 222 | """ 223 | Select sacct or sstat values from multi-line output 224 | 225 | # Arguments 226 | - lines (list): one list of parsed values per line of output 227 | - selectors (list): selector functions 228 | 229 | # Returns 230 | - list: selected sacct or sstat values 231 | """ 232 | # Extract first line of values 233 | vals = lines[0] 234 | # If more lines, run selector functions 235 | for line in lines[1:]: 236 | for i, selector in enumerate(selectors): 237 | vals[i] = selector(vals[i], line[i]) 238 | return vals 239 | 240 | # Define formatting functions ############################################################ 241 | 242 | def fmtbytes(x): 243 | """ 244 | Format bytes 245 | 246 | # Arguments 247 | - x (float): size in bytes 248 | 249 | # Returns 250 | - str: formatted size with units 251 | """ 252 | units = " KMGTPE" 253 | e = int(math.log(x + 1, 2) / 10) 254 | return f"{x / 2**(e * 10):.2f}" + units[e].strip() 255 | 256 | def fmttasks(x): 257 | """ 258 | Format tasks 259 | 260 | # Arguments 261 | - x (str): sacct value for ntasks 262 | 263 | # Returns 264 | - str: "--" (if x is empty) 265 | - str: x 266 | """ 267 | # Value is empty if job is PENDING 268 | if not x: 269 | return "--" 270 | return x 271 | 272 | def fmttime(x): 273 | """ 274 | Format time 275 | 276 | # Arguments 277 | - x (str): sacct value for time 278 | 279 | # Returns 280 | - str: "--" (if 0) 281 | - str: time formatted as (dd-)hh:mm:ss(.sss) 282 | """ 283 | # Value may be 0 if job is not COMPLETED 284 | if x == "00:00:00": 285 | return "--" 286 | dd, hh, mm, ss = parsetime(x) 287 | if dd > 0: 288 | dd = f"{dd}-" 289 | else: 290 | dd = "" 291 | if str(ss).endswith(".0"): 292 | return f"{dd}{hh:02d}:{mm:02d}:{int(ss):02d}" 293 | return f"{dd}{hh:02d}:{mm:02d}:{ss:06.3f}" 294 | 295 | def fmtdatetime(x): 296 | """ 297 | Format datetime 298 | 299 | # Arguments 300 | - x (str): sacct value for datetime 301 | 302 | # Returns 303 | - str: "--" (if Unknown) 304 | - str: x 305 | """ 306 | # Value may be Unknown if job is PENDING or RUNNING 307 | if x == "Unknown": 308 | return "--" 309 | return x 310 | 311 | def fmtmaxbytes(x): 312 | """ 313 | Format estimated max bytes 314 | 315 | # Arguments 316 | - x (float): estimated max bytes 317 | 318 | # Returns 319 | - str: "--" (if value is not available) 320 | - str: formatted bytes 321 | """ 322 | # Value set to -1.0 if job not started or is RUNNING and not user or root or not found 323 | if x == -1.0: 324 | return "--" 325 | return f"{fmtbytes(x)} (estimate)" 326 | 327 | # Define calculation functions ########################################################### 328 | 329 | def cpueff(x, y): 330 | """ 331 | Calculate CPU efficiency 332 | 333 | # Arguments 334 | - x (str): sacct value for totalcpu 335 | - y (str): sacct value for cputime 336 | 337 | # Returns 338 | - str: "--" (if x or y is 0) 339 | - str: formatted % 340 | """ 341 | usedcpu = toseconds(x) 342 | elapsedcpu = toseconds(y) 343 | # Total CPU time is 0 if job not COMPLETED 344 | # Elapsed CPU time is 0 if job not started 345 | if usedcpu == 0 or elapsedcpu == 0: 346 | return "--" 347 | return f"{usedcpu / elapsedcpu * 100:5.2f}%" 348 | 349 | def cpupct(x, y): 350 | """ 351 | Calculate % of total CPU used 352 | 353 | # Arguments 354 | - x (str): sacct value for usercpu or systemcpu 355 | - y (str): sacct value for totalcpu 356 | 357 | # Returns 358 | - str: "--" (if y is 0) 359 | - str: formatted % 360 | """ 361 | usedcpu = toseconds(y) 362 | # Total CPU time is 0 if job not COMPLETED 363 | if usedcpu == 0: 364 | return "--" 365 | usedsub = toseconds(x) 366 | return f"{usedsub / usedcpu * 100:5.2f}%" 367 | 368 | def memeff(x, y): 369 | """ 370 | Calculate memory efficiency 371 | 372 | # Arguments 373 | - x (str): estimated value for max memory used 374 | - y (str): sacct value for allocated memory 375 | 376 | # Returns 377 | - str: "--" (if x or y is not available) 378 | - str: formatted % 379 | """ 380 | # Max mem set to -1.0 if job not started or is RUNNING and not user or root 381 | # Allocated mem set to ?? if not found 382 | if x == -1.0 or y == "??": 383 | return "--" 384 | allocmem = tobytes(y) 385 | return f"{x / allocmem * 100:5.2f}%" 386 | 387 | def timeeff(x, y): 388 | """ 389 | Calculate time efficiency 390 | 391 | # Arguments 392 | - x (str): sacct value for timelimit 393 | - y (str): sacct value for elapsed 394 | 395 | # Returns 396 | - str: "--" (if y is 0) 397 | - str: formatted % 398 | """ 399 | timelimit = toseconds(x) 400 | elapsed = toseconds(y) 401 | # Elapsed time is 0 if job not started 402 | if elapsed == 0: 403 | return "--" 404 | return f"{elapsed / timelimit * 100:5.2f}%" 405 | 406 | # Define functions to get and derive job info ############################################ 407 | 408 | def getsacctv(jobid, fields): 409 | """ 410 | Get sacct values and modify as needed 411 | 412 | # Arguments 413 | - jobid (str): job ID 414 | - fields (list): sacct fields to query and associated selector functions 415 | 416 | # Returns 417 | - dict: key-value pairs for sacct fields 418 | """ 419 | keys = [item[0] for item in fields] 420 | fmt = ",".join(keys) 421 | cmd = ["sacct", "-n", "-p", "-o", fmt, "-j", jobid] 422 | proc = subprocess.run(cmd, capture_output = True, text = True) 423 | if not proc.stdout: 424 | print("Error: no such job", file = sys.stderr) 425 | sys.exit(1) 426 | # Job names may contain \n (causing line break) 427 | # So splitting by \n does not necessarily split output by line 428 | # First remove all \n and split by |, then group line values using number of fields 429 | out = re.sub(r"\n", "", proc.stdout)[:-1] 430 | out2 = out.split("|") 431 | n = len(fields) 432 | out3 = [out2[i:i + n] for i in range(0, len(out2), n)] 433 | # Output returned from sacct may have multiple lines 434 | # Jobs that are PENDING will have one line 435 | # Jobs that are CANCELLED or FAILED before starting will have one line 436 | # Otherwise, jobs that start will have multiple lines 437 | # Main job line, .batch or .interactive line, and .extern line 438 | # If srun is used, one line for each srun call 439 | # Select one value or combine multiple values depending on field 440 | selectors = [item[1] for item in fields] 441 | vals = selectv(out3, selectors) 442 | # Store values in dict 443 | d = dict(zip(keys, vals, strict = True)) 444 | return d 445 | 446 | def getsqueuev(jobid): 447 | """ 448 | Get squeue values and modify as needed 449 | 450 | # Arguments 451 | - jobid (str): job ID 452 | 453 | # Returns 454 | - dict: key-value pairs for squeue fields 455 | 456 | # Notes 457 | - For PENDING job 458 | - For job pending reason and dependencies 459 | """ 460 | cmd = ["squeue", "-h", "-a", "-o", "%R|%E", "-j", jobid] 461 | proc = subprocess.run(cmd, capture_output = True, text = True) 462 | out = proc.stdout.strip().split("|") 463 | if out[1] == "(null)": 464 | out[1] = "" 465 | # Store values in dict 466 | keys = ["pdreason", "pddeps"] 467 | d = dict(zip(keys, out, strict = True)) 468 | return d 469 | 470 | def getsstatv(jobid): 471 | """ 472 | Get sstat values and modify as needed 473 | 474 | # Arguments 475 | - jobid (str): job ID 476 | 477 | # Returns 478 | - dict: key-value pairs for sstat fields 479 | 480 | # Notes 481 | - For RUNNING job if user or root 482 | - For deriving maxmem, maxdiskread, and maxdiskwrite 483 | """ 484 | fmt = "jobid,tresusageintot,tresusageouttot" 485 | cmd = ["sstat", "-n", "-P", "-a", "-o", fmt, "-j", jobid] 486 | proc = subprocess.run(cmd, capture_output = True, text = True) 487 | out = proc.stdout.split("\n")[:-1] 488 | # Output returned from sstat will have multiple lines 489 | # .extern line and .batch or .interactive line 490 | # If srun is used, one line for each srun call 491 | # Select one value or combine multiple values depending on field 492 | # Parse values from each line into list and store in list of lists 493 | lines = [] 494 | for line in out: 495 | spl = line.strip().split("|") 496 | lines.append(spl) 497 | # Apply selector functions to parsed output (and remove job ID) 498 | selectors = [firstv, allv, allv] 499 | vals = selectv(lines, selectors)[1:] 500 | # Store values in dict 501 | keys = ["tresusageintot", "tresusageouttot"] 502 | d = dict(zip(keys, vals, strict = True)) 503 | return d 504 | 505 | def derivev(vals): 506 | """ 507 | Derive other values from sacct values 508 | 509 | # Arguments 510 | - vals (dict): key-value pairs for sacct fields 511 | 512 | # Returns 513 | - dict: key-value pairs for original and derived fields 514 | """ 515 | d = vals.copy() 516 | # Add formatted values where needed 517 | d["ntasksfmt"] = fmttasks(d["ntasks"]) 518 | d["startfmt"] = fmtdatetime(d["start"]) 519 | d["endfmt"] = fmtdatetime(d["end"]) 520 | d["elapsedfmt"] = fmttime(d["elapsed"]) 521 | d["cputimefmt"] = fmttime(d["cputime"]) 522 | d["totalcpufmt"] = fmttime(d["totalcpu"]) 523 | # Format state depending on job state 524 | if d["state"] == "PENDING": 525 | squeuev = getsqueuev(d["jobid"]) 526 | d.update(squeuev) 527 | if not d["pddeps"]: 528 | d["statefmt"] = d["state"] + " " + d["pdreason"] 529 | else: 530 | d["statefmt"] = d["state"] + " " + d["pdreason"] + " (" + d["pddeps"] + ")" 531 | else: 532 | d["statefmt"] = d["state"] 533 | # Get requested nodes 534 | reqnodes = re.search(r"node=([0-9]+)", d["reqtres"]) 535 | if not reqnodes: 536 | d["reqnodes"] = "??" 537 | else: 538 | d["reqnodes"] = reqnodes.group(1) 539 | # Get requested CPUs 540 | reqcpus = re.search(r"cpu=([0-9]+)", d["reqtres"]) 541 | if not reqcpus: 542 | d["reqcpus"] = "??" 543 | else: 544 | d["reqcpus"] = reqcpus.group(1) 545 | # Get requested memory (may be in varying units) 546 | reqmem = re.search(r"mem=([0-9]+.[0-9]+[A-Z]|[0-9]+[A-Z])", d["reqtres"]) 547 | if not reqmem: 548 | d["reqmem"] = "??" 549 | else: 550 | d["reqmem"] = reqmem.group(1) 551 | # Get requested GPUs 552 | reqgpus = re.search(r"gres/gpu=([0-9]+)", d["reqtres"]) 553 | if not reqgpus: 554 | d["reqgpus"] = "0" 555 | d["reqgpumodels"] = "--" 556 | d["reqgpustr"] = d["reqgpus"] 557 | else: 558 | d["reqgpus"] = reqgpus.group(1) 559 | # Get requested GPU models (typically one but could be multiple) 560 | reqgpumodels = re.findall(r"gres/gpu:([a-zA-Z0-9]+)", d["reqtres"]) 561 | if not reqgpumodels: 562 | d["reqgpumodels"] = "--" 563 | d["reqgpustr"] = d["reqgpus"] 564 | else: 565 | d["reqgpumodels"] = ",".join(reqgpumodels) 566 | d["reqgpustr"] = d["reqgpus"] + " (" + d["reqgpumodels"] + ")" 567 | # alloctres is empty if resources have not been allocated 568 | # If job is PENDING or CANCELLED/FAILED before job started 569 | if not d["alloctres"]: 570 | d["allocnodes"] = "--" 571 | d["alloccpus"] = "--" 572 | d["allocmem"] = "--" 573 | d["allocgpus"] = "--" 574 | d["allocgpumodels"] = "--" 575 | d["allocgpustr"] = "--" 576 | else: 577 | # Get allocated nodes 578 | allocnodes = re.search(r"node=([0-9]+)", d["alloctres"]) 579 | if not allocnodes: 580 | d["allocnodes"] = "??" 581 | else: 582 | d["allocnodes"] = allocnodes.group(1) 583 | # Get allocated CPUs 584 | alloccpus = re.search(r"cpu=([0-9]+)", d["alloctres"]) 585 | if not alloccpus: 586 | d["alloccpus"] = "??" 587 | else: 588 | d["alloccpus"] = alloccpus.group(1) 589 | # Get allocated memory (may be in varying units) 590 | allocmem = re.search(r"mem=([0-9]+.[0-9]+[A-Z]|[0-9]+[A-Z])", d["alloctres"]) 591 | if not allocmem: 592 | d["allocmem"] = "??" 593 | else: 594 | d["allocmem"] = allocmem.group(1) 595 | # Get allocated GPUs 596 | allocgpus = re.search(r"gres/gpu=([0-9]+)", d["alloctres"]) 597 | if not allocgpus: 598 | d["allocgpus"] = "0" 599 | d["allocgpumodels"] = "--" 600 | d["allocgpustr"] = d["allocgpus"] 601 | else: 602 | d["allocgpus"] = allocgpus.group(1) 603 | # Get allocated GPU models (typically one but could be multiple) 604 | allocgpumodels = re.findall(r"gres/gpu:([a-zA-Z0-9]+)", d["alloctres"]) 605 | if not allocgpumodels: 606 | d["allocgpumodels"] = "--" 607 | d["allocgpustr"] = d["allocgpus"] 608 | else: 609 | d["allocgpumodels"] = ",".join(allocgpumodels) 610 | d["allocgpustr"] = d["allocgpus"] + " (" + d["allocgpumodels"] + ")" 611 | # Calculate CPU efficiency 612 | d["cpueff"] = cpueff(d["totalcpu"], d["cputime"]) 613 | # Calculate user CPU time pct 614 | d["usercpupct"] = cpupct(d["usercpu"], d["totalcpu"]) 615 | # Calculate system CPU time pct 616 | d["systemcpupct"] = cpupct(d["systemcpu"], d["totalcpu"]) 617 | # Derive maxmem, maxdiskread, and maxdiskwrite depending on job state 618 | # If job is RUNNING and user or root, use sstat values 619 | if "RUNNING" in d["state"] and (d["user"] == os.environ["USER"] or os.getuid() == 0): 620 | sstatv = getsstatv(d["jobid"]) 621 | d["tresusageintot"] = sstatv["tresusageintot"] 622 | d["tresusageouttot"] = sstatv["tresusageouttot"] 623 | # tresusage values may be empty 624 | # If job is PENDING or CANCELLED/FAILED before job started 625 | # If job is RUNNING but not user or root 626 | # Derive max memory used from tresusageintot 627 | allmem = re.findall(r",mem=([0-9]+.[0-9]+[A-Z]|[0-9]+[A-Z])", d["tresusageintot"]) 628 | if not allmem: 629 | d["maxmem"] = -1.0 630 | else: 631 | allmembytes = [tobytes(mem) for mem in allmem] 632 | d["maxmem"] = max(allmembytes) 633 | # Derive max disk read from tresusageintot (already in bytes) 634 | alldiskread = re.findall(r"disk=([0-9]+)", d["tresusageintot"]) 635 | if not alldiskread: 636 | d["maxdiskread"] = -1.0 637 | else: 638 | alldiskread2 = [int(dr) for dr in alldiskread] 639 | d["maxdiskread"] = max(alldiskread2) 640 | # Derive max disk write from tresusageouttot (already in bytes) 641 | alldiskwrite = re.findall(r"disk=([0-9]+)", d["tresusageouttot"]) 642 | if not alldiskwrite: 643 | d["maxdiskwrite"] = -1.0 644 | else: 645 | alldiskwrite2 = [int(dw) for dw in alldiskwrite] 646 | d["maxdiskwrite"] = max(alldiskwrite2) 647 | d["maxmemfmt"] = fmtmaxbytes(d["maxmem"]) 648 | d["maxdiskreadfmt"] = fmtmaxbytes(d["maxdiskread"]) 649 | d["maxdiskwritefmt"] = fmtmaxbytes(d["maxdiskwrite"]) 650 | # Calculate memory efficiency 651 | d["memeff"] = memeff(d["maxmem"], d["allocmem"]) 652 | # Calculate time efficiency if job not RUNNING 653 | if "RUNNING" in d["state"]: 654 | d["timeeff"] = "--" 655 | else: 656 | d["timeeff"] = timeeff(d["timelimit"], d["elapsed"]) 657 | return d 658 | 659 | # Define main function to get job info ################################################### 660 | 661 | def getjobinfo(jobid, fields): 662 | """ 663 | Get job info values 664 | 665 | # Arguments 666 | - jobid (str): job ID 667 | - fields (list): sacct fields to query and associated selector functions 668 | 669 | # Returns 670 | - dict: key-value pairs for job info 671 | """ 672 | vals = getsacctv(jobid, fields) 673 | vals2 = derivev(vals) 674 | return vals2 675 | 676 | # Get job info ########################################################################### 677 | 678 | # Specify sacct fields to query and associated selector functions 679 | fs = [["jobid", firstv], 680 | ["jobname", firstv], 681 | ["user", firstv], 682 | ["account", firstv], 683 | ["workdir", firstv], 684 | ["cluster", firstv], 685 | ["partition", firstv], 686 | ["nodelist", firstv], 687 | ["ntasks", maxint], 688 | ["reqtres", firstv], 689 | ["alloctres", firstv], 690 | ["state", uniquev], 691 | ["exitcode", firstv], 692 | ["submit", firstv], 693 | ["start", min], 694 | ["end", maxtime], 695 | ["planned", firstv], 696 | ["timelimit", maxtime], 697 | ["elapsed", firstv], 698 | ["cputime", firstv], 699 | ["totalcpu", firstv], 700 | ["usercpu", firstv], 701 | ["systemcpu", firstv], 702 | ["tresusageintot", allv], 703 | ["tresusageouttot", allv]] 704 | 705 | # Run main function to get job info 706 | v = getjobinfo(args.jobid, fs) 707 | 708 | # Print selected key-value pairs ######################################################### 709 | 710 | print("Job ID | " + v["jobid"]) 711 | print("Job Name | " + v["jobname"]) 712 | print("User | " + v["user"]) 713 | print("Account | " + v["account"]) 714 | print("Working directory | " + v["workdir"]) 715 | print("Cluster | " + v["cluster"]) 716 | print("Partition | " + v["partition"]) 717 | print("State | " + v["statefmt"]) 718 | print("Exit code | " + v["exitcode"]) 719 | if v["statefmt"] == "PENDING" or not v["alloctres"]: 720 | print("Nodes | " + v["reqnodes"]) 721 | print("Tasks | " + v["ntasksfmt"]) 722 | print("CPUs | " + v["reqcpus"]) 723 | print("Memory | " + v["reqmem"]) 724 | print("GPUs | " + v["reqgpustr"]) 725 | else: 726 | print("Nodes | " + v["allocnodes"]) 727 | print("Tasks | " + v["ntasksfmt"]) 728 | print("CPUs | " + v["alloccpus"]) 729 | print("Memory | " + v["allocmem"]) 730 | print("GPUs | " + v["allocgpustr"]) 731 | print("Nodelist | " + v["nodelist"]) 732 | print("Submit time | " + v["submit"]) 733 | print("Start time | " + v["startfmt"]) 734 | print("End time | " + v["endfmt"]) 735 | print("Wait time | " + v["planned"]) 736 | print("Reserved walltime | " + v["timelimit"]) 737 | print("Elapsed walltime | " + v["elapsedfmt"]) 738 | print("Time efficiency | " + v["timeeff"]) 739 | print("Elapsed CPU walltime | " + v["cputimefmt"]) 740 | print("Used CPU time | " + v["totalcpufmt"]) 741 | print("CPU efficiency | " + v["cpueff"]) 742 | print("User CPU time pct | " + v["usercpupct"]) 743 | print("System CPU time pct | " + v["systemcpupct"]) 744 | print("Max memory used | " + v["maxmemfmt"]) 745 | print("Memory efficiency | " + v["memeff"]) 746 | print("Max disk read | " + v["maxdiskreadfmt"]) 747 | print("Max disk write | " + v["maxdiskwritefmt"]) 748 | --------------------------------------------------------------------------------