├── srunning
    ├── README.md
    └── srunning
├── helpers.R
├── sacct_stats_queue_dist.R
├── README.md
└── sacct_stats.R


/srunning/README.md:
--------------------------------------------------------------------------------
 1 | ## srunning
 2 | 
 3 | srunning shows currently running jobs in the selected partitions (by default
 4 | all accelerator partitions), and the most time-consuming binary being run by
 5 | the user on the node.
 6 | 
 7 | 
 8 | ### Usage
 9 | ```
10 | ./srunning 
11 | 
12 | ```
13 | 
14 | One can use the -p flag to set partition (as in squeue).
15 | 
16 | ```
17 | ./srunning  -p parallel
18 | ```
19 | 


--------------------------------------------------------------------------------
/helpers.R:
--------------------------------------------------------------------------------
 1 | convb <- function(x){
 2 |   ptn <- "(\\d*(.\\d+)*)(.*)"
 3 |   num  <- as.numeric(sub(ptn, "\\1", x))
 4 |   unit <- sub(ptn, "\\3", x)             
 5 |   unit[unit==""] <- "1" 
 6 |   
 7 |   mult <- c("1"=1, "K"=1024, "M"=1024^2, "G"=1024^3)
 8 |   num * unname(mult[unit])
 9 | }
10 | 
11 | getelapsed <- function(x){
12 |   str<-unlist(strsplit(as.character(x),'[-:]'))
13 |   secs = 0
14 |   if (length(str)==4) {
15 |     secs <- as.numeric(str[1])*86400
16 |     secs <- secs + as.numeric(str[2])*3600
17 |     secs <- secs + as.numeric(str[3])*60
18 |     secs <- secs + as.numeric(str[4])
19 |   } else if (length(str)==3) {
20 |     secs <- as.numeric(str[1])*3600
21 |     secs <- secs + as.numeric(str[2])*60
22 |     secs <- secs + as.numeric(str[3])
23 |   }
24 |   return(secs)
25 | }
26 | 
27 | 
28 | 


--------------------------------------------------------------------------------
/sacct_stats_queue_dist.R:
--------------------------------------------------------------------------------
 1 | #
 2 | # Simple SLURM queue time statistics
 3 | # Olli-Pekka Lehto / CSC - IT Center for Science Ltd. 
 4 | # 
 5 | # ------
 6 | # Step 1:
 7 | # Create the accounting file, run:
 8 | # sacct --format Submit,Start,Partition,User  -s BF,CA,CD,F,NF,PR,TO -P -a -S (startdate) -X > queue_stats
 9 | #
10 | # Where (startdate) is the start timestamp
11 | # Feel free to use other sacct parameters (like -E as well) 
12 | # -------
13 | # Step 2:
14 | # The script needs the data.table package
15 | # To install it, do install.packages(data.table)
16 | # -------
17 | # Step 3
18 | # Run this script and give the input file as a parameter. For example: 
19 | # Rscript --vanilla sacct_stats_queue_dist.R queue_stats 
20 | # After the run you should have queue_stats.out in your directory
21 | 
22 | require(data.table)
23 | 
24 | args <- commandArgs(trailingOnly = TRUE)
25 | filename <- args[1] 
26 | 
27 | # Load some helper functions
28 | source('./helpers.R')
29 | 
30 | # Read the input file containing raw data
31 | dt=fread(filename,header=T,sep="|")
32 | 
33 | # Clean up the batch lines
34 | dt <- dt[-which(dt$User == ""), ]
35 | 
36 | # Convert timestamps to POSIXct format
37 | dt$Submit=as.POSIXct(strptime(dt$Submit,"%Y-%m-%dT%H:%M:%S"))
38 | dt$Start=as.POSIXct(strptime(dt$Start,"%Y-%m-%dT%H:%M:%S"))
39 | 
40 | # Some helper tables to speed things up (I hope)
41 | dt$Year=year(dt$Start)
42 | dt$Month=month(dt$Start)
43 | 
44 | # Calculate the queuing time for each job
45 | dt$QueueTime=as.numeric(dt$Start - dt$Submit)
46 | 
47 | out=dt[,list("<1min"=sum(QueueTime<60)/.N,"<15min"=sum(QueueTime %in% 60:900 )/.N,"<1h"=sum(QueueTime %in% 900:3600)/.N,"<5h"=sum(QueueTime %in% 3600:18000 )/.N,"Longer"=sum(QueueTime>18000)/.N),by=Partition]
48 | 
49 | write.csv(out,paste(filename,"_out.csv",sep=""),row.names=FALSE, na="")
50 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # slurm-stats
 2 | **SLURM Stats** are scripts for gathering SLURM statistics
 3 | 
 4 | Currently the scripts are 
 5 |  - **sacct_stats.R** which generates simple user/monthly stats from sacct output
 6 |  - **sacct_stats_queue_dist.R** which generates a CSV with a binned distribution of how long jobs in each partition have queued
 7 |  - **helpers.R** helper functions to convert timestamps etc.
 8 |  
 9 | ## sacct_stats.R ##
10 | ### Fetching data
11 | To generate the data, you should use the following type of **sacct** command. You probably want to vary the start (and end) dates (-S and -E flags).  
12 | 
13 |     sacct --format JobID,JobIDRaw,JobName,User,Group,Partition,MaxRSS,MaxPages,AveCPU,MaxDiskWrite,MaxDiskRead,MaxVMSize,NTasks,AllocCPUS,Submit,Start,Elapsed,End,State,ExitCode,ReqMem,Timelimit -s BF,CA,CD,F,NF,PR,TO -P -a -S 08/15 > sisu
14 | 
15 | The example contains some extra fields which are not processed yet by the script but will likely be useful
16 | 
17 | ### Processing the data
18 |  - Ensure that you have the [data.table](https://github.com/Rdatatable/data.table) library installed in R
19 | ```
20 | install.packages("data.table")
21 | library(data.table)
22 | ```
23 |  - Run the command and give the input file as an argument
24 | ```
25 | R --no-save --args "taito-gpu" < sacct_stats.R
26 | ```
27 |  - After the script completes you should have CSVs containing aggregations of per-month and per-user data
28 | ```
29 | sisu_stats_per_user.csv
30 | sisu_stats_per_month.csv
31 | ```
32 | 
33 |  - There are also some commented out lines at the end that generate other plots and statisics and can be used as basis for playing around with the data interactively
34 | 
35 | ### Interpreting the data
36 | 
37 | The resulting CSV files contain the following fields
38 |  - **User** name or Date (Month/Year)
39 |  - **Count** Number of jobs for the user or during the time period
40 | 
41 | For the following statistics, minimum, mean, maximum and standard deviation (min,mean,max,stddev) is calculated
42 |  - **AllocCPUS** Allocated CPUs
43 |  - **QueueTime** Time spent queued (in seconds)
44 |  - **Elapsed** Time spent running (in seconds)
45 |  - **Timelimitaccuracy** Difference of timelimit vs. actual runtime (Elapsed/Timelimit)
46 | 
47 | ## sacct_stats_queue_dist.R ##
48 | ### Fetching data
49 | You can use the same sacct output file as above but if you don't have it, the minimal set of data is the User, Partition, Submit and Start fields. 
50 | 
51 | An example of the the very minimum feasible sacct command: 
52 | ```
53 | sacct --format User,Partition,Submit,Start -s BF,CA,CD,F,NF,PR,TO -P -a -S 08/15 > sisu
54 | ````
55 | 
56 | ### Processing the data
57 |  - Ensure that you have the [data.table](https://github.com/Rdatatable/data.table) library installed in R
58 | ```
59 | install.packages("data.table")
60 | library(data.table)
61 | ```
62 |  - Run the command and give the input file as an argument
63 | ```
64 | R --no-save --args "taito-gpu" < sacct_stats_queue_dist.R
65 | ```
66 |  - After the script completes you should have a CSV containing a table with the suffix _out and the queue time distribution.
67 | ```
68 | sisu_out.csv
69 | ```
70 | 


--------------------------------------------------------------------------------
/srunning/srunning:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | import multiprocessing
 3 | import subprocess
 4 | import os
 5 | import sys
 6 | 
 7 | def get_job_details(job):
 8 |     slurmNodes=job[0]
 9 |     nodes=list(get_nodes(slurmNodes))
10 |     slurmId=job[1]
11 |     user=job[2]
12 |     slurmName=job[3]
13 |     slurmPartition=job[4]
14 |     nNodes=job[5]
15 |     nCores=job[6]
16 |     slurmResources=job[7]
17 |     getApplicationNameCommand = "ssh  %s top -bn 1 -u %s |grep %s |head -1 |gawk '{print $12}'|tr '\n' ' '" % (nodes[0], user, user)
18 |     pGetApplicationName = subprocess.Popen(getApplicationNameCommand, shell=True, stdout=subprocess.PIPE, stderr=DEVNULL)
19 |     name = pGetApplicationName.stdout.read()
20 |     return (slurmId, user, slurmName, name, slurmPartition, nNodes, nCores, slurmResources,slurmNodes)
21 | 
22 | 
23 | 
24 | def get_nodes(s):
25 |     """ Converts slurm node lists to list with each node separately. 
26 | 
27 |     >>> list(get_nodes('c[1-2,4]'))
28 |     [c1, c2, c4]
29 |     
30 |     """
31 |     s=s.replace("["," ").replace("]","")
32 |     nodeprefix=s[0]
33 |     nodeids=s[1:]
34 |     for x in nodeids.split(','):
35 |         elem = x.split('-')
36 |         if len(elem) == 1: # a number
37 |             yield nodeprefix + elem[0].strip()
38 |         elif len(elem) == 2: # a range inclusive
39 |             start, end = map(int, elem)
40 |             for i in xrange(start, end+1):
41 |                 yield nodeprefix + str(i)
42 |         else: # more than one hyphen
43 |             raise ValueError('format error in %s' % x)
44 | 
45 | if __name__ == '__main__':
46 |     total = len(sys.argv)
47 |     # Get the arguments list 
48 |     cmdargs = " ".join(sys.argv[1:])
49 |     #add default partitions
50 |     if total == 1:
51 |         cmdargs += "-p \"gpu,gpulong,gputest,mic\" "
52 | 
53 |     #get list of currently running applications        
54 |     slurmJobList=[]
55 |     squeueCommand = " squeue -h " + cmdargs + " -t \"RUNNING\" -S \"D\" -o \"%N   %A %.10u %.50j %.9P %.3D %.4C  %.8b  \""
56 |     p = subprocess.Popen(squeueCommand, shell=True, stdout=subprocess.PIPE)    
57 |     for line in p.stdout.readlines():
58 |         slurmJobList.append(line.split())
59 |     p.wait()
60 | 
61 |     #Get application name in parallel
62 |     #devnull to hide ssh errors (no key)
63 |     DEVNULL = open(os.devnull, 'wb')
64 |     #limit parallelism to some sane value
65 |     nJobs =  min( len(slurmJobList), 96)
66 |     pool = multiprocessing.Pool(nJobs)   
67 |     slurmId, user, slurmName, name, slurmPartition, nNodes, nCores, slurmResources, slurmNodes = zip(*pool.map(get_job_details, slurmJobList))
68 |     DEVNULL.close()                                                                                          
69 | 
70 |     #print results
71 |     print "%10s %10s %40s %20s %10s %4s %5s %10s" %("#  slurmId ", "User ", "jobName ", "Application ", "Partition ", "nNodes ", "nCores ", "resources ")
72 |     for i in range(0, len(slurmId)):
73 |         print "%10s %10s %40s %20s %10s %4s %5s %10s %s" %(slurmId[i], user[i], slurmName[i], name[i], slurmPartition[i], nNodes[i], nCores[i], slurmResources[i], slurmNodes[i])
74 | 
75 | 
76 |     
77 | 


--------------------------------------------------------------------------------
/sacct_stats.R:
--------------------------------------------------------------------------------
 1 | # We need data.table package
 2 | # To install
 3 | # install.packages(data.table)
 4 | 
 5 | require(data.table)
 6 | 
 7 | args <- commandArgs(trailingOnly = TRUE)
 8 | print(args[0])
 9 | filename <- args[1] 
10 | 
11 | # Load some helper functions
12 | source('./helpers.R')
13 | 
14 | # Read the input file containing raw data
15 | dt=fread(filename,header=T,sep="|")
16 | 
17 | # Clean up the batch lines
18 | dt <- dt[-which(dt$User == ""), ]
19 | 
20 | # Convert the elapsed times (run time and time limit) into seconds
21 | dt$Timelimit=as.numeric(unlist(lapply(dt$Timelimit,getelapsed)))
22 | dt$Elapsed=as.numeric(unlist(lapply(dt$Elapsed,getelapsed)))
23 | 
24 | # Convert timestamps to POSIXct format
25 | dt$Submit=as.POSIXct(strptime(dt$Submit,"%Y-%m-%dT%H:%M:%S"))
26 | dt$Start=as.POSIXct(strptime(dt$Start,"%Y-%m-%dT%H:%M:%S"))
27 | dt$End=as.POSIXct(strptime(dt$End,"%Y-%m-%dT%H:%M:%S"))
28 | 
29 | # Some helper tables to speed things up (I hope)
30 | dt$Year=year(dt$Start)
31 | dt$Month=month(dt$Start)
32 | 
33 | # Calculate the queuing time for each job
34 | dt$QueueTime=as.numeric(dt$Start - dt$Submit)
35 | 
36 | # Calculate the total time (core seconds) for each job
37 | dt$TotalTime=dt$Elapsed * dt$AllocCPUS
38 | 
39 | # Calculate the accuracy of timelimit estimation for each job
40 | dt$TimelimitAccuracy = dt$Elapsed / dt$Timelimit
41 | 
42 | # Convert the formatted byte values to standard bytes
43 | 
44 | dt$MaxVMSize=convb(dt$MaxVMSize)
45 | dt$MaxRSS=convb(dt$MaxRSS)
46 | dt$MaxDiskRead=convb(dt$MaxDiskRead)
47 | dt$MaxDiskWrite=convb(dt$MaxDiskWrite)
48 | 
49 | 
50 | # User -based statistics as CSV
51 | 
52 | setkey(dt,User)
53 | dt2 <- dt[, list(count=.N, AllocCPUS_min=min(AllocCPUS), AllocCPUS_mean=mean(AllocCPUS), AllocCPUS_stddev=sd(AllocCPUS), AllocCPUS_max=max(AllocCPUS),
54 |                      QueueTime_min=min(QueueTime), QueueTime_mean=mean(QueueTime), QueueTime_stddev=sd(QueueTime), QueueTime_max=max(QueueTime),
55 |                      Elapsed_min=min(Elapsed),Elapsed_mean=mean(Elapsed), Elapsed_stddev=sd(Elapsed), Elapsed_max=max(Elapsed),
56 |                      TimelimitAccuracy_min=min(TimelimitAccuracy),TimelimitAccuracy_mean=mean(TimelimitAccuracy), TimelimitAccuracy_stddev=sd(TimelimitAccuracy), TimelimitAccuracy_max=max(TimelimitAccuracy)
57 |                      ),by=list(User)]
58 | write.csv(dt2,file=paste(filename,"_stats_per_user.csv",sep=""))
59 | 
60 | setkey(dt,Month,Year)
61 | 
62 | # Monthly statistics as CSV
63 | 
64 | dt3 <- dt[, list(count=.N, AllocCPUS_min=min(AllocCPUS), AllocCPUS_mean=mean(AllocCPUS), AllocCPUS_stddev=sd(AllocCPUS), AllocCPUS_max=max(AllocCPUS),
65 |                             QueueTime_min=min(QueueTime), QueueTime_mean=mean(QueueTime), QueueTime_stddev=sd(QueueTime), QueueTime_max=max(QueueTime),
66 |                             Elapsed_min=min(Elapsed),Elapsed_mean=mean(Elapsed), Elapsed_stddev=sd(Elapsed), Elapsed_max=max(Elapsed),
67 |                             TimelimitAccuracy_min=min(TimelimitAccuracy,na.rm=TRUE),TimelimitAccuracy_mean=mean(TimelimitAccuracy,na.rm=TRUE), TimelimitAccuracy_stddev=sd(TimelimitAccuracy,na.rm=TRUE), TimelimitAccuracy_max=max(TimelimitAccuracy,na.rm=TRUE)
68 |                             ),by=list(Year,Month)]
69 | write.csv(dt3,file=paste(filename,"_stats_per_month.csv",sep=""))
70 | 
71 | # Some random things that I've commented out for now
72 | 
73 | #save.image("Tables.RData")
74 | 
75 | # Plot runtime vs allocated CPUs
76 | #plot(dt$RunTime,dt$AllocCPUS)
77 | 
78 | # Cumulative core counts
79 | #CoreCounts <- table(dt$AllocCPUS)
80 | #CumulativeCoreCounts <- cumsum(CoreCounts)
81 | 
82 | # Cumulative total time (core seconds)
83 | #TotalTimes <- table(dt$TotalTime)
84 | #CumTotalTimes <- cumsum(TotalTimes)
85 | #plot(as.numeric(names(CumTotalTimes),CumTotalTimes))
86 | 
87 | # Plot job size distribution
88 | #plot(as.numeric(names(CumulativeCoreCounts)),CumulativeCoreCounts,
89 | #      main="Job size distribution",
90 | #      xlab="Core count",
91 | #      ylab="Job count")
92 | 
93 | 


--------------------------------------------------------------------------------