├── Example.gif ├── README.md ├── find-best-partition └── submitCPU.sh /Example.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fasrc/best_slurm_partition/d6c7e6693090ee773a72ec81dd4730beb1265299/Example.gif -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## BestSlurmPartition 2 | 3 | Find a Slurm partition with the minimum delay to start your job on cluster. 4 | 5 | - Check for the Slurm partition with the minimum delay: `sh find-best-partition -f submit.sh -o check` 6 | - [Optional] Find and save Slurm configuration files in `tmpwdir` directory: `sh find-best-partition -f submit.sh -o set` 7 | - Help: `sh find-best-partition -o help` 8 | 9 | You may omit the `sh` command if the file has the execute permission. 10 | 11 | ### Sample output: 12 | 13 | --- Error using partition: fas_gpu 14 | --- Error using partition: bigmem 15 | --- Error using partition: gpu 16 | --- Error using partition: gpu_requeue 17 | --- Error using partition: test 18 | --- Check tmpwdir/error.log for error log 19 | 20 | --- Waiting time to run this job on SLURM partitions sorted by time (sec) 21 | --- 0: knl_centos7 22 | --- 0: olveczky 23 | --- 0: remotedesktop 24 | --- 0: serial_requeue 25 | --- 1: shared 26 | --- 2: olveczkygpu 27 | --- 360: general 28 | --- 310497045: unrestricted 29 | 30 | --- Find SLURM submission scripts inside tmpwdir/ folder 31 | 32 | 33 | 34 | ![alt text](Example.gif?raw=true "Example Run") 35 | -------------------------------------------------------------------------------- /find-best-partition: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | excludeGrp=rc_admin 4 | 5 | if hash sinfo 2>/dev/null; then 6 | echo " " 7 | else 8 | echo " --- Error: Slurm is not available in this node." 9 | exit 1 10 | fi 11 | 12 | while getopts o:f: option 13 | do 14 | case "${option}" 15 | in 16 | o) optU=${OPTARG};; 17 | f) optF=${OPTARG};; 18 | esac 19 | done 20 | 21 | if [[ $optU = help ]]; then 22 | echo " --- Find and print Slurm configuration: sh find-best-partition -f submit.sh -o set" 23 | echo " --- Check for the best Slurm partition: sh find-best-partition -f submit.sh -o check" 24 | fi 25 | 26 | submissionScript=$optF 27 | 28 | if [[ $optU = set ]]; then 29 | 30 | if [[ ! $optF ]]; then 31 | echo " --- Specify Slurm submission script with the -f option." 32 | exit 1 33 | fi 34 | 35 | rm -rf tmpwdir 36 | mkdir -p tmpwdir 37 | 38 | # Dump Slurm partitions and their allowed groups to a file 39 | scontrol show partition | grep "PartitionName\|AllowGroups" > tmpwdir/slurmPartInfo.txt 40 | 41 | # Get total number of Slurm patitions. 42 | numPar=$(scontrol show partition | grep "PartitionName" | wc -l) 43 | 44 | # Re-count number of Slurm partitions 45 | nL=$(cat tmpwdir/slurmPartInfo.txt | wc -l) 46 | 47 | # Throw an error if there is a mismatch in number of Slurm patitions 48 | if [ $((nL/2)) -ne $numPar ]; then 49 | echo " --- There is error with number of SLURM partitions." 50 | echo " --- Number of actual partitions: $numPar" 51 | echo " --- Number of partitions in the file: $((nL/2))" 52 | fi 53 | 54 | # Number of groups associated with user 55 | grps=$(groups) 56 | nGrp=$(echo $grps | wc -w) 57 | rm -f tmpwdir/allowedParts.txt 58 | 59 | # Loop over SLURM partitions 60 | for i in $(seq 2 2 $nL); do 61 | sw=$(head -$i tmpwdir/slurmPartInfo.txt | tail -2) 62 | parName=$(echo $sw | cut -d'=' -f2 | cut -d' ' -f1) 63 | 64 | for j in $(seq 1 $nGrp); do 65 | grpN=$(echo $grps | cut -d' ' -f$j) 66 | if [ $grpN != $excludeGrp ]; then 67 | if [[ $sw = *"$grpN"* ]] || [[ $sw = *"AllowGroups=ALL"* ]]; then 68 | #echo " --- Group: $grpN, Partition: $parName" 69 | echo $parName >> tmpwdir/allowedParts.txt 70 | fi 71 | fi 72 | done 73 | done 74 | 75 | rm -f tmpwdir/slurmPartInfo.txt 76 | 77 | fi 78 | 79 | 80 | if [[ $optU = check ]]; then 81 | 82 | if [[ ! $optF ]]; then 83 | echo " --- Specify Slurm submission script with the -f option." 84 | exit 1 85 | fi 86 | 87 | if [ ! -e tmpwdir/allowedParts.txt ]; then 88 | echo " --- First running: sh find-best-partition -f $submissionScript -o set" 89 | sh find-best-partition -f $submissionScript -o set 90 | fi 91 | 92 | # Check specific Slurm submission line number for partition name 93 | lineN=$(grep -n " -p " $submissionScript | cut -d':' -f1) 94 | 95 | if [ ! $lineN ]; then 96 | lineN=$(grep -n " --partition " $submissionScript | cut -d':' -f1) 97 | if [ ! $lineN ]; then 98 | echo " --- Error: Specifiy a default partition in Slurm submission script." 99 | exit 1 100 | fi 101 | fi 102 | 103 | rm -f tmpwdir/result.txt 104 | rm -f tmpwdir/error.log 105 | 106 | echo " " 107 | 108 | errorSt=0 109 | 110 | # Loop over each allowed Slurm partition listed in allowedParts.txt file 111 | for i in $(cat tmpwdir/allowedParts.txt); do 112 | # Change the partition name 113 | subSName=tmpwdir/slurm_$i.sh 114 | sed "${lineN}s/.*/#SBATCH -p $i/" $submissionScript > $subSName 115 | 116 | tmpF=${subSName/.sh}_tmp.txt 117 | 118 | timCurrent=$(date +%s) 119 | 120 | # run sbatch with --test-only to get time 121 | sbatch --test-only $subSName > $tmpF 2>&1 122 | 123 | if [[ $(cat $tmpF) = *"error"* ]] || [[ $(cat $tmpF) = *"failure"* ]]; then 124 | echo " --- Error using partition: $i" 125 | echo " " >> tmpwdir/error.log 126 | echo " --- Error using partition: $i" >> tmpwdir/error.log 127 | cat $tmpF >> tmpwdir/error.log 128 | errorSt=1 129 | continue 130 | fi 131 | 132 | swT=$(cat $tmpF | cut -d' ' -f7) 133 | swT=${swT/T/" "} 134 | timEp=$(date -d "$swT" +%s 2>/dev/null) 135 | timeDiff=$(($timEp-$timCurrent)) 136 | 137 | if [[ $timeDiff -lt 0 ]]; then 138 | echo " --- Error using partition: $i" 139 | echo " " >> tmpwdir/error.log 140 | echo " --- Error using partition: $i" >> tmpwdir/error.log 141 | cat $tmpF >> tmpwdir/error.log 142 | echo "Current Time: $timCurrent, Partition Run Time: $timEp, Diff: $timeDiff" >> tmpwdir/error.log 143 | errorSt=1 144 | continue 145 | fi 146 | 147 | echo " --- $timeDiff: $i" >> tmpwdir/result.txt 148 | done 149 | 150 | if [ $errorSt = 1 ]; then 151 | echo " --- Check tmpwdir/error.log for error log" 152 | fi 153 | 154 | echo " " 155 | echo " --- Waiting time to run this job on SLURM partitions sorted by time (sec)" 156 | sort -k2 -n tmpwdir/result.txt 157 | 158 | # Remove temp files 159 | rm -f tmpwdir/slurm_*.txt tmpwdir/result.txt 160 | 161 | echo " " 162 | echo " --- Find SLURM submission scripts inside tmpwdir/ folder" 163 | fi 164 | -------------------------------------------------------------------------------- /submitCPU.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #SBATCH -J TensorFlow 4 | #SBATCH -p serial_requeue # partition (queue) 5 | #SBATCH -N 1 # number of nodes 6 | #SBATCH -n 1 # number of cores 7 | #SBATCH --mem 8000 # memory pool for all cores 8 | #SBATCH -t 0-06:00 # time (D-HH:MM) 9 | #SBATCH --export=ALL 10 | #SBATCH -o Job.%N.%j.out # STDOUT 11 | #SBATCH -e Job.%N.%j.err # STDERR 12 | 13 | module load Anaconda3/5.0.1-fasrc01 14 | 15 | python -c "import datetime; print(\"Date and time is: \" + str(datetime.datetime.now()))" 16 | --------------------------------------------------------------------------------