├── bash ├── archive_dir.sh ├── PBS_cluster_usage.sh └── slurm_free_nodes.sh ├── LICENSE ├── python ├── archive_subdirs.py └── queue_cc.py └── README.md /bash/archive_dir.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # A simple script to archive directories 4 | 5 | echo "Archiving $1" 6 | 7 | tar -zvcf $1.tar.gz $1 8 | 9 | echo "Done" 10 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Kwang Moo Yi 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /bash/PBS_cluster_usage.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Run the qstat command and store the output in a variable 4 | output=$(qstat -f gpu | grep "Job Id" | awk '{print $3}') 5 | 6 | # Initialize the total number of GPUs 7 | total_gpus=0 8 | 9 | # Read the output line by line and extract the number of GPUs 10 | while read -r line; do 11 | # Store qstat output for the current line in a variable 12 | job_output=$(qstat -f $line) 13 | 14 | # Extract the job state for the current line 15 | state=$(echo "$job_output" | grep "job_state" | awk '{print $3}') 16 | 17 | echo "Job $line is in state $state" 18 | 19 | # If state is R or B, then the job is running or begun, so add the number of GPUs to the total 20 | if [ "$state" == "R" ] || [ "$state" == "B" ]; then 21 | gpus=$(echo "$job_output" | grep "Resource_List.ngpus" | awk '{print $3}') 22 | 23 | if [ "$state" == "B" ]; then 24 | # Get the number of active runs from the "array_state_count" field 25 | active_runs=$(echo "$job_output" | grep "array_state_count" | awk '{print $4}' | awk -F: '{print $2}') 26 | 27 | # Multiply the number of GPUs by the number of active runs 28 | gpus=$((gpus * active_runs)) 29 | fi 30 | 31 | echo "Job $line is using $gpus GPUs" 32 | total_gpus=$((total_gpus + gpus)) 33 | 34 | fi 35 | 36 | echo "Running total for GPUs: $total_gpus" 37 | done <<< "$output" 38 | 39 | # Print the total number of GPUs 40 | echo "Total number of GPUs: $total_gpus" 41 | -------------------------------------------------------------------------------- /bash/slurm_free_nodes.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Loop through each unique node that has GPUs 4 | while IFS= read -r node; do 5 | # Fetch the detailed information for each node 6 | node_info=$(scontrol show node "$node") 7 | 8 | # Extract relevant information 9 | node_name=$(echo "$node_info" | grep -oP "(?<=NodeName=)\S+") 10 | total_cpus=$(echo "$node_info" | grep -oP "(?<=CPUTot=)\S+") 11 | alloc_cpus=$(echo "$node_info" | grep -oP "(?<=CPUAlloc=)\S+" || echo "0") 12 | total_memory=$(echo "$node_info" | grep -oP "(?<=RealMemory=)\S+") 13 | alloc_memory=$(echo "$node_info" | grep -oP "(?<=AllocMem=)\S+" || echo "0") 14 | 15 | # Calculate free memory and convert memory from MB to GB 16 | free_memory=$((total_memory - alloc_memory)) 17 | free_memory_gb=$(echo "scale=2; $free_memory/1024" | bc) 18 | total_memory_gb=$(echo "scale=2; $total_memory/1024" | bc) 19 | 20 | total_gpus=$(echo "$node_info" | grep "Gres=" | grep -oP "gpu:[^:]+:\K[0-9]+") 21 | alloc_gpus=$(echo "$node_info" | grep "AllocTRES=" | grep -oP "gres/gpu=\K[0-9]+" || echo "0") 22 | gpu_type=$(echo "$node_info" | grep -oP "(?<=AvailableFeatures=)\S+") 23 | users=$(squeue --nodes=$node_name --noheader --format="%u" | sort | uniq | paste -sd, -) 24 | 25 | # Calculate available resources 26 | free_cpus=$((total_cpus - alloc_cpus)) 27 | free_gpus=$((total_gpus - alloc_gpus)) 28 | 29 | # Print the information with memory in GB 30 | echo "$node_name available resources: $free_gpus/$total_gpus GPUs, $free_cpus/$total_cpus CPUs, ${free_memory_gb}GB/${total_memory_gb}GB memory, $gpu_type, users:$users" 31 | done < <(sinfo -N -o "%N %G" --noheader | grep "gpu:" | awk '{print $1}' | sort -u) 32 | -------------------------------------------------------------------------------- /python/archive_subdirs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # archive_subdirs.py --- 3 | # 4 | # Filename: archive_subdirs.py 5 | # Description: 6 | # Author: Kwang Moo Yi 7 | # Maintainer: 8 | # Created: Mon Feb 26 18:51:27 2018 (-0800) 9 | # Version: 10 | # Package-Requires: () 11 | # URL: 12 | # Doc URL: 13 | # Keywords: 14 | # Compatibility: 15 | # 16 | # 17 | 18 | # Commentary: 19 | # 20 | # 21 | # 22 | # 23 | 24 | # Change Log: 25 | # 26 | # 27 | # 28 | 29 | # Code: 30 | 31 | import argparse 32 | import os 33 | import socket 34 | import subprocess 35 | 36 | # ---------------------------------------- 37 | # Global variables within this script 38 | arg_lists = [] 39 | parser = argparse.ArgumentParser() 40 | 41 | 42 | def add_argument_group(name): 43 | arg = parser.add_argument_group(name) 44 | arg_lists.append(arg) 45 | return arg 46 | 47 | 48 | # ---------------------------------------- 49 | # Arguments for gloabal settings 50 | global_arg = add_argument_group("Global") 51 | 52 | 53 | global_arg.add_argument( 54 | "--account", type=str, 55 | default="def-kyi", 56 | help="Slurm account to use. " 57 | "Please change this to your compute canada account") 58 | 59 | # ---------------------------------------- 60 | # Arguments for model 61 | job_arg = add_argument_group("Job") 62 | 63 | job_arg.add_argument( 64 | "--archive_dir", type=str, 65 | default=None, 66 | help="Path to the parent directory which we will archive all " 67 | "its subdirectories to individual tar.gz files") 68 | job_arg.add_argument( 69 | "--num_gpu", type=int, 70 | default=0, 71 | help="Number of GPUs to use. Set zero to not use the gpu node.") 72 | job_arg.add_argument( 73 | "--num_cpu", type=str, 74 | default="auto", 75 | help="Number of CPU cores to use. Can be infered from the GPU." 76 | "Set 'auto' to do that.") 77 | job_arg.add_argument( 78 | "--mem", type=str, 79 | default="auto", 80 | help="Amount of memory to use. See compute canada wiki for details " 81 | "on large memory nodes. Typically, you don't want to go over 8G per " 82 | "CPU core") 83 | job_arg.add_argument( 84 | "--time_limit", type=str, 85 | default="0-12:00", 86 | help="Time limit on the jobs. If you can, 3 hours give you the best " 87 | "turn around.") 88 | 89 | 90 | def get_config(): 91 | config, unparsed = parser.parse_known_args() 92 | 93 | return config, unparsed 94 | 95 | 96 | def print_usage(): 97 | parser.print_usage() 98 | 99 | 100 | def main(config): 101 | """Main Function""" 102 | 103 | # Get hostname to identify the cluster 104 | hostname = socket.gethostname() 105 | 106 | # Identify cluster 107 | if hostname.startswith("gra"): 108 | cluster = "graham" 109 | elif hostname.startswith("cedar") or hostname.startswith("cdr"): 110 | cluster = "cedar" 111 | else: 112 | raise ValueError("Unknown cluster {}".format(hostname)) 113 | 114 | # # Get gpu usage statistics 115 | # num_gpu = config.num_gpu 116 | 117 | # For this opeation we will consume a full node 118 | num_cpu = config.num_cpu 119 | if num_cpu.lower() == "auto": 120 | if cluster == "cedar": 121 | num_cpu = 32 122 | elif cluster == "graham": 123 | num_cpu = 32 124 | mem = config.mem 125 | if mem.lower() == "auto": 126 | if cluster == "cedar": 127 | mem = "128000M" 128 | elif cluster == "graham": 129 | mem = "128000M" 130 | 131 | # Set time limit 132 | time_limit = config.time_limit 133 | 134 | if config.archive_dir is None: 135 | print_usage() 136 | exit(1) 137 | 138 | # For each file in the archive directory 139 | for _f in os.listdir(config.archive_dir): 140 | cur_dir = os.path.join(config.archive_dir, _f) 141 | # If not a dir continue to next one 142 | if not os.path.isdir(cur_dir): 143 | continue 144 | # If out file already exists, then simply skip 145 | if os.path.exists(cur_dir + "tar.gz"): 146 | continue 147 | # If it is a directory now queue the archive job 148 | com = ["sbatch"] 149 | com += ["--cpus-per-task={}".format(num_cpu)] 150 | com += ["--mem={}".format(mem)] 151 | com += ["--time={}".format(time_limit)] 152 | com += ["--account={}".format(config.account)] 153 | com += ["--output={}".format(cur_dir + ".tar.gz.out")] 154 | com += ["--export=ALL"] 155 | com += ["./bash/archive_dir.sh"] 156 | com += ["{}".format(cur_dir)] 157 | slurm_res = subprocess.run(com, stdout=subprocess.PIPE) 158 | print(slurm_res.stdout.decode()) 159 | # Get job ID 160 | if slurm_res.returncode != 0: 161 | raise RuntimeError("Slurm error!") 162 | 163 | 164 | if __name__ == "__main__": 165 | 166 | # ---------------------------------------- 167 | # Parse configuration 168 | config, unparsed = get_config() 169 | # If we have unparsed arguments, print usage and exit 170 | if len(unparsed) > 0: 171 | print_usage() 172 | exit(1) 173 | 174 | main(config) 175 | 176 | 177 | # 178 | # archive_subdirs.py ends here 179 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | This repository has moved to a our groups repository! 2 | [https://github.com/vcg-uvic/compute-canada-goodies](https://github.com/vcg-uvic/compute-canada-goodies) 3 | 4 | # compute-canada-goodies 5 | Automation scripts for compute canada 6 | 7 | This script is to simplify queueing jobs on Compute Canada. 8 | Simply run `./queue_cc.py` and it will queue your jobs in jobs/todo folder. 9 | Your jobs should be a shell script with proper shebang. 10 | 11 | Also try `./queue_cc.py --help`, or see `config.py` for more details. 12 | 13 | # Tutorial on how to setup Compute Canada for TensorFlow 14 | 15 | This tutorial assumes that you have setup a computecanada account, either through your advisor as your sponsor, or me. If you do not have one, simply create a computecanada account by visiting the [CCDB website](https://ccdb.computecanada.ca). I strongly suggest to ask your advisor to create the CCDB account, and tell him to be a supervisor. My CCDB account is already either given to you personally, or on the Lecture Slides. 16 | 17 | ## IMPORTANT NOTES BEFORE WE DIVE IN 18 | 19 | ComputeCanada is shared across researchers in Canada. Also, the login-nodes are not meant to run **any computationally demanding** job. This includes **compiling** and launching **TensorBoard**. Do **NOT** do them. They may crash the login-node, thus crashing the **ENTIRE SYSTEM**. This will get recorded and may cause you to get banned. So for running these jobs, make sure that you 20 | 21 | - Compile on compute nodes 22 | - Run TensorBoard locally, after downloading the logs from the server. 23 | 24 | ## About the file system 25 | 26 | Another thing is that the file system on ComputeCanada is NFS, with dedicated purposes. `project` is for shared projects. You will **NOT** need to use them unless you are saving results for **shared projects**. `home` is where you store your stuff, but **NOT** the code. `scratch` is where you want to have everything. Use `scratch/` to store your code, and the results, and then copy only the **IMPORTANT** ones to your home or projects. `scratch` has the **FASTEST** access delay. So you really want to do everything here. For more details visit [here](https://docs.computecanada.ca/wiki/Storage_and_file_management) 27 | 28 | ## Resources (pr-kmyi group only) 29 | 1. 187 core-years on the cedar-compute system 30 | 2. 8.0 RGU-years on the cedar-gpu system 31 | 3. 24.0 RGU-years on the narval-gpu system 32 | 4. 100 TB of /project storage on the narval-storage system 33 | 5. 40 TB of /nearline storage on the narval-storage system 34 | 6. 200 TB of /project storage on the cedar-storage system 35 | 7. 8.0 Millions /project inodes on the cedar-storage system 36 | 37 | ## The module system 38 | 39 | One very interesting thing about the ComputeCanada setup is that you can simply load the modules on-demand. These modules could be `CUDA`, `CUDNN`, `GCC`, `OpenCV` or whatever you would like. You can also specify their versions. To see which modules are loaded, do 40 | ``` 41 | $ module list 42 | ``` 43 | So to have proper modules we need for our case, simply do the following: 44 | ``` 45 | $ module load cudnn cuda eigen python/3 46 | ``` 47 | In my case, I also unload some of the modules that I won't be using, so I have the following lines in my `.bashrc` -- **NOTE THAT THIS IS STRONGLY DISCOURAGED. IT WILL CAUSE ISSUES** 48 | ``` 49 | module unload icc gcccore ifort intel imkl openmpi 50 | module load gcc/5.4.0 cuda cudnn eigen python/3 51 | module unload icc gcccore ifort intel imkl openmpi jasper java 52 | ``` 53 | 54 | ## Installing TensorFlow 55 | One very important thing in ComputeCanada, is that some of the libraries, including TensorFlow, will not come from standard `pip`. They use a custom `wheelhouse`. So you need to **specifically use their virtualenv**. So please carefully follow the instructions at [their wiki page](https://docs.computecanada.ca/wiki/Tensorflow). If you are usig `virtualenvwrapper` you **must** point to the correct `virtualenv` binary, or it will not work. I suggest you stick to the ComputeCanada instructions to keep your life simple. 56 | 57 | ## Using salloc 58 | The following command will give you access to one of the interactive nodes for you to debug your code. Notice that I set the account to my account `def-kyi`, but you probably want to replace that with your advisor's. 59 | ``` 60 | salloc --time=03:00:00 --gres=gpu:1 --cpus-per-task=6 --account=def-kyi --mem=31000M 61 | ``` 62 | Note the 3 hour limit and 1 GPU, 6Cores, 32G MEM. This is for cedar, where each GPU node has 24 cores and 128G MEM. You want to access a *virtual* node with 1 GPU, so we divide the resource equally with four. For Graham, the setting will be a bit different. See [here for Cedar](https://docs.computecanada.ca/wiki/Cedar) and [here for Graham](https://docs.computecanada.ca/wiki/Graham). 63 | 64 | Use these interactive nodes for debugging and then launch your batch jobs, once you are sure that it will work. The current `queue_cc.py` script expects your jobs to be located at `jobs/todo/`. So in fact, you can simply emulate what will happen with your batch job by running 65 | ``` 66 | ./jobs/todo/.sh 67 | ``` 68 | 69 | ## Launching scripts 70 | Once you are sure that your script runs, run `queue_cc.py` **IN YOUR VIRTUAL ENVIRONMENT** to queue your jobs as batch jobs. Note that your current `shell` environment will get carried on to your batch jobs. So you want to be in the virtual environment that you want your jobs to run in. 71 | 72 | ## Monitoring the job queue 73 | Once your jobs is in the queue, you can type 74 | ``` 75 | squeue -u 76 | ``` 77 | to see how it's doing. **BE PREPARED TO WAIT**. Your jobs will be in the queue for possibly many days before it runs. If it's there for more than 5 days, contact me. However, the scheduling policy prefers 3 hour jobs, as they can fit between long hour jobs. Since we will be using TF, saving and restoring is trivial. Do make your jobs save roughly 15 minutes, so that your job **dies** on three hour time limit, and restarts using dependencies. The `queue_cc.py` actually will take care of that for you. To make it even more efficient, a good way for you is that whenever you save your model, monitor the time you have left (roughly) and quit your training when you don't think the next save interval cannot fit in the 3 hour limit. This way, you can maximize the turn-around in the cluster. [Here's more info on the job scheduling policy](https://docs.computecanada.ca/wiki/Job_scheduling_policies) 78 | 79 | ### Switching back and forth between Cedar and Graham 80 | 81 | Cedar and Graham do not share queues. They are independent. In fact, you can possibly code so that the two clusters are identical in their setup and bounce back between your clusters. The job scheduling is based on priority, and this is set according to your latest two week window. So if you were not running on one cluster, in fact, your priority on that cluster is probably quite good. So it is worth bouncing back and forth. However, this is per CCDB account. So you might want to check who else is running on your advisor's account by doing 82 | ``` 83 | squeue -A def-kyi_gpu 84 | ``` 85 | Here, `def-kyi_gpu` is my CCDB account for the gpu machines. Your advisor's account would be different. 86 | 87 | ## Monitoring your job outputs 88 | Be aware that your shell outputs will be forwarded in `jobs/output/` but they **are not guaranteed to be up-to-date**. The outputs there are not very trustworthy. You probably want to look at TensorBoard outputs, which will be up to date. **AGAIN DO NOT RUN TENSORBOARD IN LOGIN NODES**. 89 | 90 | 91 | 92 | -------------------------------------------------------------------------------- /python/queue_cc.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # queue_cc.py --- 3 | # 4 | # Filename: queue_cc.py 5 | # Description: 6 | # Author: Kwang Moo Yi 7 | # Maintainer: 8 | # Created: Mon Jan 29 17:56:38 2018 (-0800) 9 | # Version: 10 | # Package-Requires: () 11 | # URL: 12 | # Doc URL: 13 | # Keywords: 14 | # Compatibility: 15 | # 16 | # 17 | 18 | # Commentary: 19 | # 20 | # 21 | # 22 | # 23 | 24 | # Change Log: 25 | # 26 | # 27 | # 28 | 29 | # Code: 30 | 31 | import argparse 32 | import getpass 33 | import os 34 | import shutil 35 | import socket 36 | import subprocess 37 | import datetime 38 | 39 | # ---------------------------------------- 40 | # Global variables within this script 41 | arg_lists = [] 42 | parser = argparse.ArgumentParser() 43 | cluster_config = { 44 | "cedar": 45 | { 46 | "gpu_model": "p100", 47 | "gpus_per_node": 4, 48 | "cpu_cores_per_node": 24, 49 | "threads_per_node": 48, 50 | "cpu_cores_per_gpu": 6, 51 | "threads_per_gpu": 12, 52 | "ram_per_node": 128000, 53 | "ram_per_gpu": 31500, 54 | "job_system": "slurm", 55 | "default_account": "rrg-kyi", 56 | }, 57 | "graham": 58 | { 59 | "gpu_model": "p100", 60 | "gpus_per_node": 2, 61 | "cpu_cores_per_node": 32, 62 | "threads_per_node": 64, 63 | "cpu_cores_per_gpu": 16, 64 | "threads_per_gpu": 32, 65 | "ram_per_node": 127518, 66 | "ram_per_gpu": 63500, 67 | "job_system": "slurm", 68 | "default_account": "def-kyi-ab", 69 | }, 70 | "beluga": 71 | { 72 | "gpu_model": "v100", 73 | "gpus_per_node": 4, 74 | "cpu_cores_per_node": 40, 75 | "threads_per_node": 80, 76 | "cpu_cores_per_gpu": 10, 77 | "threads_per_gpu": 20, 78 | "ram_per_node": 191000, 79 | "ram_per_gpu": 47500, 80 | "job_system": "slurm", 81 | "default_account": "def-kyi-ab", 82 | }, 83 | "moo": 84 | { 85 | "gpu_model": "v100", 86 | "gpus_per_node": 8, 87 | "cpu_cores_per_node": 28, 88 | "threads_per_node": 56, 89 | "cpu_cores_per_gpu": 3, 90 | "threads_per_gpu": 7, 91 | "ram_per_node": 191000, 92 | "ram_per_gpu": 23875, 93 | "job_system": "slurm", 94 | "default_account": "def-kyi", 95 | }, 96 | "sockeye": 97 | { 98 | "gpu_model": "v100", 99 | "gpus_per_node": 4, 100 | "cpu_cores_per_node": 24, 101 | "threads_per_node": None, 102 | "cpu_cores_per_gpu": 6, 103 | "threads_per_gpu": None, 104 | "ram_per_node": 191000, 105 | "ram_per_gpu": 47750, 106 | "job_system": "PBS", 107 | "default_account": "pr-kmyi-1", 108 | "default_gpu_account": "pr-kmyi-1-gpu", 109 | }, 110 | "narval": 111 | { 112 | "gpu_model": "a100", 113 | "gpus_per_node": 4, 114 | "cpu_cores_per_node": 48, 115 | "threads_per_node": 96, 116 | "cpu_cores_per_gpu": 12, 117 | "threads_per_gpu": 24, 118 | "ram_per_node": 510000, 119 | "ram_per_gpu": 127500, 120 | "job_system": "slurm", 121 | "default_account": "def-kyi-ab", 122 | }, 123 | "snubfin": 124 | { 125 | "cpu_cores_per_gpu": 10, 126 | "ram_per_gpu": 23785, 127 | "job_system": "slurm", 128 | "partition": "snubfin", 129 | }, 130 | } 131 | 132 | 133 | def slurm_command(num_cpu, num_gpu, mem, time_limit, dep_str, account, output_dir, job, partition, nodelist): 134 | com = ["sbatch"] 135 | com += ["--cpus-per-task={}".format(num_cpu)] 136 | if num_gpu > 0: 137 | com += ["--gres=gpu:{}".format(num_gpu)] 138 | com += ["--mem={}".format(mem)] 139 | com += ["--time={}".format(time_limit)] 140 | if len(dep_str) > 0: 141 | com += ["--dependency=afterany:{}".format(dep_str)] 142 | if len(account) > 0: 143 | com += ["--account={}".format(account)] 144 | if partition and len(partition) > 0: 145 | com += ["--partition={}".format(partition)] 146 | if nodelist and len(nodelist) > 0: 147 | com += ["--nodelist={}".format(nodelist)] 148 | com += ["--output={}/%x-%j.out".format(output_dir)] 149 | com += ["--export=ALL"] 150 | com += [job] 151 | return com 152 | 153 | 154 | def PBS_command(num_cpu, num_gpu, mem, time_limit, dep_str, account, output_dir, job, partition, nodelist): 155 | com = ["qsub"] 156 | if num_gpu > 0: 157 | com += ["-l", "walltime={0},select=1:ncpus={1}:mem={2}:ngpus={3}".format(time_limit, num_cpu, mem, num_gpu)] 158 | else: 159 | com += ["-l", "walltime={0},select=1:ncpus={1}:mem={2}".format(time_limit, num_cpu, mem)] 160 | if len(dep_str) > 0: 161 | com += ["-W", "depend=afterany:{}".format(dep_str)] 162 | com += ["-A", "{}".format(account)] 163 | com += ["-o", "{0}/{1}_{2}.out".format(output_dir, 164 | os.path.basename(job), 165 | str(datetime.datetime.now()).replace(" ", "_").replace(":", "_"), 166 | )] 167 | com += ["-e", "{0}/{1}_{2}.err".format(output_dir, 168 | os.path.basename(job), 169 | str(datetime.datetime.now()).replace(" ", "_").replace(":", "_"), 170 | )] 171 | com += [job] 172 | return com 173 | 174 | 175 | def add_argument_group(name): 176 | arg = parser.add_argument_group(name) 177 | arg_lists.append(arg) 178 | return arg 179 | 180 | 181 | # ---------------------------------------- 182 | # Arguments for training 183 | global_arg = add_argument_group("Global") 184 | 185 | global_arg.add_argument( 186 | "--account", type=str, 187 | default=None, 188 | help="Slurm account to use. " 189 | "Please change this to your compute canada account") 190 | 191 | 192 | global_arg.add_argument( 193 | "--cluster", type=str, 194 | default=None, 195 | help="Name of the cluster.") 196 | 197 | 198 | global_arg.add_argument( 199 | "--todo_dir", type=str, 200 | default="./jobs/todo", 201 | help="Path to directory containing shell scripts to run.") 202 | 203 | global_arg.add_argument( 204 | "--done_dir", type=str, 205 | default="./jobs/done", 206 | help="Path to directory that the program will move queued scripts.") 207 | 208 | global_arg.add_argument( 209 | "--output_dir", type=str, 210 | default="./jobs/output", 211 | help="Directory that will contain job outputs.") 212 | 213 | # ---------------------------------------- 214 | # Arguments for model 215 | job_arg = add_argument_group("Job") 216 | 217 | job_arg.add_argument( 218 | "--num_jobs", type=int, 219 | default=1, 220 | help="Number of shell scripts to queue from the TODO_DIR.") 221 | job_arg.add_argument( 222 | "--num_runs", type=int, 223 | default=5, 224 | help="Number of times this shell script will be executed. " 225 | "This is useful when running 3 hour jobs that run multiple times.") 226 | job_arg.add_argument( 227 | "--num_gpu", type=int, 228 | default=1, 229 | help="Number of GPUs to use. Set zero to not use the gpu node.") 230 | job_arg.add_argument( 231 | "--num_cpu", type=str, 232 | default="auto", 233 | help="Number of CPU cores to use. Can be infered from the GPU." 234 | "Set 'auto' to do that.") 235 | job_arg.add_argument( 236 | "--mem", type=str, 237 | default="auto", 238 | help="Amount of memory to use. See compute canada wiki for details " 239 | "on large memory nodes. Typically, you don't want to go over 8G per " 240 | "CPU core") 241 | job_arg.add_argument( 242 | "--time_limit", type=str, 243 | default="03:00:00", 244 | help="Time limit on the jobs. If you can, 3 hours give you the best " 245 | "turn around. Hours:Minutes:Seconds") 246 | job_arg.add_argument( 247 | "--depends_key", type=str, 248 | default="none", 249 | help="In case you want to schedule your jobs depending on something. " 250 | "Set to 'none' if not wanted.") 251 | job_arg.add_argument( 252 | "--partition", type=str, 253 | default=None, 254 | help="Partition to be used.") 255 | job_arg.add_argument( 256 | "--nodelist", type=str, 257 | default=None, 258 | help="List of nodes to be used.") 259 | 260 | 261 | def get_config(): 262 | config, unparsed = parser.parse_known_args() 263 | 264 | return config, unparsed 265 | 266 | 267 | def print_usage(): 268 | parser.print_usage() 269 | 270 | 271 | def main(config): 272 | """Main Function""" 273 | 274 | # Check if directories exist and create them if necessary 275 | if not os.path.exists(config.todo_dir): 276 | os.makedirs(config.todo_dir) 277 | if not os.path.exists(config.done_dir): 278 | os.makedirs(config.done_dir) 279 | if not os.path.exists(config.output_dir): 280 | os.makedirs(config.output_dir) 281 | 282 | # Get hostname and user name 283 | username = getpass.getuser() 284 | hostname = socket.gethostname() 285 | 286 | # Identify cluster 287 | if config.cluster is None: 288 | if hostname.startswith("gra"): 289 | cluster = "graham" 290 | elif hostname.startswith("cedar") or hostname.startswith("cdr"): 291 | cluster = "cedar" 292 | elif hostname.startswith("beluga") or hostname.startswith("blg"): 293 | cluster = "beluga" 294 | elif hostname.startswith("stirk"): 295 | cluster = "moo" 296 | elif hostname.startswith("se"): 297 | cluster = "sockeye" 298 | elif hostname.startswith("narval"): 299 | cluster = "narval" 300 | elif hostname.startswith("borg"): 301 | cluster = "snubfin" 302 | else: 303 | raise ValueError("Unknown cluster {}".format(hostname)) 304 | else: 305 | cluster = config.cluster 306 | 307 | # Get gpu usage statistics 308 | num_gpu = config.num_gpu 309 | 310 | # Apply default account if not specified 311 | if config.account is None: 312 | if "default_gpu_account" not in cluster_config[cluster] and "default_account" not in cluster_config[cluster]: 313 | config.account = "" 314 | elif num_gpu > 0 and "default_gpu_account" in cluster_config[cluster]: 315 | config.account = cluster_config[cluster]["default_gpu_account"] 316 | else: 317 | config.account = cluster_config[cluster]["default_account"] 318 | if config.partition is None and "partition" in cluster_config[cluster]: 319 | config.partition = cluster_config[cluster]["partition"] 320 | if config.nodelist is None and "nodelist" in cluster_config[cluster]: 321 | config.nodelist = cluster_config[cluster]["nodelist"] 322 | 323 | # Set options or automatically infer CPU and MEM 324 | num_cpu = config.num_cpu 325 | if num_cpu.lower() == "auto": 326 | if num_gpu > 0: 327 | num_cores_per_gpu = cluster_config[cluster]["cpu_cores_per_gpu"] 328 | num_cpu = str(num_cores_per_gpu * num_gpu) 329 | mem = config.mem 330 | if mem.lower() == "auto": 331 | if num_gpu > 0: 332 | ram_per_gpu = cluster_config[cluster]["ram_per_gpu"] 333 | mem = str(ram_per_gpu * num_gpu) + "M" 334 | 335 | # Set time limit 336 | time_limit = config.time_limit 337 | 338 | # Get jobs that this new job should depend on. 339 | job_depends = [] 340 | if config.depends_key != "none": 341 | assert cluster_config[cluster]["job_system"] == "slurm" 342 | squeue_res = subprocess.run( 343 | ["squeue", "-u", username], 344 | stdout=subprocess.PIPE 345 | ) 346 | job_details = squeue_res.stdout.decode().split("\n")[1:] 347 | # For each job create a list of IDs 348 | for _str in job_details: 349 | # Look for job dependency keys in string 350 | if config.depends_key in _str: 351 | # Add to the list of dependent jobs 352 | job_depends += [str(int(_str.split()[0]))] 353 | 354 | # Run jobs 355 | for idx_job in range(config.num_jobs): 356 | # Grab a job from the list of jobs 357 | found_job = False 358 | # Sort -- Just in case 359 | list_files = os.listdir(config.todo_dir) 360 | list_files.sort() 361 | for _f in list_files: 362 | if _f.endswith(".sh"): 363 | job_script = _f 364 | print("Queueing script {}".format( 365 | os.path.join(config.todo_dir, job_script) 366 | )) 367 | found_job = True 368 | break 369 | if not found_job: 370 | raise RuntimeError("No job found in {}".format(config.todo_dir)) 371 | # Move that job to the done folder 372 | shutil.move( 373 | os.path.join(config.todo_dir, job_script), 374 | os.path.join(config.done_dir, job_script), 375 | ) 376 | # Build Initial dependency (from the job_depends) 377 | dep_str = ":".join(job_depends) 378 | # Run job N times 379 | for idx_run in range(config.num_runs): 380 | if cluster_config[cluster]["job_system"] == "slurm": 381 | com = slurm_command(num_cpu, 382 | num_gpu, 383 | mem, 384 | time_limit, 385 | dep_str, 386 | config.account, 387 | config.output_dir, 388 | os.path.join(config.done_dir, job_script), 389 | config.partition, 390 | config.nodelist) 391 | elif cluster_config[cluster]["job_system"] == "PBS": 392 | com = PBS_command(num_cpu, 393 | num_gpu, 394 | mem, 395 | time_limit, 396 | dep_str, 397 | config.account, 398 | config.output_dir, 399 | os.path.join(config.done_dir, job_script), 400 | config.partition, 401 | config.nodelist) 402 | slurm_res = subprocess.run(com, stdout=subprocess.PIPE) 403 | print(slurm_res.stdout.decode()) 404 | # Get job ID 405 | if slurm_res.returncode != 0: 406 | raise RuntimeError("Slurm/PBS error!") 407 | job_id = slurm_res.stdout.decode().split()[-1] 408 | dep_str = str(job_id) 409 | 410 | 411 | if __name__ == "__main__": 412 | 413 | # ---------------------------------------- 414 | # Parse configuration 415 | config, unparsed = get_config() 416 | # If we have unparsed arguments, print usage and exit 417 | if len(unparsed) > 0: 418 | print_usage() 419 | exit(1) 420 | 421 | main(config) 422 | 423 | # 424 | # queue_cc.py ends here 425 | --------------------------------------------------------------------------------