├── bash
    ├── archive_dir.sh
    ├── PBS_cluster_usage.sh
    └── slurm_free_nodes.sh
├── LICENSE
├── python
    ├── archive_subdirs.py
    └── queue_cc.py
└── README.md


/bash/archive_dir.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # A simple script to archive directories
 4 | 
 5 | echo "Archiving $1"
 6 | 
 7 | tar -zvcf $1.tar.gz $1
 8 | 
 9 | echo "Done"
10 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 Kwang Moo Yi
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/bash/PBS_cluster_usage.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Run the qstat command and store the output in a variable
 4 | output=$(qstat -f gpu | grep "Job Id" | awk '{print $3}')
 5 | 
 6 | # Initialize the total number of GPUs
 7 | total_gpus=0
 8 | 
 9 | # Read the output line by line and extract the number of GPUs
10 | while read -r line; do
11 |     # Store qstat output for the current line in a variable
12 |     job_output=$(qstat -f $line)
13 | 
14 |     # Extract the job state for the current line
15 |     state=$(echo "$job_output" | grep "job_state" | awk '{print $3}')
16 | 
17 |     echo "Job $line is in state $state"
18 | 
19 |     # If state is R or B, then the job is running or begun, so add the number of GPUs to the total
20 |     if [ "$state" == "R" ] || [ "$state" == "B" ];  then
21 |         gpus=$(echo "$job_output" | grep "Resource_List.ngpus" | awk '{print $3}')
22 | 
23 |         if [ "$state" == "B" ]; then
24 |             # Get the number of active runs from the "array_state_count" field
25 |             active_runs=$(echo "$job_output" | grep "array_state_count" | awk '{print $4}' | awk -F: '{print $2}')
26 | 
27 |             # Multiply the number of GPUs by the number of active runs
28 |             gpus=$((gpus * active_runs))
29 |         fi
30 | 
31 |         echo "Job $line is using $gpus GPUs"
32 |         total_gpus=$((total_gpus + gpus))
33 | 
34 |     fi
35 | 
36 |     echo "Running total for GPUs: $total_gpus"
37 | done <<< "$output"
38 | 
39 | # Print the total number of GPUs
40 | echo "Total number of GPUs: $total_gpus"
41 | 


--------------------------------------------------------------------------------
/bash/slurm_free_nodes.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Loop through each unique node that has GPUs
 4 | while IFS= read -r node; do
 5 |   # Fetch the detailed information for each node
 6 |   node_info=$(scontrol show node "$node")
 7 | 
 8 |   # Extract relevant information
 9 |   node_name=$(echo "$node_info" | grep -oP "(?<=NodeName=)\S+")
10 |   total_cpus=$(echo "$node_info" | grep -oP "(?<=CPUTot=)\S+")
11 |   alloc_cpus=$(echo "$node_info" | grep -oP "(?<=CPUAlloc=)\S+" || echo "0")
12 |   total_memory=$(echo "$node_info" | grep -oP "(?<=RealMemory=)\S+")
13 |   alloc_memory=$(echo "$node_info" | grep -oP "(?<=AllocMem=)\S+" || echo "0")
14 | 
15 |   # Calculate free memory and convert memory from MB to GB
16 |   free_memory=$((total_memory - alloc_memory))
17 |   free_memory_gb=$(echo "scale=2; $free_memory/1024" | bc)
18 |   total_memory_gb=$(echo "scale=2; $total_memory/1024" | bc)
19 | 
20 |   total_gpus=$(echo "$node_info" | grep "Gres=" | grep -oP "gpu:[^:]+:\K[0-9]+")
21 |   alloc_gpus=$(echo "$node_info" | grep "AllocTRES=" | grep -oP "gres/gpu=\K[0-9]+" || echo "0")
22 |   gpu_type=$(echo "$node_info" | grep -oP "(?<=AvailableFeatures=)\S+")
23 |   users=$(squeue --nodes=$node_name --noheader --format="%u" | sort | uniq | paste -sd, -)
24 | 
25 |   # Calculate available resources
26 |   free_cpus=$((total_cpus - alloc_cpus))
27 |   free_gpus=$((total_gpus - alloc_gpus))
28 | 
29 |   # Print the information with memory in GB
30 |   echo "$node_name available resources: $free_gpus/$total_gpus GPUs, $free_cpus/$total_cpus CPUs, ${free_memory_gb}GB/${total_memory_gb}GB memory, $gpu_type, users:$users"
31 | done < <(sinfo -N -o "%N %G" --noheader | grep "gpu:" | awk '{print $1}' | sort -u)
32 | 


--------------------------------------------------------------------------------
/python/archive_subdirs.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # archive_subdirs.py ---
  3 | #
  4 | # Filename: archive_subdirs.py
  5 | # Description:
  6 | # Author: Kwang Moo Yi
  7 | # Maintainer:
  8 | # Created: Mon Feb 26 18:51:27 2018 (-0800)
  9 | # Version:
 10 | # Package-Requires: ()
 11 | # URL:
 12 | # Doc URL:
 13 | # Keywords:
 14 | # Compatibility:
 15 | #
 16 | #
 17 | 
 18 | # Commentary:
 19 | #
 20 | #
 21 | #
 22 | #
 23 | 
 24 | # Change Log:
 25 | #
 26 | #
 27 | #
 28 | 
 29 | # Code:
 30 | 
 31 | import argparse
 32 | import os
 33 | import socket
 34 | import subprocess
 35 | 
 36 | # ----------------------------------------
 37 | # Global variables within this script
 38 | arg_lists = []
 39 | parser = argparse.ArgumentParser()
 40 | 
 41 | 
 42 | def add_argument_group(name):
 43 |     arg = parser.add_argument_group(name)
 44 |     arg_lists.append(arg)
 45 |     return arg
 46 | 
 47 | 
 48 | # ----------------------------------------
 49 | # Arguments for gloabal settings
 50 | global_arg = add_argument_group("Global")
 51 | 
 52 | 
 53 | global_arg.add_argument(
 54 |     "--account", type=str,
 55 |     default="def-kyi",
 56 |     help="Slurm account to use. "
 57 |     "Please change this to your compute canada account")
 58 | 
 59 | # ----------------------------------------
 60 | # Arguments for model
 61 | job_arg = add_argument_group("Job")
 62 | 
 63 | job_arg.add_argument(
 64 |     "--archive_dir", type=str,
 65 |     default=None,
 66 |     help="Path to the parent directory which we will archive all "
 67 |     "its subdirectories to individual tar.gz files")
 68 | job_arg.add_argument(
 69 |     "--num_gpu", type=int,
 70 |     default=0,
 71 |     help="Number of GPUs to use. Set zero to not use the gpu node.")
 72 | job_arg.add_argument(
 73 |     "--num_cpu", type=str,
 74 |     default="auto",
 75 |     help="Number of CPU cores to use. Can be infered from the GPU."
 76 |     "Set 'auto' to do that.")
 77 | job_arg.add_argument(
 78 |     "--mem", type=str,
 79 |     default="auto",
 80 |     help="Amount of memory to use. See compute canada wiki for details "
 81 |     "on large memory nodes. Typically, you don't want to go over 8G per "
 82 |     "CPU core")
 83 | job_arg.add_argument(
 84 |     "--time_limit", type=str,
 85 |     default="0-12:00",
 86 |     help="Time limit on the jobs. If you can, 3 hours give you the best "
 87 |     "turn around.")
 88 | 
 89 | 
 90 | def get_config():
 91 |     config, unparsed = parser.parse_known_args()
 92 | 
 93 |     return config, unparsed
 94 | 
 95 | 
 96 | def print_usage():
 97 |     parser.print_usage()
 98 | 
 99 | 
100 | def main(config):
101 |     """Main Function"""
102 | 
103 |     # Get hostname to identify the cluster
104 |     hostname = socket.gethostname()
105 | 
106 |     # Identify cluster
107 |     if hostname.startswith("gra"):
108 |         cluster = "graham"
109 |     elif hostname.startswith("cedar") or hostname.startswith("cdr"):
110 |         cluster = "cedar"
111 |     else:
112 |         raise ValueError("Unknown cluster {}".format(hostname))
113 | 
114 |     # # Get gpu usage statistics
115 |     # num_gpu = config.num_gpu
116 | 
117 |     # For this opeation we will consume a full node
118 |     num_cpu = config.num_cpu
119 |     if num_cpu.lower() == "auto":
120 |         if cluster == "cedar":
121 |             num_cpu = 32
122 |         elif cluster == "graham":
123 |             num_cpu = 32
124 |     mem = config.mem
125 |     if mem.lower() == "auto":
126 |         if cluster == "cedar":
127 |             mem = "128000M"
128 |         elif cluster == "graham":
129 |             mem = "128000M"
130 | 
131 |     # Set time limit
132 |     time_limit = config.time_limit
133 | 
134 |     if config.archive_dir is None:
135 |         print_usage()
136 |         exit(1)
137 | 
138 |     # For each file in the archive directory
139 |     for _f in os.listdir(config.archive_dir):
140 |         cur_dir = os.path.join(config.archive_dir, _f)
141 |         # If not a dir continue to next one
142 |         if not os.path.isdir(cur_dir):
143 |             continue
144 |         # If out file already exists, then simply skip
145 |         if os.path.exists(cur_dir + "tar.gz"):
146 |             continue
147 |         # If it is a directory now queue the archive job
148 |         com = ["sbatch"]
149 |         com += ["--cpus-per-task={}".format(num_cpu)]
150 |         com += ["--mem={}".format(mem)]
151 |         com += ["--time={}".format(time_limit)]
152 |         com += ["--account={}".format(config.account)]
153 |         com += ["--output={}".format(cur_dir + ".tar.gz.out")]
154 |         com += ["--export=ALL"]
155 |         com += ["./bash/archive_dir.sh"]
156 |         com += ["{}".format(cur_dir)]
157 |         slurm_res = subprocess.run(com, stdout=subprocess.PIPE)
158 |         print(slurm_res.stdout.decode())
159 |         # Get job ID
160 |         if slurm_res.returncode != 0:
161 |             raise RuntimeError("Slurm error!")
162 | 
163 | 
164 | if __name__ == "__main__":
165 | 
166 |     # ----------------------------------------
167 |     # Parse configuration
168 |     config, unparsed = get_config()
169 |     # If we have unparsed arguments, print usage and exit
170 |     if len(unparsed) > 0:
171 |         print_usage()
172 |         exit(1)
173 | 
174 |     main(config)
175 | 
176 | 
177 | #
178 | # archive_subdirs.py ends here
179 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | This repository has moved to a our groups repository!
 2 | [https://github.com/vcg-uvic/compute-canada-goodies](https://github.com/vcg-uvic/compute-canada-goodies)
 3 | 
 4 | # compute-canada-goodies
 5 | Automation scripts for compute canada
 6 | 
 7 | This script is to simplify queueing jobs on Compute Canada.
 8 | Simply run `./queue_cc.py` and it will queue your jobs in jobs/todo folder.
 9 | Your jobs should be a shell script with proper shebang.
10 | 
11 | Also try `./queue_cc.py --help`, or see `config.py` for more details.
12 | 
13 | # Tutorial on how to setup Compute Canada for TensorFlow
14 | 
15 | This tutorial assumes that you have setup a computecanada account, either through your advisor as your sponsor, or me. If you do not have one, simply create a computecanada account by visiting the [CCDB website](https://ccdb.computecanada.ca). I strongly suggest to ask your advisor to create the CCDB account, and tell him to be a supervisor. My CCDB account is already either given to you personally, or on the Lecture Slides.
16 | 
17 | ## IMPORTANT NOTES BEFORE WE DIVE IN
18 | 
19 | ComputeCanada is shared across researchers in Canada. Also, the login-nodes are not meant to run **any computationally demanding** job. This includes **compiling** and launching **TensorBoard**. Do **NOT** do them. They may crash the login-node, thus crashing the **ENTIRE SYSTEM**. This will get recorded and may cause you to get banned. So for running these jobs, make sure that you
20 | 
21 | - Compile on compute nodes
22 | - Run TensorBoard locally, after downloading the logs from the server.
23 | 
24 | ## About the file system
25 | 
26 | Another thing is that the file system on ComputeCanada is NFS, with dedicated purposes. `project` is for shared projects. You will **NOT** need to use them unless you are saving results for **shared projects**. `home` is where you store your stuff, but **NOT** the code. `scratch` is where you want to have everything. Use `scratch/<login-id>` to store your code, and the results, and then copy only the **IMPORTANT** ones to your home or projects. `scratch` has the **FASTEST** access delay. So you really want to do everything here. For more details visit [here](https://docs.computecanada.ca/wiki/Storage_and_file_management)
27 | 
28 | ## Resources (pr-kmyi group only)
29 | 1. 187 core-years on the cedar-compute system
30 | 2. 8.0 RGU-years on the cedar-gpu system
31 | 3. 24.0 RGU-years on the narval-gpu system
32 | 4. 100 TB of /project storage on the narval-storage system
33 | 5. 40 TB of /nearline storage on the narval-storage system
34 | 6. 200 TB of /project storage on the cedar-storage system
35 | 7. 8.0 Millions /project inodes on the cedar-storage system
36 | 
37 | ## The module system
38 | 
39 | One very interesting thing about the ComputeCanada setup is that you can simply load the modules on-demand. These modules could be `CUDA`, `CUDNN`, `GCC`, `OpenCV` or whatever you would like. You can also specify their versions. To see which modules are loaded, do
40 | ```
41 | $ module list
42 | ```
43 | So to have proper modules we need for our case, simply do the following:
44 | ```
45 | $ module load cudnn cuda eigen python/3
46 | ```
47 | In my case, I also unload some of the modules that I won't be using, so I have the following lines in my `.bashrc` -- **NOTE THAT THIS IS STRONGLY DISCOURAGED. IT WILL CAUSE ISSUES**
48 | ```
49 | module unload icc gcccore ifort intel imkl openmpi
50 | module load gcc/5.4.0 cuda cudnn eigen python/3
51 | module unload icc gcccore ifort intel imkl openmpi jasper java
52 | ```
53 | 
54 | ## Installing TensorFlow
55 | One very important thing in ComputeCanada, is that some of the libraries, including TensorFlow, will not come from standard `pip`. They use a custom `wheelhouse`. So you need to **specifically use their virtualenv**. So please carefully follow the instructions at [their wiki page](https://docs.computecanada.ca/wiki/Tensorflow). If you are usig `virtualenvwrapper` you **must** point to the correct `virtualenv` binary, or it will not work. I suggest you stick to the ComputeCanada instructions to keep your life simple.
56 | 
57 | ## Using salloc
58 | The following command will give you access to one of the interactive nodes for you to debug your code. Notice that I set the account to my account `def-kyi`, but you probably want to replace that with your advisor's.
59 | ```
60 | salloc --time=03:00:00 --gres=gpu:1 --cpus-per-task=6 --account=def-kyi --mem=31000M
61 | ```
62 | Note the 3 hour limit and 1 GPU, 6Cores, 32G MEM. This is for cedar, where each GPU node has 24 cores and 128G MEM. You want to access a *virtual* node with 1 GPU, so we divide the resource equally with four. For Graham, the setting will be a bit different. See [here for Cedar](https://docs.computecanada.ca/wiki/Cedar) and [here for Graham](https://docs.computecanada.ca/wiki/Graham).
63 | 
64 | Use these interactive nodes for debugging and then launch your batch jobs, once you are sure that it will work. The current `queue_cc.py` script expects your jobs to be located at `jobs/todo/`. So in fact, you can simply emulate what will happen with your batch job by running
65 | ```
66 | ./jobs/todo/<your_job_script>.sh
67 | ```
68 | 
69 | ## Launching scripts
70 | Once you are sure that your script runs, run `queue_cc.py` **IN YOUR VIRTUAL ENVIRONMENT** to queue your jobs as batch jobs. Note that your current `shell` environment will get carried on to your batch jobs. So you want to be in the virtual environment that you want your jobs to run in.
71 | 
72 | ## Monitoring the job queue
73 | Once your jobs is in the queue, you can type
74 | ```
75 | squeue -u <your-login-id>
76 | ```
77 | to see how it's doing. **BE PREPARED TO WAIT**. Your jobs will be in the queue for possibly many days before it runs. If it's there for more than 5 days, contact me. However, the scheduling policy prefers 3 hour jobs, as they can fit between long hour jobs. Since we will be using TF, saving and restoring is trivial. Do make your jobs save roughly 15 minutes, so that your job **dies** on three hour time limit, and restarts using dependencies. The `queue_cc.py` actually will take care of that for you. To make it even more efficient, a good way for you is that whenever you save your model, monitor the time you have left (roughly) and quit your training when you don't think the next save interval cannot fit in the 3 hour limit. This way, you can maximize the turn-around in the cluster. [Here's more info on the job scheduling policy](https://docs.computecanada.ca/wiki/Job_scheduling_policies)
78 | 
79 | ### Switching back and forth between Cedar and Graham
80 | 
81 | Cedar and Graham do not share queues. They are independent. In fact, you can possibly code so that the two clusters are identical in their setup and bounce back between your clusters. The job scheduling is based on priority, and this is set according to your latest two week window. So if you were not running on one cluster, in fact, your priority on that cluster is probably quite good. So it is worth bouncing back and forth. However, this is per CCDB account. So you might want to check who else is running on your advisor's account by doing
82 | ```
83 | squeue -A def-kyi_gpu
84 | ```
85 | Here, `def-kyi_gpu` is my CCDB account for the gpu machines. Your advisor's account would be different.
86 | 
87 | ## Monitoring your job outputs
88 | Be aware that your shell outputs will be forwarded in `jobs/output/` but they **are not guaranteed to be up-to-date**. The outputs there are not very trustworthy. You probably want to look at TensorBoard outputs, which will be up to date. **AGAIN DO NOT RUN TENSORBOARD IN LOGIN NODES**.
89 | 
90 | 
91 | 
92 | 


--------------------------------------------------------------------------------
/python/queue_cc.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # queue_cc.py ---
  3 | #
  4 | # Filename: queue_cc.py
  5 | # Description:
  6 | # Author: Kwang Moo Yi
  7 | # Maintainer:
  8 | # Created: Mon Jan 29 17:56:38 2018 (-0800)
  9 | # Version:
 10 | # Package-Requires: ()
 11 | # URL:
 12 | # Doc URL:
 13 | # Keywords:
 14 | # Compatibility:
 15 | #
 16 | #
 17 | 
 18 | # Commentary:
 19 | #
 20 | #
 21 | #
 22 | #
 23 | 
 24 | # Change Log:
 25 | #
 26 | #
 27 | #
 28 | 
 29 | # Code:
 30 | 
 31 | import argparse
 32 | import getpass
 33 | import os
 34 | import shutil
 35 | import socket
 36 | import subprocess
 37 | import datetime
 38 | 
 39 | # ----------------------------------------
 40 | # Global variables within this script
 41 | arg_lists = []
 42 | parser = argparse.ArgumentParser()
 43 | cluster_config = {
 44 |     "cedar":
 45 |         {
 46 |             "gpu_model": "p100",
 47 |             "gpus_per_node": 4,
 48 |             "cpu_cores_per_node": 24,
 49 |             "threads_per_node": 48,
 50 |             "cpu_cores_per_gpu": 6,
 51 |             "threads_per_gpu": 12,
 52 |             "ram_per_node": 128000,
 53 |             "ram_per_gpu": 31500,
 54 |             "job_system": "slurm",
 55 |             "default_account": "rrg-kyi",
 56 |         },
 57 |     "graham":
 58 |         {
 59 |             "gpu_model": "p100",
 60 |             "gpus_per_node": 2,
 61 |             "cpu_cores_per_node": 32,
 62 |             "threads_per_node": 64,
 63 |             "cpu_cores_per_gpu": 16,
 64 |             "threads_per_gpu": 32,
 65 |             "ram_per_node": 127518,
 66 |             "ram_per_gpu": 63500,
 67 |             "job_system": "slurm",
 68 |             "default_account": "def-kyi-ab",
 69 |         },
 70 |     "beluga":
 71 |         {
 72 |             "gpu_model": "v100",
 73 |             "gpus_per_node": 4,
 74 |             "cpu_cores_per_node": 40,
 75 |             "threads_per_node": 80,
 76 |             "cpu_cores_per_gpu": 10,
 77 |             "threads_per_gpu": 20,
 78 |             "ram_per_node": 191000,
 79 |             "ram_per_gpu": 47500,
 80 |             "job_system": "slurm",
 81 |             "default_account": "def-kyi-ab",
 82 |         },
 83 |     "moo":
 84 |         {
 85 |             "gpu_model": "v100",
 86 |             "gpus_per_node": 8,
 87 |             "cpu_cores_per_node": 28,
 88 |             "threads_per_node": 56,
 89 |             "cpu_cores_per_gpu": 3,
 90 |             "threads_per_gpu": 7,
 91 |             "ram_per_node": 191000,
 92 |             "ram_per_gpu": 23875,
 93 |             "job_system": "slurm",
 94 |             "default_account": "def-kyi",
 95 |         },
 96 |     "sockeye":
 97 |         {
 98 |             "gpu_model": "v100",
 99 |             "gpus_per_node": 4,
100 |             "cpu_cores_per_node": 24,
101 |             "threads_per_node": None,
102 |             "cpu_cores_per_gpu": 6,
103 |             "threads_per_gpu": None,
104 |             "ram_per_node": 191000,
105 |             "ram_per_gpu": 47750,
106 |             "job_system": "PBS",
107 |             "default_account": "pr-kmyi-1",
108 |             "default_gpu_account": "pr-kmyi-1-gpu",
109 |         },
110 |     "narval":
111 |         {
112 |             "gpu_model": "a100",
113 |             "gpus_per_node": 4,
114 |             "cpu_cores_per_node": 48,
115 |             "threads_per_node": 96,
116 |             "cpu_cores_per_gpu": 12,
117 |             "threads_per_gpu": 24,
118 |             "ram_per_node": 510000,
119 |             "ram_per_gpu": 127500,
120 |             "job_system": "slurm",
121 |             "default_account": "def-kyi-ab",
122 |         },
123 |     "snubfin":
124 |         {
125 |             "cpu_cores_per_gpu": 10,
126 |             "ram_per_gpu": 23785,
127 |             "job_system": "slurm",
128 |             "partition": "snubfin",
129 |         },
130 | }
131 | 
132 | 
133 | def slurm_command(num_cpu, num_gpu, mem, time_limit, dep_str, account, output_dir, job, partition, nodelist):
134 |     com = ["sbatch"]
135 |     com += ["--cpus-per-task={}".format(num_cpu)]
136 |     if num_gpu > 0:
137 |         com += ["--gres=gpu:{}".format(num_gpu)]
138 |     com += ["--mem={}".format(mem)]
139 |     com += ["--time={}".format(time_limit)]
140 |     if len(dep_str) > 0:
141 |         com += ["--dependency=afterany:{}".format(dep_str)]
142 |     if len(account) > 0:
143 |         com += ["--account={}".format(account)]
144 |     if partition and len(partition) > 0:
145 |         com += ["--partition={}".format(partition)]
146 |     if nodelist and len(nodelist) > 0:
147 |         com += ["--nodelist={}".format(nodelist)]
148 |     com += ["--output={}/%x-%j.out".format(output_dir)]
149 |     com += ["--export=ALL"]
150 |     com += [job]
151 |     return com
152 | 
153 | 
154 | def PBS_command(num_cpu, num_gpu, mem, time_limit, dep_str, account, output_dir, job, partition, nodelist):
155 |     com = ["qsub"]
156 |     if num_gpu > 0:
157 |         com += ["-l", "walltime={0},select=1:ncpus={1}:mem={2}:ngpus={3}".format(time_limit, num_cpu, mem, num_gpu)]
158 |     else:
159 |         com += ["-l", "walltime={0},select=1:ncpus={1}:mem={2}".format(time_limit, num_cpu, mem)]
160 |     if len(dep_str) > 0:
161 |         com += ["-W", "depend=afterany:{}".format(dep_str)]
162 |     com += ["-A", "{}".format(account)]
163 |     com += ["-o", "{0}/{1}_{2}.out".format(output_dir,
164 |                                            os.path.basename(job),
165 |                                            str(datetime.datetime.now()).replace(" ", "_").replace(":", "_"),
166 |                                            )]
167 |     com += ["-e", "{0}/{1}_{2}.err".format(output_dir,
168 |                                            os.path.basename(job),
169 |                                            str(datetime.datetime.now()).replace(" ", "_").replace(":", "_"),
170 |                                            )]
171 |     com += [job]
172 |     return com
173 | 
174 | 
175 | def add_argument_group(name):
176 |     arg = parser.add_argument_group(name)
177 |     arg_lists.append(arg)
178 |     return arg
179 | 
180 | 
181 | # ----------------------------------------
182 | # Arguments for training
183 | global_arg = add_argument_group("Global")
184 | 
185 | global_arg.add_argument(
186 |     "--account", type=str,
187 |     default=None,
188 |     help="Slurm account to use. "
189 |          "Please change this to your compute canada account")
190 | 
191 | 
192 | global_arg.add_argument(
193 |     "--cluster", type=str,
194 |     default=None,
195 |     help="Name of the cluster.")
196 | 
197 | 
198 | global_arg.add_argument(
199 |     "--todo_dir", type=str,
200 |     default="./jobs/todo",
201 |     help="Path to directory containing shell scripts to run.")
202 | 
203 | global_arg.add_argument(
204 |     "--done_dir", type=str,
205 |     default="./jobs/done",
206 |     help="Path to directory that the program will move queued scripts.")
207 | 
208 | global_arg.add_argument(
209 |     "--output_dir", type=str,
210 |     default="./jobs/output",
211 |     help="Directory that will contain job outputs.")
212 | 
213 | # ----------------------------------------
214 | # Arguments for model
215 | job_arg = add_argument_group("Job")
216 | 
217 | job_arg.add_argument(
218 |     "--num_jobs", type=int,
219 |     default=1,
220 |     help="Number of shell scripts to queue from the TODO_DIR.")
221 | job_arg.add_argument(
222 |     "--num_runs", type=int,
223 |     default=5,
224 |     help="Number of times this shell script will be executed. "
225 |          "This is useful when running 3 hour jobs that run multiple times.")
226 | job_arg.add_argument(
227 |     "--num_gpu", type=int,
228 |     default=1,
229 |     help="Number of GPUs to use. Set zero to not use the gpu node.")
230 | job_arg.add_argument(
231 |     "--num_cpu", type=str,
232 |     default="auto",
233 |     help="Number of CPU cores to use. Can be infered from the GPU."
234 |          "Set 'auto' to do that.")
235 | job_arg.add_argument(
236 |     "--mem", type=str,
237 |     default="auto",
238 |     help="Amount of memory to use. See compute canada wiki for details "
239 |          "on large memory nodes. Typically, you don't want to go over 8G per "
240 |          "CPU core")
241 | job_arg.add_argument(
242 |     "--time_limit", type=str,
243 |     default="03:00:00",
244 |     help="Time limit on the jobs. If you can, 3 hours give you the best "
245 |          "turn around. Hours:Minutes:Seconds")
246 | job_arg.add_argument(
247 |     "--depends_key", type=str,
248 |     default="none",
249 |     help="In case you want to schedule your jobs depending on something. "
250 |          "Set to 'none' if not wanted.")
251 | job_arg.add_argument(
252 |     "--partition", type=str,
253 |     default=None,
254 |     help="Partition to be used.")
255 | job_arg.add_argument(
256 |     "--nodelist", type=str,
257 |     default=None,
258 |     help="List of nodes to be used.")
259 | 
260 | 
261 | def get_config():
262 |     config, unparsed = parser.parse_known_args()
263 | 
264 |     return config, unparsed
265 | 
266 | 
267 | def print_usage():
268 |     parser.print_usage()
269 | 
270 | 
271 | def main(config):
272 |     """Main Function"""
273 | 
274 |     # Check if directories exist and create them if necessary
275 |     if not os.path.exists(config.todo_dir):
276 |         os.makedirs(config.todo_dir)
277 |     if not os.path.exists(config.done_dir):
278 |         os.makedirs(config.done_dir)
279 |     if not os.path.exists(config.output_dir):
280 |         os.makedirs(config.output_dir)
281 | 
282 |     # Get hostname and user name
283 |     username = getpass.getuser()
284 |     hostname = socket.gethostname()
285 | 
286 |     # Identify cluster
287 |     if config.cluster is None:
288 |         if hostname.startswith("gra"):
289 |             cluster = "graham"
290 |         elif hostname.startswith("cedar") or hostname.startswith("cdr"):
291 |             cluster = "cedar"
292 |         elif hostname.startswith("beluga") or hostname.startswith("blg"):
293 |             cluster = "beluga"
294 |         elif hostname.startswith("stirk"):
295 |             cluster = "moo"
296 |         elif hostname.startswith("se"):
297 |             cluster = "sockeye"
298 |         elif hostname.startswith("narval"):
299 |             cluster = "narval"
300 |         elif hostname.startswith("borg"):
301 |             cluster = "snubfin"
302 |         else:
303 |             raise ValueError("Unknown cluster {}".format(hostname))
304 |     else:
305 |         cluster = config.cluster
306 | 
307 |     # Get gpu usage statistics
308 |     num_gpu = config.num_gpu
309 | 
310 |     # Apply default account if not specified
311 |     if config.account is None:
312 |         if "default_gpu_account" not in cluster_config[cluster] and "default_account" not in cluster_config[cluster]:
313 |             config.account = ""
314 |         elif num_gpu > 0 and "default_gpu_account" in cluster_config[cluster]:
315 |             config.account = cluster_config[cluster]["default_gpu_account"]
316 |         else:
317 |             config.account = cluster_config[cluster]["default_account"]
318 |     if config.partition is None and "partition" in cluster_config[cluster]:
319 |         config.partition = cluster_config[cluster]["partition"]
320 |     if config.nodelist is None and "nodelist" in cluster_config[cluster]:
321 |         config.nodelist = cluster_config[cluster]["nodelist"]
322 | 
323 |     # Set options or automatically infer CPU and MEM
324 |     num_cpu = config.num_cpu
325 |     if num_cpu.lower() == "auto":
326 |         if num_gpu > 0:
327 |             num_cores_per_gpu = cluster_config[cluster]["cpu_cores_per_gpu"]
328 |             num_cpu = str(num_cores_per_gpu * num_gpu)
329 |     mem = config.mem
330 |     if mem.lower() == "auto":
331 |         if num_gpu > 0:
332 |             ram_per_gpu = cluster_config[cluster]["ram_per_gpu"]
333 |             mem = str(ram_per_gpu * num_gpu) + "M"
334 | 
335 |     # Set time limit
336 |     time_limit = config.time_limit
337 | 
338 |     # Get jobs that this new job should depend on.
339 |     job_depends = []
340 |     if config.depends_key != "none":
341 |         assert cluster_config[cluster]["job_system"] == "slurm"
342 |         squeue_res = subprocess.run(
343 |             ["squeue", "-u", username],
344 |             stdout=subprocess.PIPE
345 |         )
346 |         job_details = squeue_res.stdout.decode().split("\n")[1:]
347 |         # For each job create a list of IDs
348 |         for _str in job_details:
349 |             # Look for job dependency keys in string
350 |             if config.depends_key in _str:
351 |                 # Add to the list of dependent jobs
352 |                 job_depends += [str(int(_str.split()[0]))]
353 | 
354 |     # Run jobs
355 |     for idx_job in range(config.num_jobs):
356 |         # Grab a job from the list of jobs
357 |         found_job = False
358 |         # Sort -- Just in case
359 |         list_files = os.listdir(config.todo_dir)
360 |         list_files.sort()
361 |         for _f in list_files:
362 |             if _f.endswith(".sh"):
363 |                 job_script = _f
364 |                 print("Queueing script {}".format(
365 |                     os.path.join(config.todo_dir, job_script)
366 |                 ))
367 |                 found_job = True
368 |                 break
369 |         if not found_job:
370 |             raise RuntimeError("No job found in {}".format(config.todo_dir))
371 |         # Move that job to the done folder
372 |         shutil.move(
373 |             os.path.join(config.todo_dir, job_script),
374 |             os.path.join(config.done_dir, job_script),
375 |         )
376 |         # Build Initial dependency (from the job_depends)
377 |         dep_str = ":".join(job_depends)
378 |         # Run job N times
379 |         for idx_run in range(config.num_runs):
380 |             if cluster_config[cluster]["job_system"] == "slurm":
381 |                 com = slurm_command(num_cpu,
382 |                                     num_gpu,
383 |                                     mem,
384 |                                     time_limit,
385 |                                     dep_str,
386 |                                     config.account,
387 |                                     config.output_dir,
388 |                                     os.path.join(config.done_dir, job_script),
389 |                                     config.partition,
390 |                                     config.nodelist)
391 |             elif cluster_config[cluster]["job_system"] == "PBS":
392 |                 com = PBS_command(num_cpu,
393 |                                   num_gpu,
394 |                                   mem,
395 |                                   time_limit,
396 |                                   dep_str,
397 |                                   config.account,
398 |                                   config.output_dir,
399 |                                   os.path.join(config.done_dir, job_script),
400 |                                   config.partition,
401 |                                   config.nodelist)
402 |             slurm_res = subprocess.run(com, stdout=subprocess.PIPE)
403 |             print(slurm_res.stdout.decode())
404 |             # Get job ID
405 |             if slurm_res.returncode != 0:
406 |                 raise RuntimeError("Slurm/PBS error!")
407 |             job_id = slurm_res.stdout.decode().split()[-1]
408 |             dep_str = str(job_id)
409 | 
410 | 
411 | if __name__ == "__main__":
412 | 
413 |     # ----------------------------------------
414 |     # Parse configuration
415 |     config, unparsed = get_config()
416 |     # If we have unparsed arguments, print usage and exit
417 |     if len(unparsed) > 0:
418 |         print_usage()
419 |         exit(1)
420 | 
421 |     main(config)
422 | 
423 | #
424 | # queue_cc.py ends here
425 | 


--------------------------------------------------------------------------------