├── .gitignore
├── LICENSE
├── README.md
├── helper
    └── benchmark_layernorm.py
├── main.py
├── scripts
    ├── demo_cc.sh
    ├── demo_cc_apptainer.sh
    ├── demo_sockeye.sh
    ├── demo_sockeye_pbs.sh
    └── demo_vector.sh
├── setup
    ├── requirements_cc.txt
    └── requirements_sockeye.txt
└── utils
    ├── arg_parser.py
    ├── dist_training.py
    └── learning_utils.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | mnist_data
  2 | logger*
  3 | venv*
  4 | runs/
  5 | .idea
  6 | 
  7 | # Byte-compiled / optimized / DLL files
  8 | __pycache__/
  9 | *.py[cod]
 10 | *$py.class
 11 | 
 12 | # C extensions
 13 | *.so
 14 | 
 15 | # Distribution / packaging
 16 | .Python
 17 | build/
 18 | develop-eggs/
 19 | dist/
 20 | downloads/
 21 | eggs/
 22 | .eggs/
 23 | lib/
 24 | lib64/
 25 | parts/
 26 | sdist/
 27 | var/
 28 | wheels/
 29 | pip-wheel-metadata/
 30 | share/python-wheels/
 31 | *.egg-info/
 32 | .installed.cfg
 33 | *.egg
 34 | MANIFEST
 35 | 
 36 | # PyInstaller
 37 | #  Usually these files are written by a python script from a template
 38 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 39 | *.manifest
 40 | *.spec
 41 | 
 42 | # Installer logs
 43 | pip-log.txt
 44 | pip-delete-this-directory.txt
 45 | 
 46 | # Unit test / coverage reports
 47 | htmlcov/
 48 | .tox/
 49 | .nox/
 50 | .coverage
 51 | .coverage.*
 52 | .cache
 53 | nosetests.xml
 54 | coverage.xml
 55 | *.cover
 56 | *.py,cover
 57 | .hypothesis/
 58 | .pytest_cache/
 59 | 
 60 | # Translations
 61 | *.mo
 62 | *.pot
 63 | 
 64 | # Django stuff:
 65 | *.log
 66 | local_settings.py
 67 | db.sqlite3
 68 | db.sqlite3-journal
 69 | 
 70 | # Flask stuff:
 71 | instance/
 72 | .webassets-cache
 73 | 
 74 | # Scrapy stuff:
 75 | .scrapy
 76 | 
 77 | # Sphinx documentation
 78 | docs/_build/
 79 | 
 80 | # PyBuilder
 81 | target/
 82 | 
 83 | # Jupyter Notebook
 84 | .ipynb_checkpoints
 85 | 
 86 | # IPython
 87 | profile_default/
 88 | ipython_config.py
 89 | 
 90 | # pyenv
 91 | .python-version
 92 | 
 93 | # pipenv
 94 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 95 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 96 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 97 | #   install all needed dependencies.
 98 | #Pipfile.lock
 99 | 
100 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
101 | __pypackages__/
102 | 
103 | # Celery stuff
104 | celerybeat-schedule
105 | celerybeat.pid
106 | 
107 | # SageMath parsed files
108 | *.sage.py
109 | 
110 | # Environments
111 | .env
112 | .venv
113 | env/
114 | venv/
115 | ENV/
116 | env.bak/
117 | venv.bak/
118 | 
119 | # Spyder project settings
120 | .spyderproject
121 | .spyproject
122 | 
123 | # Rope project settings
124 | .ropeproject
125 | 
126 | # mkdocs documentation
127 | /site
128 | 
129 | # mypy
130 | .mypy_cache/
131 | .dmypy.json
132 | dmypy.json
133 | 
134 | # Pyre type checker
135 | .pyre/
136 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 Qi Yan
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # HPC_helper
  2 | 
  3 | This repository showcases a minimal example of using `PyTorch` distributed training on computing clusters, enabling you to run your training tasks on `N` nodes, each with `M` GPUs. It includes common use cases such as [DataParallel (DP)](https://pytorch.org/docs/stable/generated/torch.nn.DataParallel.html) or [DistributedDataParallel (DDP)](https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html) and offers support for [PBS](https://2020.help.altair.com/2020.1/PBSProfessional/PBSUserGuide2020.1.1.pdf) and [SLURM](https://slurm.schedmd.com/documentation.html) systems. Below, you'll find runnable code and scripts for UBC Sockeye, Vector Vaughan cluster, and Digital Research Alliance of Canada (formerly ComputeCanada) HPCs.
  4 | 
  5 | Last updated: Jun 23, 2024. Contact: Qi Yan, qi.yan@ece.ubc.ca 
  6 | 
  7 | ## Get started
  8 | 
  9 | ### Setup python environment
 10 | ```bash
 11 | # load python 3.8 at HPC
 12 | # module load gcc/9.4.0 python/3.8.10 cuda/11.3.1 nccl/2.9.9-1-cuda11-3 # Sockeye
 13 | # module load python/3.8 cuda-11.7 # Vector
 14 | # module load python/3.10.13 StdEnv/2023 # CC
 15 | 
 16 | # python virtual environment
 17 | python -m venv venvhpc
 18 | source venvhpc/bin/activate
 19 | pip install -U pip
 20 | pip install -r setup/requirements_sockeye.txt   # if at Sockeye
 21 | pip install -r setup/requirements_cc.txt        # if at Vector or CC
 22 | 
 23 | # sanity check at Sockeye or CC
 24 | # you must enter an interactive session on Vector to tun this
 25 | python -c "import torch; print('Things are done.')"  
 26 | 
 27 | # download MNIST dataset
 28 | mkdir -p ./mnist_data/MNIST/raw
 29 | wget https://raw.githubusercontent.com/fgnt/mnist/master/train-images-idx3-ubyte.gz -P ./mnist_data/MNIST/raw
 30 | wget https://raw.githubusercontent.com/fgnt/mnist/master/train-labels-idx1-ubyte.gz -P ./mnist_data/MNIST/raw
 31 | wget https://raw.githubusercontent.com/fgnt/mnist/master/t10k-images-idx3-ubyte.gz -P ./mnist_data/MNIST/raw
 32 | wget https://raw.githubusercontent.com/fgnt/mnist/master/t10k-labels-idx1-ubyte.gz -P ./mnist_data/MNIST/raw
 33 | ```
 34 | On Alliance/CC clusters, you can only `pip install` python packages available on the system and `conda` is forbidden.
 35 | If you need to install additional packages, you can use the [`apptainer` container environment](https://docs.alliancecan.ca/wiki/Apptainer/en).
 36 | See the section below for details.
 37 | 
 38 | <details>
 39 |   <summary>apptainer instructions on Alliance/CC clusters</summary>
 40 |   The following instructions have been tested on the `narval` cluster. Similar steps work on other clusters like `cedar`, while the storage path may vary.
 41 | 
 42 |   ```bash
 43 |   ## pull image and create sandbox; recommended to do so at /scratch space for faster runtime
 44 |   module load apptainer-suid/1.1
 45 |   mkdir -p /lustre07/scratch/${USER}/venv && cd /lustre07/scratch/${USER}/venv
 46 |   apptainer pull --name pytorch220_official.sif docker://pytorch/pytorch:2.2.0-cuda11.8-cudnn8-devel
 47 |   apptainer build --sandbox venvhpc.sandbox pytorch220_official.sif
 48 | 
 49 |   ## get ready to enter the sandbox in an interactive shell
 50 |   export TMPDIR=/tmp/${USER}tmp
 51 |   mkdir -p ${TMPDIR}
 52 |   export APPTAINER_CACHEDIR=${TMPDIR}
 53 |   export APPTAINER_TMPDIR=${TMPDIR}
 54 | 
 55 |   ## bind the project, scratch, home directory to the sandbox; run `apptainer help run` to see meaning of each flag
 56 |   apptainer shell -C -B /project -B /scratch -B /home -W ${TMPDIR} --nv venvhpc.sandbox
 57 | 
 58 |   ## inside apptainer create conda env or use python venv; recommended to create conda env at /scratch space
 59 |   bash
 60 |   export USER=YOUR_USER_NAME  # change to your username
 61 |   conda create -p /lustre07/scratch/${USER}/venv/condaenvs/venvhpc python=3.8 -y
 62 |   conda activate /lustre07/scratch/${USER}/venv/condaenvs/venvhpc
 63 | 
 64 |   mkdir -p /lustre07/scratch/${USER}/venv/condaenvs/condacache
 65 |   conda config --add pkgs_dirs /lustre07/scratch/${USER}/venv/condaenvs/condacache
 66 | 
 67 |   ## pip install within the conda env
 68 |   pip install -U pip
 69 |   pip install -r setup/requirements_cc.txt
 70 | 
 71 |   ## sanity check
 72 |   python -c "import torch; print('Things are done.')"  
 73 | 
 74 |   ## follow the above "download MNIST dataset" section to load the dataset
 75 |   ```
 76 |   
 77 |   The apptainer sandbox is a containerized environment that allows you to install custom packages without root access. The `--bind` or `-B` flag is used to bind directories to the container. The sandbox itself contains only the necessary system libraries and the user's home directory. We still store the code and datasets on normal storage space.
 78 | 
 79 | </details>
 80 | 
 81 | 
 82 | ### Go training
 83 | We showcase the use of distributed learning for a simple training task using ResNet50 as backbone.
 84 | 
 85 | **IMPORTANT**: 
 86 | * please change the account and notification email address in the bash script before running.
 87 | * the old Sockeye script is intended for OpenPBS system, which is no longer useful and kpet just for the sake of completeness.
 88 | * the Sockeye, Vector and CC scripts are intended for SLURM system, but we don't provide `preemption` support for Vector script.
 89 | 
 90 | ```bash
 91 | # at Sockeye
 92 | sbatch scripts/demo_sockeye.sh
 93 | 
 94 | # at Vector
 95 | sbatch scripts/demo_vector.sh
 96 | 
 97 | # at CC
 98 | sbatch scripts/demo_cc.sh
 99 | 
100 | # at CC with apptainer
101 | ## note: please change the paths in the script accordingly
102 | sbatch scripts/demo_cc_apptainer.sh
103 | ```
104 | Please check the training logs at `runs` for runtime comparison. Hear are five-epoch training time comparisons from my runs:
105 | 
106 | | #Nodes | #GPUs per node | PyTorch Distirbuted Method | Sockeye runtime | CC runtime                   | Vector runtime                    |
107 | | ------ | -------------- | -------------------------- | --------------- | ---------------------------- | --------------------------------- |
108 | | N=1    | M=1            | N/A                        | 363.4s          | 309.7s                       | 425.0s                            |
109 | | N=1    | M=4            | DP                         | 103.5s          | 114.2s                       | 133.9s                            |
110 | | N=1    | M=4            | DDP                        | 93.7s           | 85.2s                        | 113.4s                            |
111 | | N=2    | M=4            | DDP                        | 55.7s           | 47.0s (mpirun); 47.4s (srun) | 60.9s (mpirun); 60.6s (srun)      |
112 | 
113 | In the demo script, we use Tesla V100-SXM2-32GB at Sockeye and CC, and RTX6000-24GB at Vector.
114 | The single-precision performance in terms of FLOPS is 15.7 TFLOPS for V100-SXM2-32GB and 16.3 TFLOPS for RTX6000-24GB.
115 | Therefore, the performance difference is mainly due to the GPU memory size.
116 | 
117 | ## Distributed training rule of thumb
118 | 
119 | Generally, we could either use [DataParallel (DP)](https://pytorch.org/docs/stable/generated/torch.nn.DataParallel.html) or [DistributedDataParallel (DDP)](https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html) protocol to start distributed training. DP is straightforward and only involves changes to a few lines of code. However, its efficiency is worse than DDP; please see [this page](https://pytorch.org/docs/stable/notes/cuda.html#use-nn-parallel-distributeddataparallel-instead-of-multiprocessing-or-nn-dataparallel) for why. Moreover, DP doesn't support multi-node distributed training. Therefore, it's better to always start with DDP despite its relatively higher complexity. The table belows shows the possible way to launch your distributed training jobs.
120 | 
121 | 
122 | | #Nodes | #GPUs per node | PyTorch Distirbuted Method | Launch Method at Sockeye | Launch Method at CC |
123 | |--------|----------------|----------------------------|---------------------------|----------------------|
124 | | N=1    | M=1            | N/A                        | N/A                       | N/A                  |
125 | | N=1    | M>1            | DDP or DP                 | torchrun                  | torchrun             |
126 | | N>1    | M>1            | DDP                        | mpirun + python           | mpirun + python or srun + torchrun |
127 | 
128 | 
129 | ### Difference between PBS (old Sockeye) and SLURM (Vector and CC) systems
130 | At PBS (old Sockeye) system, `mpirun + python` seems to be the only viable way to launch multi-node training. At SLURM (Vector and CC) system, we could use either `srun + torchrun` or `mpirun + python`. Essentially, both `mpirun` and `srun` are launching parallel jobs across different nodes *in one line of code*, and these two mechanisms are the key to scalable multi-node DDP training. We use the following example to show the crucial details to avoid errors.
131 | 
132 | **`mpirun + python` method explained**
133 | 
134 | Sample commands:
135 | ```bash
136 | mpirun -np 8 \
137 | --hostfile $PBS_NODEFILE --oversubscribe \
138 | -x MASTER_ADDR=$(hostname) \
139 | -x MASTER_PORT=$MASTER_PORT \
140 | -x CUDA_VISIBLE_DEVICES=0,1,2,3 \
141 | -x PATH \
142 | -bind-to none -map-by :OVERSUBSCRIBE \
143 | -mca pml ob1 -mca btl ^openib \
144 | python main.py --batch_size=6144 --ddp -m=sockeye_demo_multiple_node_mpi_ddp
145 | ```
146 | The `mpirun` is executed once, then the parallel jobs will be launched and their communications will be handled by PyTorch and `mpirun` altogether. The key is that we only need to **run `mpirun + python`  once on the master node**.
147 | 
148 |  `mpirun + python` comes with an option `-np` which specifies the number of processes in total. In our demo script, each process amounts to one trainer (i.e., one GPU), and we use `-np=8` for 2 nodes with 8 GPUs in total. This must be used along with `--oversubscribe`, and the reasons are as follows.
149 | 
150 | `mpirun` assigns job processes to nodes using [`slot`](https://www.open-mpi.org/doc/v4.0/man1/mpirun.1.php#sect3) scheduling, which was originally intended for CPU-only tasks due to historical reasons (one process amounts to one CPU core). However, such slot assignment may go wrong in the age of GPU training, as now we need to view one GPU as one process. For example, old Sockeye's PBS would not distribute 8 tasks equal to the 2 nodes and instead would raise an error indicating the number of available slots is insufficient. Therefore, we need to use the `--oversubscribe` option to enforce that `mpirun` does distribute tasks equally to each node and ignores the possible false alarm errors.
151 | 
152 | **`srun + torchrun` method explained**
153 | 
154 | Sample commands:
155 | 
156 | ```bash
157 | srun --ntasks-per-node=1 --ntasks=2 torchrun --nnodes=2 --nproc_per_node=4 \
158 | --rdzv_id=$SLURM_JOB_ID --rdzv_backend=c10d --rdzv_endpoint=$(hostname):$MASTER_PORT \
159 | main.py --batch_size=6144 --ddp -m=cc_demo_multiple_node_srun_ddp
160 | ```
161 | 
162 | The `SLURM_NTASKS` variable tells the script how many processes are available for this execution. `srun` executes the script `<tasks-per-node * nodes>` times. For `torchrun` launch method, we only need to **run it once per node**, and in our example, we are running `torchrun` commands twice on two nodes. Note that this is different than `mpirun + python`, where we *run it once for all nodes*.
163 | 
164 | For error-free srun execution, we need to pay attention to the `#SBATCH` options set in the very beginning or enforcing these parameters by using `--ntasks=2 --ntasks-per-node=1` explicitly. The nuance is `--ntasks=8 --ntasks-per-node=4` works for `mpirun + python` method, while `--ntasks=2 --ntasks-per-node=1` works for `srun + torchrun`.
165 | 
166 | ## Adapt your code to distributed training
167 | If you are okay with the PyTorch's built-in distributed training utilities, the plugin at `utils/dist_training.py` could be helpful. To change the code minimally for adaptation, please refer to the lines in `main.py` where `dist_helper` is called. 
168 | 
169 | Other third-party plugins like [horovod](https://horovod.ai/) and [pytorch lightning](https://www.pytorchlightning.ai/) can also possibly do the same things.
170 | 
171 | 
172 | 
173 | ## Reference
174 | #### Tutorial
175 | * [Multi Node PyTorch Distributed Training Guide For People In A Hurry](https://lambdalabs.com/blog/multi-node-pytorch-distributed-training-guide)
176 | * [PyTorch with Multiple GPUs](https://docs.alliancecan.ca/wiki/PyTorch#PyTorch_with_Multiple_GPUs)
177 | * [Multi-node-training on slurm with PyTorch](https://gist.github.com/TengdaHan/1dd10d335c7ca6f13810fff41e809904)
178 | 
179 | #### Helpful documentations
180 | * [pytorch torchrun](https://pytorch.org/docs/stable/elastic/run.html)
181 | * [mpirun man page](https://www.open-mpi.org/doc/v4.0/man1/mpirun.1.php)
182 | * [SLURM srun page](https://slurm.schedmd.com/srun.html)
183 | * [SLURM sbatch environment variables](https://slurm.schedmd.com/sbatch.html#SECTION_OUTPUT-ENVIRONMENT-VARIABLES)
184 | * [PBS qsub environment variables](https://opus.nci.org.au/display/Help/Useful+PBS+Environment+Variables)
185 | 
186 | #### Wiki
187 | * [UBC Sockeye](https://confluence.it.ubc.ca/display/UARC/About+Sockeye)
188 | * [Vector](https://support.vectorinstitute.ai/FrontPage)
189 | * [CC](https://docs.alliancecan.ca/wiki/Technical_documentation)


--------------------------------------------------------------------------------
/helper/benchmark_layernorm.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import torch
 3 | import torch.nn as nn
 4 | from torch.profiler import tensorboard_trace_handler
 5 | 
 6 | 
 7 | class TestNet(nn.Module):
 8 |     def __init__(self):
 9 |         super().__init__()
10 |         self.fc_in = nn.Linear(3, 128)
11 |         self.layers = nn.ModuleList()
12 |         for i in range(0, 10):
13 |             self.layers.append(nn.Linear(128, 128))
14 |             self.layers.append(nn.LayerNorm(128))
15 |             # self.layers.append(nn.BatchNorm1d(262144))
16 |         self.fc_out = nn.Linear(128, 3)
17 | 
18 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
19 |         x = self.fc_in(x)
20 |         for layer in self.layers:
21 |             x = layer(x)
22 |             # print(x.shape)
23 |         x = self.fc_out(x)
24 |         return x
25 | 
26 | 
27 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
28 | net = TestNet().to(device).train()
29 | in_data = torch.zeros([1, 512 * 512, 3]).to(device)
30 | criterion = nn.MSELoss().to(device)
31 | optimizer = torch.optim.SGD(net.parameters(), 0.01)
32 | 
33 | with torch.profiler.profile(
34 |         schedule=torch.profiler.schedule(wait=2, warmup=2, active=6, repeat=1, skip_first=2),
35 |         on_trace_ready=tensorboard_trace_handler("tmp/profile"),
36 |         with_stack=True, with_flops=True, with_modules=True, profile_memory=True) as profiler:
37 | 
38 |     for i in range(0, 20):
39 |         t0 = time.time()
40 |         out_data = net(in_data)
41 |         loss = criterion(out_data, in_data)
42 | 
43 |         optimizer.zero_grad()
44 |         loss.backward()
45 |         optimizer.step()
46 |         profiler.step()
47 |         print(f"step: {i:,d} {time.time() - t0:.3f}")
48 | print("Done! ")


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | # Reference:
  2 | # https://github.com/olehb/pytorch_ddp_tutorial/blob/main/ddp_tutorial_multi_gpu.py
  3 | 
  4 | 
  5 | import os
  6 | import logging
  7 | import pdb
  8 | import time
  9 | import numpy as np
 10 | from tqdm import tqdm
 11 | from datetime import datetime
 12 | 
 13 | import torch
 14 | from torch import nn, optim
 15 | from torch import distributed as dist
 16 | from torch.utils.data import DataLoader, DistributedSampler
 17 | import torchvision
 18 | from torchvision import datasets, transforms
 19 | 
 20 | from utils.arg_parser import parse_arguments, set_seed_and_logger, backup_code
 21 | from utils.dist_training import DistributedHelper, get_ddp_save_flag
 22 | from utils.learning_utils import count_model_params
 23 | 
 24 | 
 25 | def init_basics():
 26 |     """
 27 |     Initialization
 28 |     """
 29 |     args = parse_arguments()
 30 |     dist_helper = DistributedHelper(args.dp, args.ddp, args.ddp_gpu_ids, args.ddp_init_method)
 31 |     writer = set_seed_and_logger(args.seed, args.logdir, args.log_level, args.comment, dist_helper)
 32 |     backup_code(args.logdir)
 33 |     return args, dist_helper, writer
 34 | 
 35 | 
 36 | def init_model(dist_helper):
 37 |     """
 38 |     Initialize model and training necessities.
 39 |     """
 40 |     # model, we use an unnecessarily heavy model to showcase the GPU profiling
 41 |     model = getattr(torchvision.models, 'resnet50')(weights=None)
 42 |     model.conv1 = nn.Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
 43 |     model.fc = nn.Linear(model.fc.in_features, 10)  # 10 classes to predict
 44 |     model = model.to(dist_helper.device)
 45 | 
 46 |     param_string, total_params, total_trainable_params = count_model_params(model)
 47 |     logging.info(f"Parameters: \n{param_string}")
 48 |     logging.info(f"Parameters Count: {total_params:,}, Trainable: {total_trainable_params:,}.")
 49 | 
 50 |     # adapt to distributed training
 51 |     model = dist_helper.dist_adapt_model(model)
 52 | 
 53 |     # optimizer and loss function
 54 |     optimizer = optim.Adam(model.parameters(), lr=1e-4)
 55 |     criterion = nn.CrossEntropyLoss()
 56 |     return model, optimizer, criterion
 57 | 
 58 | 
 59 | def init_dataloader(batch_size, dist_helper):
 60 |     """
 61 |     Get dataloader
 62 |     """
 63 |     transform = transforms.Compose([
 64 |         transforms.ToTensor(),
 65 |         transforms.Normalize((0.1307,), (0.3081,)),
 66 |         transforms.Resize(128)  # resize to larger image to showcase the use of GPU profiling
 67 |     ])
 68 |     dataset_loc = './mnist_data'
 69 | 
 70 |     train_dataset = datasets.MNIST(dataset_loc, download=True, train=True, transform=transform)
 71 | 
 72 |     # For final evaluation, it is advised not to use distributed sampler due to possibly incorrect results.
 73 |     # But we are using it now to accelerate evaluation during training.
 74 |     # Ref: https://github.com/pytorch/pytorch/issues/25162
 75 |     test_dataset = datasets.MNIST(dataset_loc, download=True, train=False, transform=transform)
 76 | 
 77 |     logging.info("Training set size: {:d}, testing set size: {:d}".format(len(train_dataset), len(test_dataset)))
 78 | 
 79 |     if dist_helper.is_ddp:
 80 |         batch_size_per_gpu = max(1, batch_size // dist.get_world_size())
 81 |         sampler = DistributedSampler(train_dataset, shuffle=True)
 82 |         train_loader = DataLoader(dataset=train_dataset, sampler=sampler, batch_size=batch_size_per_gpu,
 83 |                                   pin_memory=True, num_workers=min(6, os.cpu_count()))
 84 | 
 85 |         sampler = DistributedSampler(test_dataset, shuffle=False)
 86 |         test_loader = DataLoader(dataset=test_dataset, sampler=sampler, batch_size=batch_size_per_gpu,
 87 |                                  pin_memory=True, num_workers=min(6, os.cpu_count()))
 88 |     else:
 89 |         train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size,
 90 |                                   shuffle=True, pin_memory=True, num_workers=min(6, os.cpu_count()))
 91 |         test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size,
 92 |                                  shuffle=False, pin_memory=True, num_workers=min(6, os.cpu_count()))
 93 | 
 94 |     return train_loader, test_loader
 95 | 
 96 | 
 97 | def go_training(epochs, model, optimizer, criterion, dist_helper, train_loader, test_loader, writer, logdir):
 98 |     """
 99 |     Training loop.
100 |     """
101 | 
102 |     # init
103 |     time_train_ls, time_val_ls = [], []
104 |     epoch_when_snapshot = list(range(0, epochs, epochs // 5))
105 | 
106 |     # epoch-wise training
107 |     for i_epoch in range(epochs):
108 |         # train the model for one epoch
109 |         if dist_helper.is_ddp:
110 |             train_loader.sampler.set_epoch(i_epoch)
111 | 
112 |         time_epoch = time.time()
113 |         train_loss = 0
114 |         pbar = tqdm(train_loader)
115 |         model.train()
116 |         for x, y in pbar:
117 |             x = x.to(dist_helper.device, non_blocking=True)
118 |             y = y.to(dist_helper.device, non_blocking=True)
119 |             optimizer.zero_grad()
120 |             y_hat = model(x)
121 |             batch_loss = criterion(y_hat, y)
122 |             batch_loss.backward()
123 |             optimizer.step()
124 |             batch_loss_scalar = batch_loss.item()
125 |             train_loss += batch_loss_scalar / x.shape[0]
126 |             pbar.set_description(f'training batch_loss={batch_loss_scalar:.4f}')
127 |         time_training = time.time() - time_epoch
128 | 
129 |         # calculate validation loss
130 |         time_val = time.time()
131 |         val_loss = 0.0
132 |         pbar = tqdm(test_loader)
133 |         model.eval()
134 |         with torch.no_grad():
135 |             for x, y in pbar:
136 |                 x = x.to(dist_helper.device, non_blocking=True)
137 |                 y = y.to(dist_helper.device, non_blocking=True)
138 |                 y_hat = model(x)
139 |                 batch_loss = criterion(y_hat, y)
140 |                 batch_loss_scalar = batch_loss.item()
141 |                 val_loss += batch_loss_scalar / x.shape[0]
142 |                 pbar.set_description(f'validation batch_loss={batch_loss_scalar:.4f}')
143 |         time_val = time.time() - time_val
144 | 
145 |         logging.info(f"Epoch={i_epoch}, train_loss={train_loss:.4f}, val_loss={val_loss:.4f}")
146 |         logging.info("Training time: {:.3f}s, Validation time: {:.3f}s".format(time_training, time_val))
147 |         time_train_ls.append(time_training)
148 |         time_val_ls.append(time_val)
149 | 
150 |         if get_ddp_save_flag():
151 |             writer.add_scalar("train/loss", train_loss, i_epoch)
152 |             writer.add_scalar("test/loss", val_loss, i_epoch)
153 |             writer.flush()
154 | 
155 |         if i_epoch in epoch_when_snapshot and get_ddp_save_flag():
156 |             model_path = os.path.join(logdir, 'model_epoch_{:03d}_{:s}_{:d}.pt'.format(
157 |                 i_epoch, datetime.now().strftime("%Y%m%d-%H%M%S"), os.getpid()))
158 |             torch.save(model.state_dict(), model_path)
159 |             logging.info("Saving model to {:s}".format(model_path))
160 |         dist_helper.ddp_sync()
161 | 
162 |     # Count overall training efficiency
163 |     logging.info("{:s} Overall timing results {:s}".format('-' * 10, '-' * 10))
164 |     logging.info("Total training time: {:.3f}s, total validation time: {:.3f}s".format(
165 |         np.sum(time_train_ls), np.sum(time_val_ls)))
166 |     for i_epoch, time_training, time_val in zip(range(epochs), time_train_ls, time_val_ls):
167 |         logging.info("Epoch: {:d}, Training time: {:.3f}s, Validation time: {:.3f}s.".format(
168 |             i_epoch, time_training, time_val))
169 | 
170 | 
171 | def main():
172 |     """
173 |     Main training loop
174 |     """
175 | 
176 |     """Initialization basics"""
177 |     args, dist_helper, writer = init_basics()
178 | 
179 |     """Get network"""
180 |     model, optimizer, criterion = init_model(dist_helper)
181 | 
182 |     """Get dataloader"""
183 |     train_loader, test_loader = init_dataloader(args.batch_size, dist_helper)
184 | 
185 |     """Go training"""
186 |     go_training(args.epoch, model, optimizer, criterion, dist_helper, train_loader, test_loader, writer, args.logdir)
187 | 
188 |     """Distributed training cleanup"""
189 |     dist_helper.clean_up()
190 | 
191 |     
192 | if __name__ == '__main__':
193 |     main()
194 | 


--------------------------------------------------------------------------------
/scripts/demo_cc.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --account=def-rjliao
 3 | #SBATCH --gres=gpu:v100l:4        # Number of GPUs per node (specifying v100l gpu)
 4 | #SBATCH --nodes=2                 # Number of nodes
 5 | #SBATCH --ntasks=8                # Number of MPI process
 6 | #SBATCH --ntasks-per-node=4       # Number of distributed process per compute node
 7 | #SBATCH --cpus-per-task=8         # CPU cores per MPI process
 8 | #SBATCH --mem=64G                 # memory per node
 9 | #SBATCH --time=00-00:20            # time (DD-HH:MM)
10 | #SBATCH --mail-user=yanq@student.ubc.ca # send email regarding task status
11 | #SBATCH --mail-type=ALL
12 |  
13 | ################################################################################
14 | 
15 | # in this demo, we take 2 nodes and each node has 4 V100-32GB GPUs
16 | MASTER_PORT=29400
17 | 
18 | module load gcc
19 | module load cuda
20 | module load nccl
21 | module load openmpi
22 | 
23 | # you should submit job from the cloned repo's directory
24 | cd ${SLURM_SUBMIT_DIR}
25 | source venvhpc/bin/activate
26 | export OMP_NUM_THREADS=6
27 | 
28 | # single GPU: use 1 GPU on 1 node
29 | CUDA_VISIBLE_DEVICES=0 python main.py --batch_size=768 -m=cc_demo_single_gpu
30 | 
31 | # DP: use multiple GPUs on 1 node
32 | CUDA_VISIBLE_DEVICES=0,1,2,3 python main.py --batch_size=3072 --dp -m=cc_demo_single_node_dp
33 | 
34 | # DDP: use multiple GPUs on 1 node
35 | CUDA_VISIBLE_DEVICES=0,1,2,3 torchrun --nnodes=1 --nproc_per_node=4 --master_port=$MASTER_PORT main.py --batch_size=3072 --ddp -m=cc_demo_single_node_ddp
36 | 
37 | # DDP: use multiple GPUs on multiple nodes
38 | 
39 | # mpirun method
40 | mpirun -np 8 \
41 | -x MASTER_ADDR=$(hostname) \
42 | -x MASTER_PORT=$MASTER_PORT \
43 | -x PATH \
44 | -bind-to none -map-by :OVERSUBSCRIBE \
45 | -mca pml ob1 -mca btl ^openib \
46 | python main.py --batch_size=6144 --ddp -m=cc_demo_multiple_node_mpi_ddp
47 | 
48 | # srun method
49 | # The SLURM_NTASKS variable tells the script how many processes are available for this execution.
50 | # “srun” executes the script <tasks-per-node * nodes> times
51 | 
52 | # Therefore, for error-free srun execution, we need to overwrite the SBATCH options set in the very beginning
53 | # by using --ntasks=2 --ntasks-per-node=1 explicitly.
54 | # Note: the nuance is --ntasks=8 --ntasks-per-node=4 works for mpirun + python main.py --args,
55 | # while --ntasks=2 --ntasks-per-node=1 works for srun + torchrun.
56 | 
57 | srun --nodes=2 --ntasks-per-node=1 --ntasks=2 torchrun --nnodes=2 --nproc_per_node=4 \
58 | --rdzv_id=$SLURM_JOB_ID --rdzv_backend=c10d --rdzv_endpoint=$(hostname):$MASTER_PORT \
59 | main.py --batch_size=6144 --ddp -m=cc_demo_multiple_node_srun_ddp
60 | 
61 | ##### DEBUG info #####
62 | #echo $SLURM_JOB_NODELIST
63 | #
64 | #echo $(hostname)
65 | #
66 | #echo $(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
67 | #
68 | ## extract the first string component before the first dot
69 | ## cdr2639.int.cedar.computecanada.ca -> cdr2639
70 | #echo $(echo $(hostname) | cut -d '.' -f 1)
71 | 


--------------------------------------------------------------------------------
/scripts/demo_cc_apptainer.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --account=def-rjliao
 3 | #SBATCH --gres=gpu:v100l:4        # Number of GPUs per node (specifying v100l gpu)
 4 | #SBATCH --nodes=1                 # Number of nodes
 5 | #SBATCH --ntasks=8                # Number of MPI process
 6 | #SBATCH --ntasks-per-node=4       # Number of distributed process per compute node
 7 | #SBATCH --cpus-per-task=8         # CPU cores per MPI process
 8 | #SBATCH --mem=64G                 # memory per node
 9 | #SBATCH --time=00-00:20            # time (DD-HH:MM)
10 | #SBATCH --mail-user=yanq@student.ubc.ca # send email regarding task status
11 | #SBATCH --mail-type=ALL
12 |  
13 | ################################################################################
14 | 
15 | # in this demo, we take 1 nodes and each node has 4 V100-32GB GPUs
16 | 
17 | ## set up environment variables for apptainer
18 | ## this script is intended for the narval cluster, please adjust the path accordingly for other clusters
19 | module load apptainer-suid/1.1
20 | cd /lustre07/scratch/${USER}/venv
21 | export TMPDIR=/tmp/${USER}tmp
22 | mkdir -p ${TMPDIR}
23 | export APPTAINER_CACHEDIR=${TMPDIR}
24 | export APPTAINER_TMPDIR=${TMPDIR}
25 | 
26 | # !!!please change the USER_NAME to your own username before running the script!!!
27 | 
28 | # single GPU: use 1 GPU on 1 node
29 | apptainer exec -C -B /project -B /scratch -B /home -W ${TMPDIR} --nv venvhpc.sandbox bash -c '
30 | export USER_NAME='YOUR_USER_NAME'
31 | source /opt/conda/etc/profile.d/conda.sh
32 | conda activate /lustre07/scratch/${USER_NAME}/venv/condaenvs/venvhpc
33 | cd /lustre06/project/6068146/${USER_NAME}/HPC_helper
34 | CUDA_VISIBLE_DEVICES=0 python main.py --batch_size=768 -m=cc_demo_single_gpu
35 | '
36 | 
37 | # single GPU: use 1 GPU on 1 node
38 | apptainer exec -C -B /project -B /scratch -B /home -W ${TMPDIR} --nv venvhpc.sandbox bash -c '
39 | export USER_NAME='YOUR_USER_NAME'
40 | source /opt/conda/etc/profile.d/conda.sh
41 | conda activate /lustre07/scratch/${USER_NAME}/venv/condaenvs/venvhpc
42 | cd /lustre06/project/6068146/${USER_NAME}/HPC_helper
43 | CUDA_VISIBLE_DEVICES=0 python main.py --batch_size=768 -m=cc_apptainer_demo_single_gpu
44 | '
45 | 
46 | # DP: use multiple GPUs on 1 node
47 | apptainer exec -C -B /project -B /scratch -B /home -W ${TMPDIR} --nv venvhpc.sandbox bash -c '
48 | export USER_NAME='YOUR_USER_NAME'
49 | source /opt/conda/etc/profile.d/conda.sh
50 | conda activate /lustre07/scratch/${USER_NAME}/venv/condaenvs/venvhpc
51 | cd /lustre06/project/6068146/${USER_NAME}/HPC_helper
52 | CUDA_VISIBLE_DEVICES=0,1,2,3 python main.py --batch_size=3072 --dp -m=cc_apptainer_demo_single_node_dp
53 | '
54 | 
55 | # DDP: use multiple GPUs on 1 node
56 | apptainer exec -C -B /project -B /scratch -B /home -W ${TMPDIR} --nv venvhpc.sandbox bash -c '
57 | export USER_NAME='YOUR_USER_NAME'
58 | export MASTER_PORT=29400
59 | source /opt/conda/etc/profile.d/conda.sh
60 | conda activate /lustre07/scratch/${USER_NAME}/venv/condaenvs/venvhpc
61 | cd /lustre06/project/6068146/${USER_NAME}/HPC_helper
62 | CUDA_VISIBLE_DEVICES=0,1,2,3 torchrun --nnodes=1 --nproc_per_node=4 --master_port=$MASTER_PORT main.py --batch_size=3072 --ddp -m=cc_apptainer_demo_single_node_ddp
63 | '
64 | 
65 | # note: we haven't tested the multi-node DDP with apptainer yet, but the MPI option may work
66 | 


--------------------------------------------------------------------------------
/scripts/demo_sockeye.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=demo_sockeye
 3 | #SBATCH --account=st-rjliao-1-gpu
 4 | #SBATCH --nodes=1
 5 | #SBATCH --ntasks=1
 6 | #SBATCH --cpus-per-task=24
 7 | #SBATCH --mem=32G
 8 | #SBATCH --time=00:20:00
 9 | #SBATCH --gpus-per-node=1
10 | #SBATCH --output=slurm-%j_out.txt
11 | #SBATCH --error=slurm-%j_err.txt
12 | #SBATCH --mail-user=yanq@student.ubc.ca
13 | #SBATCH --mail-type=ALL
14 | ################################################################################
15 | 
16 | # in this demo, we take 2 nodes and each node has 4 V100-32GB GPUs
17 | MASTER_PORT=29400
18 | 
19 | module load gcc
20 | module load cuda
21 | module load nccl
22 | module load openmpi
23 | 
24 | # you should submit job from the cloned repo's directory
25 | cd ${PBS_O_WORKDIR}
26 | source venvhpc/bin/activate
27 | export OMP_NUM_THREADS=6
28 | 
29 | # note: at Sockeye, it's better to specify CUDA_VISIBLE_DEVICES explicitly for distributed training,
30 | # otherwise methods in torch.cuda may lead to an error, e.g., torch.cuda.device_count()
31 | 
32 | # single GPU: use 1 GPU on 1 node
33 | CUDA_VISIBLE_DEVICES=0 python main.py --batch_size=768 -m=sockeye_demo_single_gpu
34 | 
35 | # DP: use multiple GPUs on 1 node
36 | CUDA_VISIBLE_DEVICES=0,1,2,3 python main.py --batch_size=3072 --dp -m=sockeye_demo_single_node_dp
37 | 
38 | # DDP: use multiple GPUs on 1 node
39 | CUDA_VISIBLE_DEVICES=0,1,2,3 torchrun --nnodes=1 --nproc_per_node=4 --master_port=$MASTER_PORT main.py --batch_size=3072 --ddp -m=sockeye_demo_single_node_ddp
40 | 
41 | # DDP: use multiple GPUs on multiple nodes
42 | mpirun -np 8 \
43 | --hostfile $PBS_NODEFILE --oversubscribe \
44 | -x MASTER_ADDR=$(hostname) \
45 | -x MASTER_PORT=$MASTER_PORT \
46 | -x CUDA_VISIBLE_DEVICES=0,1,2,3 \
47 | -x PATH \
48 | -bind-to none -map-by :OVERSUBSCRIBE \
49 | -mca pml ob1 -mca btl ^openib \
50 | python main.py --batch_size=6144 --ddp -m=sockeye_demo_multiple_node_mpi_ddp
51 | 
52 | 


--------------------------------------------------------------------------------
/scripts/demo_sockeye_pbs.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #PBS -l walltime=00:20:00,select=2:ncpus=8:ngpus=4:mem=64gb:gpu_mem=32gb
 3 | #PBS -N sockeye_demo
 4 | #PBS -A st-rjliao-1-gpu
 5 | #PBS -m abe
 6 | #PBS -M yanq@student.ubc.ca
 7 |  
 8 | ################################################################################
 9 | 
10 | # in this demo, we take 2 nodes and each node has 4 V100-32GB GPUs
11 | MASTER_PORT=29400
12 | 
13 | module load gcc
14 | module load cuda
15 | module load nccl
16 | module load openmpi
17 | 
18 | # you should submit job from the cloned repo's directory
19 | cd ${PBS_O_WORKDIR}
20 | source venvhpc/bin/activate
21 | export OMP_NUM_THREADS=6
22 | 
23 | # note: at Sockeye, it's better to specify CUDA_VISIBLE_DEVICES explicitly for distributed training,
24 | # otherwise methods in torch.cuda may lead to an error, e.g., torch.cuda.device_count()
25 | 
26 | # single GPU: use 1 GPU on 1 node
27 | CUDA_VISIBLE_DEVICES=0 python main.py --batch_size=768 -m=sockeye_demo_single_gpu
28 | 
29 | # DP: use multiple GPUs on 1 node
30 | CUDA_VISIBLE_DEVICES=0,1,2,3 python main.py --batch_size=3072 --dp -m=sockeye_demo_single_node_dp
31 | 
32 | # DDP: use multiple GPUs on 1 node
33 | CUDA_VISIBLE_DEVICES=0,1,2,3 torchrun --nnodes=1 --nproc_per_node=4 --master_port=$MASTER_PORT main.py --batch_size=3072 --ddp -m=sockeye_demo_single_node_ddp
34 | 
35 | # DDP: use multiple GPUs on multiple nodes
36 | mpirun -np 8 \
37 | --hostfile $PBS_NODEFILE --oversubscribe \
38 | -x MASTER_ADDR=$(hostname) \
39 | -x MASTER_PORT=$MASTER_PORT \
40 | -x CUDA_VISIBLE_DEVICES=0,1,2,3 \
41 | -x PATH \
42 | -bind-to none -map-by :OVERSUBSCRIBE \
43 | -mca pml ob1 -mca btl ^openib \
44 | python main.py --batch_size=6144 --ddp -m=sockeye_demo_multiple_node_mpi_ddp
45 | 
46 | 


--------------------------------------------------------------------------------
/scripts/demo_vector.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=demo_vector
 3 | #SBATCH --partition=rtx6000         # Type of GPUs
 4 | #SBATCH --gres=gpu:4                # Number of GPUs per node
 5 | #SBATCH --nodes=2                   # Number of nodes
 6 | #SBATCH --ntasks=8                  # Number of MPI process
 7 | #SBATCH --ntasks-per-node=4         # Number of distributed process per compute node
 8 | #SBATCH --cpus-per-task=8           # CPU cores per MPI process
 9 | #SBATCH --mem=64G                   # memory per node
10 | #SBATCH --time=00-00:20             # time (DD-HH:MM)
11 | #SBATCH --qos=normal                # QoS type
12 | #SBATCH --mail-user=yanq@student.ubc.ca # send email regarding task status
13 | #SBATCH --mail-type=ALL
14 | #SBATCH --output=slurm-%j_out.txt
15 | #SBATCH --error=slurm-%j_err.txt
16 |  
17 | ################################################################################
18 | 
19 | # in this demo, we take 2 nodes and each node has 4 RTX6000-24GB GPUs
20 | MASTER_PORT=29400
21 | 
22 | module use /pkgs/environment-modules/
23 | module load python/3.8
24 | module load cuda-11.7
25 | source /scratch/ssd004/scratch/qiyan/venvmtr/bin/activate
26 | cd /fs01/home/qiyan/DSL-MTR/tools
27 | 
28 | 
29 | # you should submit job from the cloned repo's directory
30 | cd ${SLURM_SUBMIT_DIR}
31 | source venvhpc/bin/activate
32 | export OMP_NUM_THREADS=6
33 | 
34 | # single GPU: use 1 GPU on 1 node
35 | (while true; do nvidia-smi; top -b -n 1 | head -20; sleep 10; done) &
36 | CUDA_VISIBLE_DEVICES=0 python main.py --batch_size=512 -m=vector_demo_single_gpu
37 | 
38 | # DP: use multiple GPUs on 1 node
39 | (while true; do nvidia-smi; top -b -n 1 | head -20; sleep 10; done) &
40 | CUDA_VISIBLE_DEVICES=0,1,2,3 python main.py --batch_size=2048 --dp -m=vector_demo_single_node_dp
41 | 
42 | # DDP: use multiple GPUs on 1 node
43 | (while true; do nvidia-smi; top -b -n 1 | head -20; sleep 10; done) &
44 | CUDA_VISIBLE_DEVICES=0,1,2,3 torchrun --nnodes=1 --nproc_per_node=4 --master_port=$MASTER_PORT main.py --batch_size=2048 --ddp -m=vector_demo_single_node_ddp
45 | 
46 | # DDP: use multiple GPUs on multiple nodes
47 | 
48 | # mpirun method
49 | (while true; do nvidia-smi; top -b -n 1 | head -20; sleep 10; done) &
50 | mpirun -np 8 \
51 | -x MASTER_ADDR=$(hostname) \
52 | -x MASTER_PORT=$MASTER_PORT \
53 | -x PATH \
54 | -bind-to none -map-by :OVERSUBSCRIBE \
55 | -mca pml ob1 -mca btl ^openib \
56 | python main.py --batch_size=2048 --ddp -m=vector_demo_multiple_node_mpi_ddp
57 | 
58 | # srun method
59 | # The SLURM_NTASKS variable tells the script how many processes are available for this execution.
60 | # “srun” executes the script <tasks-per-node * nodes> times
61 | 
62 | # Therefore, for error-free srun execution, we need to overwrite the SBATCH options set in the very beginning
63 | # by using --ntasks=2 --ntasks-per-node=1 explicitly.
64 | # Note: the nuance is --ntasks=8 --ntasks-per-node=4 works for mpirun + python main.py --args,
65 | # while --ntasks=2 --ntasks-per-node=1 works for srun + torchrun.
66 | 
67 | (while true; do nvidia-smi; top -b -n 1 | head -20; sleep 10; done) &
68 | srun --nodes=2 --ntasks-per-node=1 --ntasks=2 torchrun --nnodes=2 --nproc_per_node=4 \
69 | --rdzv_id=$SLURM_JOB_ID --rdzv_backend=c10d --rdzv_endpoint=$(hostname):$MASTER_PORT \
70 | main.py --batch_size=2048 --ddp -m=vector_demo_multiple_node_srun_ddp
71 | 
72 | ##### DEBUG info #####
73 | #echo $SLURM_JOB_NODELIST
74 | #
75 | #echo $(hostname)
76 | #
77 | #echo $(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
78 | #
79 | 


--------------------------------------------------------------------------------
/setup/requirements_cc.txt:
--------------------------------------------------------------------------------
1 | torch
2 | torchvision
3 | torchaudio
4 | tqdm
5 | numpy
6 | ml_collections
7 | torch_tb_profiler
8 | 


--------------------------------------------------------------------------------
/setup/requirements_sockeye.txt:
--------------------------------------------------------------------------------
1 | --extra-index-url https://download.pytorch.org/whl/cu116
2 | torch
3 | torchvision
4 | torchaudio
5 | tqdm
6 | numpy
7 | ml_collections
8 | torch_tb_profiler
9 | 


--------------------------------------------------------------------------------
/utils/arg_parser.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import logging
  3 | import os
  4 | import pdb
  5 | import shutil
  6 | import random
  7 | import sys
  8 | from datetime import datetime
  9 | import numpy as np
 10 | import torch
 11 | import torch.distributed as dist
 12 | from torch.utils.tensorboard import SummaryWriter
 13 | 
 14 | from utils.dist_training import get_ddp_save_flag
 15 | 
 16 | 
 17 | def parse_arguments():
 18 |     """
 19 |     Argument parser.
 20 |     """
 21 |     parser = argparse.ArgumentParser(description="Running Experiments")
 22 |     parser.add_argument('-l', '--log_level', type=str,
 23 |                         default='DEBUG', help="Logging Level, one of: DEBUG, INFO, WARNING, ERROR, CRITICAL")
 24 |     parser.add_argument('-m', '--comment', type=str,
 25 |                         default="", help="A single line comment for the experiment")
 26 |     parser.add_argument('--dp', default=False, action='store_true',
 27 |                         help='To use DataParallel distributed learning.')
 28 |     parser.add_argument('--ddp', default=False, action='store_true',
 29 |                         help='To use DDP distributed learning')
 30 |     parser.add_argument('--ddp_gpu_ids', nargs='+', default=None,
 31 |                         help="A list of GPU IDs to run distributed learning")
 32 |     parser.add_argument('--batch_size', default=256, type=int,
 33 |                         help='Training batch size.')
 34 |     parser.add_argument('--epoch', default=5, type=int,
 35 |                         help='Training epochs.')
 36 |     parser.add_argument('--seed', default=1234, type=int,
 37 |                         help='Random seed.')
 38 |     parser.add_argument('--ddp_init_method', default='env://', type=str,
 39 |                         help='torch.distributed.init_process_group options.')
 40 | 
 41 |     args = parser.parse_args()
 42 | 
 43 |     # add log directory
 44 |     if args.dp:
 45 |         dist_status = 'dp'
 46 |     elif args.ddp:
 47 |         dist_status = 'ddp'
 48 |     else:
 49 |         dist_status = 'single_gpu'
 50 | 
 51 |     logdir_nm = dist_status + "_" + datetime.now().strftime("%Y%m%d_%H%M%S")
 52 |     if len(args.comment):
 53 |         logdir_nm += '_' + args.comment
 54 | 
 55 |     logdir = os.path.join('runs', logdir_nm)
 56 |     os.makedirs(logdir, exist_ok=True)
 57 | 
 58 |     args.logdir = logdir
 59 |     print('Args: \n', args)
 60 |     return args
 61 | 
 62 | 
 63 | def set_seed_and_logger(seed, logdir, log_level, comment, dist_helper):
 64 |     """
 65 |     Set up random seed number and global logger.
 66 |     """
 67 |     # Setup random seed
 68 |     if dist_helper.is_ddp:
 69 |         seed += dist.get_rank()
 70 |     else:
 71 |         pass
 72 |     random.seed(seed)
 73 |     np.random.seed(seed)
 74 |     torch.manual_seed(seed)
 75 |     torch.cuda.manual_seed_all(seed)
 76 | 
 77 |     # torch numerical accuracy flags
 78 |     # reference: https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
 79 |     # The flag below controls whether to allow TF32 on matmul. This flag defaults to True.
 80 |     torch.backends.cuda.matmul.allow_tf32 = False
 81 |     # The flag below controls whether to allow TF32 on cuDNN. This flag defaults to True.
 82 |     torch.backends.cudnn.allow_tf32 = False
 83 | 
 84 |     # Setup logger
 85 |     if dist_helper.is_ddp:
 86 |         log_file = os.path.join(logdir, "ddp_rank_{:02d}_".format(dist.get_rank()) + log_level.lower() + ".log")
 87 |     else:
 88 |         log_file = os.path.join(logdir, log_level.lower() + ".log")
 89 |     logger_format = comment + '| %(asctime)s %(message)s'
 90 |     fh = logging.FileHandler(log_file)
 91 |     fh.setLevel(log_level)
 92 |     for handler in logging.root.handlers[:]:
 93 |         logging.root.removeHandler(handler)
 94 |     logging.basicConfig(level=logging.DEBUG, format=logger_format,
 95 |                         datefmt='%m-%d %H:%M:%S',
 96 |                         handlers=[
 97 |                             fh,
 98 |                             logging.StreamHandler(sys.stdout)
 99 |                         ])
100 |     logging.getLogger('matplotlib.font_manager').setLevel(logging.INFO)  # remove excessive matplotlib messages
101 |     logging.getLogger('matplotlib').setLevel(logging.INFO)  # remove excessive matplotlib messages
102 |     logging.info('EXPERIMENT BEGIN: ' + comment)
103 |     logging.info('logging into %s', log_file)
104 | 
105 |     # Setup tensorboard logger
106 |     if get_ddp_save_flag():
107 |         writer = SummaryWriter(log_dir=logdir)
108 |     else:
109 |         writer = None
110 |     return writer
111 | 
112 | 
113 | def backup_code(logdir):
114 |     if get_ddp_save_flag():
115 |         code_path = os.path.join(logdir, 'code')
116 |         dirs_to_save = ['utils']
117 |         os.makedirs(code_path, exist_ok=True)
118 | 
119 |         # save_name = os.path.join(code_path, 'config.yaml')
120 |         # yaml.dump(dict(config), open(save_name, 'w'), default_flow_style=False)
121 | 
122 |         os.system('cp ./*py ' + code_path)
123 |         [shutil.copytree(os.path.join('./', this_dir), os.path.join(code_path, this_dir)) for this_dir in dirs_to_save]
124 | 


--------------------------------------------------------------------------------
/utils/dist_training.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | import pdb
  4 | 
  5 | import torch
  6 | from torch import distributed as dist, nn as nn
  7 | from torch.nn.parallel import DistributedDataParallel as DDP
  8 | 
  9 | 
 10 | class DistributedHelper(object):
 11 |     def __init__(self, flag_dp, flag_ddp, ddp_gpu_ids, init_method):
 12 |         self.flag_dp = flag_dp
 13 |         self.flag_ddp = flag_ddp
 14 |         self.ddp_gpu_ids = ddp_gpu_ids
 15 |         self.init_method = init_method
 16 | 
 17 |         if (self.flag_dp or self.flag_ddp) and ddp_gpu_ids is None:
 18 |             assert torch.cuda.device_count() > 1, "Number of GPU must be more than one to use distributed learning!"
 19 |         assert not all((flag_dp, flag_ddp)), \
 20 |             "Flag DP ({:}) and flag DDP ({:}) cannot be both true!".format(flag_dp, flag_ddp)
 21 | 
 22 |         self.gpu_name = 'dummy'
 23 |         self.init_ddp()
 24 |         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 25 | 
 26 |     def init_ddp(self):
 27 |         """
 28 |         Initialize DDP distributed training if necessary.
 29 |         Note: we have to initialize DDP mode before initialize the logging file, otherwise the multiple DDP
 30 |         processes' loggings will interfere with each other.
 31 |         """
 32 |         print("Number of available GPU to use: {}".format(torch.cuda.device_count()))
 33 |         if self.flag_ddp:
 34 |             self.init_ddp_backend()
 35 |             self.gpu_name = torch.cuda.get_device_name()
 36 |             print("Setup DDP for process {:d} using GPUs {} (ID) with NCCL backend. GPU for this process: {:s}".format(
 37 |                 os.getpid(), self.ddp_gpu_ids, self.gpu_name))
 38 |         elif self.flag_dp:
 39 |             gpu_specs = [torch.cuda.get_device_name(i_gpu) for i_gpu in range(torch.cuda.device_count())]
 40 |             self.gpu_name = ','.join(gpu_specs)
 41 |             print("Setup DP using {:d} GPUs, specs: {:s}.".format(torch.cuda.device_count(), self.gpu_name))
 42 |         else:
 43 |             self.gpu_name = torch.cuda.get_device_name()
 44 |             print("Single GPU mode, specs: {:s}.".format(self.gpu_name))
 45 | 
 46 |     def init_ddp_backend(self):
 47 |         """
 48 |         Start DDP engine using NCCL backend.
 49 |         """
 50 |         ddp_status, env_dict = self.get_ddp_status()
 51 |         local_rank = env_dict['LOCAL_RANK']
 52 | 
 53 |         if self.ddp_gpu_ids is not None:
 54 |             assert isinstance(self.ddp_gpu_ids, list)
 55 |             num_gpus = len(self.ddp_gpu_ids)
 56 |             gpu_id = int(self.ddp_gpu_ids[local_rank % num_gpus])
 57 |             torch.cuda.set_device(gpu_id)  # set single gpu device per process
 58 |         else:
 59 |             torch.cuda.set_device(local_rank)  # set single gpu device per process
 60 |         dist.init_process_group(backend="nccl", init_method=self.init_method, rank=env_dict['WORLD_RANK'], world_size=env_dict['WORLD_SIZE'])
 61 | 
 62 |     def dist_adapt_model(self, model):
 63 |         """
 64 |         Setup distributed learning for network.
 65 |         """
 66 |         logging.info("Adapt the model for distributed training...")
 67 |         if self.flag_ddp:
 68 |             # DDP
 69 |             model = DDP(model.cuda(), device_ids=[torch.cuda.current_device()])  # single CUDA device per process
 70 |             logging.info("Distributed ON. Mode: DDP. Backend: {:s}, Rank: {:d} / World size: {:d}. "
 71 |                          "Current device: {}, spec: {}".format(
 72 |                           dist.get_backend(), dist.get_rank(), dist.get_world_size(),
 73 |                           torch.cuda.current_device(), self.gpu_name))
 74 |         elif self.flag_dp:
 75 |             # DP
 76 |             model = nn.DataParallel(model)
 77 |             model.to(torch.device("cuda"))  # multiple devices per process, controlled by CUDA_VISIBLE_DEVICES
 78 |             logging.info("Distributed ON. Mode: DP. Number of available GPU to use: {}, specs: {}".format(
 79 |                           torch.cuda.device_count(), self.gpu_name))
 80 |         else:
 81 |             # single GPU
 82 |             logging.info("Distributed OFF. Single-GPU training, specs: {}.".format(self.gpu_name))
 83 | 
 84 |         return model
 85 | 
 86 |     def ddp_sync(self):
 87 |         if self.flag_ddp and dist.is_initialized():
 88 |             dist.barrier()
 89 |         else:
 90 |             pass
 91 | 
 92 |     def clean_up(self):
 93 |         self.ddp_sync()
 94 |         if self.flag_ddp and dist.is_initialized():
 95 |             dist.destroy_process_group()
 96 |         else:
 97 |             pass
 98 | 
 99 |     @staticmethod
100 |     def get_ddp_status():
101 |         """
102 |         Get DDP-related env. parameters.
103 |         """
104 |         if 'LOCAL_RANK' in os.environ:
105 |             # Environment variables set by torch.distributed.launch or torchrun
106 |             local_rank = int(os.environ['LOCAL_RANK'])
107 |             world_size = int(os.environ['WORLD_SIZE'])
108 |             world_rank = int(os.environ['RANK'])
109 |         elif 'OMPI_COMM_WORLD_LOCAL_RANK' in os.environ:
110 |             # Environment variables set by mpirun
111 |             local_rank = int(os.environ['OMPI_COMM_WORLD_LOCAL_RANK'])
112 |             world_size = int(os.environ['OMPI_COMM_WORLD_SIZE'])
113 |             world_rank = int(os.environ['OMPI_COMM_WORLD_RANK'])
114 |         else:
115 |             raise NotImplementedError
116 | 
117 |         env_dict = {
118 |             'MASTER_ADDR': os.environ['MASTER_ADDR'],
119 |             'MASTER_PORT': os.environ['MASTER_PORT'],
120 |             'LOCAL_RANK': local_rank,
121 |             'WORLD_SIZE': world_size,
122 |             'WORLD_RANK': world_rank,
123 |         }
124 |         ddp_status = "Process PID: {}. DDP setup: {} ".format(os.getpid(), env_dict)
125 |         return ddp_status, env_dict
126 | 
127 |     @property
128 |     def is_ddp(self):
129 |         """
130 |         DDP flag.
131 |         """
132 |         return self.flag_ddp
133 | 
134 |     @property
135 |     def is_dp(self):
136 |         """
137 |         DP flag.
138 |         """
139 |         return self.flag_dp
140 | 
141 |     @property
142 |     def is_distributed(self):
143 |         """
144 |         Distributed learning flag.
145 |         """
146 |         return self.flag_dp or self.flag_ddp
147 | 
148 | 
149 | # Independent function helpers
150 | def get_ddp_save_flag():
151 |     """
152 |     Return saving flag for DDP mode, only rank 0 process makes the output.
153 |     """
154 |     flag_save = True
155 |     if dist.is_initialized():
156 |         if dist.get_rank() != 0:
157 |             flag_save = False
158 |     return flag_save
159 | 
160 | 
161 | def dist_save_model(data_to_save, to_save_path):
162 |     """
163 |     Wrapper to save based on DDP status (for main process only).
164 |     """
165 |     if get_ddp_save_flag():
166 |         torch.save(data_to_save, to_save_path)
167 | 


--------------------------------------------------------------------------------
/utils/learning_utils.py:
--------------------------------------------------------------------------------
 1 | def count_model_params(model):
 2 |     """
 3 |     Go through the model parameters
 4 |     """
 5 |     param_strings = []
 6 |     max_string_len = 126
 7 |     for name, param in model.named_parameters():
 8 |         if param.requires_grad:
 9 |             line = '.' * max(0, max_string_len - len(name) - len(str(param.size())))
10 |             param_strings.append(f"{name} {line} {param.size()}")
11 |     param_string = '\n'.join(param_strings)
12 | 
13 |     total_params = sum(p.numel() for p in model.parameters())
14 |     total_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
15 |     return param_string, total_params, total_trainable_params
16 | 
17 | 
18 | def _print_and_log(in_str, log_file):
19 |     assert isinstance(in_str, str)
20 |     print(in_str, flush=True)
21 |     log_file.write(in_str + '\n')
22 |     log_file.flush()
23 | 


--------------------------------------------------------------------------------