├── .gitignore ├── LICENSE ├── README.md ├── helper └── benchmark_layernorm.py ├── main.py ├── scripts ├── demo_cc.sh ├── demo_cc_apptainer.sh ├── demo_sockeye.sh ├── demo_sockeye_pbs.sh └── demo_vector.sh ├── setup ├── requirements_cc.txt └── requirements_sockeye.txt └── utils ├── arg_parser.py ├── dist_training.py └── learning_utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | mnist_data 2 | logger* 3 | venv* 4 | runs/ 5 | .idea 6 | 7 | # Byte-compiled / optimized / DLL files 8 | __pycache__/ 9 | *.py[cod] 10 | *$py.class 11 | 12 | # C extensions 13 | *.so 14 | 15 | # Distribution / packaging 16 | .Python 17 | build/ 18 | develop-eggs/ 19 | dist/ 20 | downloads/ 21 | eggs/ 22 | .eggs/ 23 | lib/ 24 | lib64/ 25 | parts/ 26 | sdist/ 27 | var/ 28 | wheels/ 29 | pip-wheel-metadata/ 30 | share/python-wheels/ 31 | *.egg-info/ 32 | .installed.cfg 33 | *.egg 34 | MANIFEST 35 | 36 | # PyInstaller 37 | # Usually these files are written by a python script from a template 38 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 39 | *.manifest 40 | *.spec 41 | 42 | # Installer logs 43 | pip-log.txt 44 | pip-delete-this-directory.txt 45 | 46 | # Unit test / coverage reports 47 | htmlcov/ 48 | .tox/ 49 | .nox/ 50 | .coverage 51 | .coverage.* 52 | .cache 53 | nosetests.xml 54 | coverage.xml 55 | *.cover 56 | *.py,cover 57 | .hypothesis/ 58 | .pytest_cache/ 59 | 60 | # Translations 61 | *.mo 62 | *.pot 63 | 64 | # Django stuff: 65 | *.log 66 | local_settings.py 67 | db.sqlite3 68 | db.sqlite3-journal 69 | 70 | # Flask stuff: 71 | instance/ 72 | .webassets-cache 73 | 74 | # Scrapy stuff: 75 | .scrapy 76 | 77 | # Sphinx documentation 78 | docs/_build/ 79 | 80 | # PyBuilder 81 | target/ 82 | 83 | # Jupyter Notebook 84 | .ipynb_checkpoints 85 | 86 | # IPython 87 | profile_default/ 88 | ipython_config.py 89 | 90 | # pyenv 91 | .python-version 92 | 93 | # pipenv 94 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 95 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 96 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 97 | # install all needed dependencies. 98 | #Pipfile.lock 99 | 100 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 101 | __pypackages__/ 102 | 103 | # Celery stuff 104 | celerybeat-schedule 105 | celerybeat.pid 106 | 107 | # SageMath parsed files 108 | *.sage.py 109 | 110 | # Environments 111 | .env 112 | .venv 113 | env/ 114 | venv/ 115 | ENV/ 116 | env.bak/ 117 | venv.bak/ 118 | 119 | # Spyder project settings 120 | .spyderproject 121 | .spyproject 122 | 123 | # Rope project settings 124 | .ropeproject 125 | 126 | # mkdocs documentation 127 | /site 128 | 129 | # mypy 130 | .mypy_cache/ 131 | .dmypy.json 132 | dmypy.json 133 | 134 | # Pyre type checker 135 | .pyre/ 136 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Qi Yan 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # HPC_helper 2 | 3 | This repository showcases a minimal example of using `PyTorch` distributed training on computing clusters, enabling you to run your training tasks on `N` nodes, each with `M` GPUs. It includes common use cases such as [DataParallel (DP)](https://pytorch.org/docs/stable/generated/torch.nn.DataParallel.html) or [DistributedDataParallel (DDP)](https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html) and offers support for [PBS](https://2020.help.altair.com/2020.1/PBSProfessional/PBSUserGuide2020.1.1.pdf) and [SLURM](https://slurm.schedmd.com/documentation.html) systems. Below, you'll find runnable code and scripts for UBC Sockeye, Vector Vaughan cluster, and Digital Research Alliance of Canada (formerly ComputeCanada) HPCs. 4 | 5 | Last updated: Jun 23, 2024. Contact: Qi Yan, qi.yan@ece.ubc.ca 6 | 7 | ## Get started 8 | 9 | ### Setup python environment 10 | ```bash 11 | # load python 3.8 at HPC 12 | # module load gcc/9.4.0 python/3.8.10 cuda/11.3.1 nccl/2.9.9-1-cuda11-3 # Sockeye 13 | # module load python/3.8 cuda-11.7 # Vector 14 | # module load python/3.10.13 StdEnv/2023 # CC 15 | 16 | # python virtual environment 17 | python -m venv venvhpc 18 | source venvhpc/bin/activate 19 | pip install -U pip 20 | pip install -r setup/requirements_sockeye.txt # if at Sockeye 21 | pip install -r setup/requirements_cc.txt # if at Vector or CC 22 | 23 | # sanity check at Sockeye or CC 24 | # you must enter an interactive session on Vector to tun this 25 | python -c "import torch; print('Things are done.')" 26 | 27 | # download MNIST dataset 28 | mkdir -p ./mnist_data/MNIST/raw 29 | wget https://raw.githubusercontent.com/fgnt/mnist/master/train-images-idx3-ubyte.gz -P ./mnist_data/MNIST/raw 30 | wget https://raw.githubusercontent.com/fgnt/mnist/master/train-labels-idx1-ubyte.gz -P ./mnist_data/MNIST/raw 31 | wget https://raw.githubusercontent.com/fgnt/mnist/master/t10k-images-idx3-ubyte.gz -P ./mnist_data/MNIST/raw 32 | wget https://raw.githubusercontent.com/fgnt/mnist/master/t10k-labels-idx1-ubyte.gz -P ./mnist_data/MNIST/raw 33 | ``` 34 | On Alliance/CC clusters, you can only `pip install` python packages available on the system and `conda` is forbidden. 35 | If you need to install additional packages, you can use the [`apptainer` container environment](https://docs.alliancecan.ca/wiki/Apptainer/en). 36 | See the section below for details. 37 | 38 |
39 | apptainer instructions on Alliance/CC clusters 40 | The following instructions have been tested on the `narval` cluster. Similar steps work on other clusters like `cedar`, while the storage path may vary. 41 | 42 | ```bash 43 | ## pull image and create sandbox; recommended to do so at /scratch space for faster runtime 44 | module load apptainer-suid/1.1 45 | mkdir -p /lustre07/scratch/${USER}/venv && cd /lustre07/scratch/${USER}/venv 46 | apptainer pull --name pytorch220_official.sif docker://pytorch/pytorch:2.2.0-cuda11.8-cudnn8-devel 47 | apptainer build --sandbox venvhpc.sandbox pytorch220_official.sif 48 | 49 | ## get ready to enter the sandbox in an interactive shell 50 | export TMPDIR=/tmp/${USER}tmp 51 | mkdir -p ${TMPDIR} 52 | export APPTAINER_CACHEDIR=${TMPDIR} 53 | export APPTAINER_TMPDIR=${TMPDIR} 54 | 55 | ## bind the project, scratch, home directory to the sandbox; run `apptainer help run` to see meaning of each flag 56 | apptainer shell -C -B /project -B /scratch -B /home -W ${TMPDIR} --nv venvhpc.sandbox 57 | 58 | ## inside apptainer create conda env or use python venv; recommended to create conda env at /scratch space 59 | bash 60 | export USER=YOUR_USER_NAME # change to your username 61 | conda create -p /lustre07/scratch/${USER}/venv/condaenvs/venvhpc python=3.8 -y 62 | conda activate /lustre07/scratch/${USER}/venv/condaenvs/venvhpc 63 | 64 | mkdir -p /lustre07/scratch/${USER}/venv/condaenvs/condacache 65 | conda config --add pkgs_dirs /lustre07/scratch/${USER}/venv/condaenvs/condacache 66 | 67 | ## pip install within the conda env 68 | pip install -U pip 69 | pip install -r setup/requirements_cc.txt 70 | 71 | ## sanity check 72 | python -c "import torch; print('Things are done.')" 73 | 74 | ## follow the above "download MNIST dataset" section to load the dataset 75 | ``` 76 | 77 | The apptainer sandbox is a containerized environment that allows you to install custom packages without root access. The `--bind` or `-B` flag is used to bind directories to the container. The sandbox itself contains only the necessary system libraries and the user's home directory. We still store the code and datasets on normal storage space. 78 | 79 |
80 | 81 | 82 | ### Go training 83 | We showcase the use of distributed learning for a simple training task using ResNet50 as backbone. 84 | 85 | **IMPORTANT**: 86 | * please change the account and notification email address in the bash script before running. 87 | * the old Sockeye script is intended for OpenPBS system, which is no longer useful and kpet just for the sake of completeness. 88 | * the Sockeye, Vector and CC scripts are intended for SLURM system, but we don't provide `preemption` support for Vector script. 89 | 90 | ```bash 91 | # at Sockeye 92 | sbatch scripts/demo_sockeye.sh 93 | 94 | # at Vector 95 | sbatch scripts/demo_vector.sh 96 | 97 | # at CC 98 | sbatch scripts/demo_cc.sh 99 | 100 | # at CC with apptainer 101 | ## note: please change the paths in the script accordingly 102 | sbatch scripts/demo_cc_apptainer.sh 103 | ``` 104 | Please check the training logs at `runs` for runtime comparison. Hear are five-epoch training time comparisons from my runs: 105 | 106 | | #Nodes | #GPUs per node | PyTorch Distirbuted Method | Sockeye runtime | CC runtime | Vector runtime | 107 | | ------ | -------------- | -------------------------- | --------------- | ---------------------------- | --------------------------------- | 108 | | N=1 | M=1 | N/A | 363.4s | 309.7s | 425.0s | 109 | | N=1 | M=4 | DP | 103.5s | 114.2s | 133.9s | 110 | | N=1 | M=4 | DDP | 93.7s | 85.2s | 113.4s | 111 | | N=2 | M=4 | DDP | 55.7s | 47.0s (mpirun); 47.4s (srun) | 60.9s (mpirun); 60.6s (srun) | 112 | 113 | In the demo script, we use Tesla V100-SXM2-32GB at Sockeye and CC, and RTX6000-24GB at Vector. 114 | The single-precision performance in terms of FLOPS is 15.7 TFLOPS for V100-SXM2-32GB and 16.3 TFLOPS for RTX6000-24GB. 115 | Therefore, the performance difference is mainly due to the GPU memory size. 116 | 117 | ## Distributed training rule of thumb 118 | 119 | Generally, we could either use [DataParallel (DP)](https://pytorch.org/docs/stable/generated/torch.nn.DataParallel.html) or [DistributedDataParallel (DDP)](https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html) protocol to start distributed training. DP is straightforward and only involves changes to a few lines of code. However, its efficiency is worse than DDP; please see [this page](https://pytorch.org/docs/stable/notes/cuda.html#use-nn-parallel-distributeddataparallel-instead-of-multiprocessing-or-nn-dataparallel) for why. Moreover, DP doesn't support multi-node distributed training. Therefore, it's better to always start with DDP despite its relatively higher complexity. The table belows shows the possible way to launch your distributed training jobs. 120 | 121 | 122 | | #Nodes | #GPUs per node | PyTorch Distirbuted Method | Launch Method at Sockeye | Launch Method at CC | 123 | |--------|----------------|----------------------------|---------------------------|----------------------| 124 | | N=1 | M=1 | N/A | N/A | N/A | 125 | | N=1 | M>1 | DDP or DP | torchrun | torchrun | 126 | | N>1 | M>1 | DDP | mpirun + python | mpirun + python or srun + torchrun | 127 | 128 | 129 | ### Difference between PBS (old Sockeye) and SLURM (Vector and CC) systems 130 | At PBS (old Sockeye) system, `mpirun + python` seems to be the only viable way to launch multi-node training. At SLURM (Vector and CC) system, we could use either `srun + torchrun` or `mpirun + python`. Essentially, both `mpirun` and `srun` are launching parallel jobs across different nodes *in one line of code*, and these two mechanisms are the key to scalable multi-node DDP training. We use the following example to show the crucial details to avoid errors. 131 | 132 | **`mpirun + python` method explained** 133 | 134 | Sample commands: 135 | ```bash 136 | mpirun -np 8 \ 137 | --hostfile $PBS_NODEFILE --oversubscribe \ 138 | -x MASTER_ADDR=$(hostname) \ 139 | -x MASTER_PORT=$MASTER_PORT \ 140 | -x CUDA_VISIBLE_DEVICES=0,1,2,3 \ 141 | -x PATH \ 142 | -bind-to none -map-by :OVERSUBSCRIBE \ 143 | -mca pml ob1 -mca btl ^openib \ 144 | python main.py --batch_size=6144 --ddp -m=sockeye_demo_multiple_node_mpi_ddp 145 | ``` 146 | The `mpirun` is executed once, then the parallel jobs will be launched and their communications will be handled by PyTorch and `mpirun` altogether. The key is that we only need to **run `mpirun + python` once on the master node**. 147 | 148 | `mpirun + python` comes with an option `-np` which specifies the number of processes in total. In our demo script, each process amounts to one trainer (i.e., one GPU), and we use `-np=8` for 2 nodes with 8 GPUs in total. This must be used along with `--oversubscribe`, and the reasons are as follows. 149 | 150 | `mpirun` assigns job processes to nodes using [`slot`](https://www.open-mpi.org/doc/v4.0/man1/mpirun.1.php#sect3) scheduling, which was originally intended for CPU-only tasks due to historical reasons (one process amounts to one CPU core). However, such slot assignment may go wrong in the age of GPU training, as now we need to view one GPU as one process. For example, old Sockeye's PBS would not distribute 8 tasks equal to the 2 nodes and instead would raise an error indicating the number of available slots is insufficient. Therefore, we need to use the `--oversubscribe` option to enforce that `mpirun` does distribute tasks equally to each node and ignores the possible false alarm errors. 151 | 152 | **`srun + torchrun` method explained** 153 | 154 | Sample commands: 155 | 156 | ```bash 157 | srun --ntasks-per-node=1 --ntasks=2 torchrun --nnodes=2 --nproc_per_node=4 \ 158 | --rdzv_id=$SLURM_JOB_ID --rdzv_backend=c10d --rdzv_endpoint=$(hostname):$MASTER_PORT \ 159 | main.py --batch_size=6144 --ddp -m=cc_demo_multiple_node_srun_ddp 160 | ``` 161 | 162 | The `SLURM_NTASKS` variable tells the script how many processes are available for this execution. `srun` executes the script `` times. For `torchrun` launch method, we only need to **run it once per node**, and in our example, we are running `torchrun` commands twice on two nodes. Note that this is different than `mpirun + python`, where we *run it once for all nodes*. 163 | 164 | For error-free srun execution, we need to pay attention to the `#SBATCH` options set in the very beginning or enforcing these parameters by using `--ntasks=2 --ntasks-per-node=1` explicitly. The nuance is `--ntasks=8 --ntasks-per-node=4` works for `mpirun + python` method, while `--ntasks=2 --ntasks-per-node=1` works for `srun + torchrun`. 165 | 166 | ## Adapt your code to distributed training 167 | If you are okay with the PyTorch's built-in distributed training utilities, the plugin at `utils/dist_training.py` could be helpful. To change the code minimally for adaptation, please refer to the lines in `main.py` where `dist_helper` is called. 168 | 169 | Other third-party plugins like [horovod](https://horovod.ai/) and [pytorch lightning](https://www.pytorchlightning.ai/) can also possibly do the same things. 170 | 171 | 172 | 173 | ## Reference 174 | #### Tutorial 175 | * [Multi Node PyTorch Distributed Training Guide For People In A Hurry](https://lambdalabs.com/blog/multi-node-pytorch-distributed-training-guide) 176 | * [PyTorch with Multiple GPUs](https://docs.alliancecan.ca/wiki/PyTorch#PyTorch_with_Multiple_GPUs) 177 | * [Multi-node-training on slurm with PyTorch](https://gist.github.com/TengdaHan/1dd10d335c7ca6f13810fff41e809904) 178 | 179 | #### Helpful documentations 180 | * [pytorch torchrun](https://pytorch.org/docs/stable/elastic/run.html) 181 | * [mpirun man page](https://www.open-mpi.org/doc/v4.0/man1/mpirun.1.php) 182 | * [SLURM srun page](https://slurm.schedmd.com/srun.html) 183 | * [SLURM sbatch environment variables](https://slurm.schedmd.com/sbatch.html#SECTION_OUTPUT-ENVIRONMENT-VARIABLES) 184 | * [PBS qsub environment variables](https://opus.nci.org.au/display/Help/Useful+PBS+Environment+Variables) 185 | 186 | #### Wiki 187 | * [UBC Sockeye](https://confluence.it.ubc.ca/display/UARC/About+Sockeye) 188 | * [Vector](https://support.vectorinstitute.ai/FrontPage) 189 | * [CC](https://docs.alliancecan.ca/wiki/Technical_documentation) -------------------------------------------------------------------------------- /helper/benchmark_layernorm.py: -------------------------------------------------------------------------------- 1 | import time 2 | import torch 3 | import torch.nn as nn 4 | from torch.profiler import tensorboard_trace_handler 5 | 6 | 7 | class TestNet(nn.Module): 8 | def __init__(self): 9 | super().__init__() 10 | self.fc_in = nn.Linear(3, 128) 11 | self.layers = nn.ModuleList() 12 | for i in range(0, 10): 13 | self.layers.append(nn.Linear(128, 128)) 14 | self.layers.append(nn.LayerNorm(128)) 15 | # self.layers.append(nn.BatchNorm1d(262144)) 16 | self.fc_out = nn.Linear(128, 3) 17 | 18 | def forward(self, x: torch.Tensor) -> torch.Tensor: 19 | x = self.fc_in(x) 20 | for layer in self.layers: 21 | x = layer(x) 22 | # print(x.shape) 23 | x = self.fc_out(x) 24 | return x 25 | 26 | 27 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 28 | net = TestNet().to(device).train() 29 | in_data = torch.zeros([1, 512 * 512, 3]).to(device) 30 | criterion = nn.MSELoss().to(device) 31 | optimizer = torch.optim.SGD(net.parameters(), 0.01) 32 | 33 | with torch.profiler.profile( 34 | schedule=torch.profiler.schedule(wait=2, warmup=2, active=6, repeat=1, skip_first=2), 35 | on_trace_ready=tensorboard_trace_handler("tmp/profile"), 36 | with_stack=True, with_flops=True, with_modules=True, profile_memory=True) as profiler: 37 | 38 | for i in range(0, 20): 39 | t0 = time.time() 40 | out_data = net(in_data) 41 | loss = criterion(out_data, in_data) 42 | 43 | optimizer.zero_grad() 44 | loss.backward() 45 | optimizer.step() 46 | profiler.step() 47 | print(f"step: {i:,d} {time.time() - t0:.3f}") 48 | print("Done! ") -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | # Reference: 2 | # https://github.com/olehb/pytorch_ddp_tutorial/blob/main/ddp_tutorial_multi_gpu.py 3 | 4 | 5 | import os 6 | import logging 7 | import pdb 8 | import time 9 | import numpy as np 10 | from tqdm import tqdm 11 | from datetime import datetime 12 | 13 | import torch 14 | from torch import nn, optim 15 | from torch import distributed as dist 16 | from torch.utils.data import DataLoader, DistributedSampler 17 | import torchvision 18 | from torchvision import datasets, transforms 19 | 20 | from utils.arg_parser import parse_arguments, set_seed_and_logger, backup_code 21 | from utils.dist_training import DistributedHelper, get_ddp_save_flag 22 | from utils.learning_utils import count_model_params 23 | 24 | 25 | def init_basics(): 26 | """ 27 | Initialization 28 | """ 29 | args = parse_arguments() 30 | dist_helper = DistributedHelper(args.dp, args.ddp, args.ddp_gpu_ids, args.ddp_init_method) 31 | writer = set_seed_and_logger(args.seed, args.logdir, args.log_level, args.comment, dist_helper) 32 | backup_code(args.logdir) 33 | return args, dist_helper, writer 34 | 35 | 36 | def init_model(dist_helper): 37 | """ 38 | Initialize model and training necessities. 39 | """ 40 | # model, we use an unnecessarily heavy model to showcase the GPU profiling 41 | model = getattr(torchvision.models, 'resnet50')(weights=None) 42 | model.conv1 = nn.Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False) 43 | model.fc = nn.Linear(model.fc.in_features, 10) # 10 classes to predict 44 | model = model.to(dist_helper.device) 45 | 46 | param_string, total_params, total_trainable_params = count_model_params(model) 47 | logging.info(f"Parameters: \n{param_string}") 48 | logging.info(f"Parameters Count: {total_params:,}, Trainable: {total_trainable_params:,}.") 49 | 50 | # adapt to distributed training 51 | model = dist_helper.dist_adapt_model(model) 52 | 53 | # optimizer and loss function 54 | optimizer = optim.Adam(model.parameters(), lr=1e-4) 55 | criterion = nn.CrossEntropyLoss() 56 | return model, optimizer, criterion 57 | 58 | 59 | def init_dataloader(batch_size, dist_helper): 60 | """ 61 | Get dataloader 62 | """ 63 | transform = transforms.Compose([ 64 | transforms.ToTensor(), 65 | transforms.Normalize((0.1307,), (0.3081,)), 66 | transforms.Resize(128) # resize to larger image to showcase the use of GPU profiling 67 | ]) 68 | dataset_loc = './mnist_data' 69 | 70 | train_dataset = datasets.MNIST(dataset_loc, download=True, train=True, transform=transform) 71 | 72 | # For final evaluation, it is advised not to use distributed sampler due to possibly incorrect results. 73 | # But we are using it now to accelerate evaluation during training. 74 | # Ref: https://github.com/pytorch/pytorch/issues/25162 75 | test_dataset = datasets.MNIST(dataset_loc, download=True, train=False, transform=transform) 76 | 77 | logging.info("Training set size: {:d}, testing set size: {:d}".format(len(train_dataset), len(test_dataset))) 78 | 79 | if dist_helper.is_ddp: 80 | batch_size_per_gpu = max(1, batch_size // dist.get_world_size()) 81 | sampler = DistributedSampler(train_dataset, shuffle=True) 82 | train_loader = DataLoader(dataset=train_dataset, sampler=sampler, batch_size=batch_size_per_gpu, 83 | pin_memory=True, num_workers=min(6, os.cpu_count())) 84 | 85 | sampler = DistributedSampler(test_dataset, shuffle=False) 86 | test_loader = DataLoader(dataset=test_dataset, sampler=sampler, batch_size=batch_size_per_gpu, 87 | pin_memory=True, num_workers=min(6, os.cpu_count())) 88 | else: 89 | train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, 90 | shuffle=True, pin_memory=True, num_workers=min(6, os.cpu_count())) 91 | test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, 92 | shuffle=False, pin_memory=True, num_workers=min(6, os.cpu_count())) 93 | 94 | return train_loader, test_loader 95 | 96 | 97 | def go_training(epochs, model, optimizer, criterion, dist_helper, train_loader, test_loader, writer, logdir): 98 | """ 99 | Training loop. 100 | """ 101 | 102 | # init 103 | time_train_ls, time_val_ls = [], [] 104 | epoch_when_snapshot = list(range(0, epochs, epochs // 5)) 105 | 106 | # epoch-wise training 107 | for i_epoch in range(epochs): 108 | # train the model for one epoch 109 | if dist_helper.is_ddp: 110 | train_loader.sampler.set_epoch(i_epoch) 111 | 112 | time_epoch = time.time() 113 | train_loss = 0 114 | pbar = tqdm(train_loader) 115 | model.train() 116 | for x, y in pbar: 117 | x = x.to(dist_helper.device, non_blocking=True) 118 | y = y.to(dist_helper.device, non_blocking=True) 119 | optimizer.zero_grad() 120 | y_hat = model(x) 121 | batch_loss = criterion(y_hat, y) 122 | batch_loss.backward() 123 | optimizer.step() 124 | batch_loss_scalar = batch_loss.item() 125 | train_loss += batch_loss_scalar / x.shape[0] 126 | pbar.set_description(f'training batch_loss={batch_loss_scalar:.4f}') 127 | time_training = time.time() - time_epoch 128 | 129 | # calculate validation loss 130 | time_val = time.time() 131 | val_loss = 0.0 132 | pbar = tqdm(test_loader) 133 | model.eval() 134 | with torch.no_grad(): 135 | for x, y in pbar: 136 | x = x.to(dist_helper.device, non_blocking=True) 137 | y = y.to(dist_helper.device, non_blocking=True) 138 | y_hat = model(x) 139 | batch_loss = criterion(y_hat, y) 140 | batch_loss_scalar = batch_loss.item() 141 | val_loss += batch_loss_scalar / x.shape[0] 142 | pbar.set_description(f'validation batch_loss={batch_loss_scalar:.4f}') 143 | time_val = time.time() - time_val 144 | 145 | logging.info(f"Epoch={i_epoch}, train_loss={train_loss:.4f}, val_loss={val_loss:.4f}") 146 | logging.info("Training time: {:.3f}s, Validation time: {:.3f}s".format(time_training, time_val)) 147 | time_train_ls.append(time_training) 148 | time_val_ls.append(time_val) 149 | 150 | if get_ddp_save_flag(): 151 | writer.add_scalar("train/loss", train_loss, i_epoch) 152 | writer.add_scalar("test/loss", val_loss, i_epoch) 153 | writer.flush() 154 | 155 | if i_epoch in epoch_when_snapshot and get_ddp_save_flag(): 156 | model_path = os.path.join(logdir, 'model_epoch_{:03d}_{:s}_{:d}.pt'.format( 157 | i_epoch, datetime.now().strftime("%Y%m%d-%H%M%S"), os.getpid())) 158 | torch.save(model.state_dict(), model_path) 159 | logging.info("Saving model to {:s}".format(model_path)) 160 | dist_helper.ddp_sync() 161 | 162 | # Count overall training efficiency 163 | logging.info("{:s} Overall timing results {:s}".format('-' * 10, '-' * 10)) 164 | logging.info("Total training time: {:.3f}s, total validation time: {:.3f}s".format( 165 | np.sum(time_train_ls), np.sum(time_val_ls))) 166 | for i_epoch, time_training, time_val in zip(range(epochs), time_train_ls, time_val_ls): 167 | logging.info("Epoch: {:d}, Training time: {:.3f}s, Validation time: {:.3f}s.".format( 168 | i_epoch, time_training, time_val)) 169 | 170 | 171 | def main(): 172 | """ 173 | Main training loop 174 | """ 175 | 176 | """Initialization basics""" 177 | args, dist_helper, writer = init_basics() 178 | 179 | """Get network""" 180 | model, optimizer, criterion = init_model(dist_helper) 181 | 182 | """Get dataloader""" 183 | train_loader, test_loader = init_dataloader(args.batch_size, dist_helper) 184 | 185 | """Go training""" 186 | go_training(args.epoch, model, optimizer, criterion, dist_helper, train_loader, test_loader, writer, args.logdir) 187 | 188 | """Distributed training cleanup""" 189 | dist_helper.clean_up() 190 | 191 | 192 | if __name__ == '__main__': 193 | main() 194 | -------------------------------------------------------------------------------- /scripts/demo_cc.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --account=def-rjliao 3 | #SBATCH --gres=gpu:v100l:4 # Number of GPUs per node (specifying v100l gpu) 4 | #SBATCH --nodes=2 # Number of nodes 5 | #SBATCH --ntasks=8 # Number of MPI process 6 | #SBATCH --ntasks-per-node=4 # Number of distributed process per compute node 7 | #SBATCH --cpus-per-task=8 # CPU cores per MPI process 8 | #SBATCH --mem=64G # memory per node 9 | #SBATCH --time=00-00:20 # time (DD-HH:MM) 10 | #SBATCH --mail-user=yanq@student.ubc.ca # send email regarding task status 11 | #SBATCH --mail-type=ALL 12 | 13 | ################################################################################ 14 | 15 | # in this demo, we take 2 nodes and each node has 4 V100-32GB GPUs 16 | MASTER_PORT=29400 17 | 18 | module load gcc 19 | module load cuda 20 | module load nccl 21 | module load openmpi 22 | 23 | # you should submit job from the cloned repo's directory 24 | cd ${SLURM_SUBMIT_DIR} 25 | source venvhpc/bin/activate 26 | export OMP_NUM_THREADS=6 27 | 28 | # single GPU: use 1 GPU on 1 node 29 | CUDA_VISIBLE_DEVICES=0 python main.py --batch_size=768 -m=cc_demo_single_gpu 30 | 31 | # DP: use multiple GPUs on 1 node 32 | CUDA_VISIBLE_DEVICES=0,1,2,3 python main.py --batch_size=3072 --dp -m=cc_demo_single_node_dp 33 | 34 | # DDP: use multiple GPUs on 1 node 35 | CUDA_VISIBLE_DEVICES=0,1,2,3 torchrun --nnodes=1 --nproc_per_node=4 --master_port=$MASTER_PORT main.py --batch_size=3072 --ddp -m=cc_demo_single_node_ddp 36 | 37 | # DDP: use multiple GPUs on multiple nodes 38 | 39 | # mpirun method 40 | mpirun -np 8 \ 41 | -x MASTER_ADDR=$(hostname) \ 42 | -x MASTER_PORT=$MASTER_PORT \ 43 | -x PATH \ 44 | -bind-to none -map-by :OVERSUBSCRIBE \ 45 | -mca pml ob1 -mca btl ^openib \ 46 | python main.py --batch_size=6144 --ddp -m=cc_demo_multiple_node_mpi_ddp 47 | 48 | # srun method 49 | # The SLURM_NTASKS variable tells the script how many processes are available for this execution. 50 | # “srun” executes the script times 51 | 52 | # Therefore, for error-free srun execution, we need to overwrite the SBATCH options set in the very beginning 53 | # by using --ntasks=2 --ntasks-per-node=1 explicitly. 54 | # Note: the nuance is --ntasks=8 --ntasks-per-node=4 works for mpirun + python main.py --args, 55 | # while --ntasks=2 --ntasks-per-node=1 works for srun + torchrun. 56 | 57 | srun --nodes=2 --ntasks-per-node=1 --ntasks=2 torchrun --nnodes=2 --nproc_per_node=4 \ 58 | --rdzv_id=$SLURM_JOB_ID --rdzv_backend=c10d --rdzv_endpoint=$(hostname):$MASTER_PORT \ 59 | main.py --batch_size=6144 --ddp -m=cc_demo_multiple_node_srun_ddp 60 | 61 | ##### DEBUG info ##### 62 | #echo $SLURM_JOB_NODELIST 63 | # 64 | #echo $(hostname) 65 | # 66 | #echo $(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) 67 | # 68 | ## extract the first string component before the first dot 69 | ## cdr2639.int.cedar.computecanada.ca -> cdr2639 70 | #echo $(echo $(hostname) | cut -d '.' -f 1) 71 | -------------------------------------------------------------------------------- /scripts/demo_cc_apptainer.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --account=def-rjliao 3 | #SBATCH --gres=gpu:v100l:4 # Number of GPUs per node (specifying v100l gpu) 4 | #SBATCH --nodes=1 # Number of nodes 5 | #SBATCH --ntasks=8 # Number of MPI process 6 | #SBATCH --ntasks-per-node=4 # Number of distributed process per compute node 7 | #SBATCH --cpus-per-task=8 # CPU cores per MPI process 8 | #SBATCH --mem=64G # memory per node 9 | #SBATCH --time=00-00:20 # time (DD-HH:MM) 10 | #SBATCH --mail-user=yanq@student.ubc.ca # send email regarding task status 11 | #SBATCH --mail-type=ALL 12 | 13 | ################################################################################ 14 | 15 | # in this demo, we take 1 nodes and each node has 4 V100-32GB GPUs 16 | 17 | ## set up environment variables for apptainer 18 | ## this script is intended for the narval cluster, please adjust the path accordingly for other clusters 19 | module load apptainer-suid/1.1 20 | cd /lustre07/scratch/${USER}/venv 21 | export TMPDIR=/tmp/${USER}tmp 22 | mkdir -p ${TMPDIR} 23 | export APPTAINER_CACHEDIR=${TMPDIR} 24 | export APPTAINER_TMPDIR=${TMPDIR} 25 | 26 | # !!!please change the USER_NAME to your own username before running the script!!! 27 | 28 | # single GPU: use 1 GPU on 1 node 29 | apptainer exec -C -B /project -B /scratch -B /home -W ${TMPDIR} --nv venvhpc.sandbox bash -c ' 30 | export USER_NAME='YOUR_USER_NAME' 31 | source /opt/conda/etc/profile.d/conda.sh 32 | conda activate /lustre07/scratch/${USER_NAME}/venv/condaenvs/venvhpc 33 | cd /lustre06/project/6068146/${USER_NAME}/HPC_helper 34 | CUDA_VISIBLE_DEVICES=0 python main.py --batch_size=768 -m=cc_demo_single_gpu 35 | ' 36 | 37 | # single GPU: use 1 GPU on 1 node 38 | apptainer exec -C -B /project -B /scratch -B /home -W ${TMPDIR} --nv venvhpc.sandbox bash -c ' 39 | export USER_NAME='YOUR_USER_NAME' 40 | source /opt/conda/etc/profile.d/conda.sh 41 | conda activate /lustre07/scratch/${USER_NAME}/venv/condaenvs/venvhpc 42 | cd /lustre06/project/6068146/${USER_NAME}/HPC_helper 43 | CUDA_VISIBLE_DEVICES=0 python main.py --batch_size=768 -m=cc_apptainer_demo_single_gpu 44 | ' 45 | 46 | # DP: use multiple GPUs on 1 node 47 | apptainer exec -C -B /project -B /scratch -B /home -W ${TMPDIR} --nv venvhpc.sandbox bash -c ' 48 | export USER_NAME='YOUR_USER_NAME' 49 | source /opt/conda/etc/profile.d/conda.sh 50 | conda activate /lustre07/scratch/${USER_NAME}/venv/condaenvs/venvhpc 51 | cd /lustre06/project/6068146/${USER_NAME}/HPC_helper 52 | CUDA_VISIBLE_DEVICES=0,1,2,3 python main.py --batch_size=3072 --dp -m=cc_apptainer_demo_single_node_dp 53 | ' 54 | 55 | # DDP: use multiple GPUs on 1 node 56 | apptainer exec -C -B /project -B /scratch -B /home -W ${TMPDIR} --nv venvhpc.sandbox bash -c ' 57 | export USER_NAME='YOUR_USER_NAME' 58 | export MASTER_PORT=29400 59 | source /opt/conda/etc/profile.d/conda.sh 60 | conda activate /lustre07/scratch/${USER_NAME}/venv/condaenvs/venvhpc 61 | cd /lustre06/project/6068146/${USER_NAME}/HPC_helper 62 | CUDA_VISIBLE_DEVICES=0,1,2,3 torchrun --nnodes=1 --nproc_per_node=4 --master_port=$MASTER_PORT main.py --batch_size=3072 --ddp -m=cc_apptainer_demo_single_node_ddp 63 | ' 64 | 65 | # note: we haven't tested the multi-node DDP with apptainer yet, but the MPI option may work 66 | -------------------------------------------------------------------------------- /scripts/demo_sockeye.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=demo_sockeye 3 | #SBATCH --account=st-rjliao-1-gpu 4 | #SBATCH --nodes=1 5 | #SBATCH --ntasks=1 6 | #SBATCH --cpus-per-task=24 7 | #SBATCH --mem=32G 8 | #SBATCH --time=00:20:00 9 | #SBATCH --gpus-per-node=1 10 | #SBATCH --output=slurm-%j_out.txt 11 | #SBATCH --error=slurm-%j_err.txt 12 | #SBATCH --mail-user=yanq@student.ubc.ca 13 | #SBATCH --mail-type=ALL 14 | ################################################################################ 15 | 16 | # in this demo, we take 2 nodes and each node has 4 V100-32GB GPUs 17 | MASTER_PORT=29400 18 | 19 | module load gcc 20 | module load cuda 21 | module load nccl 22 | module load openmpi 23 | 24 | # you should submit job from the cloned repo's directory 25 | cd ${PBS_O_WORKDIR} 26 | source venvhpc/bin/activate 27 | export OMP_NUM_THREADS=6 28 | 29 | # note: at Sockeye, it's better to specify CUDA_VISIBLE_DEVICES explicitly for distributed training, 30 | # otherwise methods in torch.cuda may lead to an error, e.g., torch.cuda.device_count() 31 | 32 | # single GPU: use 1 GPU on 1 node 33 | CUDA_VISIBLE_DEVICES=0 python main.py --batch_size=768 -m=sockeye_demo_single_gpu 34 | 35 | # DP: use multiple GPUs on 1 node 36 | CUDA_VISIBLE_DEVICES=0,1,2,3 python main.py --batch_size=3072 --dp -m=sockeye_demo_single_node_dp 37 | 38 | # DDP: use multiple GPUs on 1 node 39 | CUDA_VISIBLE_DEVICES=0,1,2,3 torchrun --nnodes=1 --nproc_per_node=4 --master_port=$MASTER_PORT main.py --batch_size=3072 --ddp -m=sockeye_demo_single_node_ddp 40 | 41 | # DDP: use multiple GPUs on multiple nodes 42 | mpirun -np 8 \ 43 | --hostfile $PBS_NODEFILE --oversubscribe \ 44 | -x MASTER_ADDR=$(hostname) \ 45 | -x MASTER_PORT=$MASTER_PORT \ 46 | -x CUDA_VISIBLE_DEVICES=0,1,2,3 \ 47 | -x PATH \ 48 | -bind-to none -map-by :OVERSUBSCRIBE \ 49 | -mca pml ob1 -mca btl ^openib \ 50 | python main.py --batch_size=6144 --ddp -m=sockeye_demo_multiple_node_mpi_ddp 51 | 52 | -------------------------------------------------------------------------------- /scripts/demo_sockeye_pbs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #PBS -l walltime=00:20:00,select=2:ncpus=8:ngpus=4:mem=64gb:gpu_mem=32gb 3 | #PBS -N sockeye_demo 4 | #PBS -A st-rjliao-1-gpu 5 | #PBS -m abe 6 | #PBS -M yanq@student.ubc.ca 7 | 8 | ################################################################################ 9 | 10 | # in this demo, we take 2 nodes and each node has 4 V100-32GB GPUs 11 | MASTER_PORT=29400 12 | 13 | module load gcc 14 | module load cuda 15 | module load nccl 16 | module load openmpi 17 | 18 | # you should submit job from the cloned repo's directory 19 | cd ${PBS_O_WORKDIR} 20 | source venvhpc/bin/activate 21 | export OMP_NUM_THREADS=6 22 | 23 | # note: at Sockeye, it's better to specify CUDA_VISIBLE_DEVICES explicitly for distributed training, 24 | # otherwise methods in torch.cuda may lead to an error, e.g., torch.cuda.device_count() 25 | 26 | # single GPU: use 1 GPU on 1 node 27 | CUDA_VISIBLE_DEVICES=0 python main.py --batch_size=768 -m=sockeye_demo_single_gpu 28 | 29 | # DP: use multiple GPUs on 1 node 30 | CUDA_VISIBLE_DEVICES=0,1,2,3 python main.py --batch_size=3072 --dp -m=sockeye_demo_single_node_dp 31 | 32 | # DDP: use multiple GPUs on 1 node 33 | CUDA_VISIBLE_DEVICES=0,1,2,3 torchrun --nnodes=1 --nproc_per_node=4 --master_port=$MASTER_PORT main.py --batch_size=3072 --ddp -m=sockeye_demo_single_node_ddp 34 | 35 | # DDP: use multiple GPUs on multiple nodes 36 | mpirun -np 8 \ 37 | --hostfile $PBS_NODEFILE --oversubscribe \ 38 | -x MASTER_ADDR=$(hostname) \ 39 | -x MASTER_PORT=$MASTER_PORT \ 40 | -x CUDA_VISIBLE_DEVICES=0,1,2,3 \ 41 | -x PATH \ 42 | -bind-to none -map-by :OVERSUBSCRIBE \ 43 | -mca pml ob1 -mca btl ^openib \ 44 | python main.py --batch_size=6144 --ddp -m=sockeye_demo_multiple_node_mpi_ddp 45 | 46 | -------------------------------------------------------------------------------- /scripts/demo_vector.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=demo_vector 3 | #SBATCH --partition=rtx6000 # Type of GPUs 4 | #SBATCH --gres=gpu:4 # Number of GPUs per node 5 | #SBATCH --nodes=2 # Number of nodes 6 | #SBATCH --ntasks=8 # Number of MPI process 7 | #SBATCH --ntasks-per-node=4 # Number of distributed process per compute node 8 | #SBATCH --cpus-per-task=8 # CPU cores per MPI process 9 | #SBATCH --mem=64G # memory per node 10 | #SBATCH --time=00-00:20 # time (DD-HH:MM) 11 | #SBATCH --qos=normal # QoS type 12 | #SBATCH --mail-user=yanq@student.ubc.ca # send email regarding task status 13 | #SBATCH --mail-type=ALL 14 | #SBATCH --output=slurm-%j_out.txt 15 | #SBATCH --error=slurm-%j_err.txt 16 | 17 | ################################################################################ 18 | 19 | # in this demo, we take 2 nodes and each node has 4 RTX6000-24GB GPUs 20 | MASTER_PORT=29400 21 | 22 | module use /pkgs/environment-modules/ 23 | module load python/3.8 24 | module load cuda-11.7 25 | source /scratch/ssd004/scratch/qiyan/venvmtr/bin/activate 26 | cd /fs01/home/qiyan/DSL-MTR/tools 27 | 28 | 29 | # you should submit job from the cloned repo's directory 30 | cd ${SLURM_SUBMIT_DIR} 31 | source venvhpc/bin/activate 32 | export OMP_NUM_THREADS=6 33 | 34 | # single GPU: use 1 GPU on 1 node 35 | (while true; do nvidia-smi; top -b -n 1 | head -20; sleep 10; done) & 36 | CUDA_VISIBLE_DEVICES=0 python main.py --batch_size=512 -m=vector_demo_single_gpu 37 | 38 | # DP: use multiple GPUs on 1 node 39 | (while true; do nvidia-smi; top -b -n 1 | head -20; sleep 10; done) & 40 | CUDA_VISIBLE_DEVICES=0,1,2,3 python main.py --batch_size=2048 --dp -m=vector_demo_single_node_dp 41 | 42 | # DDP: use multiple GPUs on 1 node 43 | (while true; do nvidia-smi; top -b -n 1 | head -20; sleep 10; done) & 44 | CUDA_VISIBLE_DEVICES=0,1,2,3 torchrun --nnodes=1 --nproc_per_node=4 --master_port=$MASTER_PORT main.py --batch_size=2048 --ddp -m=vector_demo_single_node_ddp 45 | 46 | # DDP: use multiple GPUs on multiple nodes 47 | 48 | # mpirun method 49 | (while true; do nvidia-smi; top -b -n 1 | head -20; sleep 10; done) & 50 | mpirun -np 8 \ 51 | -x MASTER_ADDR=$(hostname) \ 52 | -x MASTER_PORT=$MASTER_PORT \ 53 | -x PATH \ 54 | -bind-to none -map-by :OVERSUBSCRIBE \ 55 | -mca pml ob1 -mca btl ^openib \ 56 | python main.py --batch_size=2048 --ddp -m=vector_demo_multiple_node_mpi_ddp 57 | 58 | # srun method 59 | # The SLURM_NTASKS variable tells the script how many processes are available for this execution. 60 | # “srun” executes the script times 61 | 62 | # Therefore, for error-free srun execution, we need to overwrite the SBATCH options set in the very beginning 63 | # by using --ntasks=2 --ntasks-per-node=1 explicitly. 64 | # Note: the nuance is --ntasks=8 --ntasks-per-node=4 works for mpirun + python main.py --args, 65 | # while --ntasks=2 --ntasks-per-node=1 works for srun + torchrun. 66 | 67 | (while true; do nvidia-smi; top -b -n 1 | head -20; sleep 10; done) & 68 | srun --nodes=2 --ntasks-per-node=1 --ntasks=2 torchrun --nnodes=2 --nproc_per_node=4 \ 69 | --rdzv_id=$SLURM_JOB_ID --rdzv_backend=c10d --rdzv_endpoint=$(hostname):$MASTER_PORT \ 70 | main.py --batch_size=2048 --ddp -m=vector_demo_multiple_node_srun_ddp 71 | 72 | ##### DEBUG info ##### 73 | #echo $SLURM_JOB_NODELIST 74 | # 75 | #echo $(hostname) 76 | # 77 | #echo $(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) 78 | # 79 | -------------------------------------------------------------------------------- /setup/requirements_cc.txt: -------------------------------------------------------------------------------- 1 | torch 2 | torchvision 3 | torchaudio 4 | tqdm 5 | numpy 6 | ml_collections 7 | torch_tb_profiler 8 | -------------------------------------------------------------------------------- /setup/requirements_sockeye.txt: -------------------------------------------------------------------------------- 1 | --extra-index-url https://download.pytorch.org/whl/cu116 2 | torch 3 | torchvision 4 | torchaudio 5 | tqdm 6 | numpy 7 | ml_collections 8 | torch_tb_profiler 9 | -------------------------------------------------------------------------------- /utils/arg_parser.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | import os 4 | import pdb 5 | import shutil 6 | import random 7 | import sys 8 | from datetime import datetime 9 | import numpy as np 10 | import torch 11 | import torch.distributed as dist 12 | from torch.utils.tensorboard import SummaryWriter 13 | 14 | from utils.dist_training import get_ddp_save_flag 15 | 16 | 17 | def parse_arguments(): 18 | """ 19 | Argument parser. 20 | """ 21 | parser = argparse.ArgumentParser(description="Running Experiments") 22 | parser.add_argument('-l', '--log_level', type=str, 23 | default='DEBUG', help="Logging Level, one of: DEBUG, INFO, WARNING, ERROR, CRITICAL") 24 | parser.add_argument('-m', '--comment', type=str, 25 | default="", help="A single line comment for the experiment") 26 | parser.add_argument('--dp', default=False, action='store_true', 27 | help='To use DataParallel distributed learning.') 28 | parser.add_argument('--ddp', default=False, action='store_true', 29 | help='To use DDP distributed learning') 30 | parser.add_argument('--ddp_gpu_ids', nargs='+', default=None, 31 | help="A list of GPU IDs to run distributed learning") 32 | parser.add_argument('--batch_size', default=256, type=int, 33 | help='Training batch size.') 34 | parser.add_argument('--epoch', default=5, type=int, 35 | help='Training epochs.') 36 | parser.add_argument('--seed', default=1234, type=int, 37 | help='Random seed.') 38 | parser.add_argument('--ddp_init_method', default='env://', type=str, 39 | help='torch.distributed.init_process_group options.') 40 | 41 | args = parser.parse_args() 42 | 43 | # add log directory 44 | if args.dp: 45 | dist_status = 'dp' 46 | elif args.ddp: 47 | dist_status = 'ddp' 48 | else: 49 | dist_status = 'single_gpu' 50 | 51 | logdir_nm = dist_status + "_" + datetime.now().strftime("%Y%m%d_%H%M%S") 52 | if len(args.comment): 53 | logdir_nm += '_' + args.comment 54 | 55 | logdir = os.path.join('runs', logdir_nm) 56 | os.makedirs(logdir, exist_ok=True) 57 | 58 | args.logdir = logdir 59 | print('Args: \n', args) 60 | return args 61 | 62 | 63 | def set_seed_and_logger(seed, logdir, log_level, comment, dist_helper): 64 | """ 65 | Set up random seed number and global logger. 66 | """ 67 | # Setup random seed 68 | if dist_helper.is_ddp: 69 | seed += dist.get_rank() 70 | else: 71 | pass 72 | random.seed(seed) 73 | np.random.seed(seed) 74 | torch.manual_seed(seed) 75 | torch.cuda.manual_seed_all(seed) 76 | 77 | # torch numerical accuracy flags 78 | # reference: https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices 79 | # The flag below controls whether to allow TF32 on matmul. This flag defaults to True. 80 | torch.backends.cuda.matmul.allow_tf32 = False 81 | # The flag below controls whether to allow TF32 on cuDNN. This flag defaults to True. 82 | torch.backends.cudnn.allow_tf32 = False 83 | 84 | # Setup logger 85 | if dist_helper.is_ddp: 86 | log_file = os.path.join(logdir, "ddp_rank_{:02d}_".format(dist.get_rank()) + log_level.lower() + ".log") 87 | else: 88 | log_file = os.path.join(logdir, log_level.lower() + ".log") 89 | logger_format = comment + '| %(asctime)s %(message)s' 90 | fh = logging.FileHandler(log_file) 91 | fh.setLevel(log_level) 92 | for handler in logging.root.handlers[:]: 93 | logging.root.removeHandler(handler) 94 | logging.basicConfig(level=logging.DEBUG, format=logger_format, 95 | datefmt='%m-%d %H:%M:%S', 96 | handlers=[ 97 | fh, 98 | logging.StreamHandler(sys.stdout) 99 | ]) 100 | logging.getLogger('matplotlib.font_manager').setLevel(logging.INFO) # remove excessive matplotlib messages 101 | logging.getLogger('matplotlib').setLevel(logging.INFO) # remove excessive matplotlib messages 102 | logging.info('EXPERIMENT BEGIN: ' + comment) 103 | logging.info('logging into %s', log_file) 104 | 105 | # Setup tensorboard logger 106 | if get_ddp_save_flag(): 107 | writer = SummaryWriter(log_dir=logdir) 108 | else: 109 | writer = None 110 | return writer 111 | 112 | 113 | def backup_code(logdir): 114 | if get_ddp_save_flag(): 115 | code_path = os.path.join(logdir, 'code') 116 | dirs_to_save = ['utils'] 117 | os.makedirs(code_path, exist_ok=True) 118 | 119 | # save_name = os.path.join(code_path, 'config.yaml') 120 | # yaml.dump(dict(config), open(save_name, 'w'), default_flow_style=False) 121 | 122 | os.system('cp ./*py ' + code_path) 123 | [shutil.copytree(os.path.join('./', this_dir), os.path.join(code_path, this_dir)) for this_dir in dirs_to_save] 124 | -------------------------------------------------------------------------------- /utils/dist_training.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import pdb 4 | 5 | import torch 6 | from torch import distributed as dist, nn as nn 7 | from torch.nn.parallel import DistributedDataParallel as DDP 8 | 9 | 10 | class DistributedHelper(object): 11 | def __init__(self, flag_dp, flag_ddp, ddp_gpu_ids, init_method): 12 | self.flag_dp = flag_dp 13 | self.flag_ddp = flag_ddp 14 | self.ddp_gpu_ids = ddp_gpu_ids 15 | self.init_method = init_method 16 | 17 | if (self.flag_dp or self.flag_ddp) and ddp_gpu_ids is None: 18 | assert torch.cuda.device_count() > 1, "Number of GPU must be more than one to use distributed learning!" 19 | assert not all((flag_dp, flag_ddp)), \ 20 | "Flag DP ({:}) and flag DDP ({:}) cannot be both true!".format(flag_dp, flag_ddp) 21 | 22 | self.gpu_name = 'dummy' 23 | self.init_ddp() 24 | self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 25 | 26 | def init_ddp(self): 27 | """ 28 | Initialize DDP distributed training if necessary. 29 | Note: we have to initialize DDP mode before initialize the logging file, otherwise the multiple DDP 30 | processes' loggings will interfere with each other. 31 | """ 32 | print("Number of available GPU to use: {}".format(torch.cuda.device_count())) 33 | if self.flag_ddp: 34 | self.init_ddp_backend() 35 | self.gpu_name = torch.cuda.get_device_name() 36 | print("Setup DDP for process {:d} using GPUs {} (ID) with NCCL backend. GPU for this process: {:s}".format( 37 | os.getpid(), self.ddp_gpu_ids, self.gpu_name)) 38 | elif self.flag_dp: 39 | gpu_specs = [torch.cuda.get_device_name(i_gpu) for i_gpu in range(torch.cuda.device_count())] 40 | self.gpu_name = ','.join(gpu_specs) 41 | print("Setup DP using {:d} GPUs, specs: {:s}.".format(torch.cuda.device_count(), self.gpu_name)) 42 | else: 43 | self.gpu_name = torch.cuda.get_device_name() 44 | print("Single GPU mode, specs: {:s}.".format(self.gpu_name)) 45 | 46 | def init_ddp_backend(self): 47 | """ 48 | Start DDP engine using NCCL backend. 49 | """ 50 | ddp_status, env_dict = self.get_ddp_status() 51 | local_rank = env_dict['LOCAL_RANK'] 52 | 53 | if self.ddp_gpu_ids is not None: 54 | assert isinstance(self.ddp_gpu_ids, list) 55 | num_gpus = len(self.ddp_gpu_ids) 56 | gpu_id = int(self.ddp_gpu_ids[local_rank % num_gpus]) 57 | torch.cuda.set_device(gpu_id) # set single gpu device per process 58 | else: 59 | torch.cuda.set_device(local_rank) # set single gpu device per process 60 | dist.init_process_group(backend="nccl", init_method=self.init_method, rank=env_dict['WORLD_RANK'], world_size=env_dict['WORLD_SIZE']) 61 | 62 | def dist_adapt_model(self, model): 63 | """ 64 | Setup distributed learning for network. 65 | """ 66 | logging.info("Adapt the model for distributed training...") 67 | if self.flag_ddp: 68 | # DDP 69 | model = DDP(model.cuda(), device_ids=[torch.cuda.current_device()]) # single CUDA device per process 70 | logging.info("Distributed ON. Mode: DDP. Backend: {:s}, Rank: {:d} / World size: {:d}. " 71 | "Current device: {}, spec: {}".format( 72 | dist.get_backend(), dist.get_rank(), dist.get_world_size(), 73 | torch.cuda.current_device(), self.gpu_name)) 74 | elif self.flag_dp: 75 | # DP 76 | model = nn.DataParallel(model) 77 | model.to(torch.device("cuda")) # multiple devices per process, controlled by CUDA_VISIBLE_DEVICES 78 | logging.info("Distributed ON. Mode: DP. Number of available GPU to use: {}, specs: {}".format( 79 | torch.cuda.device_count(), self.gpu_name)) 80 | else: 81 | # single GPU 82 | logging.info("Distributed OFF. Single-GPU training, specs: {}.".format(self.gpu_name)) 83 | 84 | return model 85 | 86 | def ddp_sync(self): 87 | if self.flag_ddp and dist.is_initialized(): 88 | dist.barrier() 89 | else: 90 | pass 91 | 92 | def clean_up(self): 93 | self.ddp_sync() 94 | if self.flag_ddp and dist.is_initialized(): 95 | dist.destroy_process_group() 96 | else: 97 | pass 98 | 99 | @staticmethod 100 | def get_ddp_status(): 101 | """ 102 | Get DDP-related env. parameters. 103 | """ 104 | if 'LOCAL_RANK' in os.environ: 105 | # Environment variables set by torch.distributed.launch or torchrun 106 | local_rank = int(os.environ['LOCAL_RANK']) 107 | world_size = int(os.environ['WORLD_SIZE']) 108 | world_rank = int(os.environ['RANK']) 109 | elif 'OMPI_COMM_WORLD_LOCAL_RANK' in os.environ: 110 | # Environment variables set by mpirun 111 | local_rank = int(os.environ['OMPI_COMM_WORLD_LOCAL_RANK']) 112 | world_size = int(os.environ['OMPI_COMM_WORLD_SIZE']) 113 | world_rank = int(os.environ['OMPI_COMM_WORLD_RANK']) 114 | else: 115 | raise NotImplementedError 116 | 117 | env_dict = { 118 | 'MASTER_ADDR': os.environ['MASTER_ADDR'], 119 | 'MASTER_PORT': os.environ['MASTER_PORT'], 120 | 'LOCAL_RANK': local_rank, 121 | 'WORLD_SIZE': world_size, 122 | 'WORLD_RANK': world_rank, 123 | } 124 | ddp_status = "Process PID: {}. DDP setup: {} ".format(os.getpid(), env_dict) 125 | return ddp_status, env_dict 126 | 127 | @property 128 | def is_ddp(self): 129 | """ 130 | DDP flag. 131 | """ 132 | return self.flag_ddp 133 | 134 | @property 135 | def is_dp(self): 136 | """ 137 | DP flag. 138 | """ 139 | return self.flag_dp 140 | 141 | @property 142 | def is_distributed(self): 143 | """ 144 | Distributed learning flag. 145 | """ 146 | return self.flag_dp or self.flag_ddp 147 | 148 | 149 | # Independent function helpers 150 | def get_ddp_save_flag(): 151 | """ 152 | Return saving flag for DDP mode, only rank 0 process makes the output. 153 | """ 154 | flag_save = True 155 | if dist.is_initialized(): 156 | if dist.get_rank() != 0: 157 | flag_save = False 158 | return flag_save 159 | 160 | 161 | def dist_save_model(data_to_save, to_save_path): 162 | """ 163 | Wrapper to save based on DDP status (for main process only). 164 | """ 165 | if get_ddp_save_flag(): 166 | torch.save(data_to_save, to_save_path) 167 | -------------------------------------------------------------------------------- /utils/learning_utils.py: -------------------------------------------------------------------------------- 1 | def count_model_params(model): 2 | """ 3 | Go through the model parameters 4 | """ 5 | param_strings = [] 6 | max_string_len = 126 7 | for name, param in model.named_parameters(): 8 | if param.requires_grad: 9 | line = '.' * max(0, max_string_len - len(name) - len(str(param.size()))) 10 | param_strings.append(f"{name} {line} {param.size()}") 11 | param_string = '\n'.join(param_strings) 12 | 13 | total_params = sum(p.numel() for p in model.parameters()) 14 | total_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad) 15 | return param_string, total_params, total_trainable_params 16 | 17 | 18 | def _print_and_log(in_str, log_file): 19 | assert isinstance(in_str, str) 20 | print(in_str, flush=True) 21 | log_file.write(in_str + '\n') 22 | log_file.flush() 23 | --------------------------------------------------------------------------------