├── .gitignore
├── LICENSE
├── README.md
├── helper
└── benchmark_layernorm.py
├── main.py
├── scripts
├── demo_cc.sh
├── demo_cc_apptainer.sh
├── demo_sockeye.sh
├── demo_sockeye_pbs.sh
└── demo_vector.sh
├── setup
├── requirements_cc.txt
└── requirements_sockeye.txt
└── utils
├── arg_parser.py
├── dist_training.py
└── learning_utils.py
/.gitignore:
--------------------------------------------------------------------------------
1 | mnist_data
2 | logger*
3 | venv*
4 | runs/
5 | .idea
6 |
7 | # Byte-compiled / optimized / DLL files
8 | __pycache__/
9 | *.py[cod]
10 | *$py.class
11 |
12 | # C extensions
13 | *.so
14 |
15 | # Distribution / packaging
16 | .Python
17 | build/
18 | develop-eggs/
19 | dist/
20 | downloads/
21 | eggs/
22 | .eggs/
23 | lib/
24 | lib64/
25 | parts/
26 | sdist/
27 | var/
28 | wheels/
29 | pip-wheel-metadata/
30 | share/python-wheels/
31 | *.egg-info/
32 | .installed.cfg
33 | *.egg
34 | MANIFEST
35 |
36 | # PyInstaller
37 | # Usually these files are written by a python script from a template
38 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
39 | *.manifest
40 | *.spec
41 |
42 | # Installer logs
43 | pip-log.txt
44 | pip-delete-this-directory.txt
45 |
46 | # Unit test / coverage reports
47 | htmlcov/
48 | .tox/
49 | .nox/
50 | .coverage
51 | .coverage.*
52 | .cache
53 | nosetests.xml
54 | coverage.xml
55 | *.cover
56 | *.py,cover
57 | .hypothesis/
58 | .pytest_cache/
59 |
60 | # Translations
61 | *.mo
62 | *.pot
63 |
64 | # Django stuff:
65 | *.log
66 | local_settings.py
67 | db.sqlite3
68 | db.sqlite3-journal
69 |
70 | # Flask stuff:
71 | instance/
72 | .webassets-cache
73 |
74 | # Scrapy stuff:
75 | .scrapy
76 |
77 | # Sphinx documentation
78 | docs/_build/
79 |
80 | # PyBuilder
81 | target/
82 |
83 | # Jupyter Notebook
84 | .ipynb_checkpoints
85 |
86 | # IPython
87 | profile_default/
88 | ipython_config.py
89 |
90 | # pyenv
91 | .python-version
92 |
93 | # pipenv
94 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
95 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
96 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
97 | # install all needed dependencies.
98 | #Pipfile.lock
99 |
100 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
101 | __pypackages__/
102 |
103 | # Celery stuff
104 | celerybeat-schedule
105 | celerybeat.pid
106 |
107 | # SageMath parsed files
108 | *.sage.py
109 |
110 | # Environments
111 | .env
112 | .venv
113 | env/
114 | venv/
115 | ENV/
116 | env.bak/
117 | venv.bak/
118 |
119 | # Spyder project settings
120 | .spyderproject
121 | .spyproject
122 |
123 | # Rope project settings
124 | .ropeproject
125 |
126 | # mkdocs documentation
127 | /site
128 |
129 | # mypy
130 | .mypy_cache/
131 | .dmypy.json
132 | dmypy.json
133 |
134 | # Pyre type checker
135 | .pyre/
136 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2022 Qi Yan
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # HPC_helper
2 |
3 | This repository showcases a minimal example of using `PyTorch` distributed training on computing clusters, enabling you to run your training tasks on `N` nodes, each with `M` GPUs. It includes common use cases such as [DataParallel (DP)](https://pytorch.org/docs/stable/generated/torch.nn.DataParallel.html) or [DistributedDataParallel (DDP)](https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html) and offers support for [PBS](https://2020.help.altair.com/2020.1/PBSProfessional/PBSUserGuide2020.1.1.pdf) and [SLURM](https://slurm.schedmd.com/documentation.html) systems. Below, you'll find runnable code and scripts for UBC Sockeye, Vector Vaughan cluster, and Digital Research Alliance of Canada (formerly ComputeCanada) HPCs.
4 |
5 | Last updated: Jun 23, 2024. Contact: Qi Yan, qi.yan@ece.ubc.ca
6 |
7 | ## Get started
8 |
9 | ### Setup python environment
10 | ```bash
11 | # load python 3.8 at HPC
12 | # module load gcc/9.4.0 python/3.8.10 cuda/11.3.1 nccl/2.9.9-1-cuda11-3 # Sockeye
13 | # module load python/3.8 cuda-11.7 # Vector
14 | # module load python/3.10.13 StdEnv/2023 # CC
15 |
16 | # python virtual environment
17 | python -m venv venvhpc
18 | source venvhpc/bin/activate
19 | pip install -U pip
20 | pip install -r setup/requirements_sockeye.txt # if at Sockeye
21 | pip install -r setup/requirements_cc.txt # if at Vector or CC
22 |
23 | # sanity check at Sockeye or CC
24 | # you must enter an interactive session on Vector to tun this
25 | python -c "import torch; print('Things are done.')"
26 |
27 | # download MNIST dataset
28 | mkdir -p ./mnist_data/MNIST/raw
29 | wget https://raw.githubusercontent.com/fgnt/mnist/master/train-images-idx3-ubyte.gz -P ./mnist_data/MNIST/raw
30 | wget https://raw.githubusercontent.com/fgnt/mnist/master/train-labels-idx1-ubyte.gz -P ./mnist_data/MNIST/raw
31 | wget https://raw.githubusercontent.com/fgnt/mnist/master/t10k-images-idx3-ubyte.gz -P ./mnist_data/MNIST/raw
32 | wget https://raw.githubusercontent.com/fgnt/mnist/master/t10k-labels-idx1-ubyte.gz -P ./mnist_data/MNIST/raw
33 | ```
34 | On Alliance/CC clusters, you can only `pip install` python packages available on the system and `conda` is forbidden.
35 | If you need to install additional packages, you can use the [`apptainer` container environment](https://docs.alliancecan.ca/wiki/Apptainer/en).
36 | See the section below for details.
37 |
38 |
39 | apptainer instructions on Alliance/CC clusters
40 | The following instructions have been tested on the `narval` cluster. Similar steps work on other clusters like `cedar`, while the storage path may vary.
41 |
42 | ```bash
43 | ## pull image and create sandbox; recommended to do so at /scratch space for faster runtime
44 | module load apptainer-suid/1.1
45 | mkdir -p /lustre07/scratch/${USER}/venv && cd /lustre07/scratch/${USER}/venv
46 | apptainer pull --name pytorch220_official.sif docker://pytorch/pytorch:2.2.0-cuda11.8-cudnn8-devel
47 | apptainer build --sandbox venvhpc.sandbox pytorch220_official.sif
48 |
49 | ## get ready to enter the sandbox in an interactive shell
50 | export TMPDIR=/tmp/${USER}tmp
51 | mkdir -p ${TMPDIR}
52 | export APPTAINER_CACHEDIR=${TMPDIR}
53 | export APPTAINER_TMPDIR=${TMPDIR}
54 |
55 | ## bind the project, scratch, home directory to the sandbox; run `apptainer help run` to see meaning of each flag
56 | apptainer shell -C -B /project -B /scratch -B /home -W ${TMPDIR} --nv venvhpc.sandbox
57 |
58 | ## inside apptainer create conda env or use python venv; recommended to create conda env at /scratch space
59 | bash
60 | export USER=YOUR_USER_NAME # change to your username
61 | conda create -p /lustre07/scratch/${USER}/venv/condaenvs/venvhpc python=3.8 -y
62 | conda activate /lustre07/scratch/${USER}/venv/condaenvs/venvhpc
63 |
64 | mkdir -p /lustre07/scratch/${USER}/venv/condaenvs/condacache
65 | conda config --add pkgs_dirs /lustre07/scratch/${USER}/venv/condaenvs/condacache
66 |
67 | ## pip install within the conda env
68 | pip install -U pip
69 | pip install -r setup/requirements_cc.txt
70 |
71 | ## sanity check
72 | python -c "import torch; print('Things are done.')"
73 |
74 | ## follow the above "download MNIST dataset" section to load the dataset
75 | ```
76 |
77 | The apptainer sandbox is a containerized environment that allows you to install custom packages without root access. The `--bind` or `-B` flag is used to bind directories to the container. The sandbox itself contains only the necessary system libraries and the user's home directory. We still store the code and datasets on normal storage space.
78 |
79 |
80 |
81 |
82 | ### Go training
83 | We showcase the use of distributed learning for a simple training task using ResNet50 as backbone.
84 |
85 | **IMPORTANT**:
86 | * please change the account and notification email address in the bash script before running.
87 | * the old Sockeye script is intended for OpenPBS system, which is no longer useful and kpet just for the sake of completeness.
88 | * the Sockeye, Vector and CC scripts are intended for SLURM system, but we don't provide `preemption` support for Vector script.
89 |
90 | ```bash
91 | # at Sockeye
92 | sbatch scripts/demo_sockeye.sh
93 |
94 | # at Vector
95 | sbatch scripts/demo_vector.sh
96 |
97 | # at CC
98 | sbatch scripts/demo_cc.sh
99 |
100 | # at CC with apptainer
101 | ## note: please change the paths in the script accordingly
102 | sbatch scripts/demo_cc_apptainer.sh
103 | ```
104 | Please check the training logs at `runs` for runtime comparison. Hear are five-epoch training time comparisons from my runs:
105 |
106 | | #Nodes | #GPUs per node | PyTorch Distirbuted Method | Sockeye runtime | CC runtime | Vector runtime |
107 | | ------ | -------------- | -------------------------- | --------------- | ---------------------------- | --------------------------------- |
108 | | N=1 | M=1 | N/A | 363.4s | 309.7s | 425.0s |
109 | | N=1 | M=4 | DP | 103.5s | 114.2s | 133.9s |
110 | | N=1 | M=4 | DDP | 93.7s | 85.2s | 113.4s |
111 | | N=2 | M=4 | DDP | 55.7s | 47.0s (mpirun); 47.4s (srun) | 60.9s (mpirun); 60.6s (srun) |
112 |
113 | In the demo script, we use Tesla V100-SXM2-32GB at Sockeye and CC, and RTX6000-24GB at Vector.
114 | The single-precision performance in terms of FLOPS is 15.7 TFLOPS for V100-SXM2-32GB and 16.3 TFLOPS for RTX6000-24GB.
115 | Therefore, the performance difference is mainly due to the GPU memory size.
116 |
117 | ## Distributed training rule of thumb
118 |
119 | Generally, we could either use [DataParallel (DP)](https://pytorch.org/docs/stable/generated/torch.nn.DataParallel.html) or [DistributedDataParallel (DDP)](https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html) protocol to start distributed training. DP is straightforward and only involves changes to a few lines of code. However, its efficiency is worse than DDP; please see [this page](https://pytorch.org/docs/stable/notes/cuda.html#use-nn-parallel-distributeddataparallel-instead-of-multiprocessing-or-nn-dataparallel) for why. Moreover, DP doesn't support multi-node distributed training. Therefore, it's better to always start with DDP despite its relatively higher complexity. The table belows shows the possible way to launch your distributed training jobs.
120 |
121 |
122 | | #Nodes | #GPUs per node | PyTorch Distirbuted Method | Launch Method at Sockeye | Launch Method at CC |
123 | |--------|----------------|----------------------------|---------------------------|----------------------|
124 | | N=1 | M=1 | N/A | N/A | N/A |
125 | | N=1 | M>1 | DDP or DP | torchrun | torchrun |
126 | | N>1 | M>1 | DDP | mpirun + python | mpirun + python or srun + torchrun |
127 |
128 |
129 | ### Difference between PBS (old Sockeye) and SLURM (Vector and CC) systems
130 | At PBS (old Sockeye) system, `mpirun + python` seems to be the only viable way to launch multi-node training. At SLURM (Vector and CC) system, we could use either `srun + torchrun` or `mpirun + python`. Essentially, both `mpirun` and `srun` are launching parallel jobs across different nodes *in one line of code*, and these two mechanisms are the key to scalable multi-node DDP training. We use the following example to show the crucial details to avoid errors.
131 |
132 | **`mpirun + python` method explained**
133 |
134 | Sample commands:
135 | ```bash
136 | mpirun -np 8 \
137 | --hostfile $PBS_NODEFILE --oversubscribe \
138 | -x MASTER_ADDR=$(hostname) \
139 | -x MASTER_PORT=$MASTER_PORT \
140 | -x CUDA_VISIBLE_DEVICES=0,1,2,3 \
141 | -x PATH \
142 | -bind-to none -map-by :OVERSUBSCRIBE \
143 | -mca pml ob1 -mca btl ^openib \
144 | python main.py --batch_size=6144 --ddp -m=sockeye_demo_multiple_node_mpi_ddp
145 | ```
146 | The `mpirun` is executed once, then the parallel jobs will be launched and their communications will be handled by PyTorch and `mpirun` altogether. The key is that we only need to **run `mpirun + python` once on the master node**.
147 |
148 | `mpirun + python` comes with an option `-np` which specifies the number of processes in total. In our demo script, each process amounts to one trainer (i.e., one GPU), and we use `-np=8` for 2 nodes with 8 GPUs in total. This must be used along with `--oversubscribe`, and the reasons are as follows.
149 |
150 | `mpirun` assigns job processes to nodes using [`slot`](https://www.open-mpi.org/doc/v4.0/man1/mpirun.1.php#sect3) scheduling, which was originally intended for CPU-only tasks due to historical reasons (one process amounts to one CPU core). However, such slot assignment may go wrong in the age of GPU training, as now we need to view one GPU as one process. For example, old Sockeye's PBS would not distribute 8 tasks equal to the 2 nodes and instead would raise an error indicating the number of available slots is insufficient. Therefore, we need to use the `--oversubscribe` option to enforce that `mpirun` does distribute tasks equally to each node and ignores the possible false alarm errors.
151 |
152 | **`srun + torchrun` method explained**
153 |
154 | Sample commands:
155 |
156 | ```bash
157 | srun --ntasks-per-node=1 --ntasks=2 torchrun --nnodes=2 --nproc_per_node=4 \
158 | --rdzv_id=$SLURM_JOB_ID --rdzv_backend=c10d --rdzv_endpoint=$(hostname):$MASTER_PORT \
159 | main.py --batch_size=6144 --ddp -m=cc_demo_multiple_node_srun_ddp
160 | ```
161 |
162 | The `SLURM_NTASKS` variable tells the script how many processes are available for this execution. `srun` executes the script `` times. For `torchrun` launch method, we only need to **run it once per node**, and in our example, we are running `torchrun` commands twice on two nodes. Note that this is different than `mpirun + python`, where we *run it once for all nodes*.
163 |
164 | For error-free srun execution, we need to pay attention to the `#SBATCH` options set in the very beginning or enforcing these parameters by using `--ntasks=2 --ntasks-per-node=1` explicitly. The nuance is `--ntasks=8 --ntasks-per-node=4` works for `mpirun + python` method, while `--ntasks=2 --ntasks-per-node=1` works for `srun + torchrun`.
165 |
166 | ## Adapt your code to distributed training
167 | If you are okay with the PyTorch's built-in distributed training utilities, the plugin at `utils/dist_training.py` could be helpful. To change the code minimally for adaptation, please refer to the lines in `main.py` where `dist_helper` is called.
168 |
169 | Other third-party plugins like [horovod](https://horovod.ai/) and [pytorch lightning](https://www.pytorchlightning.ai/) can also possibly do the same things.
170 |
171 |
172 |
173 | ## Reference
174 | #### Tutorial
175 | * [Multi Node PyTorch Distributed Training Guide For People In A Hurry](https://lambdalabs.com/blog/multi-node-pytorch-distributed-training-guide)
176 | * [PyTorch with Multiple GPUs](https://docs.alliancecan.ca/wiki/PyTorch#PyTorch_with_Multiple_GPUs)
177 | * [Multi-node-training on slurm with PyTorch](https://gist.github.com/TengdaHan/1dd10d335c7ca6f13810fff41e809904)
178 |
179 | #### Helpful documentations
180 | * [pytorch torchrun](https://pytorch.org/docs/stable/elastic/run.html)
181 | * [mpirun man page](https://www.open-mpi.org/doc/v4.0/man1/mpirun.1.php)
182 | * [SLURM srun page](https://slurm.schedmd.com/srun.html)
183 | * [SLURM sbatch environment variables](https://slurm.schedmd.com/sbatch.html#SECTION_OUTPUT-ENVIRONMENT-VARIABLES)
184 | * [PBS qsub environment variables](https://opus.nci.org.au/display/Help/Useful+PBS+Environment+Variables)
185 |
186 | #### Wiki
187 | * [UBC Sockeye](https://confluence.it.ubc.ca/display/UARC/About+Sockeye)
188 | * [Vector](https://support.vectorinstitute.ai/FrontPage)
189 | * [CC](https://docs.alliancecan.ca/wiki/Technical_documentation)
--------------------------------------------------------------------------------
/helper/benchmark_layernorm.py:
--------------------------------------------------------------------------------
1 | import time
2 | import torch
3 | import torch.nn as nn
4 | from torch.profiler import tensorboard_trace_handler
5 |
6 |
7 | class TestNet(nn.Module):
8 | def __init__(self):
9 | super().__init__()
10 | self.fc_in = nn.Linear(3, 128)
11 | self.layers = nn.ModuleList()
12 | for i in range(0, 10):
13 | self.layers.append(nn.Linear(128, 128))
14 | self.layers.append(nn.LayerNorm(128))
15 | # self.layers.append(nn.BatchNorm1d(262144))
16 | self.fc_out = nn.Linear(128, 3)
17 |
18 | def forward(self, x: torch.Tensor) -> torch.Tensor:
19 | x = self.fc_in(x)
20 | for layer in self.layers:
21 | x = layer(x)
22 | # print(x.shape)
23 | x = self.fc_out(x)
24 | return x
25 |
26 |
27 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
28 | net = TestNet().to(device).train()
29 | in_data = torch.zeros([1, 512 * 512, 3]).to(device)
30 | criterion = nn.MSELoss().to(device)
31 | optimizer = torch.optim.SGD(net.parameters(), 0.01)
32 |
33 | with torch.profiler.profile(
34 | schedule=torch.profiler.schedule(wait=2, warmup=2, active=6, repeat=1, skip_first=2),
35 | on_trace_ready=tensorboard_trace_handler("tmp/profile"),
36 | with_stack=True, with_flops=True, with_modules=True, profile_memory=True) as profiler:
37 |
38 | for i in range(0, 20):
39 | t0 = time.time()
40 | out_data = net(in_data)
41 | loss = criterion(out_data, in_data)
42 |
43 | optimizer.zero_grad()
44 | loss.backward()
45 | optimizer.step()
46 | profiler.step()
47 | print(f"step: {i:,d} {time.time() - t0:.3f}")
48 | print("Done! ")
--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
1 | # Reference:
2 | # https://github.com/olehb/pytorch_ddp_tutorial/blob/main/ddp_tutorial_multi_gpu.py
3 |
4 |
5 | import os
6 | import logging
7 | import pdb
8 | import time
9 | import numpy as np
10 | from tqdm import tqdm
11 | from datetime import datetime
12 |
13 | import torch
14 | from torch import nn, optim
15 | from torch import distributed as dist
16 | from torch.utils.data import DataLoader, DistributedSampler
17 | import torchvision
18 | from torchvision import datasets, transforms
19 |
20 | from utils.arg_parser import parse_arguments, set_seed_and_logger, backup_code
21 | from utils.dist_training import DistributedHelper, get_ddp_save_flag
22 | from utils.learning_utils import count_model_params
23 |
24 |
25 | def init_basics():
26 | """
27 | Initialization
28 | """
29 | args = parse_arguments()
30 | dist_helper = DistributedHelper(args.dp, args.ddp, args.ddp_gpu_ids, args.ddp_init_method)
31 | writer = set_seed_and_logger(args.seed, args.logdir, args.log_level, args.comment, dist_helper)
32 | backup_code(args.logdir)
33 | return args, dist_helper, writer
34 |
35 |
36 | def init_model(dist_helper):
37 | """
38 | Initialize model and training necessities.
39 | """
40 | # model, we use an unnecessarily heavy model to showcase the GPU profiling
41 | model = getattr(torchvision.models, 'resnet50')(weights=None)
42 | model.conv1 = nn.Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
43 | model.fc = nn.Linear(model.fc.in_features, 10) # 10 classes to predict
44 | model = model.to(dist_helper.device)
45 |
46 | param_string, total_params, total_trainable_params = count_model_params(model)
47 | logging.info(f"Parameters: \n{param_string}")
48 | logging.info(f"Parameters Count: {total_params:,}, Trainable: {total_trainable_params:,}.")
49 |
50 | # adapt to distributed training
51 | model = dist_helper.dist_adapt_model(model)
52 |
53 | # optimizer and loss function
54 | optimizer = optim.Adam(model.parameters(), lr=1e-4)
55 | criterion = nn.CrossEntropyLoss()
56 | return model, optimizer, criterion
57 |
58 |
59 | def init_dataloader(batch_size, dist_helper):
60 | """
61 | Get dataloader
62 | """
63 | transform = transforms.Compose([
64 | transforms.ToTensor(),
65 | transforms.Normalize((0.1307,), (0.3081,)),
66 | transforms.Resize(128) # resize to larger image to showcase the use of GPU profiling
67 | ])
68 | dataset_loc = './mnist_data'
69 |
70 | train_dataset = datasets.MNIST(dataset_loc, download=True, train=True, transform=transform)
71 |
72 | # For final evaluation, it is advised not to use distributed sampler due to possibly incorrect results.
73 | # But we are using it now to accelerate evaluation during training.
74 | # Ref: https://github.com/pytorch/pytorch/issues/25162
75 | test_dataset = datasets.MNIST(dataset_loc, download=True, train=False, transform=transform)
76 |
77 | logging.info("Training set size: {:d}, testing set size: {:d}".format(len(train_dataset), len(test_dataset)))
78 |
79 | if dist_helper.is_ddp:
80 | batch_size_per_gpu = max(1, batch_size // dist.get_world_size())
81 | sampler = DistributedSampler(train_dataset, shuffle=True)
82 | train_loader = DataLoader(dataset=train_dataset, sampler=sampler, batch_size=batch_size_per_gpu,
83 | pin_memory=True, num_workers=min(6, os.cpu_count()))
84 |
85 | sampler = DistributedSampler(test_dataset, shuffle=False)
86 | test_loader = DataLoader(dataset=test_dataset, sampler=sampler, batch_size=batch_size_per_gpu,
87 | pin_memory=True, num_workers=min(6, os.cpu_count()))
88 | else:
89 | train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size,
90 | shuffle=True, pin_memory=True, num_workers=min(6, os.cpu_count()))
91 | test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size,
92 | shuffle=False, pin_memory=True, num_workers=min(6, os.cpu_count()))
93 |
94 | return train_loader, test_loader
95 |
96 |
97 | def go_training(epochs, model, optimizer, criterion, dist_helper, train_loader, test_loader, writer, logdir):
98 | """
99 | Training loop.
100 | """
101 |
102 | # init
103 | time_train_ls, time_val_ls = [], []
104 | epoch_when_snapshot = list(range(0, epochs, epochs // 5))
105 |
106 | # epoch-wise training
107 | for i_epoch in range(epochs):
108 | # train the model for one epoch
109 | if dist_helper.is_ddp:
110 | train_loader.sampler.set_epoch(i_epoch)
111 |
112 | time_epoch = time.time()
113 | train_loss = 0
114 | pbar = tqdm(train_loader)
115 | model.train()
116 | for x, y in pbar:
117 | x = x.to(dist_helper.device, non_blocking=True)
118 | y = y.to(dist_helper.device, non_blocking=True)
119 | optimizer.zero_grad()
120 | y_hat = model(x)
121 | batch_loss = criterion(y_hat, y)
122 | batch_loss.backward()
123 | optimizer.step()
124 | batch_loss_scalar = batch_loss.item()
125 | train_loss += batch_loss_scalar / x.shape[0]
126 | pbar.set_description(f'training batch_loss={batch_loss_scalar:.4f}')
127 | time_training = time.time() - time_epoch
128 |
129 | # calculate validation loss
130 | time_val = time.time()
131 | val_loss = 0.0
132 | pbar = tqdm(test_loader)
133 | model.eval()
134 | with torch.no_grad():
135 | for x, y in pbar:
136 | x = x.to(dist_helper.device, non_blocking=True)
137 | y = y.to(dist_helper.device, non_blocking=True)
138 | y_hat = model(x)
139 | batch_loss = criterion(y_hat, y)
140 | batch_loss_scalar = batch_loss.item()
141 | val_loss += batch_loss_scalar / x.shape[0]
142 | pbar.set_description(f'validation batch_loss={batch_loss_scalar:.4f}')
143 | time_val = time.time() - time_val
144 |
145 | logging.info(f"Epoch={i_epoch}, train_loss={train_loss:.4f}, val_loss={val_loss:.4f}")
146 | logging.info("Training time: {:.3f}s, Validation time: {:.3f}s".format(time_training, time_val))
147 | time_train_ls.append(time_training)
148 | time_val_ls.append(time_val)
149 |
150 | if get_ddp_save_flag():
151 | writer.add_scalar("train/loss", train_loss, i_epoch)
152 | writer.add_scalar("test/loss", val_loss, i_epoch)
153 | writer.flush()
154 |
155 | if i_epoch in epoch_when_snapshot and get_ddp_save_flag():
156 | model_path = os.path.join(logdir, 'model_epoch_{:03d}_{:s}_{:d}.pt'.format(
157 | i_epoch, datetime.now().strftime("%Y%m%d-%H%M%S"), os.getpid()))
158 | torch.save(model.state_dict(), model_path)
159 | logging.info("Saving model to {:s}".format(model_path))
160 | dist_helper.ddp_sync()
161 |
162 | # Count overall training efficiency
163 | logging.info("{:s} Overall timing results {:s}".format('-' * 10, '-' * 10))
164 | logging.info("Total training time: {:.3f}s, total validation time: {:.3f}s".format(
165 | np.sum(time_train_ls), np.sum(time_val_ls)))
166 | for i_epoch, time_training, time_val in zip(range(epochs), time_train_ls, time_val_ls):
167 | logging.info("Epoch: {:d}, Training time: {:.3f}s, Validation time: {:.3f}s.".format(
168 | i_epoch, time_training, time_val))
169 |
170 |
171 | def main():
172 | """
173 | Main training loop
174 | """
175 |
176 | """Initialization basics"""
177 | args, dist_helper, writer = init_basics()
178 |
179 | """Get network"""
180 | model, optimizer, criterion = init_model(dist_helper)
181 |
182 | """Get dataloader"""
183 | train_loader, test_loader = init_dataloader(args.batch_size, dist_helper)
184 |
185 | """Go training"""
186 | go_training(args.epoch, model, optimizer, criterion, dist_helper, train_loader, test_loader, writer, args.logdir)
187 |
188 | """Distributed training cleanup"""
189 | dist_helper.clean_up()
190 |
191 |
192 | if __name__ == '__main__':
193 | main()
194 |
--------------------------------------------------------------------------------
/scripts/demo_cc.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #SBATCH --account=def-rjliao
3 | #SBATCH --gres=gpu:v100l:4 # Number of GPUs per node (specifying v100l gpu)
4 | #SBATCH --nodes=2 # Number of nodes
5 | #SBATCH --ntasks=8 # Number of MPI process
6 | #SBATCH --ntasks-per-node=4 # Number of distributed process per compute node
7 | #SBATCH --cpus-per-task=8 # CPU cores per MPI process
8 | #SBATCH --mem=64G # memory per node
9 | #SBATCH --time=00-00:20 # time (DD-HH:MM)
10 | #SBATCH --mail-user=yanq@student.ubc.ca # send email regarding task status
11 | #SBATCH --mail-type=ALL
12 |
13 | ################################################################################
14 |
15 | # in this demo, we take 2 nodes and each node has 4 V100-32GB GPUs
16 | MASTER_PORT=29400
17 |
18 | module load gcc
19 | module load cuda
20 | module load nccl
21 | module load openmpi
22 |
23 | # you should submit job from the cloned repo's directory
24 | cd ${SLURM_SUBMIT_DIR}
25 | source venvhpc/bin/activate
26 | export OMP_NUM_THREADS=6
27 |
28 | # single GPU: use 1 GPU on 1 node
29 | CUDA_VISIBLE_DEVICES=0 python main.py --batch_size=768 -m=cc_demo_single_gpu
30 |
31 | # DP: use multiple GPUs on 1 node
32 | CUDA_VISIBLE_DEVICES=0,1,2,3 python main.py --batch_size=3072 --dp -m=cc_demo_single_node_dp
33 |
34 | # DDP: use multiple GPUs on 1 node
35 | CUDA_VISIBLE_DEVICES=0,1,2,3 torchrun --nnodes=1 --nproc_per_node=4 --master_port=$MASTER_PORT main.py --batch_size=3072 --ddp -m=cc_demo_single_node_ddp
36 |
37 | # DDP: use multiple GPUs on multiple nodes
38 |
39 | # mpirun method
40 | mpirun -np 8 \
41 | -x MASTER_ADDR=$(hostname) \
42 | -x MASTER_PORT=$MASTER_PORT \
43 | -x PATH \
44 | -bind-to none -map-by :OVERSUBSCRIBE \
45 | -mca pml ob1 -mca btl ^openib \
46 | python main.py --batch_size=6144 --ddp -m=cc_demo_multiple_node_mpi_ddp
47 |
48 | # srun method
49 | # The SLURM_NTASKS variable tells the script how many processes are available for this execution.
50 | # “srun” executes the script times
51 |
52 | # Therefore, for error-free srun execution, we need to overwrite the SBATCH options set in the very beginning
53 | # by using --ntasks=2 --ntasks-per-node=1 explicitly.
54 | # Note: the nuance is --ntasks=8 --ntasks-per-node=4 works for mpirun + python main.py --args,
55 | # while --ntasks=2 --ntasks-per-node=1 works for srun + torchrun.
56 |
57 | srun --nodes=2 --ntasks-per-node=1 --ntasks=2 torchrun --nnodes=2 --nproc_per_node=4 \
58 | --rdzv_id=$SLURM_JOB_ID --rdzv_backend=c10d --rdzv_endpoint=$(hostname):$MASTER_PORT \
59 | main.py --batch_size=6144 --ddp -m=cc_demo_multiple_node_srun_ddp
60 |
61 | ##### DEBUG info #####
62 | #echo $SLURM_JOB_NODELIST
63 | #
64 | #echo $(hostname)
65 | #
66 | #echo $(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
67 | #
68 | ## extract the first string component before the first dot
69 | ## cdr2639.int.cedar.computecanada.ca -> cdr2639
70 | #echo $(echo $(hostname) | cut -d '.' -f 1)
71 |
--------------------------------------------------------------------------------
/scripts/demo_cc_apptainer.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #SBATCH --account=def-rjliao
3 | #SBATCH --gres=gpu:v100l:4 # Number of GPUs per node (specifying v100l gpu)
4 | #SBATCH --nodes=1 # Number of nodes
5 | #SBATCH --ntasks=8 # Number of MPI process
6 | #SBATCH --ntasks-per-node=4 # Number of distributed process per compute node
7 | #SBATCH --cpus-per-task=8 # CPU cores per MPI process
8 | #SBATCH --mem=64G # memory per node
9 | #SBATCH --time=00-00:20 # time (DD-HH:MM)
10 | #SBATCH --mail-user=yanq@student.ubc.ca # send email regarding task status
11 | #SBATCH --mail-type=ALL
12 |
13 | ################################################################################
14 |
15 | # in this demo, we take 1 nodes and each node has 4 V100-32GB GPUs
16 |
17 | ## set up environment variables for apptainer
18 | ## this script is intended for the narval cluster, please adjust the path accordingly for other clusters
19 | module load apptainer-suid/1.1
20 | cd /lustre07/scratch/${USER}/venv
21 | export TMPDIR=/tmp/${USER}tmp
22 | mkdir -p ${TMPDIR}
23 | export APPTAINER_CACHEDIR=${TMPDIR}
24 | export APPTAINER_TMPDIR=${TMPDIR}
25 |
26 | # !!!please change the USER_NAME to your own username before running the script!!!
27 |
28 | # single GPU: use 1 GPU on 1 node
29 | apptainer exec -C -B /project -B /scratch -B /home -W ${TMPDIR} --nv venvhpc.sandbox bash -c '
30 | export USER_NAME='YOUR_USER_NAME'
31 | source /opt/conda/etc/profile.d/conda.sh
32 | conda activate /lustre07/scratch/${USER_NAME}/venv/condaenvs/venvhpc
33 | cd /lustre06/project/6068146/${USER_NAME}/HPC_helper
34 | CUDA_VISIBLE_DEVICES=0 python main.py --batch_size=768 -m=cc_demo_single_gpu
35 | '
36 |
37 | # single GPU: use 1 GPU on 1 node
38 | apptainer exec -C -B /project -B /scratch -B /home -W ${TMPDIR} --nv venvhpc.sandbox bash -c '
39 | export USER_NAME='YOUR_USER_NAME'
40 | source /opt/conda/etc/profile.d/conda.sh
41 | conda activate /lustre07/scratch/${USER_NAME}/venv/condaenvs/venvhpc
42 | cd /lustre06/project/6068146/${USER_NAME}/HPC_helper
43 | CUDA_VISIBLE_DEVICES=0 python main.py --batch_size=768 -m=cc_apptainer_demo_single_gpu
44 | '
45 |
46 | # DP: use multiple GPUs on 1 node
47 | apptainer exec -C -B /project -B /scratch -B /home -W ${TMPDIR} --nv venvhpc.sandbox bash -c '
48 | export USER_NAME='YOUR_USER_NAME'
49 | source /opt/conda/etc/profile.d/conda.sh
50 | conda activate /lustre07/scratch/${USER_NAME}/venv/condaenvs/venvhpc
51 | cd /lustre06/project/6068146/${USER_NAME}/HPC_helper
52 | CUDA_VISIBLE_DEVICES=0,1,2,3 python main.py --batch_size=3072 --dp -m=cc_apptainer_demo_single_node_dp
53 | '
54 |
55 | # DDP: use multiple GPUs on 1 node
56 | apptainer exec -C -B /project -B /scratch -B /home -W ${TMPDIR} --nv venvhpc.sandbox bash -c '
57 | export USER_NAME='YOUR_USER_NAME'
58 | export MASTER_PORT=29400
59 | source /opt/conda/etc/profile.d/conda.sh
60 | conda activate /lustre07/scratch/${USER_NAME}/venv/condaenvs/venvhpc
61 | cd /lustre06/project/6068146/${USER_NAME}/HPC_helper
62 | CUDA_VISIBLE_DEVICES=0,1,2,3 torchrun --nnodes=1 --nproc_per_node=4 --master_port=$MASTER_PORT main.py --batch_size=3072 --ddp -m=cc_apptainer_demo_single_node_ddp
63 | '
64 |
65 | # note: we haven't tested the multi-node DDP with apptainer yet, but the MPI option may work
66 |
--------------------------------------------------------------------------------
/scripts/demo_sockeye.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #SBATCH --job-name=demo_sockeye
3 | #SBATCH --account=st-rjliao-1-gpu
4 | #SBATCH --nodes=1
5 | #SBATCH --ntasks=1
6 | #SBATCH --cpus-per-task=24
7 | #SBATCH --mem=32G
8 | #SBATCH --time=00:20:00
9 | #SBATCH --gpus-per-node=1
10 | #SBATCH --output=slurm-%j_out.txt
11 | #SBATCH --error=slurm-%j_err.txt
12 | #SBATCH --mail-user=yanq@student.ubc.ca
13 | #SBATCH --mail-type=ALL
14 | ################################################################################
15 |
16 | # in this demo, we take 2 nodes and each node has 4 V100-32GB GPUs
17 | MASTER_PORT=29400
18 |
19 | module load gcc
20 | module load cuda
21 | module load nccl
22 | module load openmpi
23 |
24 | # you should submit job from the cloned repo's directory
25 | cd ${PBS_O_WORKDIR}
26 | source venvhpc/bin/activate
27 | export OMP_NUM_THREADS=6
28 |
29 | # note: at Sockeye, it's better to specify CUDA_VISIBLE_DEVICES explicitly for distributed training,
30 | # otherwise methods in torch.cuda may lead to an error, e.g., torch.cuda.device_count()
31 |
32 | # single GPU: use 1 GPU on 1 node
33 | CUDA_VISIBLE_DEVICES=0 python main.py --batch_size=768 -m=sockeye_demo_single_gpu
34 |
35 | # DP: use multiple GPUs on 1 node
36 | CUDA_VISIBLE_DEVICES=0,1,2,3 python main.py --batch_size=3072 --dp -m=sockeye_demo_single_node_dp
37 |
38 | # DDP: use multiple GPUs on 1 node
39 | CUDA_VISIBLE_DEVICES=0,1,2,3 torchrun --nnodes=1 --nproc_per_node=4 --master_port=$MASTER_PORT main.py --batch_size=3072 --ddp -m=sockeye_demo_single_node_ddp
40 |
41 | # DDP: use multiple GPUs on multiple nodes
42 | mpirun -np 8 \
43 | --hostfile $PBS_NODEFILE --oversubscribe \
44 | -x MASTER_ADDR=$(hostname) \
45 | -x MASTER_PORT=$MASTER_PORT \
46 | -x CUDA_VISIBLE_DEVICES=0,1,2,3 \
47 | -x PATH \
48 | -bind-to none -map-by :OVERSUBSCRIBE \
49 | -mca pml ob1 -mca btl ^openib \
50 | python main.py --batch_size=6144 --ddp -m=sockeye_demo_multiple_node_mpi_ddp
51 |
52 |
--------------------------------------------------------------------------------
/scripts/demo_sockeye_pbs.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #PBS -l walltime=00:20:00,select=2:ncpus=8:ngpus=4:mem=64gb:gpu_mem=32gb
3 | #PBS -N sockeye_demo
4 | #PBS -A st-rjliao-1-gpu
5 | #PBS -m abe
6 | #PBS -M yanq@student.ubc.ca
7 |
8 | ################################################################################
9 |
10 | # in this demo, we take 2 nodes and each node has 4 V100-32GB GPUs
11 | MASTER_PORT=29400
12 |
13 | module load gcc
14 | module load cuda
15 | module load nccl
16 | module load openmpi
17 |
18 | # you should submit job from the cloned repo's directory
19 | cd ${PBS_O_WORKDIR}
20 | source venvhpc/bin/activate
21 | export OMP_NUM_THREADS=6
22 |
23 | # note: at Sockeye, it's better to specify CUDA_VISIBLE_DEVICES explicitly for distributed training,
24 | # otherwise methods in torch.cuda may lead to an error, e.g., torch.cuda.device_count()
25 |
26 | # single GPU: use 1 GPU on 1 node
27 | CUDA_VISIBLE_DEVICES=0 python main.py --batch_size=768 -m=sockeye_demo_single_gpu
28 |
29 | # DP: use multiple GPUs on 1 node
30 | CUDA_VISIBLE_DEVICES=0,1,2,3 python main.py --batch_size=3072 --dp -m=sockeye_demo_single_node_dp
31 |
32 | # DDP: use multiple GPUs on 1 node
33 | CUDA_VISIBLE_DEVICES=0,1,2,3 torchrun --nnodes=1 --nproc_per_node=4 --master_port=$MASTER_PORT main.py --batch_size=3072 --ddp -m=sockeye_demo_single_node_ddp
34 |
35 | # DDP: use multiple GPUs on multiple nodes
36 | mpirun -np 8 \
37 | --hostfile $PBS_NODEFILE --oversubscribe \
38 | -x MASTER_ADDR=$(hostname) \
39 | -x MASTER_PORT=$MASTER_PORT \
40 | -x CUDA_VISIBLE_DEVICES=0,1,2,3 \
41 | -x PATH \
42 | -bind-to none -map-by :OVERSUBSCRIBE \
43 | -mca pml ob1 -mca btl ^openib \
44 | python main.py --batch_size=6144 --ddp -m=sockeye_demo_multiple_node_mpi_ddp
45 |
46 |
--------------------------------------------------------------------------------
/scripts/demo_vector.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #SBATCH --job-name=demo_vector
3 | #SBATCH --partition=rtx6000 # Type of GPUs
4 | #SBATCH --gres=gpu:4 # Number of GPUs per node
5 | #SBATCH --nodes=2 # Number of nodes
6 | #SBATCH --ntasks=8 # Number of MPI process
7 | #SBATCH --ntasks-per-node=4 # Number of distributed process per compute node
8 | #SBATCH --cpus-per-task=8 # CPU cores per MPI process
9 | #SBATCH --mem=64G # memory per node
10 | #SBATCH --time=00-00:20 # time (DD-HH:MM)
11 | #SBATCH --qos=normal # QoS type
12 | #SBATCH --mail-user=yanq@student.ubc.ca # send email regarding task status
13 | #SBATCH --mail-type=ALL
14 | #SBATCH --output=slurm-%j_out.txt
15 | #SBATCH --error=slurm-%j_err.txt
16 |
17 | ################################################################################
18 |
19 | # in this demo, we take 2 nodes and each node has 4 RTX6000-24GB GPUs
20 | MASTER_PORT=29400
21 |
22 | module use /pkgs/environment-modules/
23 | module load python/3.8
24 | module load cuda-11.7
25 | source /scratch/ssd004/scratch/qiyan/venvmtr/bin/activate
26 | cd /fs01/home/qiyan/DSL-MTR/tools
27 |
28 |
29 | # you should submit job from the cloned repo's directory
30 | cd ${SLURM_SUBMIT_DIR}
31 | source venvhpc/bin/activate
32 | export OMP_NUM_THREADS=6
33 |
34 | # single GPU: use 1 GPU on 1 node
35 | (while true; do nvidia-smi; top -b -n 1 | head -20; sleep 10; done) &
36 | CUDA_VISIBLE_DEVICES=0 python main.py --batch_size=512 -m=vector_demo_single_gpu
37 |
38 | # DP: use multiple GPUs on 1 node
39 | (while true; do nvidia-smi; top -b -n 1 | head -20; sleep 10; done) &
40 | CUDA_VISIBLE_DEVICES=0,1,2,3 python main.py --batch_size=2048 --dp -m=vector_demo_single_node_dp
41 |
42 | # DDP: use multiple GPUs on 1 node
43 | (while true; do nvidia-smi; top -b -n 1 | head -20; sleep 10; done) &
44 | CUDA_VISIBLE_DEVICES=0,1,2,3 torchrun --nnodes=1 --nproc_per_node=4 --master_port=$MASTER_PORT main.py --batch_size=2048 --ddp -m=vector_demo_single_node_ddp
45 |
46 | # DDP: use multiple GPUs on multiple nodes
47 |
48 | # mpirun method
49 | (while true; do nvidia-smi; top -b -n 1 | head -20; sleep 10; done) &
50 | mpirun -np 8 \
51 | -x MASTER_ADDR=$(hostname) \
52 | -x MASTER_PORT=$MASTER_PORT \
53 | -x PATH \
54 | -bind-to none -map-by :OVERSUBSCRIBE \
55 | -mca pml ob1 -mca btl ^openib \
56 | python main.py --batch_size=2048 --ddp -m=vector_demo_multiple_node_mpi_ddp
57 |
58 | # srun method
59 | # The SLURM_NTASKS variable tells the script how many processes are available for this execution.
60 | # “srun” executes the script times
61 |
62 | # Therefore, for error-free srun execution, we need to overwrite the SBATCH options set in the very beginning
63 | # by using --ntasks=2 --ntasks-per-node=1 explicitly.
64 | # Note: the nuance is --ntasks=8 --ntasks-per-node=4 works for mpirun + python main.py --args,
65 | # while --ntasks=2 --ntasks-per-node=1 works for srun + torchrun.
66 |
67 | (while true; do nvidia-smi; top -b -n 1 | head -20; sleep 10; done) &
68 | srun --nodes=2 --ntasks-per-node=1 --ntasks=2 torchrun --nnodes=2 --nproc_per_node=4 \
69 | --rdzv_id=$SLURM_JOB_ID --rdzv_backend=c10d --rdzv_endpoint=$(hostname):$MASTER_PORT \
70 | main.py --batch_size=2048 --ddp -m=vector_demo_multiple_node_srun_ddp
71 |
72 | ##### DEBUG info #####
73 | #echo $SLURM_JOB_NODELIST
74 | #
75 | #echo $(hostname)
76 | #
77 | #echo $(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
78 | #
79 |
--------------------------------------------------------------------------------
/setup/requirements_cc.txt:
--------------------------------------------------------------------------------
1 | torch
2 | torchvision
3 | torchaudio
4 | tqdm
5 | numpy
6 | ml_collections
7 | torch_tb_profiler
8 |
--------------------------------------------------------------------------------
/setup/requirements_sockeye.txt:
--------------------------------------------------------------------------------
1 | --extra-index-url https://download.pytorch.org/whl/cu116
2 | torch
3 | torchvision
4 | torchaudio
5 | tqdm
6 | numpy
7 | ml_collections
8 | torch_tb_profiler
9 |
--------------------------------------------------------------------------------
/utils/arg_parser.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import logging
3 | import os
4 | import pdb
5 | import shutil
6 | import random
7 | import sys
8 | from datetime import datetime
9 | import numpy as np
10 | import torch
11 | import torch.distributed as dist
12 | from torch.utils.tensorboard import SummaryWriter
13 |
14 | from utils.dist_training import get_ddp_save_flag
15 |
16 |
17 | def parse_arguments():
18 | """
19 | Argument parser.
20 | """
21 | parser = argparse.ArgumentParser(description="Running Experiments")
22 | parser.add_argument('-l', '--log_level', type=str,
23 | default='DEBUG', help="Logging Level, one of: DEBUG, INFO, WARNING, ERROR, CRITICAL")
24 | parser.add_argument('-m', '--comment', type=str,
25 | default="", help="A single line comment for the experiment")
26 | parser.add_argument('--dp', default=False, action='store_true',
27 | help='To use DataParallel distributed learning.')
28 | parser.add_argument('--ddp', default=False, action='store_true',
29 | help='To use DDP distributed learning')
30 | parser.add_argument('--ddp_gpu_ids', nargs='+', default=None,
31 | help="A list of GPU IDs to run distributed learning")
32 | parser.add_argument('--batch_size', default=256, type=int,
33 | help='Training batch size.')
34 | parser.add_argument('--epoch', default=5, type=int,
35 | help='Training epochs.')
36 | parser.add_argument('--seed', default=1234, type=int,
37 | help='Random seed.')
38 | parser.add_argument('--ddp_init_method', default='env://', type=str,
39 | help='torch.distributed.init_process_group options.')
40 |
41 | args = parser.parse_args()
42 |
43 | # add log directory
44 | if args.dp:
45 | dist_status = 'dp'
46 | elif args.ddp:
47 | dist_status = 'ddp'
48 | else:
49 | dist_status = 'single_gpu'
50 |
51 | logdir_nm = dist_status + "_" + datetime.now().strftime("%Y%m%d_%H%M%S")
52 | if len(args.comment):
53 | logdir_nm += '_' + args.comment
54 |
55 | logdir = os.path.join('runs', logdir_nm)
56 | os.makedirs(logdir, exist_ok=True)
57 |
58 | args.logdir = logdir
59 | print('Args: \n', args)
60 | return args
61 |
62 |
63 | def set_seed_and_logger(seed, logdir, log_level, comment, dist_helper):
64 | """
65 | Set up random seed number and global logger.
66 | """
67 | # Setup random seed
68 | if dist_helper.is_ddp:
69 | seed += dist.get_rank()
70 | else:
71 | pass
72 | random.seed(seed)
73 | np.random.seed(seed)
74 | torch.manual_seed(seed)
75 | torch.cuda.manual_seed_all(seed)
76 |
77 | # torch numerical accuracy flags
78 | # reference: https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
79 | # The flag below controls whether to allow TF32 on matmul. This flag defaults to True.
80 | torch.backends.cuda.matmul.allow_tf32 = False
81 | # The flag below controls whether to allow TF32 on cuDNN. This flag defaults to True.
82 | torch.backends.cudnn.allow_tf32 = False
83 |
84 | # Setup logger
85 | if dist_helper.is_ddp:
86 | log_file = os.path.join(logdir, "ddp_rank_{:02d}_".format(dist.get_rank()) + log_level.lower() + ".log")
87 | else:
88 | log_file = os.path.join(logdir, log_level.lower() + ".log")
89 | logger_format = comment + '| %(asctime)s %(message)s'
90 | fh = logging.FileHandler(log_file)
91 | fh.setLevel(log_level)
92 | for handler in logging.root.handlers[:]:
93 | logging.root.removeHandler(handler)
94 | logging.basicConfig(level=logging.DEBUG, format=logger_format,
95 | datefmt='%m-%d %H:%M:%S',
96 | handlers=[
97 | fh,
98 | logging.StreamHandler(sys.stdout)
99 | ])
100 | logging.getLogger('matplotlib.font_manager').setLevel(logging.INFO) # remove excessive matplotlib messages
101 | logging.getLogger('matplotlib').setLevel(logging.INFO) # remove excessive matplotlib messages
102 | logging.info('EXPERIMENT BEGIN: ' + comment)
103 | logging.info('logging into %s', log_file)
104 |
105 | # Setup tensorboard logger
106 | if get_ddp_save_flag():
107 | writer = SummaryWriter(log_dir=logdir)
108 | else:
109 | writer = None
110 | return writer
111 |
112 |
113 | def backup_code(logdir):
114 | if get_ddp_save_flag():
115 | code_path = os.path.join(logdir, 'code')
116 | dirs_to_save = ['utils']
117 | os.makedirs(code_path, exist_ok=True)
118 |
119 | # save_name = os.path.join(code_path, 'config.yaml')
120 | # yaml.dump(dict(config), open(save_name, 'w'), default_flow_style=False)
121 |
122 | os.system('cp ./*py ' + code_path)
123 | [shutil.copytree(os.path.join('./', this_dir), os.path.join(code_path, this_dir)) for this_dir in dirs_to_save]
124 |
--------------------------------------------------------------------------------
/utils/dist_training.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import os
3 | import pdb
4 |
5 | import torch
6 | from torch import distributed as dist, nn as nn
7 | from torch.nn.parallel import DistributedDataParallel as DDP
8 |
9 |
10 | class DistributedHelper(object):
11 | def __init__(self, flag_dp, flag_ddp, ddp_gpu_ids, init_method):
12 | self.flag_dp = flag_dp
13 | self.flag_ddp = flag_ddp
14 | self.ddp_gpu_ids = ddp_gpu_ids
15 | self.init_method = init_method
16 |
17 | if (self.flag_dp or self.flag_ddp) and ddp_gpu_ids is None:
18 | assert torch.cuda.device_count() > 1, "Number of GPU must be more than one to use distributed learning!"
19 | assert not all((flag_dp, flag_ddp)), \
20 | "Flag DP ({:}) and flag DDP ({:}) cannot be both true!".format(flag_dp, flag_ddp)
21 |
22 | self.gpu_name = 'dummy'
23 | self.init_ddp()
24 | self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
25 |
26 | def init_ddp(self):
27 | """
28 | Initialize DDP distributed training if necessary.
29 | Note: we have to initialize DDP mode before initialize the logging file, otherwise the multiple DDP
30 | processes' loggings will interfere with each other.
31 | """
32 | print("Number of available GPU to use: {}".format(torch.cuda.device_count()))
33 | if self.flag_ddp:
34 | self.init_ddp_backend()
35 | self.gpu_name = torch.cuda.get_device_name()
36 | print("Setup DDP for process {:d} using GPUs {} (ID) with NCCL backend. GPU for this process: {:s}".format(
37 | os.getpid(), self.ddp_gpu_ids, self.gpu_name))
38 | elif self.flag_dp:
39 | gpu_specs = [torch.cuda.get_device_name(i_gpu) for i_gpu in range(torch.cuda.device_count())]
40 | self.gpu_name = ','.join(gpu_specs)
41 | print("Setup DP using {:d} GPUs, specs: {:s}.".format(torch.cuda.device_count(), self.gpu_name))
42 | else:
43 | self.gpu_name = torch.cuda.get_device_name()
44 | print("Single GPU mode, specs: {:s}.".format(self.gpu_name))
45 |
46 | def init_ddp_backend(self):
47 | """
48 | Start DDP engine using NCCL backend.
49 | """
50 | ddp_status, env_dict = self.get_ddp_status()
51 | local_rank = env_dict['LOCAL_RANK']
52 |
53 | if self.ddp_gpu_ids is not None:
54 | assert isinstance(self.ddp_gpu_ids, list)
55 | num_gpus = len(self.ddp_gpu_ids)
56 | gpu_id = int(self.ddp_gpu_ids[local_rank % num_gpus])
57 | torch.cuda.set_device(gpu_id) # set single gpu device per process
58 | else:
59 | torch.cuda.set_device(local_rank) # set single gpu device per process
60 | dist.init_process_group(backend="nccl", init_method=self.init_method, rank=env_dict['WORLD_RANK'], world_size=env_dict['WORLD_SIZE'])
61 |
62 | def dist_adapt_model(self, model):
63 | """
64 | Setup distributed learning for network.
65 | """
66 | logging.info("Adapt the model for distributed training...")
67 | if self.flag_ddp:
68 | # DDP
69 | model = DDP(model.cuda(), device_ids=[torch.cuda.current_device()]) # single CUDA device per process
70 | logging.info("Distributed ON. Mode: DDP. Backend: {:s}, Rank: {:d} / World size: {:d}. "
71 | "Current device: {}, spec: {}".format(
72 | dist.get_backend(), dist.get_rank(), dist.get_world_size(),
73 | torch.cuda.current_device(), self.gpu_name))
74 | elif self.flag_dp:
75 | # DP
76 | model = nn.DataParallel(model)
77 | model.to(torch.device("cuda")) # multiple devices per process, controlled by CUDA_VISIBLE_DEVICES
78 | logging.info("Distributed ON. Mode: DP. Number of available GPU to use: {}, specs: {}".format(
79 | torch.cuda.device_count(), self.gpu_name))
80 | else:
81 | # single GPU
82 | logging.info("Distributed OFF. Single-GPU training, specs: {}.".format(self.gpu_name))
83 |
84 | return model
85 |
86 | def ddp_sync(self):
87 | if self.flag_ddp and dist.is_initialized():
88 | dist.barrier()
89 | else:
90 | pass
91 |
92 | def clean_up(self):
93 | self.ddp_sync()
94 | if self.flag_ddp and dist.is_initialized():
95 | dist.destroy_process_group()
96 | else:
97 | pass
98 |
99 | @staticmethod
100 | def get_ddp_status():
101 | """
102 | Get DDP-related env. parameters.
103 | """
104 | if 'LOCAL_RANK' in os.environ:
105 | # Environment variables set by torch.distributed.launch or torchrun
106 | local_rank = int(os.environ['LOCAL_RANK'])
107 | world_size = int(os.environ['WORLD_SIZE'])
108 | world_rank = int(os.environ['RANK'])
109 | elif 'OMPI_COMM_WORLD_LOCAL_RANK' in os.environ:
110 | # Environment variables set by mpirun
111 | local_rank = int(os.environ['OMPI_COMM_WORLD_LOCAL_RANK'])
112 | world_size = int(os.environ['OMPI_COMM_WORLD_SIZE'])
113 | world_rank = int(os.environ['OMPI_COMM_WORLD_RANK'])
114 | else:
115 | raise NotImplementedError
116 |
117 | env_dict = {
118 | 'MASTER_ADDR': os.environ['MASTER_ADDR'],
119 | 'MASTER_PORT': os.environ['MASTER_PORT'],
120 | 'LOCAL_RANK': local_rank,
121 | 'WORLD_SIZE': world_size,
122 | 'WORLD_RANK': world_rank,
123 | }
124 | ddp_status = "Process PID: {}. DDP setup: {} ".format(os.getpid(), env_dict)
125 | return ddp_status, env_dict
126 |
127 | @property
128 | def is_ddp(self):
129 | """
130 | DDP flag.
131 | """
132 | return self.flag_ddp
133 |
134 | @property
135 | def is_dp(self):
136 | """
137 | DP flag.
138 | """
139 | return self.flag_dp
140 |
141 | @property
142 | def is_distributed(self):
143 | """
144 | Distributed learning flag.
145 | """
146 | return self.flag_dp or self.flag_ddp
147 |
148 |
149 | # Independent function helpers
150 | def get_ddp_save_flag():
151 | """
152 | Return saving flag for DDP mode, only rank 0 process makes the output.
153 | """
154 | flag_save = True
155 | if dist.is_initialized():
156 | if dist.get_rank() != 0:
157 | flag_save = False
158 | return flag_save
159 |
160 |
161 | def dist_save_model(data_to_save, to_save_path):
162 | """
163 | Wrapper to save based on DDP status (for main process only).
164 | """
165 | if get_ddp_save_flag():
166 | torch.save(data_to_save, to_save_path)
167 |
--------------------------------------------------------------------------------
/utils/learning_utils.py:
--------------------------------------------------------------------------------
1 | def count_model_params(model):
2 | """
3 | Go through the model parameters
4 | """
5 | param_strings = []
6 | max_string_len = 126
7 | for name, param in model.named_parameters():
8 | if param.requires_grad:
9 | line = '.' * max(0, max_string_len - len(name) - len(str(param.size())))
10 | param_strings.append(f"{name} {line} {param.size()}")
11 | param_string = '\n'.join(param_strings)
12 |
13 | total_params = sum(p.numel() for p in model.parameters())
14 | total_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
15 | return param_string, total_params, total_trainable_params
16 |
17 |
18 | def _print_and_log(in_str, log_file):
19 | assert isinstance(in_str, str)
20 | print(in_str, flush=True)
21 | log_file.write(in_str + '\n')
22 | log_file.flush()
23 |
--------------------------------------------------------------------------------