├── requirements.txt ├── src ├── ddp │ ├── run_ddp.sh │ ├── utils.py │ ├── main.py │ ├── config.py │ ├── net.py │ ├── dataset.py │ └── trainer.py ├── dp │ ├── run_dp.sh │ ├── main.py │ ├── utils.py │ ├── config.py │ ├── net.py │ ├── dataset.py │ └── trainer.py └── single │ ├── run_single.sh │ ├── main.py │ ├── utils.py │ ├── config.py │ ├── net.py │ ├── dataset.py │ └── trainer.py ├── Dockerfile ├── README.md ├── .gitignore └── LICENSE /requirements.txt: -------------------------------------------------------------------------------- 1 | tensorboard==2.4.1 2 | tensorboardX==2.1 3 | torch==1.7.1 4 | torchvision==0.8.2 5 | tqdm==4.49.0 6 | PyYAML==5.4.1 7 | -------------------------------------------------------------------------------- /src/ddp/run_ddp.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | EPOCH=50 4 | BATCH_SIZE=256 5 | 6 | LR=0.1 7 | LR_DECAY_STEP_SIZE=25 8 | LR_DECAY_GAMMA=0.1 9 | WEIGHT_DECAY=0.0001 10 | 11 | SEED=42 12 | 13 | python src/ddp/main.py\ 14 | --seed=${SEED}\ 15 | --epoch=${EPOCH}\ 16 | --batch-size=${BATCH_SIZE}\ 17 | --lr=${LR}\ 18 | --weight-decay=${WEIGHT_DECAY}\ 19 | --lr-decay-step-size=${LR_DECAY_STEP_SIZE}\ 20 | --lr-decay-gamma=${LR_DECAY_GAMMA}\ 21 | --amp\ 22 | --contain-test 23 | -------------------------------------------------------------------------------- /src/dp/run_dp.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | EPOCH=50 4 | BATCH_SIZE=128 5 | 6 | LR=0.1 7 | LR_DECAY_STEP_SIZE=25 8 | LR_DECAY_GAMMA=0.1 9 | WEIGHT_DECAY=0.0001 10 | 11 | SEED=42 12 | 13 | python src/dp/main.py\ 14 | --seed=${SEED}\ 15 | --epoch=${EPOCH}\ 16 | --batch-size=${BATCH_SIZE}\ 17 | --lr=${LR}\ 18 | --weight-decay=${WEIGHT_DECAY}\ 19 | --lr-decay-step-size=${LR_DECAY_STEP_SIZE}\ 20 | --lr-decay-gamma=${LR_DECAY_GAMMA}\ 21 | --amp\ 22 | --contain-test 23 | -------------------------------------------------------------------------------- /src/single/run_single.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | EPOCH=50 4 | BATCH_SIZE=128 5 | 6 | LR=0.1 7 | LR_DECAY_STEP_SIZE=25 8 | LR_DECAY_GAMMA=0.1 9 | WEIGHT_DECAY=0.0001 10 | 11 | SEED=42 12 | 13 | python src/single/main.py\ 14 | --seed=${SEED}\ 15 | --epoch=${EPOCH}\ 16 | --batch-size=${BATCH_SIZE}\ 17 | --lr=${LR}\ 18 | --weight-decay=${WEIGHT_DECAY}\ 19 | --lr-decay-step-size=${LR_DECAY_STEP_SIZE}\ 20 | --lr-decay-gamma=${LR_DECAY_GAMMA}\ 21 | --amp\ 22 | --contain-test 23 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM nvidia/cuda:10.2-cudnn7-devel-ubuntu18.04 2 | ENV LANG=C.UTF-8 LC_ALL=C.UTF-8 3 | ENV LC_ALL=C.UTF-8 4 | 5 | ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:${PATH} 6 | RUN apt-get update -y 7 | 8 | RUN apt-get update \ 9 | && apt-get install -y python3-pip python3-dev \ 10 | && cd /usr/local/bin \ 11 | && ln -s /usr/bin/python3 python \ 12 | && pip3 install --upgrade pip 13 | 14 | COPY requirements.txt /tmp 15 | WORKDIR /tmp 16 | RUN pip install -r requirements.txt 17 | 18 | ARG UNAME 19 | ARG UID 20 | ARG GID 21 | RUN groupadd -g $GID -o $UNAME 22 | RUN useradd -m -u $UID -g $GID -o -s /bin/bash $UNAME 23 | USER $UNAME -------------------------------------------------------------------------------- /src/dp/main.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import os 3 | 4 | import torch 5 | 6 | from config import load_config 7 | from net import ResNet18 8 | from trainer import Trainer 9 | from utils import fix_seed 10 | 11 | 12 | def main(hparams): 13 | fix_seed(hparams.seed) 14 | scaler = torch.cuda.amp.GradScaler() if hparams.amp else None 15 | model = ResNet18() 16 | 17 | # training phase 18 | trainer = Trainer(hparams, model, scaler) 19 | version = trainer.fit() 20 | 21 | # testing phase 22 | if hparams.contain_test: 23 | state_dict = torch.load( 24 | glob.glob( 25 | os.path.join(hparams.ckpt_path, f"version-{version}/best_model_*.pt") 26 | )[0] 27 | ) 28 | trainer.test(state_dict) 29 | 30 | 31 | if __name__ == "__main__": 32 | hparams = load_config() 33 | main(hparams) 34 | -------------------------------------------------------------------------------- /src/single/main.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import os 3 | 4 | import torch 5 | 6 | from config import load_config 7 | from net import ResNet18 8 | from trainer import Trainer 9 | from utils import fix_seed 10 | 11 | 12 | def main(hparams): 13 | fix_seed(hparams.seed) 14 | scaler = torch.cuda.amp.GradScaler() if hparams.amp else None 15 | model = ResNet18() 16 | 17 | # training phase 18 | trainer = Trainer(hparams, model, scaler) 19 | version = trainer.fit() 20 | 21 | # testing phase 22 | if hparams.contain_test: 23 | state_dict = torch.load( 24 | glob.glob( 25 | os.path.join(hparams.ckpt_path, f"version-{version}/best_model_*.pt") 26 | )[0] 27 | ) 28 | trainer.test(state_dict) 29 | 30 | 31 | if __name__ == "__main__": 32 | hparams = load_config() 33 | main(hparams) 34 | -------------------------------------------------------------------------------- /src/dp/utils.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | import numpy as np 4 | import torch 5 | 6 | 7 | def fix_seed(seed: int) -> None: 8 | torch.manual_seed(seed) 9 | torch.cuda.manual_seed(seed) 10 | torch.cuda.manual_seed_all(seed) 11 | torch.backends.cudnn.deterministic = True 12 | torch.backends.cudnn.benchmark = False 13 | np.random.seed(seed) 14 | random.seed(seed) 15 | 16 | 17 | def accuracy(output, target, topk=(1,)): 18 | """Computes the precision@k for the specified values of k""" 19 | maxk = max(topk) 20 | batch_size = target.size(0) 21 | 22 | _, pred = output.topk(maxk, 1, True, True) 23 | pred = pred.t() 24 | correct = pred.eq(target.reshape(1, -1).expand_as(pred)) 25 | 26 | res = [] 27 | for k in topk: 28 | correct_k = correct[:k].reshape(-1).float().sum(0) 29 | res.append(correct_k.mul_(100.0 / batch_size)) 30 | return res 31 | 32 | 33 | class AverageMeter: 34 | def __init__(self): 35 | self.reset() 36 | 37 | def reset(self): 38 | self.val = 0 39 | self.avg = 0 40 | self.sum = 0 41 | self.count = 0 42 | 43 | def update(self, val: float, n: int = 1): 44 | self.val = val 45 | self.sum += val * n 46 | self.count += n 47 | self.avg = self.sum / self.count 48 | -------------------------------------------------------------------------------- /src/single/utils.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | import numpy as np 4 | import torch 5 | 6 | 7 | def fix_seed(seed: int) -> None: 8 | torch.manual_seed(seed) 9 | torch.cuda.manual_seed(seed) 10 | torch.cuda.manual_seed_all(seed) 11 | torch.backends.cudnn.deterministic = True 12 | torch.backends.cudnn.benchmark = False 13 | np.random.seed(seed) 14 | random.seed(seed) 15 | 16 | 17 | def accuracy(output, target, topk=(1,)): 18 | """Computes the precision@k for the specified values of k""" 19 | maxk = max(topk) 20 | batch_size = target.size(0) 21 | 22 | _, pred = output.topk(maxk, 1, True, True) 23 | pred = pred.t() 24 | correct = pred.eq(target.reshape(1, -1).expand_as(pred)) 25 | 26 | res = [] 27 | for k in topk: 28 | correct_k = correct[:k].reshape(-1).float().sum(0) 29 | res.append(correct_k.mul_(100.0 / batch_size)) 30 | return res 31 | 32 | 33 | class AverageMeter: 34 | def __init__(self): 35 | self.reset() 36 | 37 | def reset(self): 38 | self.val = 0 39 | self.avg = 0 40 | self.sum = 0 41 | self.count = 0 42 | 43 | def update(self, val: float, n: int = 1): 44 | self.val = val 45 | self.sum += val * n 46 | self.count += n 47 | self.avg = self.sum / self.count 48 | -------------------------------------------------------------------------------- /src/ddp/utils.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | import numpy as np 4 | import torch 5 | import torch.distributed as dist 6 | 7 | 8 | def fix_seed(seed: int) -> None: 9 | torch.manual_seed(seed) 10 | torch.cuda.manual_seed(seed) 11 | torch.cuda.manual_seed_all(seed) 12 | torch.backends.cudnn.deterministic = True 13 | torch.backends.cudnn.benchmark = False 14 | np.random.seed(seed) 15 | random.seed(seed) 16 | 17 | 18 | def accuracy(output, target, topk=(1,)): 19 | """Computes the precision@k for the specified values of k""" 20 | maxk = max(topk) 21 | batch_size = target.size(0) 22 | 23 | _, pred = output.topk(maxk, 1, True, True) 24 | pred = pred.t() 25 | correct = pred.eq(target.reshape(1, -1).expand_as(pred)) 26 | 27 | res = [] 28 | for k in topk: 29 | correct_k = correct[:k].reshape(-1).float().sum(0) 30 | res.append(correct_k.mul_(100.0 / batch_size)) 31 | return res 32 | 33 | 34 | class AverageMeter: 35 | def __init__(self): 36 | self.reset() 37 | 38 | def reset(self): 39 | self.val = 0 40 | self.avg = 0 41 | self.sum = 0 42 | self.count = 0 43 | 44 | def update(self, val: float, n: int = 1): 45 | self.val = val 46 | self.sum += val * n 47 | self.count += n 48 | self.avg = self.sum / self.count 49 | -------------------------------------------------------------------------------- /src/dp/config.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | 4 | def load_config(): 5 | parser = argparse.ArgumentParser() 6 | 7 | # default hparams 8 | parser.add_argument("--dset", type=str, default="cifar100") 9 | parser.add_argument("--dpath", type=str, default="data/") 10 | parser.add_argument("--ckpt-path", type=str, default="src/dp/checkpoints/") 11 | 12 | parser.add_argument("--seed", type=int, default=42, help="Seed for reproducibility") 13 | parser.add_argument("--workers", type=int, default=4) 14 | parser.add_argument("--eval-step", type=int, default=300) 15 | parser.add_argument( 16 | "--amp", action="store_true", default=False, help="PyTorch(>=1.6.x) AMP" 17 | ) 18 | parser.add_argument("--contain-test", action="store_true", default=False) 19 | 20 | # training hparams 21 | parser.add_argument("--epoch", type=int, default=100) 22 | parser.add_argument("--batch-size", type=int, default=128) 23 | parser.add_argument("--model", type=str, default="resnet18") 24 | 25 | parser.add_argument("--lr", type=float, default=0.1) 26 | parser.add_argument("--weight-decay", type=float, default=0.0001) 27 | parser.add_argument("--lr-decay-step-size", type=int, default=60) 28 | parser.add_argument("--lr-decay-gamma", type=float, default=0.1) 29 | 30 | args = parser.parse_args() 31 | return args -------------------------------------------------------------------------------- /src/single/config.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | 4 | def load_config(): 5 | parser = argparse.ArgumentParser() 6 | 7 | # default hparams 8 | parser.add_argument("--dset", type=str, default="cifar100") 9 | parser.add_argument("--dpath", type=str, default="data/") 10 | parser.add_argument("--ckpt-path", type=str, default="src/single/checkpoints/") 11 | 12 | parser.add_argument("--seed", type=int, default=42, help="Seed for reproducibility") 13 | parser.add_argument("--workers", type=int, default=4) 14 | parser.add_argument("--eval-step", type=int, default=300) 15 | parser.add_argument( 16 | "--amp", action="store_true", default=False, help="PyTorch(>=1.6.x) AMP" 17 | ) 18 | parser.add_argument("--contain-test", action="store_true", default=False) 19 | 20 | # training hparams 21 | parser.add_argument("--epoch", type=int, default=200) 22 | parser.add_argument("--batch-size", type=int, default=128) 23 | parser.add_argument("--model", type=str, default="resnet18") 24 | 25 | parser.add_argument("--lr", type=float, default=0.1) 26 | parser.add_argument("--weight-decay", type=float, default=0.0001) 27 | parser.add_argument("--lr-decay-step-size", type=int, default=60) 28 | parser.add_argument("--lr-decay-gamma", type=float, default=0.1) 29 | 30 | args = parser.parse_args() 31 | return args -------------------------------------------------------------------------------- /src/ddp/main.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import os 3 | 4 | import torch 5 | import torch.distributed as dist 6 | import torch.multiprocessing as mp 7 | 8 | from config import load_config 9 | from net import ResNet18 10 | from trainer import Trainer 11 | from utils import fix_seed 12 | 13 | 14 | def main_worker(rank, ngpus_per_node, hparams): 15 | print(f"Use GPU {rank} for training") 16 | fix_seed(hparams.seed) 17 | hparams.rank = hparams.rank * ngpus_per_node + rank 18 | dist.init_process_group( 19 | backend=hparams.dist_backend, 20 | init_method=hparams.dist_url, 21 | world_size=hparams.world_size, 22 | rank=hparams.rank, 23 | ) 24 | 25 | scaler = torch.cuda.amp.GradScaler() if hparams.amp else None 26 | model = ResNet18() 27 | 28 | # training phase 29 | trainer = Trainer(hparams, model, scaler, rank, ngpus_per_node) 30 | version = trainer.fit() 31 | 32 | # testing phase 33 | if rank == 0 and hparams.contain_test: 34 | state_dict = torch.load( 35 | glob.glob( 36 | os.path.join(hparams.ckpt_path, f"version-{version}/best_model_*.pt") 37 | )[0] 38 | ) 39 | trainer.test(state_dict) 40 | 41 | 42 | if __name__ == "__main__": 43 | hparams = load_config() 44 | 45 | # 'world_size' means total number of processes to run 46 | ngpus_per_node = torch.cuda.device_count() 47 | hparams.world_size = ngpus_per_node * hparams.world_size 48 | 49 | mp.spawn(main_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, hparams)) 50 | -------------------------------------------------------------------------------- /src/ddp/config.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | 4 | def load_config(): 5 | parser = argparse.ArgumentParser() 6 | 7 | # default hparams 8 | parser.add_argument("--dset", type=str, default="cifar100") 9 | parser.add_argument("--dpath", type=str, default="data/") 10 | parser.add_argument("--ckpt-path", type=str, default="src/ddp/checkpoints/") 11 | 12 | parser.add_argument("--seed", type=int, default=42, help="Seed for reproducibility") 13 | parser.add_argument("--workers", type=int, default=4) 14 | parser.add_argument("--eval-step", type=int, default=300) 15 | parser.add_argument( 16 | "--amp", action="store_true", default=False, help="PyTorch(>=1.6.x) AMP" 17 | ) 18 | parser.add_argument("--contain-test", action="store_true", default=False) 19 | 20 | # ddp hparams 21 | parser.add_argument( 22 | "--world-size", type=int, default=1, help="Total number of processes to run" 23 | ) 24 | parser.add_argument("--rank", type=int, default=0) 25 | parser.add_argument("--dist-backend", type=str, default="nccl") 26 | parser.add_argument("--dist-url", default="tcp://127.0.0.1:3456", type=str) 27 | 28 | # training hparams 29 | parser.add_argument("--epoch", type=int, default=100) 30 | parser.add_argument("--batch-size", type=int, default=128) 31 | parser.add_argument("--model", type=str, default="resnet18") 32 | 33 | parser.add_argument("--lr", type=float, default=0.1) 34 | parser.add_argument("--weight-decay", type=float, default=0.0001) 35 | parser.add_argument("--lr-decay-step-size", type=int, default=60) 36 | parser.add_argument("--lr-decay-gamma", type=float, default=0.1) 37 | 38 | args = parser.parse_args() 39 | return args -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Distributed Training in PyTorch 2 | 3 | There are some distributed training steps you can try according to [PyTorch Document](https://pytorch.org/tutorials/beginner/dist_overview.html). 4 | 5 | 6 | > PyTorch provides several options for data-parallel training. For applications that gradually grow from simple to complex and from prototype to production, the common development trajectory would be: 7 | > 1. Use **single-device** training, if the data and model can fit in one GPU, and the training speed is not a concern. 8 | > 2. Use **single-machine multi-GPU DataParallel**, if there are multiple GPUs on the server, and you would like to speed up training with the minimum code change. 9 | Use single-machine multi-GPU DistributedDataParallel, if you would like to further speed up training and are willing to write a little more code to set it up. 10 | > 3. Use **multi-machine DistributedDataParallel** and the launching script, if the application needs to scale across machine boundaries. 11 | > 4. Use torchelastic to launch distributed training, if errors (e.g., OOM) are expected or if the resources can join and leave dynamically during the training. 12 | 13 | 14 | In this repo, I compared **single-device(1)** with **single-machine multi-GPU DataParallel(2)** and **single-machine multi-GPU DistributedDataParallel**. 15 | 16 | ## Environment 17 | - Nvidia RTX 2080ti * 2 18 | - torch==1.7.1 19 | - torchvision==0.8.2 20 | 21 | All dependencies are written in [requirements.txt](https://github.com/youngerous/distributed-training-comparison/blob/main/requirements.txt), and you can also access through [Dockerfile](https://github.com/youngerous/distributed-training-comparison/blob/main/Dockerfile). 22 | 23 | ## How to Run 24 | All three folders - ```src/single/```, ```src/dp/```, and ```src/ddp/``` - are independent structures. 25 | 26 | ### Single 27 | ```sh 28 | $ sh src/single/run_single.sh 29 | ``` 30 | ### DataParallel 31 | ```sh 32 | $ sh src/dp/run_dp.sh 33 | ``` 34 | ### DistributedDataParallel 35 | ```sh 36 | $ sh src/ddp/run_ddp.sh 37 | ``` 38 | 39 | ## Result 40 | Batch size is set to 128 or 256. It is recommended to use [SyncBatchNorm](https://pytorch.org/docs/stable/generated/torch.nn.SyncBatchNorm.html) in DDP training, but I used vanila BatchNorm so just trained on 256 batch size. Best model is selected according to validation top-1 accuracy. 41 | 42 | 43 | And I did not care detailed hyperparameter settings, so you can change some settings in order to improve performance (e.g. using ADAM optimizer). 44 | 45 | | Dataset | Model | Test Loss | Top-1 Acc | Top-5 Acc | Batch Size | Method | 46 | | :-------: | :-------: | :--------: | :--------: | :--------: | :--------: | :---------------------------: | 47 | | CIFAR-100 | ResNet-18 | 1.3728 | 70.99% | 91.57% | 128 | Single | 48 | | CIFAR-100 | ResNet-18 | 1.3394 | 70.64% | 91.60% | 256 | Single | 49 | | CIFAR-100 | ResNet-18 | 1.2974 | **71.48%** | 91.65% | 128 | DataParallel (DP) | 50 | | CIFAR-100 | ResNet-18 | 1.3373 | 71.20% | 91.53% | 256 | DataParallel (DP) | 51 | | CIFAR-100 | ResNet-18 | **1.2268** | 71.17% | **91.84%** | 256 | DistributedDataParallel (DDP) | 52 | 53 | - Experiment results are averaged value of random seed 2, 4, 42. 54 | - Automatic Mixed Precision(AMP) is applied to every experiment. 55 | 56 | ## Reference 57 | - [[Docs] Distributed Communication Package - torch.distributed](https://pytorch.org/docs/stable/distributed.html#) 58 | - [[Post] Technologies behind Distributed Deep Learning - AllReduce :: Keisuke Fukuda](https://tech.preferred.jp/en/blog/technologies-behind-distributed-deep-learning-allreduce/) 59 | - [[Post] PyTorch Distributed Training :: leimao blog](https://leimao.github.io/blog/PyTorch-Distributed-Training/) 60 | - [[Post] Distributed data parallel training in Pytorch :: yangkky blog](https://yangkky.github.io/2019/07/08/distributed-pytorch-tutorial.html) 61 | - [[Repo] PyTorch Official Example](https://github.com/pytorch/examples/blob/master/imagenet/main.py) 62 | - [[Repo] pytorch-distributed :: tczhangzhi](https://github.com/tczhangzhi/pytorch-distributed) 63 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | checkpoints/ 2 | data/ 3 | result.csv 4 | single.txt 5 | dp.txt 6 | ddp.txt 7 | 8 | # Created by https://www.toptal.com/developers/gitignore/api/macos,windows,python,jupyternotebooks 9 | # Edit at https://www.toptal.com/developers/gitignore?templates=macos,windows,python,jupyternotebooks 10 | 11 | ### JupyterNotebooks ### 12 | # gitignore template for Jupyter Notebooks 13 | # website: http://jupyter.org/ 14 | 15 | .ipynb_checkpoints 16 | */.ipynb_checkpoints/* 17 | 18 | # IPython 19 | profile_default/ 20 | ipython_config.py 21 | 22 | # Remove previous ipynb_checkpoints 23 | # git rm -r .ipynb_checkpoints/ 24 | 25 | ### macOS ### 26 | # General 27 | .DS_Store 28 | .AppleDouble 29 | .LSOverride 30 | 31 | # Icon must end with two \r 32 | Icon 33 | 34 | 35 | # Thumbnails 36 | ._* 37 | 38 | # Files that might appear in the root of a volume 39 | .DocumentRevisions-V100 40 | .fseventsd 41 | .Spotlight-V100 42 | .TemporaryItems 43 | .Trashes 44 | .VolumeIcon.icns 45 | .com.apple.timemachine.donotpresent 46 | 47 | # Directories potentially created on remote AFP share 48 | .AppleDB 49 | .AppleDesktop 50 | Network Trash Folder 51 | Temporary Items 52 | .apdisk 53 | 54 | ### Python ### 55 | # Byte-compiled / optimized / DLL files 56 | __pycache__/ 57 | *.py[cod] 58 | *$py.class 59 | 60 | # C extensions 61 | *.so 62 | 63 | # Distribution / packaging 64 | .Python 65 | build/ 66 | develop-eggs/ 67 | dist/ 68 | downloads/ 69 | eggs/ 70 | .eggs/ 71 | lib/ 72 | lib64/ 73 | parts/ 74 | sdist/ 75 | var/ 76 | wheels/ 77 | pip-wheel-metadata/ 78 | share/python-wheels/ 79 | *.egg-info/ 80 | .installed.cfg 81 | *.egg 82 | MANIFEST 83 | 84 | # PyInstaller 85 | # Usually these files are written by a python script from a template 86 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 87 | *.manifest 88 | *.spec 89 | 90 | # Installer logs 91 | pip-log.txt 92 | pip-delete-this-directory.txt 93 | 94 | # Unit test / coverage reports 95 | htmlcov/ 96 | .tox/ 97 | .nox/ 98 | .coverage 99 | .coverage.* 100 | .cache 101 | nosetests.xml 102 | coverage.xml 103 | *.cover 104 | *.py,cover 105 | .hypothesis/ 106 | .pytest_cache/ 107 | pytestdebug.log 108 | 109 | # Translations 110 | *.mo 111 | *.pot 112 | 113 | # Django stuff: 114 | *.log 115 | local_settings.py 116 | db.sqlite3 117 | db.sqlite3-journal 118 | 119 | # Flask stuff: 120 | instance/ 121 | .webassets-cache 122 | 123 | # Scrapy stuff: 124 | .scrapy 125 | 126 | # Sphinx documentation 127 | docs/_build/ 128 | doc/_build/ 129 | 130 | # PyBuilder 131 | target/ 132 | 133 | # Jupyter Notebook 134 | 135 | # IPython 136 | 137 | # pyenv 138 | .python-version 139 | 140 | # pipenv 141 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 142 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 143 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 144 | # install all needed dependencies. 145 | #Pipfile.lock 146 | 147 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 148 | __pypackages__/ 149 | 150 | # Celery stuff 151 | celerybeat-schedule 152 | celerybeat.pid 153 | 154 | # SageMath parsed files 155 | *.sage.py 156 | 157 | # Environments 158 | .env 159 | .venv 160 | env/ 161 | venv/ 162 | ENV/ 163 | env.bak/ 164 | venv.bak/ 165 | pythonenv* 166 | 167 | # Spyder project settings 168 | .spyderproject 169 | .spyproject 170 | 171 | # Rope project settings 172 | .ropeproject 173 | 174 | # mkdocs documentation 175 | /site 176 | 177 | # mypy 178 | .mypy_cache/ 179 | .dmypy.json 180 | dmypy.json 181 | 182 | # Pyre type checker 183 | .pyre/ 184 | 185 | # pytype static type analyzer 186 | .pytype/ 187 | 188 | # profiling data 189 | .prof 190 | 191 | ### Windows ### 192 | # Windows thumbnail cache files 193 | Thumbs.db 194 | Thumbs.db:encryptable 195 | ehthumbs.db 196 | ehthumbs_vista.db 197 | 198 | # Dump file 199 | *.stackdump 200 | 201 | # Folder config file 202 | [Dd]esktop.ini 203 | 204 | # Recycle Bin used on file shares 205 | $RECYCLE.BIN/ 206 | 207 | # Windows Installer files 208 | *.cab 209 | *.msi 210 | *.msix 211 | *.msm 212 | *.msp 213 | 214 | # Windows shortcuts 215 | *.lnk 216 | 217 | # End of https://www.toptal.com/developers/gitignore/api/macos,windows,python,jupyternotebooks -------------------------------------------------------------------------------- /src/ddp/net.py: -------------------------------------------------------------------------------- 1 | """ResNet in PyTorch. 2 | For Pre-activation ResNet, see 'preact_resnet.py'. 3 | Reference: 4 | [1] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun 5 | Deep Residual Learning for Image Recognition. arXiv:1512.03385 6 | Ref: https://github.com/kuangliu/pytorch-cifar 7 | """ 8 | import torch 9 | import torch.nn as nn 10 | import torch.nn.functional as F 11 | 12 | 13 | class BasicBlock(nn.Module): 14 | expansion = 1 15 | 16 | def __init__(self, in_planes, planes, stride=1): 17 | super(BasicBlock, self).__init__() 18 | self.conv1 = nn.Conv2d( 19 | in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False 20 | ) 21 | self.bn1 = nn.BatchNorm2d(planes) 22 | self.conv2 = nn.Conv2d( 23 | planes, planes, kernel_size=3, stride=1, padding=1, bias=False 24 | ) 25 | self.bn2 = nn.BatchNorm2d(planes) 26 | 27 | self.shortcut = nn.Sequential() 28 | if stride != 1 or in_planes != self.expansion * planes: 29 | self.shortcut = nn.Sequential( 30 | nn.Conv2d( 31 | in_planes, 32 | self.expansion * planes, 33 | kernel_size=1, 34 | stride=stride, 35 | bias=False, 36 | ), 37 | nn.BatchNorm2d(self.expansion * planes), 38 | ) 39 | 40 | def forward(self, x): 41 | out = F.relu(self.bn1(self.conv1(x))) 42 | out = self.bn2(self.conv2(out)) 43 | out += self.shortcut(x) 44 | out = F.relu(out) 45 | return out 46 | 47 | 48 | class Bottleneck(nn.Module): 49 | expansion = 4 50 | 51 | def __init__(self, in_planes, planes, stride=1): 52 | super(Bottleneck, self).__init__() 53 | self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False) 54 | self.bn1 = nn.BatchNorm2d(planes) 55 | self.conv2 = nn.Conv2d( 56 | planes, planes, kernel_size=3, stride=stride, padding=1, bias=False 57 | ) 58 | self.bn2 = nn.BatchNorm2d(planes) 59 | self.conv3 = nn.Conv2d( 60 | planes, self.expansion * planes, kernel_size=1, bias=False 61 | ) 62 | self.bn3 = nn.BatchNorm2d(self.expansion * planes) 63 | 64 | self.shortcut = nn.Sequential() 65 | if stride != 1 or in_planes != self.expansion * planes: 66 | self.shortcut = nn.Sequential( 67 | nn.Conv2d( 68 | in_planes, 69 | self.expansion * planes, 70 | kernel_size=1, 71 | stride=stride, 72 | bias=False, 73 | ), 74 | nn.BatchNorm2d(self.expansion * planes), 75 | ) 76 | 77 | def forward(self, x): 78 | out = F.relu(self.bn1(self.conv1(x))) 79 | out = F.relu(self.bn2(self.conv2(out))) 80 | out = self.bn3(self.conv3(out)) 81 | out += self.shortcut(x) 82 | out = F.relu(out) 83 | return out 84 | 85 | 86 | class ResNet(nn.Module): 87 | def __init__(self, block, num_blocks, num_classes=100): 88 | super(ResNet, self).__init__() 89 | self.in_planes = 64 90 | 91 | self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False) 92 | self.bn1 = nn.BatchNorm2d(64) 93 | self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1) 94 | self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2) 95 | self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2) 96 | self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2) 97 | self.linear = nn.Linear(512 * block.expansion, num_classes) 98 | 99 | def _make_layer(self, block, planes, num_blocks, stride): 100 | strides = [stride] + [1] * (num_blocks - 1) 101 | layers = [] 102 | for stride in strides: 103 | layers.append(block(self.in_planes, planes, stride)) 104 | self.in_planes = planes * block.expansion 105 | return nn.Sequential(*layers) 106 | 107 | def forward(self, x): 108 | out = F.relu(self.bn1(self.conv1(x))) 109 | out = self.layer1(out) 110 | out = self.layer2(out) 111 | out = self.layer3(out) 112 | out = self.layer4(out) 113 | out = F.avg_pool2d(out, 4) 114 | out = out.view(out.size(0), -1) 115 | out = self.linear(out) 116 | return out 117 | 118 | 119 | def ResNet18(): 120 | return ResNet(BasicBlock, [2, 2, 2, 2]) 121 | 122 | 123 | def ResNet34(): 124 | return ResNet(BasicBlock, [3, 4, 6, 3]) 125 | 126 | 127 | def ResNet50(): 128 | return ResNet(Bottleneck, [3, 4, 6, 3]) 129 | 130 | 131 | def ResNet101(): 132 | return ResNet(Bottleneck, [3, 4, 23, 3]) 133 | 134 | 135 | def ResNet152(): 136 | return ResNet(Bottleneck, [3, 8, 36, 3]) 137 | 138 | 139 | def test(): 140 | net = ResNet18() 141 | y = net(torch.randn(1, 3, 32, 32)) 142 | print(y.size()) 143 | 144 | 145 | # test() -------------------------------------------------------------------------------- /src/dp/net.py: -------------------------------------------------------------------------------- 1 | """ResNet in PyTorch. 2 | For Pre-activation ResNet, see 'preact_resnet.py'. 3 | Reference: 4 | [1] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun 5 | Deep Residual Learning for Image Recognition. arXiv:1512.03385 6 | Ref: https://github.com/kuangliu/pytorch-cifar 7 | """ 8 | import torch 9 | import torch.nn as nn 10 | import torch.nn.functional as F 11 | 12 | 13 | class BasicBlock(nn.Module): 14 | expansion = 1 15 | 16 | def __init__(self, in_planes, planes, stride=1): 17 | super(BasicBlock, self).__init__() 18 | self.conv1 = nn.Conv2d( 19 | in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False 20 | ) 21 | self.bn1 = nn.BatchNorm2d(planes) 22 | self.conv2 = nn.Conv2d( 23 | planes, planes, kernel_size=3, stride=1, padding=1, bias=False 24 | ) 25 | self.bn2 = nn.BatchNorm2d(planes) 26 | 27 | self.shortcut = nn.Sequential() 28 | if stride != 1 or in_planes != self.expansion * planes: 29 | self.shortcut = nn.Sequential( 30 | nn.Conv2d( 31 | in_planes, 32 | self.expansion * planes, 33 | kernel_size=1, 34 | stride=stride, 35 | bias=False, 36 | ), 37 | nn.BatchNorm2d(self.expansion * planes), 38 | ) 39 | 40 | def forward(self, x): 41 | out = F.relu(self.bn1(self.conv1(x))) 42 | out = self.bn2(self.conv2(out)) 43 | out += self.shortcut(x) 44 | out = F.relu(out) 45 | return out 46 | 47 | 48 | class Bottleneck(nn.Module): 49 | expansion = 4 50 | 51 | def __init__(self, in_planes, planes, stride=1): 52 | super(Bottleneck, self).__init__() 53 | self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False) 54 | self.bn1 = nn.BatchNorm2d(planes) 55 | self.conv2 = nn.Conv2d( 56 | planes, planes, kernel_size=3, stride=stride, padding=1, bias=False 57 | ) 58 | self.bn2 = nn.BatchNorm2d(planes) 59 | self.conv3 = nn.Conv2d( 60 | planes, self.expansion * planes, kernel_size=1, bias=False 61 | ) 62 | self.bn3 = nn.BatchNorm2d(self.expansion * planes) 63 | 64 | self.shortcut = nn.Sequential() 65 | if stride != 1 or in_planes != self.expansion * planes: 66 | self.shortcut = nn.Sequential( 67 | nn.Conv2d( 68 | in_planes, 69 | self.expansion * planes, 70 | kernel_size=1, 71 | stride=stride, 72 | bias=False, 73 | ), 74 | nn.BatchNorm2d(self.expansion * planes), 75 | ) 76 | 77 | def forward(self, x): 78 | out = F.relu(self.bn1(self.conv1(x))) 79 | out = F.relu(self.bn2(self.conv2(out))) 80 | out = self.bn3(self.conv3(out)) 81 | out += self.shortcut(x) 82 | out = F.relu(out) 83 | return out 84 | 85 | 86 | class ResNet(nn.Module): 87 | def __init__(self, block, num_blocks, num_classes=100): 88 | super(ResNet, self).__init__() 89 | self.in_planes = 64 90 | 91 | self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False) 92 | self.bn1 = nn.BatchNorm2d(64) 93 | self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1) 94 | self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2) 95 | self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2) 96 | self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2) 97 | self.linear = nn.Linear(512 * block.expansion, num_classes) 98 | 99 | def _make_layer(self, block, planes, num_blocks, stride): 100 | strides = [stride] + [1] * (num_blocks - 1) 101 | layers = [] 102 | for stride in strides: 103 | layers.append(block(self.in_planes, planes, stride)) 104 | self.in_planes = planes * block.expansion 105 | return nn.Sequential(*layers) 106 | 107 | def forward(self, x): 108 | out = F.relu(self.bn1(self.conv1(x))) 109 | out = self.layer1(out) 110 | out = self.layer2(out) 111 | out = self.layer3(out) 112 | out = self.layer4(out) 113 | out = F.avg_pool2d(out, 4) 114 | out = out.view(out.size(0), -1) 115 | out = self.linear(out) 116 | return out 117 | 118 | 119 | def ResNet18(): 120 | return ResNet(BasicBlock, [2, 2, 2, 2]) 121 | 122 | 123 | def ResNet34(): 124 | return ResNet(BasicBlock, [3, 4, 6, 3]) 125 | 126 | 127 | def ResNet50(): 128 | return ResNet(Bottleneck, [3, 4, 6, 3]) 129 | 130 | 131 | def ResNet101(): 132 | return ResNet(Bottleneck, [3, 4, 23, 3]) 133 | 134 | 135 | def ResNet152(): 136 | return ResNet(Bottleneck, [3, 8, 36, 3]) 137 | 138 | 139 | def test(): 140 | net = ResNet18() 141 | y = net(torch.randn(1, 3, 32, 32)) 142 | print(y.size()) 143 | 144 | 145 | # test() -------------------------------------------------------------------------------- /src/single/net.py: -------------------------------------------------------------------------------- 1 | """ResNet in PyTorch. 2 | For Pre-activation ResNet, see 'preact_resnet.py'. 3 | Reference: 4 | [1] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun 5 | Deep Residual Learning for Image Recognition. arXiv:1512.03385 6 | Ref: https://github.com/kuangliu/pytorch-cifar 7 | """ 8 | import torch 9 | import torch.nn as nn 10 | import torch.nn.functional as F 11 | 12 | 13 | class BasicBlock(nn.Module): 14 | expansion = 1 15 | 16 | def __init__(self, in_planes, planes, stride=1): 17 | super(BasicBlock, self).__init__() 18 | self.conv1 = nn.Conv2d( 19 | in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False 20 | ) 21 | self.bn1 = nn.BatchNorm2d(planes) 22 | self.conv2 = nn.Conv2d( 23 | planes, planes, kernel_size=3, stride=1, padding=1, bias=False 24 | ) 25 | self.bn2 = nn.BatchNorm2d(planes) 26 | 27 | self.shortcut = nn.Sequential() 28 | if stride != 1 or in_planes != self.expansion * planes: 29 | self.shortcut = nn.Sequential( 30 | nn.Conv2d( 31 | in_planes, 32 | self.expansion * planes, 33 | kernel_size=1, 34 | stride=stride, 35 | bias=False, 36 | ), 37 | nn.BatchNorm2d(self.expansion * planes), 38 | ) 39 | 40 | def forward(self, x): 41 | out = F.relu(self.bn1(self.conv1(x))) 42 | out = self.bn2(self.conv2(out)) 43 | out += self.shortcut(x) 44 | out = F.relu(out) 45 | return out 46 | 47 | 48 | class Bottleneck(nn.Module): 49 | expansion = 4 50 | 51 | def __init__(self, in_planes, planes, stride=1): 52 | super(Bottleneck, self).__init__() 53 | self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False) 54 | self.bn1 = nn.BatchNorm2d(planes) 55 | self.conv2 = nn.Conv2d( 56 | planes, planes, kernel_size=3, stride=stride, padding=1, bias=False 57 | ) 58 | self.bn2 = nn.BatchNorm2d(planes) 59 | self.conv3 = nn.Conv2d( 60 | planes, self.expansion * planes, kernel_size=1, bias=False 61 | ) 62 | self.bn3 = nn.BatchNorm2d(self.expansion * planes) 63 | 64 | self.shortcut = nn.Sequential() 65 | if stride != 1 or in_planes != self.expansion * planes: 66 | self.shortcut = nn.Sequential( 67 | nn.Conv2d( 68 | in_planes, 69 | self.expansion * planes, 70 | kernel_size=1, 71 | stride=stride, 72 | bias=False, 73 | ), 74 | nn.BatchNorm2d(self.expansion * planes), 75 | ) 76 | 77 | def forward(self, x): 78 | out = F.relu(self.bn1(self.conv1(x))) 79 | out = F.relu(self.bn2(self.conv2(out))) 80 | out = self.bn3(self.conv3(out)) 81 | out += self.shortcut(x) 82 | out = F.relu(out) 83 | return out 84 | 85 | 86 | class ResNet(nn.Module): 87 | def __init__(self, block, num_blocks, num_classes=100): 88 | super(ResNet, self).__init__() 89 | self.in_planes = 64 90 | 91 | self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False) 92 | self.bn1 = nn.BatchNorm2d(64) 93 | self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1) 94 | self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2) 95 | self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2) 96 | self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2) 97 | self.linear = nn.Linear(512 * block.expansion, num_classes) 98 | 99 | def _make_layer(self, block, planes, num_blocks, stride): 100 | strides = [stride] + [1] * (num_blocks - 1) 101 | layers = [] 102 | for stride in strides: 103 | layers.append(block(self.in_planes, planes, stride)) 104 | self.in_planes = planes * block.expansion 105 | return nn.Sequential(*layers) 106 | 107 | def forward(self, x): 108 | out = F.relu(self.bn1(self.conv1(x))) 109 | out = self.layer1(out) 110 | out = self.layer2(out) 111 | out = self.layer3(out) 112 | out = self.layer4(out) 113 | out = F.avg_pool2d(out, 4) 114 | out = out.view(out.size(0), -1) 115 | out = self.linear(out) 116 | return out 117 | 118 | 119 | def ResNet18(): 120 | return ResNet(BasicBlock, [2, 2, 2, 2]) 121 | 122 | 123 | def ResNet34(): 124 | return ResNet(BasicBlock, [3, 4, 6, 3]) 125 | 126 | 127 | def ResNet50(): 128 | return ResNet(Bottleneck, [3, 4, 6, 3]) 129 | 130 | 131 | def ResNet101(): 132 | return ResNet(Bottleneck, [3, 4, 23, 3]) 133 | 134 | 135 | def ResNet152(): 136 | return ResNet(Bottleneck, [3, 8, 36, 3]) 137 | 138 | 139 | def test(): 140 | net = ResNet18() 141 | y = net(torch.randn(1, 3, 32, 32)) 142 | print(y.size()) 143 | 144 | 145 | # test() -------------------------------------------------------------------------------- /src/dp/dataset.py: -------------------------------------------------------------------------------- 1 | """ 2 | Ref: https://gist.github.com/kevinzakka/d33bf8d6c7f06a9d8c76d97a7879f5cb 3 | Create train, valid, test iterators for CIFAR-100 [1]. 4 | [1]: https://discuss.pytorch.org/t/feedback-on-pytorch-for-kaggle-competitions/2252/4 5 | """ 6 | 7 | import numpy as np 8 | import torch 9 | from torch.utils.data.sampler import SubsetRandomSampler 10 | from torchvision import datasets, transforms 11 | 12 | 13 | def get_trn_val_loader( 14 | data_dir: str, 15 | batch_size: int, 16 | valid_size: float = 0.1, 17 | shuffle: bool = True, 18 | num_workers: int = 1, 19 | pin_memory: bool = True, 20 | ): 21 | """ 22 | Utility function for loading and returning train and valid 23 | multi-process iterators over the CIFAR-100 dataset. A sample 24 | 9x9 grid of the images can be optionally displayed. 25 | If using CUDA, num_workers should be set to 1 and pin_memory to True. 26 | :param data_dir: path directory to the dataset. 27 | :param batch_size: how many samples per batch to load. 28 | :param valid_size: percentage split of the training set used for 29 | the validation set. Should be a float in the range [0, 1]. 30 | :param shuffle: whether to shuffle the train/validation indices. 31 | :param num_workers: number of subprocesses to use when loading the dataset. 32 | :param pin_memory: whether to copy tensors into CUDA pinned memory. 33 | Set it to True if using GPU. 34 | 35 | :return train_loader: training set iterator. 36 | :return valid_loader: validation set iterator. 37 | """ 38 | error_msg = "[!] valid_size should be in the range [0, 1]." 39 | assert (valid_size >= 0) and (valid_size <= 1), error_msg 40 | 41 | normalize = transforms.Normalize( 42 | mean=[0.4914, 0.4822, 0.4465], 43 | std=[0.2023, 0.1994, 0.2010], 44 | ) 45 | 46 | # define transforms 47 | valid_transform = transforms.Compose( 48 | [ 49 | transforms.ToTensor(), 50 | normalize, 51 | ] 52 | ) 53 | 54 | # augmentation 55 | train_transform = transforms.Compose( 56 | [ 57 | transforms.RandomCrop(32, padding=4), 58 | transforms.RandomHorizontalFlip(), 59 | transforms.ToTensor(), 60 | normalize, 61 | ] 62 | ) 63 | 64 | # load the dataset 65 | train_dataset = datasets.CIFAR100( 66 | root=data_dir, 67 | train=True, 68 | download=True, 69 | transform=train_transform, 70 | ) 71 | 72 | valid_dataset = datasets.CIFAR100( 73 | root=data_dir, 74 | train=True, 75 | download=True, 76 | transform=valid_transform, 77 | ) 78 | 79 | # train/valid split 80 | num_train = len(train_dataset) 81 | indices = list(range(num_train)) 82 | split = int(np.floor(valid_size * num_train)) 83 | 84 | if shuffle: 85 | np.random.shuffle(indices) 86 | 87 | train_idx, valid_idx = indices[split:], indices[:split] 88 | train_sampler = SubsetRandomSampler(train_idx) 89 | valid_sampler = SubsetRandomSampler(valid_idx) 90 | 91 | train_loader = torch.utils.data.DataLoader( 92 | train_dataset, 93 | batch_size=batch_size, 94 | sampler=train_sampler, 95 | num_workers=num_workers, 96 | pin_memory=pin_memory, 97 | drop_last=True, 98 | ) 99 | valid_loader = torch.utils.data.DataLoader( 100 | valid_dataset, 101 | batch_size=batch_size, 102 | sampler=valid_sampler, 103 | num_workers=num_workers, 104 | pin_memory=pin_memory, 105 | ) 106 | 107 | return train_loader, valid_loader 108 | 109 | 110 | def get_tst_loader( 111 | data_dir: str, 112 | batch_size: int, 113 | shuffle: bool = True, 114 | num_workers: int = 4, 115 | pin_memory: bool = False, 116 | ): 117 | """ 118 | Utility function for loading and returning a multi-process 119 | test iterator over the CIFAR-100 dataset. 120 | If using CUDA, num_workers should be set to 1 and pin_memory to True. 121 | :param data_dir: path directory to the dataset. 122 | :param batch_size: how many samples per batch to load. 123 | :param shuffle: whether to shuffle the dataset after every epoch. 124 | :param num_workers: number of subprocesses to use when loading the dataset. 125 | :param pin_memory: whether to copy tensors into CUDA pinned memory. 126 | Set it to True if using GPU. 127 | 128 | :return data_loader: test set iterator. 129 | """ 130 | normalize = transforms.Normalize( 131 | mean=[0.485, 0.456, 0.406], 132 | std=[0.229, 0.224, 0.225], 133 | ) 134 | 135 | # define transform 136 | transform = transforms.Compose( 137 | [ 138 | transforms.ToTensor(), 139 | normalize, 140 | ] 141 | ) 142 | 143 | dataset = datasets.CIFAR100( 144 | root=data_dir, 145 | train=False, 146 | download=True, 147 | transform=transform, 148 | ) 149 | 150 | data_loader = torch.utils.data.DataLoader( 151 | dataset, 152 | batch_size=batch_size, 153 | shuffle=shuffle, 154 | num_workers=num_workers, 155 | pin_memory=pin_memory, 156 | ) 157 | 158 | return data_loader -------------------------------------------------------------------------------- /src/single/dataset.py: -------------------------------------------------------------------------------- 1 | """ 2 | Ref: https://gist.github.com/kevinzakka/d33bf8d6c7f06a9d8c76d97a7879f5cb 3 | Create train, valid, test iterators for CIFAR-100 [1]. 4 | [1]: https://discuss.pytorch.org/t/feedback-on-pytorch-for-kaggle-competitions/2252/4 5 | """ 6 | 7 | import numpy as np 8 | import torch 9 | from torch.utils.data.sampler import SubsetRandomSampler 10 | from torchvision import datasets, transforms 11 | 12 | 13 | def get_trn_val_loader( 14 | data_dir: str, 15 | batch_size: int, 16 | valid_size: float = 0.1, 17 | shuffle: bool = True, 18 | num_workers: int = 1, 19 | pin_memory: bool = True, 20 | ): 21 | """ 22 | Utility function for loading and returning train and valid 23 | multi-process iterators over the CIFAR-100 dataset. A sample 24 | 9x9 grid of the images can be optionally displayed. 25 | If using CUDA, num_workers should be set to 1 and pin_memory to True. 26 | :param data_dir: path directory to the dataset. 27 | :param batch_size: how many samples per batch to load. 28 | :param valid_size: percentage split of the training set used for 29 | the validation set. Should be a float in the range [0, 1]. 30 | :param shuffle: whether to shuffle the train/validation indices. 31 | :param num_workers: number of subprocesses to use when loading the dataset. 32 | :param pin_memory: whether to copy tensors into CUDA pinned memory. 33 | Set it to True if using GPU. 34 | 35 | :return train_loader: training set iterator. 36 | :return valid_loader: validation set iterator. 37 | """ 38 | error_msg = "[!] valid_size should be in the range [0, 1]." 39 | assert (valid_size >= 0) and (valid_size <= 1), error_msg 40 | 41 | normalize = transforms.Normalize( 42 | mean=[0.4914, 0.4822, 0.4465], 43 | std=[0.2023, 0.1994, 0.2010], 44 | ) 45 | 46 | # define transforms 47 | valid_transform = transforms.Compose( 48 | [ 49 | transforms.ToTensor(), 50 | normalize, 51 | ] 52 | ) 53 | 54 | # augmentation 55 | train_transform = transforms.Compose( 56 | [ 57 | transforms.RandomCrop(32, padding=4), 58 | transforms.RandomHorizontalFlip(), 59 | transforms.ToTensor(), 60 | normalize, 61 | ] 62 | ) 63 | 64 | # load the dataset 65 | train_dataset = datasets.CIFAR100( 66 | root=data_dir, 67 | train=True, 68 | download=True, 69 | transform=train_transform, 70 | ) 71 | 72 | valid_dataset = datasets.CIFAR100( 73 | root=data_dir, 74 | train=True, 75 | download=True, 76 | transform=valid_transform, 77 | ) 78 | 79 | # train/valid split 80 | num_train = len(train_dataset) 81 | indices = list(range(num_train)) 82 | split = int(np.floor(valid_size * num_train)) 83 | 84 | if shuffle: 85 | np.random.shuffle(indices) 86 | 87 | train_idx, valid_idx = indices[split:], indices[:split] 88 | train_sampler = SubsetRandomSampler(train_idx) 89 | valid_sampler = SubsetRandomSampler(valid_idx) 90 | 91 | train_loader = torch.utils.data.DataLoader( 92 | train_dataset, 93 | batch_size=batch_size, 94 | sampler=train_sampler, 95 | num_workers=num_workers, 96 | pin_memory=pin_memory, 97 | drop_last=True, 98 | ) 99 | valid_loader = torch.utils.data.DataLoader( 100 | valid_dataset, 101 | batch_size=batch_size, 102 | sampler=valid_sampler, 103 | num_workers=num_workers, 104 | pin_memory=pin_memory, 105 | ) 106 | 107 | return train_loader, valid_loader 108 | 109 | 110 | def get_tst_loader( 111 | data_dir: str, 112 | batch_size: int, 113 | shuffle: bool = True, 114 | num_workers: int = 4, 115 | pin_memory: bool = False, 116 | ): 117 | """ 118 | Utility function for loading and returning a multi-process 119 | test iterator over the CIFAR-100 dataset. 120 | If using CUDA, num_workers should be set to 1 and pin_memory to True. 121 | :param data_dir: path directory to the dataset. 122 | :param batch_size: how many samples per batch to load. 123 | :param shuffle: whether to shuffle the dataset after every epoch. 124 | :param num_workers: number of subprocesses to use when loading the dataset. 125 | :param pin_memory: whether to copy tensors into CUDA pinned memory. 126 | Set it to True if using GPU. 127 | 128 | :return data_loader: test set iterator. 129 | """ 130 | normalize = transforms.Normalize( 131 | mean=[0.485, 0.456, 0.406], 132 | std=[0.229, 0.224, 0.225], 133 | ) 134 | 135 | # define transform 136 | transform = transforms.Compose( 137 | [ 138 | transforms.ToTensor(), 139 | normalize, 140 | ] 141 | ) 142 | 143 | dataset = datasets.CIFAR100( 144 | root=data_dir, 145 | train=False, 146 | download=True, 147 | transform=transform, 148 | ) 149 | 150 | data_loader = torch.utils.data.DataLoader( 151 | dataset, 152 | batch_size=batch_size, 153 | shuffle=shuffle, 154 | num_workers=num_workers, 155 | pin_memory=pin_memory, 156 | ) 157 | 158 | return data_loader -------------------------------------------------------------------------------- /src/ddp/dataset.py: -------------------------------------------------------------------------------- 1 | """ 2 | Ref: https://gist.github.com/kevinzakka/d33bf8d6c7f06a9d8c76d97a7879f5cb 3 | Create train, valid, test iterators for CIFAR-100 [1]. 4 | [1]: https://discuss.pytorch.org/t/feedback-on-pytorch-for-kaggle-competitions/2252/4 5 | """ 6 | 7 | import numpy as np 8 | import torch 9 | from torch.utils.data import Subset 10 | from torch.utils.data.distributed import DistributedSampler 11 | from torch.utils.data.sampler import SubsetRandomSampler 12 | from torchvision import datasets, transforms 13 | 14 | 15 | def get_trn_val_loader( 16 | data_dir: str, 17 | batch_size: int, 18 | valid_size: float = 0.1, 19 | shuffle: bool = True, 20 | num_workers: int = 1, 21 | pin_memory: bool = True, 22 | ): 23 | """ 24 | Utility function for loading and returning train and valid 25 | multi-process iterators over the CIFAR-100 dataset. A sample 26 | 9x9 grid of the images can be optionally displayed. 27 | If using CUDA, num_workers should be set to 1 and pin_memory to True. 28 | :param data_dir: path directory to the dataset. 29 | :param batch_size: how many samples per batch to load. 30 | :param valid_size: percentage split of the training set used for 31 | the validation set. Should be a float in the range [0, 1]. 32 | :param shuffle: whether to shuffle the train/validation indices. 33 | :param num_workers: number of subprocesses to use when loading the dataset. 34 | :param pin_memory: whether to copy tensors into CUDA pinned memory. 35 | Set it to True if using GPU. 36 | 37 | :return train_loader: training set iterator. 38 | :return valid_loader: validation set iterator. 39 | """ 40 | error_msg = "[!] valid_size should be in the range [0, 1]." 41 | assert (valid_size >= 0) and (valid_size <= 1), error_msg 42 | 43 | normalize = transforms.Normalize( 44 | mean=[0.4914, 0.4822, 0.4465], 45 | std=[0.2023, 0.1994, 0.2010], 46 | ) 47 | 48 | # define transforms 49 | valid_transform = transforms.Compose( 50 | [ 51 | transforms.ToTensor(), 52 | normalize, 53 | ] 54 | ) 55 | 56 | # augmentation 57 | train_transform = transforms.Compose( 58 | [ 59 | transforms.RandomCrop(32, padding=4), 60 | transforms.RandomHorizontalFlip(), 61 | transforms.ToTensor(), 62 | normalize, 63 | ] 64 | ) 65 | 66 | # load the dataset 67 | ## Actually, download should be set to be False, because it is not multiprocess safe. 68 | ## So you should prefetch dataset. 69 | ## Ref: https://leimao.github.io/blog/PyTorch-Distributed-Training/ 70 | train_dataset = datasets.CIFAR100( 71 | root=data_dir, 72 | train=True, 73 | download=True, ## 74 | transform=train_transform, 75 | ) 76 | 77 | valid_dataset = datasets.CIFAR100( 78 | root=data_dir, 79 | train=True, 80 | download=True, ## 81 | transform=valid_transform, 82 | ) 83 | 84 | # train/valid split 85 | num_train = len(train_dataset) 86 | indices = list(range(num_train)) 87 | split = int(np.floor(valid_size * num_train)) 88 | 89 | if shuffle: 90 | np.random.shuffle(indices) 91 | 92 | train_idx, valid_idx = indices[split:], indices[:split] 93 | 94 | # split indice explicitly before DistributedSampler 95 | train_dataset = Subset(train_dataset, train_idx) 96 | valid_dataset = Subset(valid_dataset, valid_idx) 97 | 98 | train_sampler = DistributedSampler(train_dataset) 99 | 100 | train_loader = torch.utils.data.DataLoader( 101 | train_dataset, 102 | batch_size=batch_size, 103 | sampler=train_sampler, 104 | num_workers=num_workers, 105 | pin_memory=pin_memory, 106 | drop_last=True, 107 | shuffle=(train_sampler is None), 108 | ) 109 | valid_loader = torch.utils.data.DataLoader( 110 | valid_dataset, 111 | batch_size=batch_size, 112 | num_workers=num_workers, 113 | pin_memory=pin_memory, 114 | ) 115 | 116 | return train_loader, train_sampler, valid_loader 117 | 118 | 119 | def get_tst_loader( 120 | data_dir: str, 121 | batch_size: int, 122 | shuffle: bool = True, 123 | num_workers: int = 4, 124 | pin_memory: bool = False, 125 | ): 126 | """ 127 | Utility function for loading and returning a multi-process 128 | test iterator over the CIFAR-100 dataset. 129 | If using CUDA, num_workers should be set to 1 and pin_memory to True. 130 | :param data_dir: path directory to the dataset. 131 | :param batch_size: how many samples per batch to load. 132 | :param shuffle: whether to shuffle the dataset after every epoch. 133 | :param num_workers: number of subprocesses to use when loading the dataset. 134 | :param pin_memory: whether to copy tensors into CUDA pinned memory. 135 | Set it to True if using GPU. 136 | 137 | :return data_loader: test set iterator. 138 | """ 139 | normalize = transforms.Normalize( 140 | mean=[0.485, 0.456, 0.406], 141 | std=[0.229, 0.224, 0.225], 142 | ) 143 | 144 | # define transform 145 | transform = transforms.Compose( 146 | [ 147 | transforms.ToTensor(), 148 | normalize, 149 | ] 150 | ) 151 | 152 | dataset = datasets.CIFAR100( 153 | root=data_dir, 154 | train=False, 155 | download=True, 156 | transform=transform, 157 | ) 158 | sampler = DistributedSampler(dataset) 159 | data_loader = torch.utils.data.DataLoader( 160 | dataset, 161 | batch_size=batch_size, 162 | shuffle=shuffle, 163 | sampler=sampler, 164 | num_workers=num_workers, 165 | pin_memory=pin_memory, 166 | ) 167 | 168 | return data_loader 169 | -------------------------------------------------------------------------------- /src/single/trainer.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import logging 3 | import os 4 | import random 5 | from typing import * 6 | 7 | import torch 8 | import torch.nn as nn 9 | import torch.optim as optim 10 | import yaml 11 | from tensorboardX import SummaryWriter 12 | from tqdm import tqdm 13 | 14 | from dataset import get_trn_val_loader, get_tst_loader 15 | from utils import AverageMeter, accuracy 16 | 17 | 18 | class Trainer: 19 | def __init__(self, hparams, model, scaler): 20 | self.hparams = hparams 21 | self.device = "cuda" if torch.cuda.is_available() else "cpu" 22 | self.dset = hparams.dset 23 | 24 | self.model_name = hparams.model 25 | self.model = model 26 | self.model = model.to(self.device) 27 | self.scaler = scaler 28 | 29 | # optimizer, scheduler 30 | self.optimizer, self.lr_scheduler = self.configure_optimizers() 31 | 32 | # metric 33 | self.criterion = nn.CrossEntropyLoss() 34 | 35 | # dataloader 36 | self.train_loader, self.val_loader = get_trn_val_loader( 37 | data_dir=hparams.dpath.strip(), 38 | batch_size=hparams.batch_size, 39 | valid_size=0.1, 40 | num_workers=hparams.workers, 41 | pin_memory=True, 42 | ) 43 | self.test_loader = get_tst_loader( 44 | data_dir=hparams.dpath.strip(), 45 | batch_size=hparams.batch_size, 46 | shuffle=False, 47 | num_workers=1, 48 | pin_memory=True, 49 | ) 50 | 51 | # model-saving options 52 | self.version = 0 53 | while True: 54 | self.save_path = os.path.join(hparams.ckpt_path, f"version-{self.version}") 55 | if not os.path.exists(self.save_path): 56 | os.makedirs(self.save_path) 57 | break 58 | else: 59 | self.version += 1 60 | self.summarywriter = SummaryWriter(self.save_path) 61 | self.global_step = 0 62 | self.global_val_loss = 1e5 63 | self.global_top1_acc = 0 64 | self.eval_step = hparams.eval_step 65 | logging.basicConfig( 66 | filename=os.path.join(self.save_path, "experiment.log"), 67 | level=logging.INFO, 68 | format="%(asctime)s > %(message)s", 69 | ) 70 | with open( 71 | os.path.join(self.save_path, "hparams.yaml"), "w", encoding="utf8" 72 | ) as outfile: 73 | yaml.dump(hparams, outfile, default_flow_style=False, allow_unicode=True) 74 | 75 | # experiment-logging options 76 | self.best_result = {"version": self.version} 77 | 78 | def configure_optimizers(self): 79 | # optimizer 80 | optimizer = optim.SGD( 81 | self.model.parameters(), 82 | lr=self.hparams.lr, 83 | weight_decay=self.hparams.weight_decay, 84 | momentum=0.9, 85 | nesterov=True, 86 | ) 87 | 88 | # lr scheduler (optional) 89 | scheduler = optim.lr_scheduler.StepLR( 90 | optimizer, 91 | step_size=self.hparams.lr_decay_step_size, 92 | gamma=self.hparams.lr_decay_gamma, 93 | ) 94 | return optimizer, scheduler 95 | 96 | def save_checkpoint(self, epoch: int, val_acc: float, model: nn.Module) -> None: 97 | logging.info( 98 | f"Val acc increased ({self.global_top1_acc:.4f} → {val_acc:.4f}). Saving model ..." 99 | ) 100 | new_path = os.path.join( 101 | self.save_path, f"best_model_epoch_{epoch}_acc_{val_acc:.4f}.pt" 102 | ) 103 | 104 | for filename in glob.glob(os.path.join(self.save_path, "*.pt")): 105 | os.remove(filename) # remove old checkpoint 106 | torch.save(model.state_dict(), new_path) 107 | self.global_top1_acc = val_acc 108 | 109 | def fit(self) -> dict: 110 | for epoch in tqdm(range(self.hparams.epoch), desc="epoch"): 111 | logging.info(f"* Learning Rate: {self.optimizer.param_groups[0]['lr']:.5f}") 112 | result = self._train_epoch(epoch) 113 | 114 | # update checkpoint 115 | if result["val_acc"] > self.global_top1_acc: 116 | self.save_checkpoint(epoch, result["val_acc"], self.model) 117 | self.lr_scheduler.step() 118 | 119 | self.summarywriter.close() 120 | return self.version 121 | 122 | def _train_epoch(self, epoch: int) -> dict: 123 | train_loss = AverageMeter() 124 | 125 | self.model.train() 126 | for step, batch in tqdm( 127 | enumerate(self.train_loader), 128 | desc="train_steps", 129 | total=len(self.train_loader), 130 | ): 131 | img, label = map(lambda x: x.to(self.device), batch) 132 | 133 | self.optimizer.zero_grad() 134 | if self.hparams.amp: 135 | with torch.cuda.amp.autocast(): 136 | logit = self.model(img) 137 | loss = self.criterion(logit, label) 138 | self.scaler.scale(loss).backward() 139 | self.scaler.step(self.optimizer) 140 | self.scaler.update() 141 | else: 142 | logit = self.model(img) 143 | loss = self.criterion(logit, label) 144 | loss.backward() 145 | self.optimizer.step() 146 | 147 | train_loss.update(loss.item()) 148 | 149 | self.global_step += 1 150 | if self.global_step % self.eval_step == 0: 151 | logging.info( 152 | f"[Single Version {self.version} Epoch {epoch}] global step: {self.global_step}, train loss: {loss.item():.3f}" 153 | ) 154 | 155 | train_loss = train_loss.avg 156 | val_loss, val_acc = self.validate(epoch) 157 | 158 | # tensorboard writing 159 | self.summarywriter.add_scalars( 160 | "lr", {"lr": self.optimizer.param_groups[0]["lr"]}, epoch 161 | ) 162 | self.summarywriter.add_scalars( 163 | "loss/step", {"val": val_loss, "train": train_loss}, self.global_step 164 | ) 165 | self.summarywriter.add_scalars( 166 | "loss/epoch", {"val": val_loss, "train": train_loss}, epoch 167 | ) 168 | self.summarywriter.add_scalars("acc/epoch", {"val": val_acc}, epoch) 169 | logging.info( 170 | f"** global step: {self.global_step}, val loss: {val_loss:.3f}, val_acc: {val_acc:.2f}%" 171 | ) 172 | 173 | return {"val_loss": val_loss, "val_acc": val_acc} 174 | 175 | def validate(self, epoch: int) -> Tuple[float]: 176 | val_loss = AverageMeter() 177 | top1 = AverageMeter() 178 | 179 | self.model.eval() 180 | with torch.no_grad(): 181 | for step, batch in tqdm( 182 | enumerate(self.val_loader), 183 | desc="valid_steps", 184 | total=len(self.val_loader), 185 | ): 186 | img, label = map(lambda x: x.to(self.device), batch) 187 | pred = self.model(img) 188 | loss = self.criterion(pred, label) 189 | val_loss.update(loss.item()) 190 | 191 | prec1 = accuracy(pred, label, topk=(1,))[0] 192 | top1.update(prec1.item()) 193 | 194 | return val_loss.avg, top1.avg 195 | 196 | def test(self, state_dict) -> dict: 197 | test_loss = AverageMeter() 198 | top1 = AverageMeter() 199 | top5 = AverageMeter() 200 | 201 | self.model.load_state_dict(state_dict) 202 | self.model.eval() 203 | with torch.no_grad(): 204 | for step, batch in tqdm( 205 | enumerate(self.test_loader), 206 | desc="tst_steps", 207 | total=len(self.test_loader), 208 | ): 209 | img, label = map(lambda x: x.to(self.device), batch) 210 | pred = self.model(img) 211 | 212 | loss = self.criterion(pred, label) 213 | test_loss.update(loss.item()) 214 | 215 | prec1, prec5 = accuracy(pred, label, topk=(1, 5)) 216 | top1.update(prec1.item()) 217 | top5.update(prec5.item()) 218 | 219 | print() 220 | print(f"** Test Loss: {test_loss.avg:.4f}") 221 | print(f"** Top-1 Accuracy: {top1.avg:.4f}%") 222 | print(f"** Top-5 Accuracy: {top5.avg:.4f}%") 223 | print() 224 | return { 225 | "test_loss": test_loss.avg, 226 | "top_1_acc": top1.avg, 227 | "top_5_acc": top5.avg, 228 | } -------------------------------------------------------------------------------- /src/dp/trainer.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import logging 3 | import os 4 | import random 5 | from typing import * 6 | 7 | import torch 8 | import torch.nn as nn 9 | import torch.optim as optim 10 | import yaml 11 | from tensorboardX import SummaryWriter 12 | from tqdm import tqdm 13 | 14 | from dataset import get_trn_val_loader, get_tst_loader 15 | from utils import AverageMeter, accuracy 16 | 17 | 18 | class Trainer: 19 | def __init__(self, hparams, model, scaler): 20 | self.hparams = hparams 21 | self.device = "cuda" if torch.cuda.is_available() else "cpu" 22 | self.dset = hparams.dset 23 | 24 | self.model_name = hparams.model 25 | self.model = model 26 | self.model = model.to(self.device) 27 | self.model = nn.DataParallel(self.model) 28 | self.scaler = scaler 29 | 30 | # optimizer, scheduler 31 | self.optimizer, self.lr_scheduler = self.configure_optimizers() 32 | 33 | # metric 34 | self.criterion = nn.CrossEntropyLoss() 35 | 36 | # dataloader 37 | self.train_loader, self.val_loader = get_trn_val_loader( 38 | data_dir=hparams.dpath.strip(), 39 | batch_size=hparams.batch_size, 40 | valid_size=0.1, 41 | num_workers=hparams.workers, 42 | pin_memory=True, 43 | ) 44 | self.test_loader = get_tst_loader( 45 | data_dir=hparams.dpath.strip(), 46 | batch_size=hparams.batch_size, 47 | shuffle=False, 48 | num_workers=1, 49 | pin_memory=True, 50 | ) 51 | 52 | # model-saving options 53 | self.version = 0 54 | while True: 55 | self.save_path = os.path.join(hparams.ckpt_path, f"version-{self.version}") 56 | if not os.path.exists(self.save_path): 57 | os.makedirs(self.save_path) 58 | break 59 | else: 60 | self.version += 1 61 | self.summarywriter = SummaryWriter(self.save_path) 62 | self.global_step = 0 63 | self.global_val_loss = 1e5 64 | self.global_top1_acc = 0 65 | self.eval_step = hparams.eval_step 66 | logging.basicConfig( 67 | filename=os.path.join(self.save_path, "experiment.log"), 68 | level=logging.INFO, 69 | format="%(asctime)s > %(message)s", 70 | ) 71 | with open( 72 | os.path.join(self.save_path, "hparams.yaml"), "w", encoding="utf8" 73 | ) as outfile: 74 | yaml.dump(hparams, outfile, default_flow_style=False, allow_unicode=True) 75 | 76 | # experiment-logging options 77 | self.best_result = {"version": self.version} 78 | 79 | def configure_optimizers(self): 80 | # optimizer 81 | optimizer = optim.SGD( 82 | self.model.parameters(), 83 | lr=self.hparams.lr, 84 | weight_decay=self.hparams.weight_decay, 85 | momentum=0.9, 86 | nesterov=True, 87 | ) 88 | 89 | # lr scheduler (optional) 90 | scheduler = optim.lr_scheduler.StepLR( 91 | optimizer, 92 | step_size=self.hparams.lr_decay_step_size, 93 | gamma=self.hparams.lr_decay_gamma, 94 | ) 95 | return optimizer, scheduler 96 | 97 | def save_checkpoint(self, epoch: int, val_acc: float, model: nn.Module) -> None: 98 | logging.info( 99 | f"Val acc increased ({self.global_top1_acc:.4f} → {val_acc:.4f}). Saving model ..." 100 | ) 101 | new_path = os.path.join( 102 | self.save_path, f"best_model_epoch_{epoch}_acc_{val_acc:.4f}.pt" 103 | ) 104 | 105 | for filename in glob.glob(os.path.join(self.save_path, "*.pt")): 106 | os.remove(filename) # remove old checkpoint 107 | torch.save(model.state_dict(), new_path) 108 | self.global_top1_acc = val_acc 109 | 110 | def fit(self) -> dict: 111 | for epoch in tqdm(range(self.hparams.epoch), desc="epoch"): 112 | logging.info(f"* Learning Rate: {self.optimizer.param_groups[0]['lr']:.5f}") 113 | result = self._train_epoch(epoch) 114 | 115 | # update checkpoint 116 | if result["val_acc"] > self.global_top1_acc: 117 | self.save_checkpoint(epoch, result["val_acc"], self.model) 118 | self.lr_scheduler.step() 119 | 120 | self.summarywriter.close() 121 | return self.version 122 | 123 | def _train_epoch(self, epoch: int) -> dict: 124 | train_loss = AverageMeter() 125 | 126 | self.model.train() 127 | for step, batch in tqdm( 128 | enumerate(self.train_loader), 129 | desc="train_steps", 130 | total=len(self.train_loader), 131 | ): 132 | img, label = map(lambda x: x.to(self.device), batch) 133 | 134 | self.optimizer.zero_grad() 135 | if self.hparams.amp: 136 | with torch.cuda.amp.autocast(): 137 | logit = self.model(img) 138 | loss = self.criterion(logit, label) 139 | self.scaler.scale(loss).backward() 140 | self.scaler.step(self.optimizer) 141 | self.scaler.update() 142 | else: 143 | logit = self.model(img) 144 | loss = self.criterion(logit, label) 145 | loss.backward() 146 | self.optimizer.step() 147 | 148 | train_loss.update(loss.item()) 149 | 150 | self.global_step += 1 151 | if self.global_step % self.eval_step == 0: 152 | logging.info( 153 | f"[DP Version {self.version} Epoch {epoch}] global step: {self.global_step}, train loss: {loss.item():.3f}" 154 | ) 155 | 156 | train_loss = train_loss.avg 157 | val_loss, val_acc = self.validate(epoch) 158 | 159 | # tensorboard writing 160 | self.summarywriter.add_scalars( 161 | "lr", {"lr": self.optimizer.param_groups[0]["lr"]}, epoch 162 | ) 163 | self.summarywriter.add_scalars( 164 | "loss/step", {"val": val_loss, "train": train_loss}, self.global_step 165 | ) 166 | self.summarywriter.add_scalars( 167 | "loss/epoch", {"val": val_loss, "train": train_loss}, epoch 168 | ) 169 | self.summarywriter.add_scalars("acc/epoch", {"val": val_acc}, epoch) 170 | logging.info( 171 | f"** global step: {self.global_step}, val loss: {val_loss:.3f}, val_acc: {val_acc:.2f}%" 172 | ) 173 | 174 | return {"val_loss": val_loss, "val_acc": val_acc} 175 | 176 | def validate(self, epoch: int) -> Tuple[float]: 177 | val_loss = AverageMeter() 178 | top1 = AverageMeter() 179 | 180 | self.model.eval() 181 | with torch.no_grad(): 182 | for step, batch in tqdm( 183 | enumerate(self.val_loader), 184 | desc="valid_steps", 185 | total=len(self.val_loader), 186 | ): 187 | img, label = map(lambda x: x.to(self.device), batch) 188 | pred = self.model(img) 189 | loss = self.criterion(pred, label) 190 | val_loss.update(loss.item()) 191 | 192 | prec1 = accuracy(pred, label, topk=(1,))[0] 193 | top1.update(prec1.item()) 194 | 195 | return val_loss.avg, top1.avg 196 | 197 | def test(self, state_dict) -> dict: 198 | test_loss = AverageMeter() 199 | top1 = AverageMeter() 200 | top5 = AverageMeter() 201 | 202 | self.model.load_state_dict(state_dict) 203 | self.model.eval() 204 | with torch.no_grad(): 205 | for step, batch in tqdm( 206 | enumerate(self.test_loader), 207 | desc="tst_steps", 208 | total=len(self.test_loader), 209 | ): 210 | img, label = map(lambda x: x.to(self.device), batch) 211 | pred = self.model(img) 212 | 213 | loss = self.criterion(pred, label) 214 | test_loss.update(loss.item()) 215 | 216 | prec1, prec5 = accuracy(pred, label, topk=(1, 5)) 217 | top1.update(prec1.item()) 218 | top5.update(prec5.item()) 219 | 220 | print() 221 | print(f"** Test Loss: {test_loss.avg:.4f}") 222 | print(f"** Top-1 Accuracy: {top1.avg:.4f}%") 223 | print(f"** Top-5 Accuracy: {top5.avg:.4f}%") 224 | print() 225 | return { 226 | "test_loss": test_loss.avg, 227 | "top_1_acc": top1.avg, 228 | "top_5_acc": top5.avg, 229 | } -------------------------------------------------------------------------------- /src/ddp/trainer.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import logging 3 | import os 4 | import random 5 | from typing import * 6 | 7 | import torch 8 | import torch.distributed as dist 9 | import torch.nn as nn 10 | import torch.optim as optim 11 | import yaml 12 | from tensorboardX import SummaryWriter 13 | from torch.nn.parallel import DistributedDataParallel as DDP 14 | from tqdm import tqdm 15 | 16 | from dataset import get_trn_val_loader, get_tst_loader 17 | from utils import AverageMeter, accuracy 18 | 19 | 20 | class Trainer: 21 | def __init__(self, hparams, model, scaler, rank, ngpus_per_node): 22 | self.hparams = hparams 23 | self.rank = rank 24 | self.nprocs = torch.cuda.device_count() 25 | self.device = f"cuda:{rank}" if torch.cuda.is_available() else "cpu" 26 | self.dset = hparams.dset 27 | 28 | self.model_name = hparams.model 29 | self.model = model 30 | self.model = model.to(self.device, non_blocking=True) 31 | self.model = DDP(self.model, device_ids=[rank], find_unused_parameters=True) 32 | self.scaler = scaler 33 | 34 | hparams.batch_size = int(hparams.batch_size / ngpus_per_node) 35 | 36 | # optimizer, scheduler 37 | self.optimizer, self.lr_scheduler = self.configure_optimizers() 38 | 39 | # metric 40 | self.criterion = nn.CrossEntropyLoss() 41 | 42 | # dataloader and distributed sampler 43 | self.train_loader, self.train_sampler, self.val_loader = get_trn_val_loader( 44 | data_dir=hparams.dpath.strip(), 45 | batch_size=hparams.batch_size, 46 | valid_size=0.1, 47 | num_workers=hparams.workers, 48 | pin_memory=True, 49 | ) 50 | self.test_loader = get_tst_loader( 51 | data_dir=hparams.dpath.strip(), 52 | batch_size=hparams.batch_size, 53 | shuffle=False, 54 | num_workers=1, 55 | pin_memory=True, 56 | ) 57 | 58 | # model-saving options (only at rank 0) 59 | if self.rank == 0: 60 | self.version = 0 61 | while True: 62 | self.save_path = os.path.join( 63 | hparams.ckpt_path, f"version-{self.version}" 64 | ) 65 | if not os.path.exists(self.save_path): 66 | os.makedirs(self.save_path) 67 | break 68 | else: 69 | self.version += 1 70 | self.summarywriter = SummaryWriter(self.save_path) 71 | self.global_step = 0 72 | self.global_val_loss = 1e5 73 | self.global_top1_acc = 0 74 | self.eval_step = hparams.eval_step 75 | logging.basicConfig( 76 | filename=os.path.join(self.save_path, "experiment.log"), 77 | level=logging.INFO, 78 | format="%(asctime)s > %(message)s", 79 | ) 80 | with open( 81 | os.path.join(self.save_path, "hparams.yaml"), "w", encoding="utf8" 82 | ) as outfile: 83 | yaml.dump( 84 | hparams, outfile, default_flow_style=False, allow_unicode=True 85 | ) 86 | 87 | # experiment-logging options 88 | self.best_result = {"version": self.version} 89 | 90 | def configure_optimizers(self): 91 | # optimizer 92 | optimizer = optim.SGD( 93 | self.model.parameters(), 94 | lr=self.hparams.lr, 95 | weight_decay=self.hparams.weight_decay, 96 | momentum=0.9, 97 | nesterov=True, 98 | ) 99 | 100 | # lr scheduler (optional) 101 | scheduler = optim.lr_scheduler.StepLR( 102 | optimizer, 103 | step_size=self.hparams.lr_decay_step_size, 104 | gamma=self.hparams.lr_decay_gamma, 105 | ) 106 | return optimizer, scheduler 107 | 108 | def save_checkpoint(self, epoch: int, val_acc: float, model: nn.Module) -> None: 109 | logging.info( 110 | f"Val acc increased ({self.global_top1_acc:.4f} → {val_acc:.4f}). Saving model ..." 111 | ) 112 | new_path = os.path.join( 113 | self.save_path, f"best_model_epoch_{epoch}_acc_{val_acc:.4f}.pt" 114 | ) 115 | 116 | for filename in glob.glob(os.path.join(self.save_path, "*.pt")): 117 | os.remove(filename) # remove old checkpoint 118 | torch.save(model.state_dict(), new_path) 119 | self.global_top1_acc = val_acc 120 | 121 | def fit(self) -> dict: 122 | for epoch in tqdm( 123 | range(self.hparams.epoch), desc="epoch", disable=self.rank not in [0] 124 | ): 125 | self.train_sampler.set_epoch(epoch) 126 | 127 | logging.info(f"* Learning Rate: {self.optimizer.param_groups[0]['lr']:.5f}") 128 | result = self._train_epoch(epoch) 129 | 130 | # update checkpoint 131 | if self.rank == 0 and result["val_acc"] > self.global_top1_acc: 132 | self.save_checkpoint(epoch, result["val_acc"], self.model) 133 | self.lr_scheduler.step() 134 | 135 | if self.rank == 0: 136 | self.summarywriter.close() 137 | return self.version if self.rank == 0 else None 138 | 139 | def _train_epoch(self, epoch: int) -> dict: 140 | train_loss = AverageMeter() 141 | 142 | self.model.train() 143 | for step, batch in tqdm( 144 | enumerate(self.train_loader), 145 | desc="train_steps", 146 | total=len(self.train_loader), 147 | disable=self.rank in [0], 148 | ): 149 | img, label = map(lambda x: x.to(self.device, non_blocking=True), batch) 150 | 151 | self.optimizer.zero_grad() 152 | if self.hparams.amp: 153 | with torch.cuda.amp.autocast(): 154 | logit = self.model(img) 155 | loss = self.criterion(logit, label) 156 | dist.barrier() 157 | self.scaler.scale(loss).backward() 158 | self.scaler.step(self.optimizer) 159 | self.scaler.update() 160 | else: 161 | logit = self.model(img) 162 | loss = self.criterion(logit, label) 163 | dist.barrier() 164 | loss.backward() 165 | self.optimizer.step() 166 | 167 | train_loss.update(loss.item()) 168 | 169 | if self.rank == 0: 170 | self.global_step += 1 171 | if self.global_step % self.eval_step == 0: 172 | logging.info( 173 | f"[DDP Version {self.version} Epoch {epoch}] global step: {self.global_step}, train loss: {loss.item():.3f}" 174 | ) 175 | 176 | train_loss = train_loss.avg 177 | 178 | if self.rank == 0: 179 | val_loss, val_acc = self.validate(epoch) 180 | 181 | # tensorboard writing 182 | self.summarywriter.add_scalars( 183 | "lr", {"lr": self.optimizer.param_groups[0]["lr"]}, epoch 184 | ) 185 | self.summarywriter.add_scalars( 186 | "loss/step", {"val": val_loss, "train": train_loss}, self.global_step 187 | ) 188 | self.summarywriter.add_scalars( 189 | "loss/epoch", {"val": val_loss, "train": train_loss}, epoch 190 | ) 191 | self.summarywriter.add_scalars("acc/epoch", {"val": val_acc}, epoch) 192 | logging.info( 193 | f"** global step: {self.global_step}, val loss: {val_loss:.3f}, val_acc: {val_acc:.2f}%" 194 | ) 195 | 196 | return {"val_loss": val_loss, "val_acc": val_acc} 197 | return None 198 | 199 | def validate(self, epoch: int) -> Tuple[float]: 200 | val_loss = AverageMeter() 201 | top1 = AverageMeter() 202 | 203 | self.model.eval() 204 | with torch.no_grad(): 205 | for step, batch in tqdm( 206 | enumerate(self.val_loader), 207 | desc="valid_steps", 208 | total=len(self.val_loader), 209 | ): 210 | img, label = map(lambda x: x.to(self.device, non_blocking=True), batch) 211 | pred = self.model(img) 212 | 213 | loss = self.criterion(pred, label) 214 | val_loss.update(loss.item()) 215 | 216 | prec1 = accuracy(pred, label, topk=(1,))[0] 217 | top1.update(prec1.item()) 218 | 219 | return val_loss.avg, top1.avg 220 | 221 | def test(self, state_dict) -> dict: 222 | test_loss = AverageMeter() 223 | top1 = AverageMeter() 224 | top5 = AverageMeter() 225 | 226 | self.model.load_state_dict(state_dict) 227 | self.model.eval() 228 | with torch.no_grad(): 229 | for step, batch in tqdm( 230 | enumerate(self.test_loader), 231 | desc="tst_steps", 232 | total=len(self.test_loader), 233 | ): 234 | img, label = map(lambda x: x.to(self.device, non_blocking=True), batch) 235 | pred = self.model(img) 236 | 237 | loss = self.criterion(pred, label) 238 | test_loss.update(loss.item()) 239 | 240 | prec1, prec5 = accuracy(pred, label, topk=(1, 5)) 241 | top1.update(prec1.item()) 242 | top5.update(prec5.item()) 243 | 244 | logging.info(f"** Test Loss: {test_loss.avg:.4f}") 245 | logging.info(f"** Top-1 Accuracy: {top1.avg:.4f}%") 246 | logging.info(f"** Top-5 Accuracy: {top5.avg:.4f}%") 247 | 248 | return { 249 | "test_loss": test_loss.avg, 250 | "top_1_acc": top1.avg, 251 | "top_5_acc": top5.avg, 252 | } 253 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | --------------------------------------------------------------------------------