├── .gitignore
├── LICENSE
├── README.md
├── data_parallel
    ├── model.py
    └── train.py
├── dist_parallel
    ├── model.py
    └── train.py
└── single_gpu
    ├── model.py
    └── train.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 
106 | data/
107 | logs/
108 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 Woongwon Lee
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # pytorch-multigpu
 2 | Multi GPU Training Code for Deep Learning with PyTorch. Train PyramidNet for CIFAR10 classification task. This code is for comparing several ways of multi-GPU training.
 3 | 
 4 | # Requirement
 5 | - Python 3
 6 | - PyTorch 1.0.0+
 7 | - TorchVision
 8 | - TensorboardX
 9 | 
10 | # Usage
11 | ### single gpu
12 | ```
13 | cd single_gpu
14 | python train.py 
15 | ```
16 | 
17 | ### DataParallel
18 | ```
19 | cd data_parallel
20 | python train.py --gpu_devices 0 1 2 3 --batch_size 768
21 | ```
22 | 
23 | ### DistributedDataParallel
24 | ```
25 | cd dist_parallel
26 | python train.py --gpu_device 0 1 2 3 --batch_size 768
27 | ```
28 | 
29 | # Performance
30 | ### single gpu
31 | - batch size: 240
32 | - batch time: 6s
33 | - training time: 22 min 
34 | - gpu util: 99 %
35 | - gpu memory: 10 G
36 | 
37 | ### DataParallel(4 k80)
38 | - batch size: 768
39 | - batch time: 5s
40 | - training time: 5 min 
41 | - gpu util: 99 %
42 | - gpu memory: 10 G * 4
43 | 


--------------------------------------------------------------------------------
/data_parallel/model.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import time
  3 | 
  4 | import torch
  5 | import torch.nn as nn
  6 | import torch.nn.functional as F
  7 | 
  8 | 
  9 | # code from https://github.com/KellerJordan/ResNet-PyTorch-CIFAR10/blob/master/model.py
 10 | class IdentityPadding(nn.Module):
 11 |     def __init__(self, in_channels, out_channels, stride=1):
 12 |         super(IdentityPadding, self).__init__()
 13 | 
 14 |         if stride == 2:
 15 |             self.pooling = nn.AvgPool2d(kernel_size=2, stride=2, ceil_mode=True)
 16 |         else:
 17 |             self.pooling = None
 18 |             
 19 |         self.add_channels = out_channels - in_channels
 20 |     
 21 |     def forward(self, x):
 22 |         out = F.pad(x, (0, 0, 0, 0, 0, self.add_channels))
 23 |         if self.pooling is not None:
 24 |             out = self.pooling(out)
 25 |         return out
 26 | 
 27 | 
 28 | class ResidualBlock(nn.Module):
 29 |     def __init__(self, in_channels, out_channels, stride=1):
 30 |         super(ResidualBlock, self).__init__()
 31 |         self.bn1 = nn.BatchNorm2d(in_channels)
 32 |         self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, 
 33 |                                 stride=stride, padding=1, bias=False)      
 34 |         self.bn2 = nn.BatchNorm2d(out_channels)
 35 |         self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, 
 36 |                                 stride=1, padding=1, bias=False)    
 37 |         self.bn3 = nn.BatchNorm2d(out_channels)
 38 |         self.relu = nn.ReLU(inplace=True)
 39 | 
 40 |         self.down_sample = IdentityPadding(in_channels, out_channels, stride)
 41 |             
 42 |         self.stride = stride
 43 | 
 44 |     def forward(self, x):
 45 |         shortcut = self.down_sample(x)
 46 |         out = self.bn1(x)
 47 |         out = self.conv1(out)        
 48 |         out = self.bn2(out)
 49 |         out = self.relu(out)
 50 |         out = self.conv2(out)
 51 |         out = self.bn3(out)
 52 |        
 53 |         out += shortcut
 54 |         return out
 55 | 
 56 | 
 57 | class PyramidNet(nn.Module):
 58 |     def __init__(self, num_layers, alpha, block, num_classes=10):
 59 |         super(PyramidNet, self).__init__()   	
 60 |         self.in_channels = 16
 61 |         
 62 |         # num_layers = (110 - 2)/6 = 18
 63 |         self.num_layers = num_layers
 64 |         self.addrate = alpha / (3*self.num_layers*1.0)
 65 | 
 66 |         self.conv1 = nn.Conv2d(in_channels=3, out_channels=16, kernel_size=3, 
 67 |                                stride=1, padding=1, bias=False)
 68 |         self.bn1 = nn.BatchNorm2d(16)
 69 | 
 70 |         # feature map size = 32x32
 71 |         self.layer1 = self.get_layers(block, stride=1)
 72 |         # feature map size = 16x16
 73 |         self.layer2 = self.get_layers(block, stride=2)
 74 |         # feature map size = 8x8
 75 |         self.layer3 = self.get_layers(block, stride=2)
 76 | 
 77 |         self.out_channels = int(round(self.out_channels))
 78 |         self.bn_out= nn.BatchNorm2d(self.out_channels)
 79 |         self.relu_out = nn.ReLU(inplace=True)
 80 |         self.avgpool = nn.AvgPool2d(8, stride=1)
 81 |         self.fc_out = nn.Linear(self.out_channels, num_classes)
 82 | 
 83 |         for m in self.modules():
 84 |             if isinstance(m, nn.Conv2d):
 85 |                 nn.init.kaiming_normal_(m.weight, mode='fan_out', 
 86 |                                         nonlinearity='relu')
 87 |             elif isinstance(m, nn.BatchNorm2d):
 88 |                 nn.init.constant_(m.weight, 1)
 89 |                 nn.init.constant_(m.bias, 0)
 90 | 
 91 |     def get_layers(self, block, stride):
 92 |         layers_list = []
 93 |         for _ in range(self.num_layers - 1):
 94 |             self.out_channels = self.in_channels + self.addrate
 95 |             layers_list.append(block(int(round(self.in_channels)), 
 96 |                                      int(round(self.out_channels)), 
 97 |                                      stride))
 98 |             self.in_channels = self.out_channels
 99 |             stride=1
100 | 
101 |         return nn.Sequential(*layers_list)
102 | 
103 |     def forward(self, x):
104 |         x = self.conv1(x)
105 |         x = self.bn1(x)
106 |         
107 |         x = self.layer1(x)
108 |         x = self.layer2(x)
109 |         x = self.layer3(x)
110 | 
111 |         x = self.bn_out(x)
112 |         x = self.relu_out(x)
113 |         x = self.avgpool(x)
114 |         x = x.view(x.size(0), -1)
115 |         x = self.fc_out(x)
116 |         return x
117 |     
118 |     def save(self, path_to_checkpoints_dir, step):
119 |         path_to_checkpoint = os.path.join(path_to_checkpoints_dir, 'model-{:s}-{:d}.pth'.format(time.strftime('%Y%m%d%H%M'), step))
120 |         torch.save(self.state_dict(), path_to_checkpoint)
121 |         return path_to_checkpoint
122 | 
123 |     def load(self, path_to_checkpoint):
124 |         self.load_state_dict(torch.load(path_to_checkpoint))
125 |         return self
126 | 
127 | def pyramidnet():
128 |     block = ResidualBlock
129 |     model = PyramidNet(num_layers=18, alpha=270, block=block)
130 |     return model
131 | 


--------------------------------------------------------------------------------
/data_parallel/train.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import time
  3 | import datetime
  4 | 
  5 | import torch
  6 | import torch.nn as nn
  7 | import torch.optim as optim
  8 | from torch.optim import lr_scheduler
  9 | import torch.backends.cudnn as cudnn
 10 | 
 11 | import torchvision
 12 | import torchvision.transforms as transforms
 13 | from torchvision.datasets import CIFAR10
 14 | from torch.utils.data import DataLoader
 15 | 
 16 | from model import pyramidnet
 17 | import argparse
 18 | from tensorboardX import SummaryWriter
 19 | 
 20 | 
 21 | parser = argparse.ArgumentParser(description='cifar10 classification models')
 22 | parser.add_argument('--lr', default=0.1, help='')
 23 | parser.add_argument('--resume', default=None, help='')
 24 | parser.add_argument('--batch_size', type=int, default=768, help='')
 25 | parser.add_argument('--num_worker', type=int, default=4, help='')
 26 | parser.add_argument("--gpu_devices", type=int, nargs='+', default=None, help="")
 27 | args = parser.parse_args()
 28 | 
 29 | gpu_devices = ','.join([str(id) for id in args.gpu_devices])
 30 | os.environ["CUDA_VISIBLE_DEVICES"] = gpu_devices
 31 | 
 32 | 
 33 | def main():
 34 |     best_acc = 0
 35 | 
 36 |     device = 'cuda' if torch.cuda.is_available() else 'cpu'
 37 | 
 38 |     print('==> Preparing data..')
 39 |     transforms_train = transforms.Compose([
 40 |         transforms.RandomCrop(32, padding=4),
 41 |         transforms.RandomHorizontalFlip(),
 42 |         transforms.ToTensor(),
 43 |         transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))])
 44 | 
 45 |     dataset_train = CIFAR10(root='../data', train=True, download=True, 
 46 |                             transform=transforms_train)
 47 | 
 48 |     train_loader = DataLoader(dataset_train, batch_size=args.batch_size, 
 49 |                               shuffle=True, num_workers=args.num_worker)
 50 | 
 51 |     # there are 10 classes so the dataset name is cifar-10
 52 |     classes = ('plane', 'car', 'bird', 'cat', 'deer', 
 53 |                'dog', 'frog', 'horse', 'ship', 'truck')
 54 | 
 55 |     print('==> Making model..')
 56 | 
 57 |     net = pyramidnet()
 58 |     net = nn.DataParallel(net)
 59 |     net = net.to(device)
 60 |     num_params = sum(p.numel() for p in net.parameters() if p.requires_grad)
 61 |     print('The number of parameters of model is', num_params)
 62 | 
 63 |     criterion = nn.CrossEntropyLoss()
 64 |     optimizer = optim.Adam(net.parameters(), lr=args.lr)
 65 |     # optimizer = optim.SGD(net.parameters(), lr=args.lr, 
 66 |     #                       momentum=0.9, weight_decay=1e-4)
 67 |     
 68 |     train(net, criterion, optimizer, train_loader, device)
 69 |             
 70 | 
 71 | def train(net, criterion, optimizer, train_loader, device):
 72 |     net.train()
 73 | 
 74 |     train_loss = 0
 75 |     correct = 0
 76 |     total = 0
 77 |     
 78 |     epoch_start = time.time()
 79 |     for batch_idx, (inputs, targets) in enumerate(train_loader):
 80 |         start = time.time()
 81 |         
 82 |         inputs = inputs.to(device)
 83 |         targets = targets.to(device)
 84 |         outputs = net(inputs)
 85 |         loss = criterion(outputs, targets)
 86 | 
 87 |         optimizer.zero_grad()
 88 |         loss.backward()
 89 |         optimizer.step()
 90 | 
 91 |         train_loss += loss.item()
 92 |         _, predicted = outputs.max(1)
 93 |         total += targets.size(0)
 94 |         correct += predicted.eq(targets).sum().item()
 95 | 
 96 |         acc = 100 * correct / total
 97 |         
 98 |         batch_time = time.time() - start
 99 |         
100 |         if batch_idx % 20 == 0:
101 |             print('Epoch: [{}/{}]| loss: {:.3f} | acc: {:.3f} | batch time: {:.3f}s '.format(
102 |                 batch_idx, len(train_loader), train_loss/(batch_idx+1), acc, batch_time))
103 |     
104 |     elapse_time = time.time() - epoch_start
105 |     elapse_time = datetime.timedelta(seconds=elapse_time)
106 |     print("Training time {}".format(elapse_time))
107 |     
108 | 
109 | if __name__=='__main__':
110 |     main()


--------------------------------------------------------------------------------
/dist_parallel/model.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | 
  5 | 
  6 | # code from https://github.com/KellerJordan/ResNet-PyTorch-CIFAR10/blob/master/model.py
  7 | class IdentityPadding(nn.Module):
  8 |     def __init__(self, in_channels, out_channels, stride=1):
  9 |         super(IdentityPadding, self).__init__()
 10 | 
 11 |         if stride == 2:
 12 |             self.pooling = nn.AvgPool2d(kernel_size=2, stride=2, ceil_mode=True)
 13 |         else:
 14 |             self.pooling = None
 15 |             
 16 |         self.add_channels = out_channels - in_channels
 17 |     
 18 |     def forward(self, x):
 19 |         out = F.pad(x, (0, 0, 0, 0, 0, self.add_channels))
 20 |         if self.pooling is not None:
 21 |             out = self.pooling(out)
 22 |         return out
 23 | 
 24 | 
 25 | class ResidualBlock(nn.Module):
 26 |     def __init__(self, in_channels, out_channels, stride=1):
 27 |         super(ResidualBlock, self).__init__()
 28 |         self.bn1 = nn.BatchNorm2d(in_channels)
 29 |         self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, 
 30 |                                 stride=stride, padding=1, bias=False)      
 31 |         self.bn2 = nn.BatchNorm2d(out_channels)
 32 |         self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, 
 33 |                                 stride=1, padding=1, bias=False)    
 34 |         self.bn3 = nn.BatchNorm2d(out_channels)
 35 |         self.relu = nn.ReLU(inplace=True)
 36 | 
 37 |         self.down_sample = IdentityPadding(in_channels, out_channels, stride)
 38 |             
 39 |         self.stride = stride
 40 | 
 41 |     def forward(self, x):
 42 |         shortcut = self.down_sample(x)
 43 |         out = self.bn1(x)
 44 |         out = self.conv1(out)        
 45 |         out = self.bn2(out)
 46 |         out = self.relu(out)
 47 |         out = self.conv2(out)
 48 |         out = self.bn3(out)
 49 |        
 50 |         out += shortcut
 51 |         return out
 52 | 
 53 | 
 54 | class PyramidNet(nn.Module):
 55 |     def __init__(self, num_layers, alpha, block, num_classes=10):
 56 |         super(PyramidNet, self).__init__()   	
 57 |         self.in_channels = 16
 58 |         
 59 |         # num_layers = (110 - 2)/6 = 18
 60 |         self.num_layers = num_layers
 61 |         self.addrate = alpha / (3*self.num_layers*1.0)
 62 | 
 63 |         self.conv1 = nn.Conv2d(in_channels=3, out_channels=16, kernel_size=3, 
 64 |                                stride=1, padding=1, bias=False)
 65 |         self.bn1 = nn.BatchNorm2d(16)
 66 | 
 67 |         # feature map size = 32x32
 68 |         self.layer1 = self.get_layers(block, stride=1)
 69 |         # feature map size = 16x16
 70 |         self.layer2 = self.get_layers(block, stride=2)
 71 |         # feature map size = 8x8
 72 |         self.layer3 = self.get_layers(block, stride=2)
 73 | 
 74 |         self.out_channels = int(round(self.out_channels))
 75 |         self.bn_out= nn.BatchNorm2d(self.out_channels)
 76 |         self.relu_out = nn.ReLU(inplace=True)
 77 |         self.avgpool = nn.AvgPool2d(8, stride=1)
 78 |         self.fc_out = nn.Linear(self.out_channels, num_classes)
 79 | 
 80 |         for m in self.modules():
 81 |             if isinstance(m, nn.Conv2d):
 82 |                 nn.init.kaiming_normal_(m.weight, mode='fan_out', 
 83 |                                         nonlinearity='relu')
 84 |             elif isinstance(m, nn.BatchNorm2d):
 85 |                 nn.init.constant_(m.weight, 1)
 86 |                 nn.init.constant_(m.bias, 0)
 87 | 
 88 |     def get_layers(self, block, stride):
 89 |         layers_list = []
 90 |         for _ in range(self.num_layers - 1):
 91 |             self.out_channels = self.in_channels + self.addrate
 92 |             layers_list.append(block(int(round(self.in_channels)), 
 93 |                                      int(round(self.out_channels)), 
 94 |                                      stride))
 95 |             self.in_channels = self.out_channels
 96 |             stride=1
 97 | 
 98 |         return nn.Sequential(*layers_list)
 99 | 
100 |     def forward(self, x):
101 |         x = self.conv1(x)
102 |         x = self.bn1(x)
103 |         
104 |         x = self.layer1(x)
105 |         x = self.layer2(x)
106 |         x = self.layer3(x)
107 | 
108 |         x = self.bn_out(x)
109 |         x = self.relu_out(x)
110 |         x = self.avgpool(x)
111 |         x = x.view(x.size(0), -1)
112 |         x = self.fc_out(x)
113 |         return x
114 | 
115 | 
116 | def pyramidnet():
117 | 	block = ResidualBlock
118 | 	model = PyramidNet(num_layers=18, alpha=270, block=block)
119 | 	return model
120 | 


--------------------------------------------------------------------------------
/dist_parallel/train.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import time
  3 | import datetime
  4 | 
  5 | import torch
  6 | import torch.nn as nn
  7 | import torch.optim as optim
  8 | from torch.optim import lr_scheduler
  9 | import torch.backends.cudnn as cudnn
 10 | 
 11 | import torchvision
 12 | import torchvision.transforms as transforms
 13 | from torchvision.datasets import CIFAR10
 14 | from torch.utils.data import DataLoader
 15 | 
 16 | import torch.distributed as dist
 17 | import torch.multiprocessing as mp
 18 | import torch.utils.data.distributed
 19 | 
 20 | from model import pyramidnet
 21 | import argparse
 22 | from tensorboardX import SummaryWriter
 23 | 
 24 | 
 25 | parser = argparse.ArgumentParser(description='cifar10 classification models')
 26 | parser.add_argument('--lr', default=0.1, help='')
 27 | parser.add_argument('--resume', default=None, help='')
 28 | parser.add_argument('--batch_size', type=int, default=768, help='')
 29 | parser.add_argument('--num_workers', type=int, default=4, help='')
 30 | parser.add_argument("--gpu_devices", type=int, nargs='+', default=None, help="")
 31 | 
 32 | parser.add_argument('--gpu', default=None, type=int, help='GPU id to use.')
 33 | parser.add_argument('--dist-url', default='tcp://127.0.0.1:3456', type=str, help='')
 34 | parser.add_argument('--dist-backend', default='nccl', type=str, help='')
 35 | parser.add_argument('--rank', default=0, type=int, help='')
 36 | parser.add_argument('--world_size', default=1, type=int, help='')
 37 | parser.add_argument('--distributed', action='store_true', help='')
 38 | args = parser.parse_args()
 39 | 
 40 | gpu_devices = ','.join([str(id) for id in args.gpu_devices])
 41 | os.environ["CUDA_VISIBLE_DEVICES"] = gpu_devices
 42 | 
 43 | 
 44 | def main():
 45 |     args = parser.parse_args()
 46 | 
 47 |     ngpus_per_node = torch.cuda.device_count()
 48 | 
 49 |     args.world_size = ngpus_per_node * args.world_size
 50 |     mp.spawn(main_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, args))
 51 |         
 52 |         
 53 | def main_worker(gpu, ngpus_per_node, args):
 54 |     args.gpu = gpu
 55 |     ngpus_per_node = torch.cuda.device_count()    
 56 |     print("Use GPU: {} for training".format(args.gpu))
 57 |         
 58 |     args.rank = args.rank * ngpus_per_node + gpu    
 59 |     dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
 60 |                             world_size=args.world_size, rank=args.rank)
 61 | 
 62 |     print('==> Making model..')
 63 |     net = pyramidnet()
 64 |     torch.cuda.set_device(args.gpu)
 65 |     net.cuda(args.gpu)
 66 |     args.batch_size = int(args.batch_size / ngpus_per_node)
 67 |     args.num_workers = int(args.num_workers / ngpus_per_node)
 68 |     net = torch.nn.parallel.DistributedDataParallel(net, device_ids=[args.gpu])
 69 |     num_params = sum(p.numel() for p in net.parameters() if p.requires_grad)
 70 |     print('The number of parameters of model is', num_params)
 71 | 
 72 |     print('==> Preparing data..')
 73 |     transforms_train = transforms.Compose([
 74 |         transforms.RandomCrop(32, padding=4),
 75 |         transforms.RandomHorizontalFlip(),
 76 |         transforms.ToTensor(),
 77 |         transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))])
 78 | 
 79 |     dataset_train = CIFAR10(root='../data', train=True, download=True, 
 80 |                             transform=transforms_train)
 81 |     train_sampler = torch.utils.data.distributed.DistributedSampler(dataset_train)
 82 |     train_loader = DataLoader(dataset_train, batch_size=args.batch_size, 
 83 |                               shuffle=(train_sampler is None), num_workers=args.num_workers, 
 84 |                               sampler=train_sampler)
 85 | 
 86 |     # there are 10 classes so the dataset name is cifar-10
 87 |     classes = ('plane', 'car', 'bird', 'cat', 'deer', 
 88 |                'dog', 'frog', 'horse', 'ship', 'truck')
 89 |     
 90 |     criterion = nn.CrossEntropyLoss()
 91 |     optimizer = optim.SGD(net.parameters(), lr=args.lr, 
 92 |                           momentum=0.9, weight_decay=1e-4)
 93 |     
 94 |     train(net, criterion, optimizer, train_loader, args.gpu)
 95 |             
 96 | 
 97 | def train(net, criterion, optimizer, train_loader, device):
 98 |     net.train()
 99 | 
100 |     train_loss = 0
101 |     correct = 0
102 |     total = 0
103 |     
104 |     epoch_start = time.time()
105 |     for batch_idx, (inputs, targets) in enumerate(train_loader):
106 |         start = time.time()
107 |         
108 |         inputs = inputs.cuda(device)
109 |         targets = targets.cuda(device)
110 |         outputs = net(inputs)
111 |         loss = criterion(outputs, targets)
112 | 
113 |         optimizer.zero_grad()
114 |         loss.backward()
115 |         optimizer.step()
116 | 
117 |         train_loss += loss.item()
118 |         _, predicted = outputs.max(1)
119 |         total += targets.size(0)
120 |         correct += predicted.eq(targets).sum().item()
121 | 
122 |         acc = 100 * correct / total
123 |         
124 |         batch_time = time.time() - start
125 |         
126 |         if batch_idx % 20 == 0:
127 |             print('Epoch: [{}/{}]| loss: {:.3f} | acc: {:.3f} | batch time: {:.3f}s '.format(
128 |                 batch_idx, len(train_loader), train_loss/(batch_idx+1), acc, batch_time))
129 |     
130 |     elapse_time = time.time() - epoch_start
131 |     elapse_time = datetime.timedelta(seconds=elapse_time)
132 |     print("Training time {}".format(elapse_time))
133 |     
134 | 
135 | if __name__=='__main__':
136 |     main()


--------------------------------------------------------------------------------
/single_gpu/model.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import time
  3 | 
  4 | import torch
  5 | import torch.nn as nn
  6 | import torch.nn.functional as F
  7 | 
  8 | 
  9 | # code from https://github.com/KellerJordan/ResNet-PyTorch-CIFAR10/blob/master/model.py
 10 | class IdentityPadding(nn.Module):
 11 |     def __init__(self, in_channels, out_channels, stride=1):
 12 |         super(IdentityPadding, self).__init__()
 13 | 
 14 |         if stride == 2:
 15 |             self.pooling = nn.AvgPool2d(kernel_size=2, stride=2, ceil_mode=True)
 16 |         else:
 17 |             self.pooling = None
 18 |             
 19 |         self.add_channels = out_channels - in_channels
 20 |     
 21 |     def forward(self, x):
 22 |         out = F.pad(x, (0, 0, 0, 0, 0, self.add_channels))
 23 |         if self.pooling is not None:
 24 |             out = self.pooling(out)
 25 |         return out
 26 | 
 27 | 
 28 | class ResidualBlock(nn.Module):
 29 |     def __init__(self, in_channels, out_channels, stride=1):
 30 |         super(ResidualBlock, self).__init__()
 31 |         self.bn1 = nn.BatchNorm2d(in_channels)
 32 |         self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, 
 33 |                                 stride=stride, padding=1, bias=False)      
 34 |         self.bn2 = nn.BatchNorm2d(out_channels)
 35 |         self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, 
 36 |                                 stride=1, padding=1, bias=False)    
 37 |         self.bn3 = nn.BatchNorm2d(out_channels)
 38 |         self.relu = nn.ReLU(inplace=True)
 39 | 
 40 |         self.down_sample = IdentityPadding(in_channels, out_channels, stride)
 41 |             
 42 |         self.stride = stride
 43 | 
 44 |     def forward(self, x):
 45 |         shortcut = self.down_sample(x)
 46 |         out = self.bn1(x)
 47 |         out = self.conv1(out)        
 48 |         out = self.bn2(out)
 49 |         out = self.relu(out)
 50 |         out = self.conv2(out)
 51 |         out = self.bn3(out)
 52 |        
 53 |         out += shortcut
 54 |         return out
 55 | 
 56 | 
 57 | class PyramidNet(nn.Module):
 58 |     def __init__(self, num_layers, alpha, block, num_classes=10):
 59 |         super(PyramidNet, self).__init__()   	
 60 |         self.in_channels = 16
 61 |         
 62 |         # num_layers = (110 - 2)/6 = 18
 63 |         self.num_layers = num_layers
 64 |         self.addrate = alpha / (3*self.num_layers*1.0)
 65 | 
 66 |         self.conv1 = nn.Conv2d(in_channels=3, out_channels=16, kernel_size=3, 
 67 |                                stride=1, padding=1, bias=False)
 68 |         self.bn1 = nn.BatchNorm2d(16)
 69 | 
 70 |         # feature map size = 32x32
 71 |         self.layer1 = self.get_layers(block, stride=1)
 72 |         # feature map size = 16x16
 73 |         self.layer2 = self.get_layers(block, stride=2)
 74 |         # feature map size = 8x8
 75 |         self.layer3 = self.get_layers(block, stride=2)
 76 | 
 77 |         self.out_channels = int(round(self.out_channels))
 78 |         self.bn_out= nn.BatchNorm2d(self.out_channels)
 79 |         self.relu_out = nn.ReLU(inplace=True)
 80 |         self.avgpool = nn.AvgPool2d(8, stride=1)
 81 |         self.fc_out = nn.Linear(self.out_channels, num_classes)
 82 | 
 83 |         for m in self.modules():
 84 |             if isinstance(m, nn.Conv2d):
 85 |                 nn.init.kaiming_normal_(m.weight, mode='fan_out', 
 86 |                                         nonlinearity='relu')
 87 |             elif isinstance(m, nn.BatchNorm2d):
 88 |                 nn.init.constant_(m.weight, 1)
 89 |                 nn.init.constant_(m.bias, 0)
 90 | 
 91 |     def get_layers(self, block, stride):
 92 |         layers_list = []
 93 |         for _ in range(self.num_layers - 1):
 94 |             self.out_channels = self.in_channels + self.addrate
 95 |             layers_list.append(block(int(round(self.in_channels)), 
 96 |                                      int(round(self.out_channels)), 
 97 |                                      stride))
 98 |             self.in_channels = self.out_channels
 99 |             stride=1
100 | 
101 |         return nn.Sequential(*layers_list)
102 | 
103 |     def forward(self, x):
104 |         x = self.conv1(x)
105 |         x = self.bn1(x)
106 |         
107 |         x = self.layer1(x)
108 |         x = self.layer2(x)
109 |         x = self.layer3(x)
110 | 
111 |         x = self.bn_out(x)
112 |         x = self.relu_out(x)
113 |         x = self.avgpool(x)
114 |         x = x.view(x.size(0), -1)
115 |         x = self.fc_out(x)
116 |         return x
117 |     
118 |     def save(self, path_to_checkpoints_dir, step):
119 |         path_to_checkpoint = os.path.join(path_to_checkpoints_dir, 'model-{:s}-{:d}.pth'.format(time.strftime('%Y%m%d%H%M'), step))
120 |         torch.save(self.state_dict(), path_to_checkpoint)
121 |         return path_to_checkpoint
122 | 
123 |     def load(self, path_to_checkpoint):
124 |         self.load_state_dict(torch.load(path_to_checkpoint))
125 |         return self
126 | 
127 | def pyramidnet():
128 |     block = ResidualBlock
129 |     model = PyramidNet(num_layers=18, alpha=270, block=block)
130 |     return model
131 | 


--------------------------------------------------------------------------------
/single_gpu/train.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import time
  3 | import datetime
  4 | 
  5 | import torch
  6 | import torch.nn as nn
  7 | import torch.optim as optim
  8 | from torch.optim import lr_scheduler
  9 | import torch.backends.cudnn as cudnn
 10 | 
 11 | import torchvision
 12 | import torchvision.transforms as transforms
 13 | from torchvision.datasets import CIFAR10
 14 | from torch.utils.data import DataLoader
 15 | 
 16 | from model import pyramidnet
 17 | import argparse
 18 | from tensorboardX import SummaryWriter
 19 | 
 20 | 
 21 | parser = argparse.ArgumentParser(description='cifar10 classification models')
 22 | parser.add_argument('--lr', default=0.1, help='')
 23 | parser.add_argument('--resume', default=None, help='')
 24 | parser.add_argument('--batch_size', type=int, default=512, help='')
 25 | parser.add_argument('--num_worker', type=int, default=4, help='')
 26 | args = parser.parse_args()
 27 | 
 28 | 
 29 | def main():
 30 |     best_acc = 0
 31 | 
 32 |     device = 'cuda' if torch.cuda.is_available() else 'cpu'
 33 | 
 34 |     print('==> Preparing data..')
 35 |     transforms_train = transforms.Compose([
 36 |         transforms.RandomCrop(32, padding=4),
 37 |         transforms.RandomHorizontalFlip(),
 38 |         transforms.ToTensor(),
 39 |         transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))])
 40 | 
 41 |     dataset_train = CIFAR10(root='../data', train=True, download=True, 
 42 |                             transform=transforms_train)
 43 | 
 44 |     train_loader = DataLoader(dataset_train, batch_size=args.batch_size, 
 45 |                               shuffle=True, num_workers=args.num_worker)
 46 | 
 47 |     # there are 10 classes so the dataset name is cifar-10
 48 |     classes = ('plane', 'car', 'bird', 'cat', 'deer', 
 49 |                'dog', 'frog', 'horse', 'ship', 'truck')
 50 | 
 51 |     print('==> Making model..')
 52 | 
 53 |     net = pyramidnet()
 54 |     net = net.to(device)
 55 |     num_params = sum(p.numel() for p in net.parameters() if p.requires_grad)
 56 |     print('The number of parameters of model is', num_params)
 57 | 
 58 |     criterion = nn.CrossEntropyLoss()
 59 |     optimizer = optim.SGD(net.parameters(), lr=args.lr, 
 60 |                           momentum=0.9, weight_decay=1e-4)
 61 |     
 62 |     train(net, criterion, optimizer, train_loader, device)
 63 |             
 64 | 
 65 | def train(net, criterion, optimizer, train_loader, device):
 66 |     net.train()
 67 | 
 68 |     train_loss = 0
 69 |     correct = 0
 70 |     total = 0
 71 |     
 72 |     epoch_start = time.time()
 73 |     for batch_idx, (inputs, targets) in enumerate(train_loader):
 74 |         start = time.time()
 75 |         
 76 |         inputs = inputs.to(device)
 77 |         targets = targets.to(device)
 78 |         outputs = net(inputs)
 79 |         loss = criterion(outputs, targets)
 80 | 
 81 |         optimizer.zero_grad()
 82 |         loss.backward()
 83 |         optimizer.step()
 84 | 
 85 |         train_loss += loss.item()
 86 |         _, predicted = outputs.max(1)
 87 |         total += targets.size(0)
 88 |         correct += predicted.eq(targets).sum().item()
 89 | 
 90 |         acc = 100 * correct / total
 91 |         
 92 |         batch_time = time.time() - start
 93 |         
 94 |         if batch_idx % 20 == 0:
 95 |             print('Epoch: [{}/{}]| loss: {:.3f} | acc: {:.3f} | batch time: {:.3f}s '.format(
 96 |                 batch_idx, len(train_loader), train_loss/(batch_idx+1), acc, batch_time))
 97 |     
 98 |     elapse_time = time.time() - epoch_start
 99 |     elapse_time = datetime.timedelta(seconds=elapse_time)
100 |     print("Training time {}".format(elapse_time))
101 |     
102 | 
103 | if __name__=='__main__':
104 |     main()


--------------------------------------------------------------------------------