├── .gitignore ├── LICENSE ├── README.md ├── mnist ├── Dockerfile ├── README.md ├── docker-compose-gpu.yml ├── docker-compose.yml └── main.py ├── setup.cfg └── toy ├── README.md └── main.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # SageMath parsed files 80 | *.sage.py 81 | 82 | # dotenv 83 | .env 84 | 85 | # virtualenv 86 | .venv 87 | venv/ 88 | ENV/ 89 | 90 | # Spyder project settings 91 | .spyderproject 92 | .spyproject 93 | 94 | # Rope project settings 95 | .ropeproject 96 | 97 | # mkdocs documentation 98 | /site 99 | 100 | # mypy 101 | .mypy_cache/ 102 | 103 | data/ 104 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 なるみ 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Pytorch Distributed Example 2 | 3 | If you are using previous version of PyTorch: 4 | 5 | - [v1.1.0](https://github.com/narumiruna/pytorch-distributed-example/tree/v1.1.0) 6 | - [v1.0.1](https://github.com/narumiruna/pytorch-distributed-example/tree/v1.0.1) 7 | - [v0.4.1](https://github.com/narumiruna/pytorch-distributed-example/tree/v0.4.1) 8 | 9 | ## Requirements 10 | 11 | - pytorch 12 | - torchvision 13 | 14 | ## References 15 | 16 | - [Distributed communication package - torch.distributed](http://pytorch.org/docs/master/distributed.html) 17 | - [Writing Distributed Applications with PyTorch](http://pytorch.org/tutorials/intermediate/dist_tuto.html) 18 | -------------------------------------------------------------------------------- /mnist/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM pytorch/pytorch:1.2-cuda10.0-cudnn7-runtime 2 | 3 | RUN pip install torchvision \ 4 | && rm -rf ~/.cache/pip 5 | 6 | ENV GLOO_SOCKET_IFNAME=eth0 7 | ENV NCCL_SOCKET_IFNAME=eth0 8 | 9 | WORKDIR /work 10 | RUN python -c "from torchvision import datasets;datasets.MNIST('data', download=True)" 11 | COPY main.py . 12 | -------------------------------------------------------------------------------- /mnist/README.md: -------------------------------------------------------------------------------- 1 | # MNIST Example 2 | 3 | ```shell 4 | export GLOO_SOCKET_IFNAME=eth0 5 | ``` 6 | 7 | Rank 0 8 | ``` 9 | $ python3 main.py --init-method tcp://127.0.0.1:23456 --rank 0 --world-size 2 10 | ``` 11 | 12 | Rank 1 13 | ``` 14 | $ python3 main.py --init-method tcp://127.0.0.1:23456 --rank 1 --world-size 2 15 | ``` 16 | 17 | ## Use specific root directory for running example on single machine. 18 | 19 | Rank 0 20 | ``` 21 | $ python3 main.py --init-method tcp://127.0.0.1:23456 --rank 0 --world-size 2 --root data0 22 | ``` 23 | 24 | Rank 1 25 | ``` 26 | $ python3 main.py --init-method tcp://127.0.0.1:23456 --rank 1 --world-size 2 --root data1 27 | ``` 28 | 29 | ## Run in docker 30 | 31 | Install [docker](https://docs.docker.com/install/), [docker-compose](https://docs.docker.com/compose/install/) and [NVIDIA docker](https://github.com/NVIDIA/nvidia-docker) (if you want to run with GPU) 32 | 33 | ``` 34 | $ docker build --file Dockerfile --tag pytorch-distributed-example . 35 | $ docker-compose up 36 | For GPU 37 | $ docker-compose --file docker-compose-gpu.yml up 38 | ``` 39 | -------------------------------------------------------------------------------- /mnist/docker-compose-gpu.yml: -------------------------------------------------------------------------------- 1 | version: "2.3" 2 | services: 3 | rank0: 4 | image: pytorch-distributed-example 5 | runtime: nvidia 6 | networks: 7 | bridge: 8 | ipv4_address: 10.1.0.10 9 | command: python -u main.py --backend nccl --init-method tcp://10.1.0.10:23456 --world-size 2 --rank 0 10 | rank1: 11 | image: pytorch-distributed-example 12 | runtime: nvidia 13 | networks: 14 | bridge: 15 | ipv4_address: 10.1.0.11 16 | command: python -u main.py --backend nccl --init-method tcp://10.1.0.10:23456 --world-size 2 --rank 1 17 | networks: 18 | bridge: 19 | driver: bridge 20 | ipam: 21 | config: 22 | - subnet: 10.1.0.0/16 23 | -------------------------------------------------------------------------------- /mnist/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "2.3" 2 | services: 3 | rank0: 4 | image: pytorch-distributed-example 5 | networks: 6 | bridge: 7 | ipv4_address: 10.1.0.10 8 | command: python -u main.py --init-method tcp://10.1.0.10:23456 --world-size 2 --rank 0 9 | rank1: 10 | image: pytorch-distributed-example 11 | networks: 12 | bridge: 13 | ipv4_address: 10.1.0.11 14 | command: python -u main.py --init-method tcp://10.1.0.10:23456 --world-size 2 --rank 1 15 | networks: 16 | bridge: 17 | driver: bridge 18 | ipam: 19 | config: 20 | - subnet: 10.1.0.0/16 21 | -------------------------------------------------------------------------------- /mnist/main.py: -------------------------------------------------------------------------------- 1 | from __future__ import division, print_function 2 | 3 | import argparse 4 | 5 | import torch 6 | import torch.nn.functional as F 7 | from torch import distributed, nn 8 | from torch.utils import data 9 | from torchvision import datasets, transforms 10 | 11 | 12 | def distributed_is_initialized(): 13 | if distributed.is_available(): 14 | if distributed.is_initialized(): 15 | return True 16 | return False 17 | 18 | 19 | class Average(object): 20 | 21 | def __init__(self): 22 | self.sum = 0 23 | self.count = 0 24 | 25 | def __str__(self): 26 | return '{:.6f}'.format(self.average) 27 | 28 | @property 29 | def average(self): 30 | return self.sum / self.count 31 | 32 | def update(self, value, number): 33 | self.sum += value * number 34 | self.count += number 35 | 36 | 37 | class Accuracy(object): 38 | 39 | def __init__(self): 40 | self.correct = 0 41 | self.count = 0 42 | 43 | def __str__(self): 44 | return '{:.2f}%'.format(self.accuracy * 100) 45 | 46 | @property 47 | def accuracy(self): 48 | return self.correct / self.count 49 | 50 | @torch.no_grad() 51 | def update(self, output, target): 52 | pred = output.argmax(dim=1) 53 | correct = pred.eq(target).sum().item() 54 | 55 | self.correct += correct 56 | self.count += output.size(0) 57 | 58 | 59 | class Trainer(object): 60 | 61 | def __init__(self, model, optimizer, train_loader, test_loader, device): 62 | self.model = model 63 | self.optimizer = optimizer 64 | self.train_loader = train_loader 65 | self.test_loader = test_loader 66 | self.device = device 67 | 68 | def fit(self, epochs): 69 | for epoch in range(1, epochs + 1): 70 | train_loss, train_acc = self.train() 71 | test_loss, test_acc = self.evaluate() 72 | 73 | print( 74 | 'Epoch: {}/{},'.format(epoch, epochs), 75 | 'train loss: {}, train acc: {},'.format(train_loss, train_acc), 76 | 'test loss: {}, test acc: {}.'.format(test_loss, test_acc), 77 | ) 78 | 79 | def train(self): 80 | self.model.train() 81 | 82 | train_loss = Average() 83 | train_acc = Accuracy() 84 | 85 | for data, target in self.train_loader: 86 | data = data.to(self.device) 87 | target = target.to(self.device) 88 | 89 | output = self.model(data) 90 | loss = F.cross_entropy(output, target) 91 | 92 | self.optimizer.zero_grad() 93 | loss.backward() 94 | self.optimizer.step() 95 | 96 | train_loss.update(loss.item(), data.size(0)) 97 | train_acc.update(output, target) 98 | 99 | return train_loss, train_acc 100 | 101 | @torch.no_grad() 102 | def evaluate(self): 103 | self.model.eval() 104 | 105 | test_loss = Average() 106 | test_acc = Accuracy() 107 | 108 | for data, target in self.test_loader: 109 | data = data.to(self.device) 110 | target = target.to(self.device) 111 | 112 | output = self.model(data) 113 | loss = F.cross_entropy(output, target) 114 | 115 | test_loss.update(loss.item(), data.size(0)) 116 | test_acc.update(output, target) 117 | 118 | return test_loss, test_acc 119 | 120 | 121 | class Net(nn.Module): 122 | 123 | def __init__(self): 124 | super(Net, self).__init__() 125 | self.fc = nn.Linear(784, 10) 126 | 127 | def forward(self, x): 128 | return self.fc(x.view(x.size(0), -1)) 129 | 130 | 131 | class MNISTDataLoader(data.DataLoader): 132 | 133 | def __init__(self, root, batch_size, train=True): 134 | transform = transforms.Compose([ 135 | transforms.ToTensor(), 136 | transforms.Normalize((0.1307,), (0.3081,)), 137 | ]) 138 | 139 | dataset = datasets.MNIST(root, train=train, transform=transform, download=True) 140 | sampler = None 141 | if train and distributed_is_initialized(): 142 | sampler = data.DistributedSampler(dataset) 143 | 144 | super(MNISTDataLoader, self).__init__( 145 | dataset, 146 | batch_size=batch_size, 147 | shuffle=(sampler is None), 148 | sampler=sampler, 149 | ) 150 | 151 | 152 | def run(args): 153 | device = torch.device('cuda' if torch.cuda.is_available() and not args.no_cuda else 'cpu') 154 | 155 | model = Net() 156 | if distributed_is_initialized(): 157 | model.to(device) 158 | model = nn.parallel.DistributedDataParallel(model) 159 | else: 160 | model = nn.DataParallel(model) 161 | model.to(device) 162 | 163 | optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate) 164 | 165 | train_loader = MNISTDataLoader(args.root, args.batch_size, train=True) 166 | test_loader = MNISTDataLoader(args.root, args.batch_size, train=False) 167 | 168 | trainer = Trainer(model, optimizer, train_loader, test_loader, device) 169 | trainer.fit(args.epochs) 170 | 171 | 172 | def main(): 173 | parser = argparse.ArgumentParser() 174 | parser.add_argument('--backend', type=str, default='gloo', help='Name of the backend to use.') 175 | parser.add_argument('-i', 176 | '--init-method', 177 | type=str, 178 | default='tcp://127.0.0.1:23456', 179 | help='URL specifying how to initialize the package.') 180 | parser.add_argument('-s', '--world-size', type=int, default=1, help='Number of processes participating in the job.') 181 | parser.add_argument('-r', '--rank', type=int, default=0, help='Rank of the current process.') 182 | parser.add_argument('--epochs', type=int, default=20) 183 | parser.add_argument('--no-cuda', action='store_true') 184 | parser.add_argument('-lr', '--learning-rate', type=float, default=1e-3) 185 | parser.add_argument('--root', type=str, default='data') 186 | parser.add_argument('--batch-size', type=int, default=128) 187 | args = parser.parse_args() 188 | print(args) 189 | 190 | if args.world_size > 1: 191 | distributed.init_process_group( 192 | backend=args.backend, 193 | init_method=args.init_method, 194 | world_size=args.world_size, 195 | rank=args.rank, 196 | ) 197 | 198 | run(args) 199 | 200 | 201 | if __name__ == '__main__': 202 | main() 203 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [yapf] 2 | based_on_style = google 3 | column_limit = 120 4 | -------------------------------------------------------------------------------- /toy/README.md: -------------------------------------------------------------------------------- 1 | # Toy Example 2 | 3 | Rank 0 4 | ``` 5 | $ python3 main.py --init-method tcp://127.0.0.1:23456 --rank 0 --world-size 2 6 | ``` 7 | 8 | Rank 2 9 | ``` 10 | $ python3 main.py --init-method tcp://127.0.0.1:23456 --rank 1 --world-size 2 11 | ``` 12 | -------------------------------------------------------------------------------- /toy/main.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from random import randint 3 | from time import sleep 4 | 5 | import torch 6 | import torch.distributed as dist 7 | 8 | 9 | def run(world_size, rank, steps): 10 | for step in range(1, steps + 1): 11 | # get random int 12 | value = randint(0, 10) 13 | 14 | # group all ranks 15 | ranks = list(range(world_size)) 16 | group = dist.new_group(ranks=ranks) 17 | 18 | # compute reduced sum 19 | tensor = torch.tensor(value, dtype=torch.int) 20 | dist.all_reduce(tensor, op=dist.ReduceOp.SUM, group=group) 21 | 22 | print('rank: {}, step: {}, value: {}, reduced sum: {}.'.format(rank, step, value, tensor.item())) 23 | 24 | sleep(1) 25 | 26 | 27 | def main(): 28 | parser = argparse.ArgumentParser() 29 | parser.add_argument('--backend', type=str, default='gloo', help='Name of the backend to use.') 30 | parser.add_argument( 31 | '-i', 32 | '--init-method', 33 | type=str, 34 | default='tcp://127.0.0.1:23456', 35 | help='URL specifying how to initialize the package.') 36 | parser.add_argument('-s', '--world-size', type=int, help='Number of processes participating in the job.') 37 | parser.add_argument('-r', '--rank', type=int, help='Rank of the current process.') 38 | parser.add_argument('--steps', type=int, default=20) 39 | args = parser.parse_args() 40 | print(args) 41 | 42 | dist.init_process_group( 43 | backend=args.backend, 44 | init_method=args.init_method, 45 | world_size=args.world_size, 46 | rank=args.rank, 47 | ) 48 | 49 | run(args.world_size, args.rank, args.steps) 50 | 51 | 52 | if __name__ == '__main__': 53 | main() 54 | --------------------------------------------------------------------------------