├── Chapter02 ├── main.py ├── my_net.py ├── parameter_server.py └── worker.py ├── Chapter03 ├── ddp │ ├── main.py │ └── my_net.py └── dp │ ├── main.py │ └── my_net.py ├── Chapter07 ├── main.py └── my_net.py ├── LICENSE └── README.md /Chapter02/main.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torchvision import datasets, transforms 3 | 4 | from my_net import * 5 | from worker import * 6 | from parameter_server import * 7 | 8 | train_loader = torch.utils.data.DataLoader(datasets.MNIST('./mnist_data', download=True, train=True, 9 | transform = transforms.Compose([transforms.ToTensor(), 10 | transforms.Normalize((0.1307,),(0.3081,))])), 11 | batch_size=128, shuffle=True) 12 | test_loader = torch.utils.data.DataLoader(datasets.MNIST('./mnist_data', download=True, train=False, 13 | transform = transforms.Compose([transforms.ToTensor(), 14 | transforms.Normalize((0.1307,),(0.3081,))])), 15 | batch_size=128, shuffle=True) 16 | 17 | def main(): 18 | ps = ParameterServer() 19 | worker = Worker() 20 | 21 | for batch_idx, (data, target) in enumerate(train_loader): 22 | params = ps.get_weights() 23 | worker.pull_weights(params) 24 | grads = worker.push_gradients(batch_idx, data, target) 25 | ps.update_model(grads) 26 | print("Done Training") 27 | 28 | if __name__ == '__main__': 29 | main() 30 | -------------------------------------------------------------------------------- /Chapter02/my_net.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | class MyNet(nn.Module): 6 | def __init__(self): 7 | super(MyNet, self).__init__() 8 | if torch.cuda.is_available(): 9 | device = torch.device(f"cuda:0") 10 | else: 11 | device = torch.device("cpu") 12 | self.conv1 = nn.Conv2d(1,32,3,1).to(device) 13 | self.dropout1 = nn.Dropout2d(0.5).to(device) 14 | self.conv2 = nn.Conv2d(32,64,3,1).to(device) 15 | self.dropout2 = nn.Dropout2d(0.75).to(device) 16 | self.fc1 = nn.Linear(9216, 128).to(device) 17 | self.fc2 = nn.Linear(128,20).to(device) 18 | self.fc3 = nn.Linear(20,10).to(device) 19 | 20 | def forward(self, x): 21 | x = self.conv1(x) 22 | x = self.dropout1(x) 23 | x = F.relu(x) 24 | x = self.conv2(x) 25 | x = self.dropout2(x) 26 | x = F.max_pool2d(x,2) 27 | x = torch.flatten(x,1) 28 | 29 | x = self.fc1(x) 30 | x = F.relu(x) 31 | x = self.fc2(x) 32 | x = F.relu(x) 33 | x = self.fc3(x) 34 | 35 | output = F.log_softmax(x, dim = 1) 36 | return output 37 | 38 | -------------------------------------------------------------------------------- /Chapter02/parameter_server.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.optim as optim 4 | from my_net import * 5 | 6 | class ParameterServer(nn.Module): 7 | def __init__(self): 8 | super().__init__() 9 | self.model = MyNet() 10 | 11 | if torch.cuda.is_available(): 12 | self.input_device = torch.device("cuda:0") 13 | else: 14 | self.input_device = torch.device("cpu") 15 | 16 | self.optimizer = optim.SGD(self.model.parameters(), lr = 0.05) 17 | 18 | def get_weights(self): 19 | return self.model.state_dict() 20 | 21 | def update_model(self, grads): 22 | for para, grad in zip(self.model.parameters(), grads): 23 | para.grad = grad 24 | self.optimizer.step() 25 | self.optimizer.zero_grad() 26 | 27 | -------------------------------------------------------------------------------- /Chapter02/worker.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | #import torch.optim as optim 4 | 5 | from my_net import * 6 | 7 | class Worker(nn.Module): 8 | def __init__(self): 9 | super().__init__() 10 | self.model = MyNet() 11 | if torch.cuda.is_available(): 12 | self.input_device = torch.device("cuda:0") 13 | else: 14 | self.input_device = torch.device("cpu") 15 | 16 | def pull_weights(self, model_params): 17 | self.model.load_state_dict(model_params) 18 | 19 | def push_gradients(self, batch_idx, data, target): 20 | data, target = data.to(self.input_device), target.to(self.input_device) 21 | output = self.model(data) 22 | data.requires_grad = True 23 | loss = F.nll_loss(output, target) 24 | loss.backward() 25 | grads = [] 26 | for layer in self.parameters(): 27 | grad = layer.grad 28 | grads.append(grad) 29 | print(f"batch {batch_idx} training :: loss {loss.item()}") 30 | return grads 31 | 32 | -------------------------------------------------------------------------------- /Chapter03/ddp/main.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import datetime 4 | 5 | from my_net import * 6 | import torch.distributed as dist 7 | import torch.distributed.autograd as dist_autograd 8 | import torch.multiprocessing as mp 9 | from torchvision import datasets, transforms 10 | from torch import optim 11 | from torch.distributed.optim import DistributedOptimizer 12 | from torch.nn.parallel import DistributedDataParallel as DDP 13 | from torch.utils.data.distributed import DistributedSampler as DDP_sampler 14 | 15 | train_all_set = datasets.MNIST('./mnist_data', download=True, train=True, 16 | transform = transforms.Compose([transforms.ToTensor(), 17 | transforms.Normalize((0.1307,),(0.3081,))])) 18 | train_set, val_set = torch.utils.data.random_split( train_all_set, 19 | [50000, 10000]) 20 | 21 | test_set = datasets.MNIST('./mnist_data', download=True, train=False, 22 | transform = transforms.Compose([transforms.ToTensor(), 23 | transforms.Normalize((0.1307,),(0.3081,))])) 24 | 25 | def net_setup(): 26 | os.environ['MASTER_ADDR'] = '172.31.26.15' 27 | os.environ['MASTER_PORT'] = '12345' 28 | 29 | def checkpointing(rank, epoch, net, optimizer, loss): 30 | path = f"model{rank}.pt" 31 | torch.save({ 32 | 'epoch':epoch, 33 | 'model_state':net.state_dict(), 34 | 'loss': loss, 35 | 'optim_state': optimizer.state_dict(), 36 | }, path) 37 | print(f"Checkpointing model {rank} done.") 38 | 39 | def load_checkpoint(rank, machines): 40 | path = f"model{rank}.pt" 41 | checkpoint = torch.load(path) 42 | model = torch.nn.DataParallel(MyNet(), device_ids=[rank%machines]) 43 | optimizer = torch.optim.SGD(model.parameters(), lr = 5e-4) 44 | 45 | epoch = checkpoint['epoch'] 46 | loss = checkpoint['loss'] 47 | model.load_state_dict(checkpoint['model_state']) 48 | optimizer.load_state_dict(checkpoint['optim_state']) 49 | return model, optimizer, epoch, loss 50 | 51 | def validation(model, val_set): 52 | model.eval() 53 | val_loader = torch.utils.data.DataLoader(val_set, batch_size=128) 54 | correct_total = 0 55 | with torch.no_grad(): 56 | for idx, (data, target) in enumerate(val_loader): 57 | output = model(data) 58 | predict = output.argmax(dim=1, keepdim=True).cuda() 59 | target = target.cuda() 60 | correct = predict.eq(target.view_as(predict)).sum().item() 61 | correct_total += correct 62 | acc = correct_total/len(val_loader.dataset) 63 | print(f"Validation Accuracy {acc}") 64 | 65 | def train(local_rank, args): 66 | torch.manual_seed(123) 67 | world_size = args.machines*args.gpus 68 | rank = args.mid * args.gpus + local_rank 69 | dist.init_process_group('nccl', rank =rank, world_size = world_size, 70 | timeout=datetime.timedelta(seconds=60)) 71 | 72 | torch.cuda.set_device(local_rank) 73 | model = MyNet() 74 | local_train_sampler = DDP_sampler(datasets.MNIST('./mnist_data', download=True, train=True, 75 | transform = transforms.Compose([transforms.ToTensor(), 76 | transforms.Normalize((0.1307,),(0.3081,))])), rank = rank, num_replicas = world_size) 77 | local_train_loader = torch.utils.data.DataLoader(datasets.MNIST('./mnist_data', download=True, train=True, 78 | transform = transforms.Compose([transforms.ToTensor(), 79 | transforms.Normalize((0.1307,),(0.3081,))])), 80 | batch_size = 128, 81 | shuffle = False, 82 | sampler = local_train_sampler) 83 | 84 | optimizer = torch.optim.SGD(model.parameters(), lr = 5e-4) 85 | model = DDP(model, device_ids=[local_rank]) 86 | 87 | for epoch in range(args.epochs): 88 | print(f"Epoch {epoch}") 89 | for idx, (data, target) in enumerate(local_train_loader): 90 | data = data.cuda() 91 | target = target.cuda() 92 | output = model(data) 93 | loss = F.cross_entropy(output, target) 94 | loss.backward() 95 | optimizer.step() 96 | print(f"batch {idx} training :: loss {loss.item()}") 97 | checkpointing(rank, epoch, model, optimizer, loss.item()) 98 | validation(model, val_set) 99 | print("Training Done!") 100 | dist.destroy_process_group() 101 | 102 | def test(local_rank, args): 103 | world_size = args.machines*args.gpus 104 | rank = args.mid * args.gpus + local_rank 105 | dist.init_process_group('nccl', rank =rank, world_size = world_size, 106 | timeout=datetime.timedelta(seconds=60)) 107 | 108 | torch.cuda.set_device(local_rank) 109 | print(f"Load checkpoint {rank}") 110 | model, optimizer, epoch, loss = load_checkpoint(rank, args.machines) 111 | print("Checkpoint loading done!") 112 | 113 | local_test_sampler = DDP_sampler(test_set, rank = rank, num_replicas = world_size) 114 | 115 | model.eval() 116 | local_test_loader = torch.utils.data.DataLoader(test_set, 117 | batch_size=128, 118 | shuffle = False, 119 | sampler = local_test_sampler) 120 | correct_total = 0 121 | with torch.no_grad(): 122 | for idx, (data, target) in enumerate(local_test_loader): 123 | output = model(data) 124 | predict = output.argmax(dim=1, keepdim=True).cuda() 125 | target = target.cuda() 126 | correct = predict.eq(target.view_as(predict)).sum().item() 127 | correct_total += correct 128 | acc = correct_total/len(local_test_loader.dataset) 129 | print(f"GPU {rank}, Test Accuracy {acc}") 130 | print("Test Done!") 131 | dist.destroy_process_group() 132 | 133 | def main(): 134 | parser = argparse.ArgumentParser(description = 'distributed data parallel training') 135 | parser.add_argument('-m', '--machines', default=2, type=int, help='number of machines') 136 | parser.add_argument('-g', '--gpus', default = 4, type=int, help='number of GPUs in a machine') 137 | parser.add_argument('-id', '--mid', default = 0, type=int, help='machine id number') 138 | parser.add_argument('-e', '--epochs', default = 10, type = int, help='number of epochs') 139 | args = parser.parse_args() 140 | net_setup() 141 | mp.spawn(train, nprocs=args.gpus, args=(args,), join=True) 142 | mp.spawn(test, nprocs=args.gpus, args=(args,), join=True) 143 | 144 | if __name__ == '__main__': 145 | main() 146 | -------------------------------------------------------------------------------- /Chapter03/ddp/my_net.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | class MyNet(nn.Module): 6 | def __init__(self): 7 | super(MyNet, self).__init__() 8 | if torch.cuda.is_available(): 9 | device = torch.device(f"cuda") 10 | else: 11 | device = torch.device("cpu") 12 | self.conv1 = nn.Conv2d(1,32,3,1).to(device) 13 | self.dropout1 = nn.Dropout2d(0.5).to(device) 14 | self.conv2 = nn.Conv2d(32,64,3,1).to(device) 15 | self.dropout2 = nn.Dropout2d(0.75).to(device) 16 | self.fc1 = nn.Linear(9216, 128).to(device) 17 | self.fc2 = nn.Linear(128,20).to(device) 18 | self.fc3 = nn.Linear(20,10).to(device) 19 | 20 | def forward(self, x): 21 | x = self.conv1(x) 22 | x = self.dropout1(x) 23 | x = F.relu(x) 24 | x = self.conv2(x) 25 | x = self.dropout2(x) 26 | x = F.max_pool2d(x,2) 27 | x = torch.flatten(x,1) 28 | 29 | x = self.fc1(x) 30 | x = F.relu(x) 31 | x = self.fc2(x) 32 | x = F.relu(x) 33 | x = self.fc3(x) 34 | 35 | output = F.log_softmax(x, dim = 1) 36 | return output 37 | 38 | -------------------------------------------------------------------------------- /Chapter03/dp/main.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.utils.data import Dataset, DataLoader 3 | from my_net import * 4 | 5 | from torchvision import datasets, transforms 6 | from torch import optim 7 | 8 | train_set = datasets.MNIST('./mnist_data', download=True, train=True, 9 | transform = transforms.Compose([transforms.ToTensor(), 10 | transforms.Normalize((0.1307,),(0.3081,))])) 11 | 12 | test_set = datasets.MNIST('./mnist_data', download=True, train=False, 13 | transform = transforms.Compose([transforms.ToTensor(), 14 | transforms.Normalize((0.1307,),(0.3081,))])) 15 | 16 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 17 | 18 | train_loader = DataLoader(train_set, batch_size = 128, shuffle=True, pin_memory = True) 19 | 20 | train_epoch = 2 21 | 22 | def main(): 23 | model = MyNet() 24 | print("Using ", torch.cuda.device_count(), "GPUs for data parallel training") 25 | optimizer = torch.optim.SGD(model.parameters(), lr = 5e-4) 26 | model = nn.DataParallel(model) 27 | model.to(device) 28 | #Training 29 | for epoch in range(train_epoch): 30 | print(f"Epoch {epoch}") 31 | for idx, (data, target) in enumerate(train_loader): 32 | data, target = data.cuda(), target.cuda() 33 | output = model(data) 34 | loss = F.cross_entropy(output, target) 35 | loss.backward() 36 | optimizer.step() 37 | print(f"batch {idx}, loss {loss.item()}") 38 | print("Training Done!") 39 | 40 | 41 | if __name__ == '__main__': 42 | main() 43 | 44 | -------------------------------------------------------------------------------- /Chapter03/dp/my_net.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | class MyNet(nn.Module): 6 | def __init__(self): 7 | super(MyNet, self).__init__() 8 | if torch.cuda.is_available(): 9 | device = torch.device(f"cuda") 10 | else: 11 | device = torch.device("cpu") 12 | self.conv1 = nn.Conv2d(1,32,3,1).to(device) 13 | self.dropout1 = nn.Dropout2d(0.5).to(device) 14 | self.conv2 = nn.Conv2d(32,64,3,1).to(device) 15 | self.dropout2 = nn.Dropout2d(0.75).to(device) 16 | self.fc1 = nn.Linear(9216, 128).to(device) 17 | self.fc2 = nn.Linear(128,20).to(device) 18 | self.fc3 = nn.Linear(20,10).to(device) 19 | 20 | def forward(self, x): 21 | x = self.conv1(x) 22 | x = self.dropout1(x) 23 | x = F.relu(x) 24 | x = self.conv2(x) 25 | x = self.dropout2(x) 26 | x = F.max_pool2d(x,2) 27 | x = torch.flatten(x,1) 28 | 29 | x = self.fc1(x) 30 | x = F.relu(x) 31 | x = self.fc2(x) 32 | x = F.relu(x) 33 | x = self.fc3(x) 34 | 35 | output = F.log_softmax(x, dim = 1) 36 | return output 37 | 38 | -------------------------------------------------------------------------------- /Chapter07/main.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import datetime 4 | 5 | from my_net import * 6 | import torch.distributed as dist 7 | import torch.distributed.autograd as dist_autograd 8 | import torch.multiprocessing as mp 9 | from torchvision import datasets, transforms 10 | from torch import optim 11 | from torch.distributed.optim import DistributedOptimizer 12 | from torch.nn.parallel import DistributedDataParallel as DDP 13 | from torch.utils.data.distributed import DistributedSampler as DDP_sampler 14 | 15 | train_all_set = datasets.MNIST('./mnist_data', download=True, train=True, 16 | transform = transforms.Compose([transforms.ToTensor(), 17 | transforms.Normalize((0.1307,),(0.3081,))])) 18 | train_set, val_set = torch.utils.data.random_split( train_all_set, 19 | [50000, 10000]) 20 | 21 | test_set = datasets.MNIST('./mnist_data', download=True, train=False, 22 | transform = transforms.Compose([transforms.ToTensor(), 23 | transforms.Normalize((0.1307,),(0.3081,))])) 24 | 25 | def train(args): 26 | model = MyNet() 27 | model.train() 28 | trainset= datasets.MNIST('./mnist_data', download=True, train=True, 29 | transform = transforms.Compose([transforms.ToTensor(), 30 | transforms.Normalize((0.1307,),(0.3081,))])) 31 | trainloader = torch.utils.data.DataLoader(trainset, batch_size=128, shuffle=True, num_workers=4) 32 | criterion = nn.CrossEntropyLoss() 33 | optimizer = torch.optim.SGD(model.parameters(), lr = 1e-3) 34 | 35 | for epoch in range(args.epochs): 36 | print(f"Epoch {epoch}") 37 | for idx, (data, target) in enumerate(trainloader): 38 | data = data.to('cuda:0') 39 | optimizer.zero_grad() 40 | output = model(data) 41 | target = target.to(output.device) 42 | loss = F.cross_entropy(output, target) 43 | loss.backward() 44 | optimizer.step() 45 | print(f"batch {idx} training :: loss {loss.item()}") 46 | print("Training Done!") 47 | return model 48 | def test(args, model): 49 | model.eval() 50 | testset = datasets.MNIST('./mnist_data', download=True, train=True, 51 | transform = transforms.Compose([transforms.ToTensor(), 52 | transforms.Normalize((0.1307,),(0.3081,))])) 53 | testloader = torch.utils.data.DataLoader(testset, batch_size=128,shuffle=False, num_workers=4) 54 | correct_total = 0 55 | with torch.no_grad(): 56 | for idx, (data, target) in enumerate(testloader): 57 | output = model(data.to('cuda:0')) 58 | predict = output.argmax(dim=1, keepdim=True).to(output.device) 59 | target = target.to(output.device) 60 | correct = predict.eq(target.view_as(predict)).sum().item() 61 | correct_total += correct 62 | acc = correct_total/len(testloader.dataset) 63 | print(f"Test Accuracy {acc}") 64 | print("Test Done!") 65 | 66 | def main(): 67 | parser = argparse.ArgumentParser(description = 'model parallel training') 68 | parser.add_argument('-e', '--epochs', default = 4, type = int, help='number of epochs') 69 | args = parser.parse_args() 70 | trained_model = train(args) 71 | test(args, trained_model) 72 | 73 | if __name__ == '__main__': 74 | main() 75 | -------------------------------------------------------------------------------- /Chapter07/my_net.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | class MyNet(nn.Module): 6 | def __init__(self): 7 | super(MyNet, self).__init__() 8 | self.seq1 = nn.Sequential( 9 | nn.Conv2d(1,32,3,1), 10 | nn.Dropout2d(0.5), 11 | nn.Conv2d(32,64,3,1), 12 | nn.Dropout2d(0.75)).to('cuda:0') 13 | self.seq2 = nn.Sequential( 14 | nn.Linear(9216, 128), 15 | nn.Linear(128,20), 16 | nn.Linear(20,10)).to('cuda:2') 17 | 18 | def forward(self, x): 19 | x = self.seq1(x.to('cuda:0')) 20 | x = F.max_pool2d(x,2).to('cuda:1') 21 | x = torch.flatten(x,1).to('cuda:1') 22 | x = self.seq2(x.to('cuda:2')) 23 | output = F.log_softmax(x, dim = 1) 24 | return output 25 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Packt 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | ### [Packt Conference : Put Generative AI to work on Oct 11-13 (Virtual)](https://packt.link/JGIEY) 3 | 4 |
[](https://packt.link/JGIEY)
5 | 3 Days, 20+ AI Experts, 25+ Workshops and Power Talks 6 | 7 | Code: USD75OFF 8 | 9 | 10 | 11 | 12 | # Distributed Machine Learning with Python 13 | 14 |