├── Chapter02
    ├── main.py
    ├── my_net.py
    ├── parameter_server.py
    └── worker.py
├── Chapter03
    ├── ddp
    │   ├── main.py
    │   └── my_net.py
    └── dp
    │   ├── main.py
    │   └── my_net.py
├── Chapter07
    ├── main.py
    └── my_net.py
├── LICENSE
└── README.md


/Chapter02/main.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torchvision import datasets, transforms
 3 | 
 4 | from my_net import *
 5 | from worker import *
 6 | from parameter_server import *
 7 | 
 8 | train_loader = torch.utils.data.DataLoader(datasets.MNIST('./mnist_data', download=True, train=True,
 9 |                transform = transforms.Compose([transforms.ToTensor(), 
10 |                transforms.Normalize((0.1307,),(0.3081,))])),
11 |                batch_size=128, shuffle=True)
12 | test_loader = torch.utils.data.DataLoader(datasets.MNIST('./mnist_data', download=True, train=False,
13 |               transform = transforms.Compose([transforms.ToTensor(), 
14 |               transforms.Normalize((0.1307,),(0.3081,))])),
15 |               batch_size=128, shuffle=True)
16 | 
17 | def main():
18 | 	ps = ParameterServer()
19 | 	worker = Worker()
20 | 	
21 | 	for batch_idx, (data, target) in enumerate(train_loader):
22 | 		params = ps.get_weights()
23 | 		worker.pull_weights(params)
24 | 		grads = worker.push_gradients(batch_idx, data, target)
25 | 		ps.update_model(grads)
26 | 	print("Done Training")
27 | 
28 | if __name__ == '__main__':
29 | 	main()
30 | 


--------------------------------------------------------------------------------
/Chapter02/my_net.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | 
 5 | class MyNet(nn.Module):
 6 | 	def __init__(self):
 7 | 		super(MyNet, self).__init__()
 8 | 		if torch.cuda.is_available():
 9 | 			device = torch.device(f"cuda:0")
10 | 		else:
11 | 			device = torch.device("cpu")
12 | 		self.conv1 = nn.Conv2d(1,32,3,1).to(device)
13 | 		self.dropout1 = nn.Dropout2d(0.5).to(device)
14 | 		self.conv2 = nn.Conv2d(32,64,3,1).to(device)
15 | 		self.dropout2 = nn.Dropout2d(0.75).to(device)
16 | 		self.fc1 = nn.Linear(9216, 128).to(device)
17 | 		self.fc2 = nn.Linear(128,20).to(device)
18 | 		self.fc3 = nn.Linear(20,10).to(device)
19 | 
20 | 	def forward(self, x):
21 | 		x = self.conv1(x)
22 | 		x = self.dropout1(x)
23 | 		x = F.relu(x)
24 | 		x = self.conv2(x)
25 | 		x = self.dropout2(x)
26 | 		x = F.max_pool2d(x,2)
27 | 		x = torch.flatten(x,1)
28 | 
29 | 		x = self.fc1(x)
30 | 		x = F.relu(x)
31 | 		x = self.fc2(x)
32 | 		x = F.relu(x)
33 | 		x = self.fc3(x)
34 | 
35 | 		output = F.log_softmax(x, dim = 1)
36 | 		return output
37 | 
38 | 


--------------------------------------------------------------------------------
/Chapter02/parameter_server.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.optim as optim
 4 | from my_net import *
 5 | 
 6 | class ParameterServer(nn.Module):
 7 | 	def __init__(self):
 8 | 		super().__init__()
 9 | 		self.model = MyNet()
10 | 
11 | 		if torch.cuda.is_available():
12 | 			self.input_device = torch.device("cuda:0")
13 | 		else:
14 | 			self.input_device = torch.device("cpu")
15 | 
16 | 		self.optimizer = optim.SGD(self.model.parameters(), lr = 0.05)
17 | 
18 | 	def get_weights(self):
19 | 		return self.model.state_dict()
20 | 
21 | 	def update_model(self, grads):
22 | 		for para, grad in zip(self.model.parameters(), grads):
23 | 			para.grad = grad
24 | 		self.optimizer.step()
25 | 		self.optimizer.zero_grad()
26 | 				
27 | 


--------------------------------------------------------------------------------
/Chapter02/worker.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | #import torch.optim as optim
 4 | 
 5 | from my_net import *
 6 | 
 7 | class Worker(nn.Module):
 8 | 	def __init__(self):
 9 | 		super().__init__()
10 | 		self.model = MyNet()
11 | 		if torch.cuda.is_available():
12 | 			self.input_device = torch.device("cuda:0")
13 | 		else:
14 | 			self.input_device = torch.device("cpu")
15 | 
16 | 	def pull_weights(self, model_params):
17 | 		self.model.load_state_dict(model_params)
18 | 
19 | 	def push_gradients(self, batch_idx, data, target):
20 | 		data, target = data.to(self.input_device), target.to(self.input_device)		
21 | 		output = self.model(data)
22 | 		data.requires_grad = True
23 | 		loss = F.nll_loss(output, target)
24 | 		loss.backward()
25 | 		grads = []
26 | 		for layer in self.parameters():
27 | 			grad = layer.grad
28 | 			grads.append(grad)
29 | 		print(f"batch {batch_idx} training :: loss {loss.item()}")
30 | 		return grads
31 | 
32 | 


--------------------------------------------------------------------------------
/Chapter03/ddp/main.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | import datetime
  4 | 
  5 | from my_net import *
  6 | import torch.distributed as dist
  7 | import torch.distributed.autograd as dist_autograd
  8 | import torch.multiprocessing as mp
  9 | from torchvision import datasets, transforms
 10 | from torch import optim
 11 | from torch.distributed.optim import DistributedOptimizer
 12 | from torch.nn.parallel import DistributedDataParallel as DDP
 13 | from torch.utils.data.distributed import DistributedSampler as DDP_sampler
 14 | 
 15 | train_all_set = datasets.MNIST('./mnist_data', download=True, train=True,
 16 |                transform = transforms.Compose([transforms.ToTensor(),
 17 |                transforms.Normalize((0.1307,),(0.3081,))]))
 18 | train_set, val_set = torch.utils.data.random_split( train_all_set,
 19 |         			 [50000, 10000])
 20 | 
 21 | test_set = datasets.MNIST('./mnist_data', download=True, train=False,
 22 |               transform = transforms.Compose([transforms.ToTensor(), 
 23 |               transforms.Normalize((0.1307,),(0.3081,))]))
 24 | 
 25 | def net_setup():
 26 | 	os.environ['MASTER_ADDR'] = '172.31.26.15'
 27 | 	os.environ['MASTER_PORT'] = '12345'
 28 | 
 29 | def checkpointing(rank, epoch, net, optimizer, loss):
 30 | 	path = f"model{rank}.pt"
 31 | 	torch.save({
 32 | 				'epoch':epoch,
 33 | 				'model_state':net.state_dict(),
 34 | 				'loss': loss,
 35 | 				'optim_state': optimizer.state_dict(),
 36 | 				}, path)
 37 | 	print(f"Checkpointing model {rank} done.")
 38 | 
 39 | def load_checkpoint(rank, machines):
 40 | 	path = f"model{rank}.pt"
 41 | 	checkpoint = torch.load(path)
 42 | 	model = torch.nn.DataParallel(MyNet(), device_ids=[rank%machines])
 43 | 	optimizer = torch.optim.SGD(model.parameters(), lr = 5e-4)
 44 | 
 45 | 	epoch = checkpoint['epoch']
 46 | 	loss = checkpoint['loss']
 47 | 	model.load_state_dict(checkpoint['model_state'])
 48 | 	optimizer.load_state_dict(checkpoint['optim_state'])
 49 | 	return model, optimizer, epoch, loss
 50 | 	
 51 | def validation(model, val_set):
 52 | 	model.eval()
 53 | 	val_loader = torch.utils.data.DataLoader(val_set, batch_size=128)
 54 | 	correct_total = 0
 55 | 	with torch.no_grad():
 56 | 		for idx, (data, target) in enumerate(val_loader):
 57 | 			output = model(data)
 58 | 			predict = output.argmax(dim=1, keepdim=True).cuda()
 59 | 			target = target.cuda()
 60 | 			correct = predict.eq(target.view_as(predict)).sum().item()
 61 | 			correct_total += correct
 62 | 		acc = correct_total/len(val_loader.dataset)
 63 | 	print(f"Validation Accuracy {acc}")
 64 | 
 65 | def train(local_rank, args):
 66 | 	torch.manual_seed(123)
 67 | 	world_size = args.machines*args.gpus
 68 | 	rank = args.mid * args.gpus + local_rank
 69 | 	dist.init_process_group('nccl', rank =rank, world_size = world_size,
 70 |                             timeout=datetime.timedelta(seconds=60))
 71 | 	
 72 | 	torch.cuda.set_device(local_rank)
 73 | 	model = MyNet()
 74 | 	local_train_sampler = DDP_sampler(datasets.MNIST('./mnist_data', download=True, train=True,
 75 |                transform = transforms.Compose([transforms.ToTensor(),
 76 |                transforms.Normalize((0.1307,),(0.3081,))])), rank = rank, num_replicas = world_size) 
 77 | 	local_train_loader = torch.utils.data.DataLoader(datasets.MNIST('./mnist_data', download=True, train=True,
 78 |                transform = transforms.Compose([transforms.ToTensor(),
 79 |                transforms.Normalize((0.1307,),(0.3081,))])),
 80 | 							batch_size = 128,
 81 | 							shuffle = False,
 82 | 							sampler = local_train_sampler)
 83 | 
 84 | 	optimizer = torch.optim.SGD(model.parameters(), lr = 5e-4)
 85 | 	model = DDP(model, device_ids=[local_rank])
 86 | 
 87 | 	for epoch in range(args.epochs):
 88 | 		print(f"Epoch {epoch}")
 89 | 		for idx, (data, target) in enumerate(local_train_loader):
 90 | 			data = data.cuda()
 91 | 			target = target.cuda()
 92 | 			output = model(data)
 93 | 			loss = F.cross_entropy(output, target)
 94 | 			loss.backward()
 95 | 			optimizer.step()
 96 | 			print(f"batch {idx} training :: loss {loss.item()}")
 97 | 		checkpointing(rank, epoch, model, optimizer, loss.item())
 98 | 		validation(model, val_set)
 99 | 	print("Training Done!")
100 | 	dist.destroy_process_group()
101 | 	
102 | def test(local_rank, args):
103 | 	world_size = args.machines*args.gpus
104 | 	rank = args.mid * args.gpus + local_rank
105 | 	dist.init_process_group('nccl', rank =rank, world_size = world_size,
106 |                             timeout=datetime.timedelta(seconds=60))
107 | 
108 | 	torch.cuda.set_device(local_rank)
109 | 	print(f"Load checkpoint {rank}")
110 | 	model, optimizer, epoch, loss = load_checkpoint(rank, args.machines)
111 | 	print("Checkpoint loading done!")
112 | 
113 | 	local_test_sampler = DDP_sampler(test_set, rank = rank, num_replicas = world_size)
114 | 
115 | 	model.eval()
116 | 	local_test_loader = torch.utils.data.DataLoader(test_set, 
117 | 							batch_size=128,
118 | 							shuffle = False, 
119 | 							sampler = local_test_sampler)
120 | 	correct_total = 0
121 | 	with torch.no_grad():
122 | 		for idx, (data, target) in enumerate(local_test_loader):
123 | 			output = model(data)
124 | 			predict = output.argmax(dim=1, keepdim=True).cuda()
125 | 			target = target.cuda()
126 | 			correct = predict.eq(target.view_as(predict)).sum().item()
127 | 			correct_total += correct
128 | 		acc = correct_total/len(local_test_loader.dataset)
129 | 	print(f"GPU {rank}, Test Accuracy {acc}")
130 | 	print("Test Done!")
131 | 	dist.destroy_process_group()
132 | 
133 | def main():
134 | 	parser = argparse.ArgumentParser(description = 'distributed data parallel training')
135 | 	parser.add_argument('-m', '--machines', default=2, type=int, help='number of machines')
136 | 	parser.add_argument('-g', '--gpus', default = 4, type=int, help='number of GPUs in a machine')
137 | 	parser.add_argument('-id', '--mid', default = 0, type=int, help='machine id number')
138 | 	parser.add_argument('-e', '--epochs', default = 10, type = int, help='number of epochs')
139 | 	args = parser.parse_args()
140 | 	net_setup()
141 | 	mp.spawn(train, nprocs=args.gpus, args=(args,), join=True)
142 | 	mp.spawn(test, nprocs=args.gpus, args=(args,), join=True)
143 | 
144 | if __name__ == '__main__':
145 |     main()
146 | 


--------------------------------------------------------------------------------
/Chapter03/ddp/my_net.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | 
 5 | class MyNet(nn.Module):
 6 | 	def __init__(self):
 7 | 		super(MyNet, self).__init__()
 8 | 		if torch.cuda.is_available():
 9 | 			device = torch.device(f"cuda")
10 | 		else:
11 | 			device = torch.device("cpu")
12 | 		self.conv1 = nn.Conv2d(1,32,3,1).to(device)
13 | 		self.dropout1 = nn.Dropout2d(0.5).to(device)
14 | 		self.conv2 = nn.Conv2d(32,64,3,1).to(device)
15 | 		self.dropout2 = nn.Dropout2d(0.75).to(device)
16 | 		self.fc1 = nn.Linear(9216, 128).to(device)
17 | 		self.fc2 = nn.Linear(128,20).to(device)
18 | 		self.fc3 = nn.Linear(20,10).to(device)
19 | 
20 | 	def forward(self, x):
21 | 		x = self.conv1(x)
22 | 		x = self.dropout1(x)
23 | 		x = F.relu(x)
24 | 		x = self.conv2(x)
25 | 		x = self.dropout2(x)
26 | 		x = F.max_pool2d(x,2)
27 | 		x = torch.flatten(x,1)
28 | 
29 | 		x = self.fc1(x)
30 | 		x = F.relu(x)
31 | 		x = self.fc2(x)
32 | 		x = F.relu(x)
33 | 		x = self.fc3(x)
34 | 
35 | 		output = F.log_softmax(x, dim = 1)
36 | 		return output
37 | 
38 | 


--------------------------------------------------------------------------------
/Chapter03/dp/main.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.utils.data import Dataset, DataLoader
 3 | from my_net import *
 4 | 
 5 | from torchvision import datasets, transforms
 6 | from torch import optim
 7 | 
 8 | train_set = datasets.MNIST('./mnist_data', download=True, train=True,
 9 |                transform = transforms.Compose([transforms.ToTensor(),
10 |                transforms.Normalize((0.1307,),(0.3081,))]))
11 | 
12 | test_set = datasets.MNIST('./mnist_data', download=True, train=False,
13 |               transform = transforms.Compose([transforms.ToTensor(), 
14 |               transforms.Normalize((0.1307,),(0.3081,))]))
15 | 
16 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
17 | 
18 | train_loader = DataLoader(train_set, batch_size = 128, shuffle=True, pin_memory = True)
19 | 
20 | train_epoch = 2
21 | 
22 | def main():
23 | 	model = MyNet()
24 | 	print("Using ", torch.cuda.device_count(), "GPUs for data parallel training")
25 | 	optimizer = torch.optim.SGD(model.parameters(), lr = 5e-4)
26 | 	model = nn.DataParallel(model)
27 | 	model.to(device)
28 | 	#Training
29 | 	for epoch in range(train_epoch):
30 | 		print(f"Epoch {epoch}")
31 | 		for idx, (data, target) in enumerate(train_loader):
32 | 			data, target = data.cuda(), target.cuda()
33 | 			output = model(data)
34 | 			loss = F.cross_entropy(output, target)
35 | 			loss.backward()
36 | 			optimizer.step()
37 | 			print(f"batch {idx}, loss {loss.item()}")
38 | 	print("Training Done!")
39 | 	
40 | 
41 | if __name__ == '__main__':
42 | 	main()
43 | 
44 | 


--------------------------------------------------------------------------------
/Chapter03/dp/my_net.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | 
 5 | class MyNet(nn.Module):
 6 | 	def __init__(self):
 7 | 		super(MyNet, self).__init__()
 8 | 		if torch.cuda.is_available():
 9 | 			device = torch.device(f"cuda")
10 | 		else:
11 | 			device = torch.device("cpu")
12 | 		self.conv1 = nn.Conv2d(1,32,3,1).to(device)
13 | 		self.dropout1 = nn.Dropout2d(0.5).to(device)
14 | 		self.conv2 = nn.Conv2d(32,64,3,1).to(device)
15 | 		self.dropout2 = nn.Dropout2d(0.75).to(device)
16 | 		self.fc1 = nn.Linear(9216, 128).to(device)
17 | 		self.fc2 = nn.Linear(128,20).to(device)
18 | 		self.fc3 = nn.Linear(20,10).to(device)
19 | 
20 | 	def forward(self, x):
21 | 		x = self.conv1(x)
22 | 		x = self.dropout1(x)
23 | 		x = F.relu(x)
24 | 		x = self.conv2(x)
25 | 		x = self.dropout2(x)
26 | 		x = F.max_pool2d(x,2)
27 | 		x = torch.flatten(x,1)
28 | 
29 | 		x = self.fc1(x)
30 | 		x = F.relu(x)
31 | 		x = self.fc2(x)
32 | 		x = F.relu(x)
33 | 		x = self.fc3(x)
34 | 
35 | 		output = F.log_softmax(x, dim = 1)
36 | 		return output
37 | 
38 | 


--------------------------------------------------------------------------------
/Chapter07/main.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | import datetime
 4 | 
 5 | from my_net import *
 6 | import torch.distributed as dist
 7 | import torch.distributed.autograd as dist_autograd
 8 | import torch.multiprocessing as mp
 9 | from torchvision import datasets, transforms
10 | from torch import optim
11 | from torch.distributed.optim import DistributedOptimizer
12 | from torch.nn.parallel import DistributedDataParallel as DDP
13 | from torch.utils.data.distributed import DistributedSampler as DDP_sampler
14 | 
15 | train_all_set = datasets.MNIST('./mnist_data', download=True, train=True,
16 |                transform = transforms.Compose([transforms.ToTensor(),
17 |                transforms.Normalize((0.1307,),(0.3081,))]))
18 | train_set, val_set = torch.utils.data.random_split( train_all_set,
19 |         			 [50000, 10000])
20 | 
21 | test_set = datasets.MNIST('./mnist_data', download=True, train=False,
22 |               transform = transforms.Compose([transforms.ToTensor(), 
23 |               transforms.Normalize((0.1307,),(0.3081,))]))
24 | 
25 | def train(args):
26 |     model = MyNet()
27 |     model.train()
28 |     trainset= datasets.MNIST('./mnist_data', download=True, train=True,
29 |                transform = transforms.Compose([transforms.ToTensor(),
30 |                transforms.Normalize((0.1307,),(0.3081,))]))
31 |     trainloader = torch.utils.data.DataLoader(trainset, batch_size=128, shuffle=True, num_workers=4)
32 |     criterion = nn.CrossEntropyLoss()			
33 |     optimizer = torch.optim.SGD(model.parameters(), lr = 1e-3)
34 | 
35 |     for epoch in range(args.epochs):
36 |         print(f"Epoch {epoch}")
37 |         for idx, (data, target) in enumerate(trainloader):
38 |             data = data.to('cuda:0')
39 |             optimizer.zero_grad()
40 |             output = model(data)
41 |             target = target.to(output.device)
42 |             loss = F.cross_entropy(output, target)
43 |             loss.backward()
44 |             optimizer.step()
45 |             print(f"batch {idx} training :: loss {loss.item()}")
46 |         print("Training Done!")
47 |     return model
48 | def test(args, model):
49 |     model.eval()
50 |     testset = datasets.MNIST('./mnist_data', download=True, train=True,
51 |                transform = transforms.Compose([transforms.ToTensor(),
52 |                transforms.Normalize((0.1307,),(0.3081,))]))
53 |     testloader = torch.utils.data.DataLoader(testset, batch_size=128,shuffle=False, num_workers=4)
54 |     correct_total = 0
55 |     with torch.no_grad():
56 |         for idx, (data, target) in enumerate(testloader):
57 |             output = model(data.to('cuda:0'))
58 |             predict = output.argmax(dim=1, keepdim=True).to(output.device)
59 |             target = target.to(output.device)
60 |             correct = predict.eq(target.view_as(predict)).sum().item()
61 |             correct_total += correct
62 |             acc = correct_total/len(testloader.dataset)
63 |             print(f"Test Accuracy {acc}")
64 |     print("Test Done!")
65 | 
66 | def main():
67 |     parser = argparse.ArgumentParser(description = 'model parallel training')
68 |     parser.add_argument('-e', '--epochs', default = 4, type = int, help='number of epochs')
69 |     args = parser.parse_args()
70 |     trained_model = train(args)
71 |     test(args, trained_model)
72 | 
73 | if __name__ == '__main__':
74 |     main()
75 | 


--------------------------------------------------------------------------------
/Chapter07/my_net.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | 
 5 | class MyNet(nn.Module):
 6 |     def __init__(self):
 7 |         super(MyNet, self).__init__()
 8 |         self.seq1 = nn.Sequential(
 9 | 		        nn.Conv2d(1,32,3,1),
10 | 		        nn.Dropout2d(0.5),
11 | 		        nn.Conv2d(32,64,3,1),
12 | 		        nn.Dropout2d(0.75)).to('cuda:0')
13 |         self.seq2 = nn.Sequential(
14 | 		        nn.Linear(9216, 128),
15 | 		        nn.Linear(128,20),
16 | 		        nn.Linear(20,10)).to('cuda:2')
17 | 
18 |     def forward(self, x):
19 |         x = self.seq1(x.to('cuda:0'))
20 |         x = F.max_pool2d(x,2).to('cuda:1')
21 |         x = torch.flatten(x,1).to('cuda:1')
22 |         x = self.seq2(x.to('cuda:2'))
23 |         output = F.log_softmax(x, dim = 1)
24 |         return output
25 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 Packt
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | ### [Packt Conference : Put Generative AI to work on Oct 11-13 (Virtual)](https://packt.link/JGIEY)
 3 | 
 4 | <b><p align='center'>[![Packt Conference](https://hub.packtpub.com/wp-content/uploads/2023/08/put-generative-ai-to-work-packt.png)](https://packt.link/JGIEY)</p></b> 
 5 | 3 Days, 20+ AI Experts, 25+ Workshops and Power Talks 
 6 | 
 7 | Code: <b>USD75OFF</b>
 8 | 
 9 | 
10 | 
11 | 
12 | # Distributed Machine Learning with Python
13 | 
14 | <a href="https://www.packtpub.com/product/distributed-machine-learning-with-python/9781801815697?utm_source=github&utm_medium=repository&utm_campaign=9781801815697"><img src="https://static.packt-cdn.com/products/9781801815697/cover/smaller" alt="Distributed Machine Learning with Python" height="256px" align="right"></a>
15 | 
16 | This is the code repository for [Distributed Machine Learning with Python](https://www.packtpub.com/product/distributed-machine-learning-with-python/9781801815697?utm_source=github&utm_medium=repository&utm_campaign=9781801815697), published by Packt.
17 | 
18 | **Accelerating model training and serving with distributed systems**
19 | 
20 | ## What is this book about?
21 | Reducing time cost in machine learning leads to a shorter waiting time for model training and a faster model updating cycle. Distributed machine learning enables machine learning practitioners to shorten model training and inference time by orders of magnitude
22 | 
23 | This book covers the following exciting features: 
24 | * Deploy distributed model training and serving pipelines
25 | * Get to grips with the advanced features in TensorFlow and PyTorch
26 | * Mitigate system bottlenecks during in-parallel model training and serving
27 | * Discover the latest techniques on top of classical parallelism paradigm
28 | * Explore advanced features in Megatron-LM and Mesh-TensorFlow
29 | * Use state-of-the-art hardware such as NVLink, NVSwitch, and GPUs
30 | 
31 | If you feel this book is for you, get your [copy](https://www.amazon.com/dp/B09NC5XJ6D) today!
32 | 
33 | <a href="https://www.packtpub.com/?utm_source=github&utm_medium=banner&utm_campaign=GitHubBanner"><img src="https://raw.githubusercontent.com/PacktPublishing/GitHub/master/GitHub.png" 
34 | alt="https://www.packtpub.com/" border="5" /></a>
35 | 
36 | 
37 | ## Instructions and Navigations
38 | All of the code is organized into folders.
39 | 
40 | The code will look like the following:
41 | ```
42 | # Connect to API through subscription key and endpoint
43 | subscription_key = "<your-subscription-key>"
44 | endpoint = "https://<your-cognitive-service>.cognitiveservices.
45 | azure.com/"
46 | # Authenticate
47 | credential = AzureKeyCredential(subscription_key)
48 | cog_client = TextAnalyticsClient(endpoint=endpoint,
49 | credential=credential)
50 | ```
51 | 
52 | **Following is what you need for this book:**
53 | This book is for data scientists, machine learning engineers, and ML practitioners in both academia and industry. A fundamental understanding of machine learning concepts and working knowledge of Python programming is assumed. Prior experience implementing ML/DL models with TensorFlow or PyTorch will be beneficial. 
54 | You'll find this book useful if you are interested in using distributed systems to boost machine learning model training and serving speed.
55 | 
56 | With the following software and hardware list you can run all code files present in the book (Chapter 1-12)
57 | 
58 | ### Software and Hardware List
59 | 
60 | 
61 | | Chapter  | Software required                    | OS required                        |
62 | | -------- | ------------------------------------ | -----------------------------------|
63 | | 1-12	   | PyTorch                              | Windows, Mac OS X, and Linux (Any) |
64 | | 1-12	   | TensorFlow                           | Windows, Mac OS X, and Linux (Any) |
65 | | 1-12	   | Python                               | Windows, Mac OS X, and Linux (Any) |
66 | |          | CUDA/C                               |                                    |
67 | |          | NVprofiler/Nsight                    |                                    |
68 | 
69 | 
70 | We assume you have Linux/Ubuntu as your operating system. We assume you use
71 | NVIDIA GPUs and have installed the proper NVIDIA driver as well. We also assume you
72 | have basic knowledge about machine learning in general and are familiar with popular
73 | deep learning models.
74 | 
75 | We also provide a PDF file that has color images of the screenshots/diagrams used in this book. [Click here to download it](https://static.packt-cdn.com/downloads/9781801815697_ColorImages.pdf).
76 | 
77 | 
78 | ### Related products <Other books you may enjoy>
79 | * Distributed Data Systems with Azure Databricks [[Packt]](https://www.packtpub.com/product/distributed-data-systems-with-azure-databricks/9781838647216?utm_source=github&utm_medium=repository&utm_campaign=9781838647216) [[Amazon]](https://www.amazon.com/dp/B0946QSSBM)
80 | 
81 | * Machine Learning with the Elastic Stack - Second Edition [[Packt]](https://www.packtpub.com/product/machine-learning-with-the-elastic-stack-second-edition/9781801070034?utm_source=github&utm_medium=repository&utm_campaign=9781801070034) [[Amazon]](https://www.amazon.com/dp/1801070032)
82 | 
83 | ## Get to Know the Author
84 | **Guanhua Wang** 
85 | is a final-year computer science Ph.D. student in the RISELab at UC
86 | Berkeley, advised by Professor Ion Stoica. His research lies primarily in the machine
87 | learning systems area, including fast collective communication, efficient in-parallel model
88 | training, and real-time model serving. His research has gained lots of attention from both
89 | academia and industry. He was invited to give talks to top-tier universities (MIT, Stanford,
90 | CMU, Princeton) and big tech companies (Facebook/Meta, Microsoft). He received his
91 | master's degree from HKUST and a bachelor's degree from Southeast University in China.
92 | He has also done some cool research on wireless networks. He likes playing soccer and has
93 | run multiple half-marathons in the Bay Area of California.
94 | ### Download a free PDF
95 | 
96 |  <i>If you have already purchased a print or Kindle version of this book, you can get a DRM-free PDF version at no cost.<br>Simply click on the link to claim your free PDF.</i>
97 | <p align="center"> <a href="https://packt.link/free-ebook/9781801815697">https://packt.link/free-ebook/9781801815697 </a> </p>


--------------------------------------------------------------------------------