├── Chapter02 ├── main.py ├── my_net.py ├── parameter_server.py └── worker.py ├── Chapter03 ├── ddp │ ├── main.py │ └── my_net.py └── dp │ ├── main.py │ └── my_net.py ├── Chapter07 ├── main.py └── my_net.py ├── LICENSE └── README.md /Chapter02/main.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torchvision import datasets, transforms 3 | 4 | from my_net import * 5 | from worker import * 6 | from parameter_server import * 7 | 8 | train_loader = torch.utils.data.DataLoader(datasets.MNIST('./mnist_data', download=True, train=True, 9 | transform = transforms.Compose([transforms.ToTensor(), 10 | transforms.Normalize((0.1307,),(0.3081,))])), 11 | batch_size=128, shuffle=True) 12 | test_loader = torch.utils.data.DataLoader(datasets.MNIST('./mnist_data', download=True, train=False, 13 | transform = transforms.Compose([transforms.ToTensor(), 14 | transforms.Normalize((0.1307,),(0.3081,))])), 15 | batch_size=128, shuffle=True) 16 | 17 | def main(): 18 | ps = ParameterServer() 19 | worker = Worker() 20 | 21 | for batch_idx, (data, target) in enumerate(train_loader): 22 | params = ps.get_weights() 23 | worker.pull_weights(params) 24 | grads = worker.push_gradients(batch_idx, data, target) 25 | ps.update_model(grads) 26 | print("Done Training") 27 | 28 | if __name__ == '__main__': 29 | main() 30 | -------------------------------------------------------------------------------- /Chapter02/my_net.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | class MyNet(nn.Module): 6 | def __init__(self): 7 | super(MyNet, self).__init__() 8 | if torch.cuda.is_available(): 9 | device = torch.device(f"cuda:0") 10 | else: 11 | device = torch.device("cpu") 12 | self.conv1 = nn.Conv2d(1,32,3,1).to(device) 13 | self.dropout1 = nn.Dropout2d(0.5).to(device) 14 | self.conv2 = nn.Conv2d(32,64,3,1).to(device) 15 | self.dropout2 = nn.Dropout2d(0.75).to(device) 16 | self.fc1 = nn.Linear(9216, 128).to(device) 17 | self.fc2 = nn.Linear(128,20).to(device) 18 | self.fc3 = nn.Linear(20,10).to(device) 19 | 20 | def forward(self, x): 21 | x = self.conv1(x) 22 | x = self.dropout1(x) 23 | x = F.relu(x) 24 | x = self.conv2(x) 25 | x = self.dropout2(x) 26 | x = F.max_pool2d(x,2) 27 | x = torch.flatten(x,1) 28 | 29 | x = self.fc1(x) 30 | x = F.relu(x) 31 | x = self.fc2(x) 32 | x = F.relu(x) 33 | x = self.fc3(x) 34 | 35 | output = F.log_softmax(x, dim = 1) 36 | return output 37 | 38 | -------------------------------------------------------------------------------- /Chapter02/parameter_server.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.optim as optim 4 | from my_net import * 5 | 6 | class ParameterServer(nn.Module): 7 | def __init__(self): 8 | super().__init__() 9 | self.model = MyNet() 10 | 11 | if torch.cuda.is_available(): 12 | self.input_device = torch.device("cuda:0") 13 | else: 14 | self.input_device = torch.device("cpu") 15 | 16 | self.optimizer = optim.SGD(self.model.parameters(), lr = 0.05) 17 | 18 | def get_weights(self): 19 | return self.model.state_dict() 20 | 21 | def update_model(self, grads): 22 | for para, grad in zip(self.model.parameters(), grads): 23 | para.grad = grad 24 | self.optimizer.step() 25 | self.optimizer.zero_grad() 26 | 27 | -------------------------------------------------------------------------------- /Chapter02/worker.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | #import torch.optim as optim 4 | 5 | from my_net import * 6 | 7 | class Worker(nn.Module): 8 | def __init__(self): 9 | super().__init__() 10 | self.model = MyNet() 11 | if torch.cuda.is_available(): 12 | self.input_device = torch.device("cuda:0") 13 | else: 14 | self.input_device = torch.device("cpu") 15 | 16 | def pull_weights(self, model_params): 17 | self.model.load_state_dict(model_params) 18 | 19 | def push_gradients(self, batch_idx, data, target): 20 | data, target = data.to(self.input_device), target.to(self.input_device) 21 | output = self.model(data) 22 | data.requires_grad = True 23 | loss = F.nll_loss(output, target) 24 | loss.backward() 25 | grads = [] 26 | for layer in self.parameters(): 27 | grad = layer.grad 28 | grads.append(grad) 29 | print(f"batch {batch_idx} training :: loss {loss.item()}") 30 | return grads 31 | 32 | -------------------------------------------------------------------------------- /Chapter03/ddp/main.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import datetime 4 | 5 | from my_net import * 6 | import torch.distributed as dist 7 | import torch.distributed.autograd as dist_autograd 8 | import torch.multiprocessing as mp 9 | from torchvision import datasets, transforms 10 | from torch import optim 11 | from torch.distributed.optim import DistributedOptimizer 12 | from torch.nn.parallel import DistributedDataParallel as DDP 13 | from torch.utils.data.distributed import DistributedSampler as DDP_sampler 14 | 15 | train_all_set = datasets.MNIST('./mnist_data', download=True, train=True, 16 | transform = transforms.Compose([transforms.ToTensor(), 17 | transforms.Normalize((0.1307,),(0.3081,))])) 18 | train_set, val_set = torch.utils.data.random_split( train_all_set, 19 | [50000, 10000]) 20 | 21 | test_set = datasets.MNIST('./mnist_data', download=True, train=False, 22 | transform = transforms.Compose([transforms.ToTensor(), 23 | transforms.Normalize((0.1307,),(0.3081,))])) 24 | 25 | def net_setup(): 26 | os.environ['MASTER_ADDR'] = '172.31.26.15' 27 | os.environ['MASTER_PORT'] = '12345' 28 | 29 | def checkpointing(rank, epoch, net, optimizer, loss): 30 | path = f"model{rank}.pt" 31 | torch.save({ 32 | 'epoch':epoch, 33 | 'model_state':net.state_dict(), 34 | 'loss': loss, 35 | 'optim_state': optimizer.state_dict(), 36 | }, path) 37 | print(f"Checkpointing model {rank} done.") 38 | 39 | def load_checkpoint(rank, machines): 40 | path = f"model{rank}.pt" 41 | checkpoint = torch.load(path) 42 | model = torch.nn.DataParallel(MyNet(), device_ids=[rank%machines]) 43 | optimizer = torch.optim.SGD(model.parameters(), lr = 5e-4) 44 | 45 | epoch = checkpoint['epoch'] 46 | loss = checkpoint['loss'] 47 | model.load_state_dict(checkpoint['model_state']) 48 | optimizer.load_state_dict(checkpoint['optim_state']) 49 | return model, optimizer, epoch, loss 50 | 51 | def validation(model, val_set): 52 | model.eval() 53 | val_loader = torch.utils.data.DataLoader(val_set, batch_size=128) 54 | correct_total = 0 55 | with torch.no_grad(): 56 | for idx, (data, target) in enumerate(val_loader): 57 | output = model(data) 58 | predict = output.argmax(dim=1, keepdim=True).cuda() 59 | target = target.cuda() 60 | correct = predict.eq(target.view_as(predict)).sum().item() 61 | correct_total += correct 62 | acc = correct_total/len(val_loader.dataset) 63 | print(f"Validation Accuracy {acc}") 64 | 65 | def train(local_rank, args): 66 | torch.manual_seed(123) 67 | world_size = args.machines*args.gpus 68 | rank = args.mid * args.gpus + local_rank 69 | dist.init_process_group('nccl', rank =rank, world_size = world_size, 70 | timeout=datetime.timedelta(seconds=60)) 71 | 72 | torch.cuda.set_device(local_rank) 73 | model = MyNet() 74 | local_train_sampler = DDP_sampler(datasets.MNIST('./mnist_data', download=True, train=True, 75 | transform = transforms.Compose([transforms.ToTensor(), 76 | transforms.Normalize((0.1307,),(0.3081,))])), rank = rank, num_replicas = world_size) 77 | local_train_loader = torch.utils.data.DataLoader(datasets.MNIST('./mnist_data', download=True, train=True, 78 | transform = transforms.Compose([transforms.ToTensor(), 79 | transforms.Normalize((0.1307,),(0.3081,))])), 80 | batch_size = 128, 81 | shuffle = False, 82 | sampler = local_train_sampler) 83 | 84 | optimizer = torch.optim.SGD(model.parameters(), lr = 5e-4) 85 | model = DDP(model, device_ids=[local_rank]) 86 | 87 | for epoch in range(args.epochs): 88 | print(f"Epoch {epoch}") 89 | for idx, (data, target) in enumerate(local_train_loader): 90 | data = data.cuda() 91 | target = target.cuda() 92 | output = model(data) 93 | loss = F.cross_entropy(output, target) 94 | loss.backward() 95 | optimizer.step() 96 | print(f"batch {idx} training :: loss {loss.item()}") 97 | checkpointing(rank, epoch, model, optimizer, loss.item()) 98 | validation(model, val_set) 99 | print("Training Done!") 100 | dist.destroy_process_group() 101 | 102 | def test(local_rank, args): 103 | world_size = args.machines*args.gpus 104 | rank = args.mid * args.gpus + local_rank 105 | dist.init_process_group('nccl', rank =rank, world_size = world_size, 106 | timeout=datetime.timedelta(seconds=60)) 107 | 108 | torch.cuda.set_device(local_rank) 109 | print(f"Load checkpoint {rank}") 110 | model, optimizer, epoch, loss = load_checkpoint(rank, args.machines) 111 | print("Checkpoint loading done!") 112 | 113 | local_test_sampler = DDP_sampler(test_set, rank = rank, num_replicas = world_size) 114 | 115 | model.eval() 116 | local_test_loader = torch.utils.data.DataLoader(test_set, 117 | batch_size=128, 118 | shuffle = False, 119 | sampler = local_test_sampler) 120 | correct_total = 0 121 | with torch.no_grad(): 122 | for idx, (data, target) in enumerate(local_test_loader): 123 | output = model(data) 124 | predict = output.argmax(dim=1, keepdim=True).cuda() 125 | target = target.cuda() 126 | correct = predict.eq(target.view_as(predict)).sum().item() 127 | correct_total += correct 128 | acc = correct_total/len(local_test_loader.dataset) 129 | print(f"GPU {rank}, Test Accuracy {acc}") 130 | print("Test Done!") 131 | dist.destroy_process_group() 132 | 133 | def main(): 134 | parser = argparse.ArgumentParser(description = 'distributed data parallel training') 135 | parser.add_argument('-m', '--machines', default=2, type=int, help='number of machines') 136 | parser.add_argument('-g', '--gpus', default = 4, type=int, help='number of GPUs in a machine') 137 | parser.add_argument('-id', '--mid', default = 0, type=int, help='machine id number') 138 | parser.add_argument('-e', '--epochs', default = 10, type = int, help='number of epochs') 139 | args = parser.parse_args() 140 | net_setup() 141 | mp.spawn(train, nprocs=args.gpus, args=(args,), join=True) 142 | mp.spawn(test, nprocs=args.gpus, args=(args,), join=True) 143 | 144 | if __name__ == '__main__': 145 | main() 146 | -------------------------------------------------------------------------------- /Chapter03/ddp/my_net.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | class MyNet(nn.Module): 6 | def __init__(self): 7 | super(MyNet, self).__init__() 8 | if torch.cuda.is_available(): 9 | device = torch.device(f"cuda") 10 | else: 11 | device = torch.device("cpu") 12 | self.conv1 = nn.Conv2d(1,32,3,1).to(device) 13 | self.dropout1 = nn.Dropout2d(0.5).to(device) 14 | self.conv2 = nn.Conv2d(32,64,3,1).to(device) 15 | self.dropout2 = nn.Dropout2d(0.75).to(device) 16 | self.fc1 = nn.Linear(9216, 128).to(device) 17 | self.fc2 = nn.Linear(128,20).to(device) 18 | self.fc3 = nn.Linear(20,10).to(device) 19 | 20 | def forward(self, x): 21 | x = self.conv1(x) 22 | x = self.dropout1(x) 23 | x = F.relu(x) 24 | x = self.conv2(x) 25 | x = self.dropout2(x) 26 | x = F.max_pool2d(x,2) 27 | x = torch.flatten(x,1) 28 | 29 | x = self.fc1(x) 30 | x = F.relu(x) 31 | x = self.fc2(x) 32 | x = F.relu(x) 33 | x = self.fc3(x) 34 | 35 | output = F.log_softmax(x, dim = 1) 36 | return output 37 | 38 | -------------------------------------------------------------------------------- /Chapter03/dp/main.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.utils.data import Dataset, DataLoader 3 | from my_net import * 4 | 5 | from torchvision import datasets, transforms 6 | from torch import optim 7 | 8 | train_set = datasets.MNIST('./mnist_data', download=True, train=True, 9 | transform = transforms.Compose([transforms.ToTensor(), 10 | transforms.Normalize((0.1307,),(0.3081,))])) 11 | 12 | test_set = datasets.MNIST('./mnist_data', download=True, train=False, 13 | transform = transforms.Compose([transforms.ToTensor(), 14 | transforms.Normalize((0.1307,),(0.3081,))])) 15 | 16 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 17 | 18 | train_loader = DataLoader(train_set, batch_size = 128, shuffle=True, pin_memory = True) 19 | 20 | train_epoch = 2 21 | 22 | def main(): 23 | model = MyNet() 24 | print("Using ", torch.cuda.device_count(), "GPUs for data parallel training") 25 | optimizer = torch.optim.SGD(model.parameters(), lr = 5e-4) 26 | model = nn.DataParallel(model) 27 | model.to(device) 28 | #Training 29 | for epoch in range(train_epoch): 30 | print(f"Epoch {epoch}") 31 | for idx, (data, target) in enumerate(train_loader): 32 | data, target = data.cuda(), target.cuda() 33 | output = model(data) 34 | loss = F.cross_entropy(output, target) 35 | loss.backward() 36 | optimizer.step() 37 | print(f"batch {idx}, loss {loss.item()}") 38 | print("Training Done!") 39 | 40 | 41 | if __name__ == '__main__': 42 | main() 43 | 44 | -------------------------------------------------------------------------------- /Chapter03/dp/my_net.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | class MyNet(nn.Module): 6 | def __init__(self): 7 | super(MyNet, self).__init__() 8 | if torch.cuda.is_available(): 9 | device = torch.device(f"cuda") 10 | else: 11 | device = torch.device("cpu") 12 | self.conv1 = nn.Conv2d(1,32,3,1).to(device) 13 | self.dropout1 = nn.Dropout2d(0.5).to(device) 14 | self.conv2 = nn.Conv2d(32,64,3,1).to(device) 15 | self.dropout2 = nn.Dropout2d(0.75).to(device) 16 | self.fc1 = nn.Linear(9216, 128).to(device) 17 | self.fc2 = nn.Linear(128,20).to(device) 18 | self.fc3 = nn.Linear(20,10).to(device) 19 | 20 | def forward(self, x): 21 | x = self.conv1(x) 22 | x = self.dropout1(x) 23 | x = F.relu(x) 24 | x = self.conv2(x) 25 | x = self.dropout2(x) 26 | x = F.max_pool2d(x,2) 27 | x = torch.flatten(x,1) 28 | 29 | x = self.fc1(x) 30 | x = F.relu(x) 31 | x = self.fc2(x) 32 | x = F.relu(x) 33 | x = self.fc3(x) 34 | 35 | output = F.log_softmax(x, dim = 1) 36 | return output 37 | 38 | -------------------------------------------------------------------------------- /Chapter07/main.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import datetime 4 | 5 | from my_net import * 6 | import torch.distributed as dist 7 | import torch.distributed.autograd as dist_autograd 8 | import torch.multiprocessing as mp 9 | from torchvision import datasets, transforms 10 | from torch import optim 11 | from torch.distributed.optim import DistributedOptimizer 12 | from torch.nn.parallel import DistributedDataParallel as DDP 13 | from torch.utils.data.distributed import DistributedSampler as DDP_sampler 14 | 15 | train_all_set = datasets.MNIST('./mnist_data', download=True, train=True, 16 | transform = transforms.Compose([transforms.ToTensor(), 17 | transforms.Normalize((0.1307,),(0.3081,))])) 18 | train_set, val_set = torch.utils.data.random_split( train_all_set, 19 | [50000, 10000]) 20 | 21 | test_set = datasets.MNIST('./mnist_data', download=True, train=False, 22 | transform = transforms.Compose([transforms.ToTensor(), 23 | transforms.Normalize((0.1307,),(0.3081,))])) 24 | 25 | def train(args): 26 | model = MyNet() 27 | model.train() 28 | trainset= datasets.MNIST('./mnist_data', download=True, train=True, 29 | transform = transforms.Compose([transforms.ToTensor(), 30 | transforms.Normalize((0.1307,),(0.3081,))])) 31 | trainloader = torch.utils.data.DataLoader(trainset, batch_size=128, shuffle=True, num_workers=4) 32 | criterion = nn.CrossEntropyLoss() 33 | optimizer = torch.optim.SGD(model.parameters(), lr = 1e-3) 34 | 35 | for epoch in range(args.epochs): 36 | print(f"Epoch {epoch}") 37 | for idx, (data, target) in enumerate(trainloader): 38 | data = data.to('cuda:0') 39 | optimizer.zero_grad() 40 | output = model(data) 41 | target = target.to(output.device) 42 | loss = F.cross_entropy(output, target) 43 | loss.backward() 44 | optimizer.step() 45 | print(f"batch {idx} training :: loss {loss.item()}") 46 | print("Training Done!") 47 | return model 48 | def test(args, model): 49 | model.eval() 50 | testset = datasets.MNIST('./mnist_data', download=True, train=True, 51 | transform = transforms.Compose([transforms.ToTensor(), 52 | transforms.Normalize((0.1307,),(0.3081,))])) 53 | testloader = torch.utils.data.DataLoader(testset, batch_size=128,shuffle=False, num_workers=4) 54 | correct_total = 0 55 | with torch.no_grad(): 56 | for idx, (data, target) in enumerate(testloader): 57 | output = model(data.to('cuda:0')) 58 | predict = output.argmax(dim=1, keepdim=True).to(output.device) 59 | target = target.to(output.device) 60 | correct = predict.eq(target.view_as(predict)).sum().item() 61 | correct_total += correct 62 | acc = correct_total/len(testloader.dataset) 63 | print(f"Test Accuracy {acc}") 64 | print("Test Done!") 65 | 66 | def main(): 67 | parser = argparse.ArgumentParser(description = 'model parallel training') 68 | parser.add_argument('-e', '--epochs', default = 4, type = int, help='number of epochs') 69 | args = parser.parse_args() 70 | trained_model = train(args) 71 | test(args, trained_model) 72 | 73 | if __name__ == '__main__': 74 | main() 75 | -------------------------------------------------------------------------------- /Chapter07/my_net.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | class MyNet(nn.Module): 6 | def __init__(self): 7 | super(MyNet, self).__init__() 8 | self.seq1 = nn.Sequential( 9 | nn.Conv2d(1,32,3,1), 10 | nn.Dropout2d(0.5), 11 | nn.Conv2d(32,64,3,1), 12 | nn.Dropout2d(0.75)).to('cuda:0') 13 | self.seq2 = nn.Sequential( 14 | nn.Linear(9216, 128), 15 | nn.Linear(128,20), 16 | nn.Linear(20,10)).to('cuda:2') 17 | 18 | def forward(self, x): 19 | x = self.seq1(x.to('cuda:0')) 20 | x = F.max_pool2d(x,2).to('cuda:1') 21 | x = torch.flatten(x,1).to('cuda:1') 22 | x = self.seq2(x.to('cuda:2')) 23 | output = F.log_softmax(x, dim = 1) 24 | return output 25 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Packt 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | ### [Packt Conference : Put Generative AI to work on Oct 11-13 (Virtual)](https://packt.link/JGIEY) 3 | 4 |

[![Packt Conference](https://hub.packtpub.com/wp-content/uploads/2023/08/put-generative-ai-to-work-packt.png)](https://packt.link/JGIEY)

5 | 3 Days, 20+ AI Experts, 25+ Workshops and Power Talks 6 | 7 | Code: USD75OFF 8 | 9 | 10 | 11 | 12 | # Distributed Machine Learning with Python 13 | 14 | Distributed Machine Learning with Python 15 | 16 | This is the code repository for [Distributed Machine Learning with Python](https://www.packtpub.com/product/distributed-machine-learning-with-python/9781801815697?utm_source=github&utm_medium=repository&utm_campaign=9781801815697), published by Packt. 17 | 18 | **Accelerating model training and serving with distributed systems** 19 | 20 | ## What is this book about? 21 | Reducing time cost in machine learning leads to a shorter waiting time for model training and a faster model updating cycle. Distributed machine learning enables machine learning practitioners to shorten model training and inference time by orders of magnitude 22 | 23 | This book covers the following exciting features: 24 | * Deploy distributed model training and serving pipelines 25 | * Get to grips with the advanced features in TensorFlow and PyTorch 26 | * Mitigate system bottlenecks during in-parallel model training and serving 27 | * Discover the latest techniques on top of classical parallelism paradigm 28 | * Explore advanced features in Megatron-LM and Mesh-TensorFlow 29 | * Use state-of-the-art hardware such as NVLink, NVSwitch, and GPUs 30 | 31 | If you feel this book is for you, get your [copy](https://www.amazon.com/dp/B09NC5XJ6D) today! 32 | 33 | https://www.packtpub.com/ 35 | 36 | 37 | ## Instructions and Navigations 38 | All of the code is organized into folders. 39 | 40 | The code will look like the following: 41 | ``` 42 | # Connect to API through subscription key and endpoint 43 | subscription_key = "" 44 | endpoint = "https://.cognitiveservices. 45 | azure.com/" 46 | # Authenticate 47 | credential = AzureKeyCredential(subscription_key) 48 | cog_client = TextAnalyticsClient(endpoint=endpoint, 49 | credential=credential) 50 | ``` 51 | 52 | **Following is what you need for this book:** 53 | This book is for data scientists, machine learning engineers, and ML practitioners in both academia and industry. A fundamental understanding of machine learning concepts and working knowledge of Python programming is assumed. Prior experience implementing ML/DL models with TensorFlow or PyTorch will be beneficial. 54 | You'll find this book useful if you are interested in using distributed systems to boost machine learning model training and serving speed. 55 | 56 | With the following software and hardware list you can run all code files present in the book (Chapter 1-12) 57 | 58 | ### Software and Hardware List 59 | 60 | 61 | | Chapter | Software required | OS required | 62 | | -------- | ------------------------------------ | -----------------------------------| 63 | | 1-12 | PyTorch | Windows, Mac OS X, and Linux (Any) | 64 | | 1-12 | TensorFlow | Windows, Mac OS X, and Linux (Any) | 65 | | 1-12 | Python | Windows, Mac OS X, and Linux (Any) | 66 | | | CUDA/C | | 67 | | | NVprofiler/Nsight | | 68 | 69 | 70 | We assume you have Linux/Ubuntu as your operating system. We assume you use 71 | NVIDIA GPUs and have installed the proper NVIDIA driver as well. We also assume you 72 | have basic knowledge about machine learning in general and are familiar with popular 73 | deep learning models. 74 | 75 | We also provide a PDF file that has color images of the screenshots/diagrams used in this book. [Click here to download it](https://static.packt-cdn.com/downloads/9781801815697_ColorImages.pdf). 76 | 77 | 78 | ### Related products 79 | * Distributed Data Systems with Azure Databricks [[Packt]](https://www.packtpub.com/product/distributed-data-systems-with-azure-databricks/9781838647216?utm_source=github&utm_medium=repository&utm_campaign=9781838647216) [[Amazon]](https://www.amazon.com/dp/B0946QSSBM) 80 | 81 | * Machine Learning with the Elastic Stack - Second Edition [[Packt]](https://www.packtpub.com/product/machine-learning-with-the-elastic-stack-second-edition/9781801070034?utm_source=github&utm_medium=repository&utm_campaign=9781801070034) [[Amazon]](https://www.amazon.com/dp/1801070032) 82 | 83 | ## Get to Know the Author 84 | **Guanhua Wang** 85 | is a final-year computer science Ph.D. student in the RISELab at UC 86 | Berkeley, advised by Professor Ion Stoica. His research lies primarily in the machine 87 | learning systems area, including fast collective communication, efficient in-parallel model 88 | training, and real-time model serving. His research has gained lots of attention from both 89 | academia and industry. He was invited to give talks to top-tier universities (MIT, Stanford, 90 | CMU, Princeton) and big tech companies (Facebook/Meta, Microsoft). He received his 91 | master's degree from HKUST and a bachelor's degree from Southeast University in China. 92 | He has also done some cool research on wireless networks. He likes playing soccer and has 93 | run multiple half-marathons in the Bay Area of California. 94 | ### Download a free PDF 95 | 96 | If you have already purchased a print or Kindle version of this book, you can get a DRM-free PDF version at no cost.
Simply click on the link to claim your free PDF.
97 |

https://packt.link/free-ebook/9781801815697

--------------------------------------------------------------------------------