├── .gitignore ├── requirements.txt ├── LICENSE ├── conda.yml ├── src ├── basic │ ├── level_05_pretrained_model │ │ └── pretrained_model.py │ ├── level_01_lightning_module │ │ └── lightning_module.py │ ├── level_02_validation_and_testing │ │ └── validate_and_test_model.py │ ├── level_03_checkpointing │ │ ├── 03_disabling_and_resuming_checkpoints.py │ │ ├── 02_checkpoints_with_nn_module.py │ │ └── 01_saving_and_loading_checkpoints.py │ ├── level_07_inference │ │ └── lightining_predict_step.py │ ├── level_04_early_stopping │ │ └── early_stopping.py │ └── level_06_debugging_model │ │ ├── 01_quick_run.py │ │ └── 02_model_summary.py ├── intermediate │ ├── level_12_deploying_models │ │ └── 01_onnx.py │ ├── level_11_scaling_techniques │ │ ├── 01_precision_training.py │ │ ├── 04_stochastic_weight_averaging.py │ │ ├── 02_gradient_accumulation.py │ │ └── 03_gradient_clipping.py │ ├── level_08_accelerated_hardware │ │ ├── 02_tpu_traininig.py │ │ └── 01_gpu_training.py │ ├── level_13_profiler │ │ ├── 02_profile_pytorch_operations.py │ │ └── 01_advanced_profiler.py │ └── level_09_modularize │ │ └── 01_lightning_datamodule.py └── advanced │ ├── level_15_modify_trainer │ ├── 01_create_callbacks.py │ └── 02_customize_progress_bar.py │ ├── level_14_run_with_config_file │ └── run_with_yaml.py │ ├── level_18_ipu │ └── ipu.py │ ├── level_17_advanced_checkpointing │ └── checkpoint.py │ ├── level_19_hpu │ └── hpu.py │ └── level_16_own_the_training_loop │ └── 01_enable_manual_optimization.py └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | lightning_logs 2 | __pycache__ 3 | MNIST 4 | .DS_STORE 5 | cifar* -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | aiohttp==3.9.5 2 | aiosignal==1.3.1 3 | async-timeout==4.0.3 4 | attrs==23.2.0 5 | black==24.4.2 6 | click==8.1.7 7 | coloredlogs==15.0.1 8 | docstring_parser==0.16 9 | filelock==3.14.0 10 | flatbuffers==24.3.25 11 | frozenlist==1.4.1 12 | fsspec==2024.3.1 13 | humanfriendly==10.0 14 | idna==3.7 15 | importlib_resources==6.4.0 16 | isort==5.13.2 17 | Jinja2==3.1.3 18 | jsonargparse==4.28.0 19 | lightning==2.2.4 20 | lightning-utilities==0.11.2 21 | markdown-it-py==3.0.0 22 | MarkupSafe==2.1.5 23 | mdurl==0.1.2 24 | mpmath==1.3.0 25 | multidict==6.0.5 26 | mypy-extensions==1.0.0 27 | networkx==3.3 28 | numpy==1.26.4 29 | onnx==1.16.0 30 | onnxruntime==1.17.3 31 | packaging==24.0 32 | pathspec==0.12.1 33 | pillow==10.3.0 34 | platformdirs==4.2.1 35 | protobuf==5.26.1 36 | psutil==5.9.8 37 | Pygments==2.18.0 38 | pytorch-lightning==2.2.4 39 | PyYAML==6.0.1 40 | rich==13.7.1 41 | sympy==1.12 42 | tomli==2.0.1 43 | torch==2.3.0 44 | torchmetrics==1.3.2 45 | torchvision==0.18.0 46 | tqdm==4.66.4 47 | typeshed_client==2.5.1 48 | typing_extensions==4.11.0 49 | yarl==1.9.4 50 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Ishan Dutta 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /conda.yml: -------------------------------------------------------------------------------- 1 | name: lit-env 2 | channels: 3 | - defaults 4 | dependencies: 5 | - bzip2=1.0.8 6 | - ca-certificates=2024.3.11 7 | - libffi=3.4.4 8 | - ncurses=6.4 9 | - openssl=3.0.13 10 | - pip=23.3.1 11 | - python=3.10.14 12 | - readline=8.2 13 | - setuptools=68.2.2 14 | - sqlite=3.45.3 15 | - tk=8.6.12 16 | - tzdata=2024a 17 | - wheel=0.43.0 18 | - xz=5.4.6 19 | - zlib=1.2.13 20 | - pip: 21 | - aiohttp==3.9.5 22 | - aiosignal==1.3.1 23 | - async-timeout==4.0.3 24 | - attrs==23.2.0 25 | - black==24.4.2 26 | - click==8.1.7 27 | - coloredlogs==15.0.1 28 | - docstring-parser==0.16 29 | - filelock==3.14.0 30 | - flatbuffers==24.3.25 31 | - frozenlist==1.4.1 32 | - fsspec==2024.3.1 33 | - humanfriendly==10.0 34 | - idna==3.7 35 | - importlib-resources==6.4.0 36 | - isort==5.13.2 37 | - jinja2==3.1.3 38 | - jsonargparse==4.28.0 39 | - lightning==2.2.4 40 | - lightning-utilities==0.11.2 41 | - markdown-it-py==3.0.0 42 | - markupsafe==2.1.5 43 | - mdurl==0.1.2 44 | - mpmath==1.3.0 45 | - multidict==6.0.5 46 | - mypy-extensions==1.0.0 47 | - networkx==3.3 48 | - numpy==1.26.4 49 | - onnx==1.16.0 50 | - onnxruntime==1.17.3 51 | - packaging==24.0 52 | - pathspec==0.12.1 53 | - pillow==10.3.0 54 | - platformdirs==4.2.1 55 | - protobuf==5.26.1 56 | - psutil==5.9.8 57 | - pygments==2.18.0 58 | - pytorch-lightning==2.2.4 59 | - pyyaml==6.0.1 60 | - rich==13.7.1 61 | - sympy==1.12 62 | - tomli==2.0.1 63 | - torch==2.3.0 64 | - torchmetrics==1.3.2 65 | - torchvision==0.18.0 66 | - tqdm==4.66.4 67 | - typeshed-client==2.5.1 68 | - typing-extensions==4.11.0 69 | - yarl==1.9.4 70 | -------------------------------------------------------------------------------- /src/basic/level_05_pretrained_model/pretrained_model.py: -------------------------------------------------------------------------------- 1 | # Documentation Link 2 | # https://lightning.ai/docs/pytorch/stable/advanced/transfer_learning.html 3 | 4 | import os 5 | 6 | import lightning.pytorch as pl 7 | import torch 8 | import torchvision.models as models 9 | from torch import nn 10 | from torch.nn import functional as F 11 | from torchvision import datasets, transforms 12 | 13 | 14 | # Define the Lightning Module 15 | class ImagenetTransferLearning(pl.LightningModule): 16 | def __init__(self, learning_rate=1e-3): 17 | super().__init__() 18 | 19 | # init a pretrained resnet 20 | backbone = models.resnet50(weights="DEFAULT") 21 | num_filters = backbone.fc.in_features 22 | layers = list(backbone.children())[:-1] 23 | self.feature_extractor = nn.Sequential(*layers) 24 | 25 | # use the pretrained model to classify cifar-10 (10 image classes) 26 | num_target_classes = 10 27 | self.classifier = nn.Linear(num_filters, num_target_classes) 28 | self.learning_rate = learning_rate 29 | 30 | def forward(self, x): 31 | self.feature_extractor.eval() 32 | with torch.no_grad(): 33 | representations = self.feature_extractor(x).flatten(1) 34 | x = self.classifier(representations) 35 | return x 36 | 37 | def training_step(self, batch, batch_idx): 38 | x, y = batch 39 | y_hat = self(x) 40 | loss = F.cross_entropy(y_hat, y) 41 | return loss 42 | 43 | def configure_optimizers(self): 44 | return torch.optim.Adam(self.parameters(), lr=self.learning_rate) 45 | 46 | 47 | # Data preparation 48 | transform = transforms.Compose( 49 | [ 50 | transforms.Resize((224, 224)), # ResNet50 expects 224x224 input size 51 | transforms.ToTensor(), 52 | transforms.Normalize( 53 | mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225] 54 | ), # Normalization for Imagenet data 55 | ] 56 | ) 57 | 58 | train_dataset = datasets.CIFAR10( 59 | root="./", train=True, transform=transform, download=True 60 | ) 61 | # Use a subset of the training data for demonstration purposes 62 | train_dataset = torch.utils.data.Subset(train_dataset, indices=list(range(100))) 63 | 64 | train_dataloader = torch.utils.data.DataLoader( 65 | train_dataset, batch_size=32, shuffle=True 66 | ) 67 | 68 | # Training 69 | model = ImagenetTransferLearning() 70 | trainer = pl.Trainer(max_epochs=1) 71 | trainer.fit(model, train_dataloader) 72 | 73 | # Save the model checkpoint 74 | trainer.save_checkpoint("example_model.ckpt") 75 | 76 | # Inference 77 | loaded_model = ImagenetTransferLearning.load_from_checkpoint("example_model.ckpt") 78 | loaded_model.freeze() 79 | 80 | # Load some CIFAR10 images for prediction (assuming you're using the same transform as above) 81 | test_dataset = datasets.CIFAR10(root="./", train=False, transform=transform) 82 | test_dataloader = torch.utils.data.DataLoader( 83 | test_dataset, batch_size=5 84 | ) # Loading 5 images for demonstration 85 | some_images_from_cifar10, _ = next(iter(test_dataloader)) 86 | 87 | predictions = loaded_model(some_images_from_cifar10) 88 | print(predictions.argmax(dim=1)) 89 | -------------------------------------------------------------------------------- /src/basic/level_01_lightning_module/lightning_module.py: -------------------------------------------------------------------------------- 1 | # Documentation Link 2 | # https://lightning.ai/docs/pytorch/stable/model/train_model_basic.html 3 | 4 | import lightning.pytorch as pl 5 | import torch 6 | from torch import nn 7 | from torch.nn import functional as F 8 | from torch.utils.data import DataLoader 9 | from torchvision import transforms 10 | from torchvision.datasets import MNIST 11 | 12 | 13 | # A simple convolution based classifier model for MNIST 14 | class LitConvClassifier(pl.LightningModule): 15 | def __init__(self): 16 | super().__init__() 17 | # Define the layers for the model architecture 18 | 19 | # Convolutional layer with 32 filters of size 3x3 20 | # ReLU activation function introduces non-linearity to the model, enabling it to learn more complex patterns 21 | self.conv1 = nn.Conv2d(1, 32, 3, stride=1, padding=1) 22 | 23 | # Second convolutional layer with 64 filters of size 3x3 24 | self.conv2 = nn.Conv2d(32, 64, 3, stride=1, padding=1) 25 | 26 | # Fully connected layers for classification 27 | # The input size 64*7*7 corresponds to the flattened output of the last convolutional layer 28 | self.fc1 = nn.Linear(64 * 7 * 7, 128) 29 | self.fc2 = nn.Linear(128, 10) 30 | 31 | def forward(self, x): 32 | # Define the forward pass through the network 33 | # Input shape: (batch_size, 1, 28, 28) 34 | x = F.relu(self.conv1(x)) # Shape: (batch_size, 32, 28, 28) 35 | x = F.max_pool2d(x, 2) # Shape: (batch_size, 32, 14, 14) 36 | x = F.relu(self.conv2(x)) # Shape: (batch_size, 64, 14, 14) 37 | x = F.max_pool2d(x, 2) # Shape: (batch_size, 64, 7, 7) 38 | x = x.view(x.size(0), -1) # Shape: (batch_size, 64*7*7) 39 | x = F.relu(self.fc1(x)) # Shape: (batch_size, 128) 40 | x = self.fc2(x) # Shape: (batch_size, 10) 41 | return x 42 | 43 | def training_step(self, batch, batch_idx): 44 | # Define the training step which includes 45 | # the forward pass, loss calculation and backpropagation 46 | 47 | x, y = batch # Unpack batch 48 | y_hat = self(x) # Forward pass, get predicted logits 49 | 50 | # Calculate loss using cross-entropy, which is suitable for multi-class classification 51 | loss = F.cross_entropy(y_hat, y) 52 | return loss 53 | 54 | def configure_optimizers(self): 55 | # Define the optimizer to use for training 56 | # Adam is a popular choice due to its adaptive learning rate and momentum 57 | optimizer = torch.optim.Adam(self.parameters(), lr=1e-3) 58 | return optimizer 59 | 60 | 61 | # Load Dataset 62 | # MNIST is a widely used dataset for handwritten digit recognition 63 | dataset = MNIST("./", download=True, transform=transforms.ToTensor()) 64 | 65 | # Create a Dataloader with batch size of 32 66 | # Batch size is a hyperparameter that defines the number of 67 | # samples to work through before updating the model's weights 68 | train_dataloader = DataLoader(dataset, batch_size=32) 69 | 70 | # Initialise the model 71 | model = LitConvClassifier() 72 | 73 | # Initialise the trainer with 1 epoch 74 | # An epoch is a complete pass through the entire training dataset 75 | trainer = pl.Trainer(max_epochs=1) 76 | 77 | # Train the model 78 | trainer.fit(model, train_dataloader) 79 | -------------------------------------------------------------------------------- /src/basic/level_02_validation_and_testing/validate_and_test_model.py: -------------------------------------------------------------------------------- 1 | # Documentation Link 2 | # https://lightning.ai/docs/pytorch/stable/common/evaluation_basic.html 3 | 4 | import os 5 | 6 | import lightning.pytorch as pl 7 | import torch 8 | from torch import nn 9 | from torch.nn import functional as F 10 | from torch.utils.data import DataLoader 11 | from torchvision import transforms 12 | from torchvision.datasets import MNIST 13 | 14 | 15 | class LitConvClassifier(pl.LightningModule): 16 | def __init__(self): 17 | super().__init__() 18 | 19 | self.conv1 = nn.Conv2d(1, 32, 3, stride=1, padding=1) 20 | self.conv2 = nn.Conv2d(32, 64, 3, stride=1, padding=1) 21 | self.fc1 = nn.Linear(64 * 7 * 7, 128) 22 | self.fc2 = nn.Linear(128, 10) 23 | 24 | def forward(self, x): 25 | # Define the forward pass through the network 26 | # Input shape: (batch_size, 1, 28, 28) 27 | x = F.relu(self.conv1(x)) # Shape: (batch_size, 32, 28, 28) 28 | x = F.max_pool2d(x, 2) # Shape: (batch_size, 32, 14, 14) 29 | x = F.relu(self.conv2(x)) # Shape: (batch_size, 64, 14, 14) 30 | x = F.max_pool2d(x, 2) # Shape: (batch_size, 64, 7, 7) 31 | x = x.view(x.size(0), -1) # Shape: (batch_size, 64*7*7) 32 | x = F.relu(self.fc1(x)) # Shape: (batch_size, 128) 33 | x = self.fc2(x) # Shape: (batch_size, 10) 34 | return x 35 | 36 | def training_step(self, batch, batch_idx): 37 | x, y = batch 38 | y_hat = self(x) 39 | 40 | loss = F.cross_entropy(y_hat, y) 41 | return loss 42 | 43 | def validation_step(self, batch, batch_idx): 44 | # The validation step is performed once per batch of data from the validation set. 45 | # It's used to check the model's performance on the validation set during training. 46 | x, y = batch 47 | y_hat = self(x) 48 | 49 | loss = F.cross_entropy(y_hat, y) 50 | return loss 51 | 52 | def test_step(self, batch, batch_idx): 53 | # The test step is performed once per batch of data from the test set. 54 | # It's used to assess the model's performance on unseen data after training is complete. 55 | x, y = batch 56 | y_hat = self(x) 57 | 58 | loss = F.cross_entropy(y_hat, y) 59 | return loss 60 | 61 | def configure_optimizers(self): 62 | optimizer = torch.optim.Adam(self.parameters(), lr=1e-3) 63 | return optimizer 64 | 65 | 66 | train_dataset = MNIST( 67 | "./", download=True, train=True, transform=transforms.ToTensor() 68 | ) 69 | 70 | # Calculate training and validation split 71 | # We will keep 80% data for training and 20% for validation 72 | train_size = int(0.8 * len(train_dataset)) 73 | val_size = len(train_dataset) - train_size 74 | 75 | # Split the dataset into training and validation 76 | seed = torch.Generator().manual_seed(42) 77 | train_dataset, val_dataset = torch.utils.data.random_split( 78 | train_dataset, [train_size, val_size], generator=seed 79 | ) 80 | 81 | test_dataset = MNIST( 82 | "./", download=True, train=False, transform=transforms.ToTensor() 83 | ) 84 | 85 | # Create data loaders for loading the data in batches 86 | train_dataloader = DataLoader(train_dataset, batch_size=32) 87 | val_dataloader = DataLoader(val_dataset, batch_size=32) 88 | test_dataloader = DataLoader(test_dataset, batch_size=32) 89 | 90 | model = LitConvClassifier() 91 | 92 | trainer = pl.Trainer(max_epochs=1) 93 | 94 | trainer.fit(model, train_dataloader, val_dataloader) 95 | 96 | # Test the model on the test set after training is complete 97 | trainer.test(model, test_dataloader) 98 | -------------------------------------------------------------------------------- /src/basic/level_03_checkpointing/03_disabling_and_resuming_checkpoints.py: -------------------------------------------------------------------------------- 1 | # Documentation Link: 2 | # https://lightning.ai/docs/pytorch/stable/common/checkpointing_basic.html#save-a-checkpoint 3 | 4 | import os 5 | 6 | import lightning.pytorch as pl 7 | import torch 8 | from torch import nn 9 | from torch.nn import functional as F 10 | from torch.utils.data import DataLoader 11 | from torchvision import transforms 12 | from torchvision.datasets import MNIST 13 | 14 | 15 | class LitConvClassifier(pl.LightningModule): 16 | def __init__(self, learning_rate=1e-3): 17 | super().__init__() 18 | self.save_hyperparameters() 19 | 20 | self.learning_rate = learning_rate 21 | self.conv1 = nn.Conv2d(1, 32, 3, stride=1, padding=1) 22 | self.conv2 = nn.Conv2d(32, 64, 3, stride=1, padding=1) 23 | self.fc1 = nn.Linear(64 * 7 * 7, 128) 24 | self.fc2 = nn.Linear(128, 10) 25 | 26 | def forward(self, x): 27 | x = F.relu(self.conv1(x)) 28 | x = F.max_pool2d(x, 2) 29 | x = F.relu(self.conv2(x)) 30 | x = F.max_pool2d(x, 2) 31 | x = x.view(x.size(0), -1) 32 | x = F.relu(self.fc1(x)) 33 | x = self.fc2(x) 34 | return x 35 | 36 | def training_step(self, batch, batch_idx): 37 | x, y = batch 38 | y_hat = self(x) 39 | loss = F.cross_entropy(y_hat, y) 40 | return loss 41 | 42 | def validation_step(self, batch, batch_idx): 43 | x, y = batch 44 | y_hat = self(x) 45 | loss = F.cross_entropy(y_hat, y) 46 | return loss 47 | 48 | def test_step(self, batch, batch_idx): 49 | x, y = batch 50 | y_hat = self(x) 51 | loss = F.cross_entropy(y_hat, y) 52 | return loss 53 | 54 | def configure_optimizers(self): 55 | optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate) 56 | return optimizer 57 | 58 | 59 | def prepare_dataloaders(): 60 | train_dataset = MNIST( 61 | "./", download=True, train=True, transform=transforms.ToTensor() 62 | ) 63 | 64 | train_size = int(0.8 * len(train_dataset)) 65 | val_size = len(train_dataset) - train_size 66 | 67 | seed = torch.Generator().manual_seed(42) 68 | train_dataset, val_dataset = torch.utils.data.random_split( 69 | train_dataset, [train_size, val_size], generator=seed 70 | ) 71 | 72 | test_dataset = MNIST( 73 | "./", download=True, train=False, transform=transforms.ToTensor() 74 | ) 75 | 76 | train_dataloader = DataLoader(train_dataset, batch_size=32) 77 | val_dataloader = DataLoader(val_dataset, batch_size=32) 78 | test_dataloader = DataLoader(test_dataset, batch_size=32) 79 | 80 | return train_dataloader, val_dataloader, test_dataloader 81 | 82 | 83 | train_dataloader, val_dataloader, test_dataloader = prepare_dataloaders() 84 | 85 | model = LitConvClassifier() 86 | 87 | # You can disable checkpointing by setting the Trainer's enable_checkpointing to False 88 | trainer = pl.Trainer( 89 | max_epochs=1, default_root_dir="experiments/", enable_checkpointing=False 90 | ) 91 | 92 | trainer.fit(model, train_dataloader, val_dataloader) 93 | 94 | # To resume training from a checkpoint pass it directly to the fit method 95 | checkpoint_path = ( 96 | "experiments/lightning_logs/version_0/checkpoints/epoch=0-step=1500.ckpt" 97 | ) 98 | 99 | model = LitConvClassifier() 100 | 101 | trainer = pl.Trainer(max_epochs=1, default_root_dir="experiments/") 102 | 103 | trainer.fit(model, train_dataloader, val_dataloader, ckpt_path=checkpoint_path) 104 | -------------------------------------------------------------------------------- /src/basic/level_03_checkpointing/02_checkpoints_with_nn_module.py: -------------------------------------------------------------------------------- 1 | # Documentation Link: 2 | # https://lightning.ai/docs/pytorch/stable/common/checkpointing_basic.html#save-a-checkpoint 3 | 4 | import os 5 | 6 | import lightning.pytorch as pl 7 | import torch 8 | from torch import nn 9 | from torch.nn import functional as F 10 | from torch.utils.data import DataLoader 11 | from torchvision import transforms 12 | from torchvision.datasets import MNIST 13 | 14 | 15 | class ConvClassifier(nn.Module): 16 | def __init__(self): 17 | super(ConvClassifier, self).__init__() 18 | self.conv1 = nn.Conv2d(1, 32, 3, stride=1, padding=1) 19 | self.conv2 = nn.Conv2d(32, 64, 3, stride=1, padding=1) 20 | self.fc1 = nn.Linear(64 * 7 * 7, 128) 21 | self.fc2 = nn.Linear(128, 10) 22 | 23 | def forward(self, x): 24 | x = F.relu(self.conv1(x)) 25 | x = F.max_pool2d(x, 2) 26 | x = F.relu(self.conv2(x)) 27 | x = F.max_pool2d(x, 2) 28 | x = x.view(x.size(0), -1) 29 | x = F.relu(self.fc1(x)) 30 | x = self.fc2(x) 31 | return x 32 | 33 | 34 | class LitConvClassifier(pl.LightningModule): 35 | def __init__(self, learning_rate=1e-3): 36 | super().__init__() 37 | self.save_hyperparameters() 38 | 39 | self.learning_rate = learning_rate 40 | self.model = ConvClassifier() 41 | 42 | def forward(self, x): 43 | return self.model(x) 44 | 45 | def training_step(self, batch, batch_idx): 46 | x, y = batch 47 | y_hat = self(x) 48 | loss = F.cross_entropy(y_hat, y) 49 | return loss 50 | 51 | def validation_step(self, batch, batch_idx): 52 | x, y = batch 53 | y_hat = self(x) 54 | loss = F.cross_entropy(y_hat, y) 55 | return loss 56 | 57 | def test_step(self, batch, batch_idx): 58 | x, y = batch 59 | y_hat = self(x) 60 | loss = F.cross_entropy(y_hat, y) 61 | return loss 62 | 63 | def configure_optimizers(self): 64 | optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate) 65 | return optimizer 66 | 67 | 68 | def prepare_dataloaders(): 69 | train_dataset = MNIST( 70 | "./", download=True, train=True, transform=transforms.ToTensor() 71 | ) 72 | 73 | train_size = int(0.8 * len(train_dataset)) 74 | val_size = len(train_dataset) - train_size 75 | 76 | seed = torch.Generator().manual_seed(42) 77 | train_dataset, val_dataset = torch.utils.data.random_split( 78 | train_dataset, [train_size, val_size], generator=seed 79 | ) 80 | 81 | test_dataset = MNIST( 82 | "./", download=True, train=False, transform=transforms.ToTensor() 83 | ) 84 | 85 | train_dataloader = DataLoader(train_dataset, batch_size=32) 86 | val_dataloader = DataLoader(val_dataset, batch_size=32) 87 | test_dataloader = DataLoader(test_dataset, batch_size=32) 88 | 89 | return train_dataloader, val_dataloader, test_dataloader 90 | 91 | 92 | train_dataloader, val_dataloader, test_dataloader = prepare_dataloaders() 93 | 94 | # Train the Model 95 | model = LitConvClassifier() 96 | 97 | trainer = pl.Trainer(max_epochs=1, default_root_dir="experiments/") 98 | 99 | trainer.fit(model, train_dataloader, val_dataloader) 100 | 101 | # Load the module using the state dict 102 | 103 | checkpoint_path = ( 104 | "experiments/lightning_logs/version_0/checkpoints/epoch=0-step=1500.ckpt" 105 | ) 106 | 107 | checkpoint = torch.load(checkpoint_path) 108 | state_dict = checkpoint["state_dict"] 109 | 110 | classifier_model = ConvClassifier() 111 | classifier_model.load_state_dict(state_dict) 112 | -------------------------------------------------------------------------------- /src/basic/level_07_inference/lightining_predict_step.py: -------------------------------------------------------------------------------- 1 | # Documentation Link 2 | # https://lightning.ai/docs/pytorch/stable/levels/core_level_6.html 3 | 4 | import os 5 | 6 | import lightning.pytorch as pl 7 | import torch 8 | from lightning.pytorch.callbacks import ModelSummary 9 | from lightning.pytorch.callbacks.early_stopping import EarlyStopping 10 | from torch import nn 11 | from torch.nn import functional as F 12 | from torch.utils.data import DataLoader 13 | from torchvision import transforms 14 | from torchvision.datasets import MNIST 15 | 16 | 17 | class LitConvClassifier(pl.LightningModule): 18 | def __init__(self, learning_rate=1e-3): 19 | super().__init__() 20 | self.save_hyperparameters() 21 | self.example_input_array = torch.rand(1, 1, 28, 28) 22 | 23 | self.learning_rate = learning_rate 24 | 25 | # Define blocks of layers as submodules 26 | self.conv_block1 = nn.Sequential( 27 | nn.Conv2d(1, 32, 3, stride=1, padding=1), nn.ReLU(), nn.MaxPool2d(2) 28 | ) 29 | 30 | self.conv_block2 = nn.Sequential( 31 | nn.Conv2d(32, 64, 3, stride=1, padding=1), nn.ReLU(), nn.MaxPool2d(2) 32 | ) 33 | 34 | self.fc_block = nn.Sequential( 35 | nn.Linear(64 * 7 * 7, 128), nn.ReLU(), nn.Linear(128, 10) 36 | ) 37 | 38 | def forward(self, x): 39 | x = self.conv_block1(x) 40 | x = self.conv_block2(x) 41 | x = x.view(x.size(0), -1) 42 | x = self.fc_block(x) 43 | return x 44 | 45 | def training_step(self, batch, batch_idx): 46 | x, y = batch 47 | y_hat = self(x) 48 | loss = F.cross_entropy(y_hat, y) 49 | return loss 50 | 51 | def validation_step(self, batch, batch_idx): 52 | x, y = batch 53 | y_hat = self(x) 54 | loss = F.cross_entropy(y_hat, y) 55 | self.log("val_loss", loss) 56 | return loss 57 | 58 | def test_step(self, batch, batch_idx): 59 | x, y = batch 60 | y_hat = self(x) 61 | loss = F.cross_entropy(y_hat, y) 62 | return loss 63 | 64 | def predict_step(self, batch, batch_idx, dataloader_idx=None): 65 | x, _ = batch 66 | return self(x) 67 | 68 | def configure_optimizers(self): 69 | optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate) 70 | return optimizer 71 | 72 | 73 | def prepare_dataloaders(): 74 | train_dataset = MNIST( 75 | "./", download=True, train=True, transform=transforms.ToTensor() 76 | ) 77 | 78 | train_size = int(0.8 * len(train_dataset)) 79 | val_size = len(train_dataset) - train_size 80 | 81 | seed = torch.Generator().manual_seed(42) 82 | train_dataset, val_dataset = torch.utils.data.random_split( 83 | train_dataset, [train_size, val_size], generator=seed 84 | ) 85 | 86 | test_dataset = MNIST( 87 | "./", download=True, train=False, transform=transforms.ToTensor() 88 | ) 89 | 90 | train_dataloader = DataLoader(train_dataset, batch_size=32) 91 | val_dataloader = DataLoader(val_dataset, batch_size=32) 92 | test_dataloader = DataLoader(test_dataset, batch_size=32) 93 | 94 | return train_dataloader, val_dataloader, test_dataloader 95 | 96 | 97 | train_dataloader, val_dataloader, test_dataloader = prepare_dataloaders() 98 | 99 | model = LitConvClassifier() 100 | 101 | trainer = pl.Trainer( 102 | max_epochs=1, 103 | default_root_dir="experiments/", 104 | callbacks=[ 105 | EarlyStopping(monitor="val_loss", mode="min"), 106 | ModelSummary(max_depth=-1), 107 | ], 108 | ) 109 | trainer.fit(model, train_dataloader, val_dataloader) 110 | 111 | # Here we use the test_dataloader to get the predictions for the test set 112 | # You can use the predict_step() method for the required data 113 | predictions = trainer.predict(model, test_dataloader) 114 | print(len(predictions)) 115 | -------------------------------------------------------------------------------- /src/basic/level_03_checkpointing/01_saving_and_loading_checkpoints.py: -------------------------------------------------------------------------------- 1 | # Documentation Link: 2 | # https://lightning.ai/docs/pytorch/stable/common/checkpointing_basic.html#save-a-checkpoint 3 | 4 | import os 5 | 6 | import lightning.pytorch as pl 7 | import torch 8 | from torch import nn 9 | from torch.nn import functional as F 10 | from torch.utils.data import DataLoader 11 | from torchvision import transforms 12 | from torchvision.datasets import MNIST 13 | 14 | 15 | class LitConvClassifier(pl.LightningModule): 16 | def __init__(self, learning_rate=1e-3): 17 | super().__init__() 18 | 19 | # You can save the hyperparameters initialized in the __init__ method 20 | # by calling self.save_hyperparameters() in the __init__ method. 21 | # Here we save the learning_rate hyperparameter. 22 | self.save_hyperparameters() 23 | 24 | self.learning_rate = learning_rate 25 | self.conv1 = nn.Conv2d(1, 32, 3, stride=1, padding=1) 26 | self.conv2 = nn.Conv2d(32, 64, 3, stride=1, padding=1) 27 | self.fc1 = nn.Linear(64 * 7 * 7, 128) 28 | self.fc2 = nn.Linear(128, 10) 29 | 30 | def forward(self, x): 31 | x = F.relu(self.conv1(x)) 32 | x = F.max_pool2d(x, 2) 33 | x = F.relu(self.conv2(x)) 34 | x = F.max_pool2d(x, 2) 35 | x = x.view(x.size(0), -1) 36 | x = F.relu(self.fc1(x)) 37 | x = self.fc2(x) 38 | return x 39 | 40 | def training_step(self, batch, batch_idx): 41 | x, y = batch 42 | y_hat = self(x) 43 | loss = F.cross_entropy(y_hat, y) 44 | return loss 45 | 46 | def validation_step(self, batch, batch_idx): 47 | x, y = batch 48 | y_hat = self(x) 49 | loss = F.cross_entropy(y_hat, y) 50 | return loss 51 | 52 | def test_step(self, batch, batch_idx): 53 | x, y = batch 54 | y_hat = self(x) 55 | loss = F.cross_entropy(y_hat, y) 56 | return loss 57 | 58 | def configure_optimizers(self): 59 | optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate) 60 | return optimizer 61 | 62 | 63 | def prepare_dataloaders(): 64 | train_dataset = MNIST( 65 | "./", download=True, train=True, transform=transforms.ToTensor() 66 | ) 67 | 68 | train_size = int(0.8 * len(train_dataset)) 69 | val_size = len(train_dataset) - train_size 70 | 71 | seed = torch.Generator().manual_seed(42) 72 | train_dataset, val_dataset = torch.utils.data.random_split( 73 | train_dataset, [train_size, val_size], generator=seed 74 | ) 75 | 76 | test_dataset = MNIST( 77 | "./", download=True, train=False, transform=transforms.ToTensor() 78 | ) 79 | 80 | train_dataloader = DataLoader(train_dataset, batch_size=32) 81 | val_dataloader = DataLoader(val_dataset, batch_size=32) 82 | test_dataloader = DataLoader(test_dataset, batch_size=32) 83 | 84 | return train_dataloader, val_dataloader, test_dataloader 85 | 86 | 87 | train_dataloader, val_dataloader, test_dataloader = prepare_dataloaders() 88 | 89 | model = LitConvClassifier() 90 | 91 | # Lightning automatically saves a checkpoint for you in your current working directory, 92 | # with the state of your last training epoch. 93 | # Or you can specify the path to save the checkpoint to. 94 | trainer = pl.Trainer(max_epochs=1, default_root_dir="experiments/") 95 | 96 | trainer.fit(model, train_dataloader, val_dataloader) 97 | 98 | # Load the checkpoint from the path 99 | # You can modify the path to the checkpoint file you want to load 100 | checkpoint_path = ( 101 | "experiments/lightning_logs/version_0/checkpoints/epoch=0-step=1500.ckpt" 102 | ) 103 | 104 | # By default, the checkpoint loads the model with the same parameters as the original model 105 | model = LitConvClassifier.load_from_checkpoint(checkpoint_path) 106 | print(f"Original Model Learning Rate: {model.learning_rate}") # prints 0.001 107 | 108 | # You can also load the checkpoint with different parameters 109 | model = LitConvClassifier.load_from_checkpoint(checkpoint_path, learning_rate=0.01) 110 | print(f"Updated Model Learning Rate: {model.learning_rate}") # prints 0.01 111 | -------------------------------------------------------------------------------- /src/basic/level_04_early_stopping/early_stopping.py: -------------------------------------------------------------------------------- 1 | # Documentation Link 2 | # https://lightning.ai/docs/pytorch/stable/common/early_stopping.html 3 | 4 | 5 | import os 6 | 7 | import lightning.pytorch as pl 8 | import torch 9 | 10 | # Import the early stopping callback 11 | from lightning.pytorch.callbacks.early_stopping import EarlyStopping 12 | from torch import nn 13 | from torch.nn import functional as F 14 | from torch.utils.data import DataLoader 15 | from torchvision import transforms 16 | from torchvision.datasets import MNIST 17 | 18 | 19 | class LitConvClassifier(pl.LightningModule): 20 | def __init__(self, learning_rate=1e-3): 21 | super().__init__() 22 | self.save_hyperparameters() 23 | 24 | self.learning_rate = learning_rate 25 | self.conv1 = nn.Conv2d(1, 32, 3, stride=1, padding=1) 26 | self.conv2 = nn.Conv2d(32, 64, 3, stride=1, padding=1) 27 | self.fc1 = nn.Linear(64 * 7 * 7, 128) 28 | self.fc2 = nn.Linear(128, 10) 29 | 30 | def forward(self, x): 31 | x = F.relu(self.conv1(x)) 32 | x = F.max_pool2d(x, 2) 33 | x = F.relu(self.conv2(x)) 34 | x = F.max_pool2d(x, 2) 35 | x = x.view(x.size(0), -1) 36 | x = F.relu(self.fc1(x)) 37 | x = self.fc2(x) 38 | return x 39 | 40 | def training_step(self, batch, batch_idx): 41 | x, y = batch 42 | y_hat = self(x) 43 | loss = F.cross_entropy(y_hat, y) 44 | return loss 45 | 46 | def validation_step(self, batch, batch_idx): 47 | x, y = batch 48 | y_hat = self(x) 49 | loss = F.cross_entropy(y_hat, y) 50 | 51 | # First we log the loss of interest 52 | self.log("val_loss", loss) 53 | return loss 54 | 55 | def test_step(self, batch, batch_idx): 56 | x, y = batch 57 | y_hat = self(x) 58 | loss = F.cross_entropy(y_hat, y) 59 | return loss 60 | 61 | def configure_optimizers(self): 62 | optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate) 63 | return optimizer 64 | 65 | 66 | def prepare_dataloaders(): 67 | train_dataset = MNIST( 68 | "./", download=True, train=True, transform=transforms.ToTensor() 69 | ) 70 | 71 | train_size = int(0.8 * len(train_dataset)) 72 | val_size = len(train_dataset) - train_size 73 | 74 | seed = torch.Generator().manual_seed(42) 75 | train_dataset, val_dataset = torch.utils.data.random_split( 76 | train_dataset, [train_size, val_size], generator=seed 77 | ) 78 | 79 | test_dataset = MNIST( 80 | "./", download=True, train=False, transform=transforms.ToTensor() 81 | ) 82 | 83 | train_dataloader = DataLoader(train_dataset, batch_size=32) 84 | val_dataloader = DataLoader(val_dataset, batch_size=32) 85 | test_dataloader = DataLoader(test_dataset, batch_size=32) 86 | 87 | return train_dataloader, val_dataloader, test_dataloader 88 | 89 | 90 | train_dataloader, val_dataloader, test_dataloader = prepare_dataloaders() 91 | 92 | model = LitConvClassifier() 93 | 94 | # Then pass the callback to the trainer 95 | trainer = pl.Trainer( 96 | max_epochs=3, 97 | default_root_dir="experiments/", 98 | callbacks=[EarlyStopping(monitor="val_loss", mode="min")], 99 | ) 100 | trainer.fit(model, train_dataloader, val_dataloader) 101 | 102 | # Or customize the early stopping callback and pass it to the trainer 103 | early_stopping = EarlyStopping(monitor="val_loss", mode="min", patience=3, verbose=True) 104 | trainer = pl.Trainer( 105 | max_epochs=3, default_root_dir="experiments/", callbacks=[early_stopping] 106 | ) 107 | 108 | trainer.fit(model, train_dataloader, val_dataloader) 109 | 110 | # Additional parameters that stop training at extreme points: 111 | # --> stopping_threshold: Stops training immediately once the monitored quantity reaches this threshold. 112 | # It is useful when we know that going beyond a certain optimal value does not further benefit us. 113 | 114 | # --> divergence_threshold: Stops training as soon as the monitored quantity becomes worse than this threshold. 115 | # When reaching a value this bad, we believes the model cannot recover anymore 116 | # and it is better to stop early and run with different initial conditions. 117 | 118 | # --> check_finite: When turned on, it stops training if the monitored metric becomes NaN or infinite. 119 | 120 | # --> check_on_train_epoch_end: When turned on, it checks the metric at the end of a training epoch. 121 | # Use this only when you are monitoring any metric logged within training-specific 122 | # hooks on epoch-level. 123 | -------------------------------------------------------------------------------- /src/intermediate/level_12_deploying_models/01_onnx.py: -------------------------------------------------------------------------------- 1 | # Documentation Link 2 | # https://lightning.ai/docs/pytorch/stable/deploy/production_advanced.html 3 | 4 | import os 5 | 6 | import lightning.pytorch as pl 7 | import onnxruntime 8 | import torch 9 | from lightning.pytorch.callbacks import ModelSummary, StochasticWeightAveraging 10 | from lightning.pytorch.callbacks.early_stopping import EarlyStopping 11 | from torch import nn 12 | from torch.nn import functional as F 13 | from torch.utils.data import DataLoader, random_split 14 | from torchvision import transforms 15 | from torchvision.datasets import MNIST 16 | 17 | 18 | class MNISTDataModule(pl.LightningDataModule): 19 | def __init__(self, data_dir: str = "./"): 20 | super().__init__() 21 | self.data_dir = data_dir 22 | self.transform = transforms.Compose( 23 | [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))] 24 | ) 25 | 26 | def prepare_data(self): 27 | # download 28 | MNIST(self.data_dir, train=True, download=True) 29 | MNIST(self.data_dir, train=False, download=True) 30 | 31 | def setup(self, stage: str): 32 | # Assign train/val datasets for use in dataloaders 33 | if stage == "fit": 34 | mnist_full = MNIST(self.data_dir, train=True, transform=self.transform) 35 | self.mnist_train, self.mnist_val = random_split(mnist_full, [55000, 5000]) 36 | 37 | # Assign test dataset for use in dataloader(s) 38 | if stage == "test": 39 | self.mnist_test = MNIST( 40 | self.data_dir, train=False, transform=self.transform 41 | ) 42 | 43 | if stage == "predict": 44 | self.mnist_predict = MNIST( 45 | self.data_dir, train=False, transform=self.transform 46 | ) 47 | 48 | def train_dataloader(self): 49 | return DataLoader(self.mnist_train, batch_size=32) 50 | 51 | def val_dataloader(self): 52 | return DataLoader(self.mnist_val, batch_size=32) 53 | 54 | def test_dataloader(self): 55 | return DataLoader(self.mnist_test, batch_size=32) 56 | 57 | def predict_dataloader(self): 58 | return DataLoader(self.mnist_predict, batch_size=32) 59 | 60 | 61 | class LitConvClassifier(pl.LightningModule): 62 | def __init__(self, learning_rate=1e-3): 63 | super().__init__() 64 | self.save_hyperparameters() 65 | self.example_input_array = torch.rand(1, 1, 28, 28) 66 | 67 | self.learning_rate = learning_rate 68 | 69 | # Define blocks of layers as submodules 70 | self.conv_block1 = nn.Sequential( 71 | nn.Conv2d(1, 32, 3, stride=1, padding=1), nn.ReLU(), nn.MaxPool2d(2) 72 | ) 73 | 74 | self.conv_block2 = nn.Sequential( 75 | nn.Conv2d(32, 64, 3, stride=1, padding=1), nn.ReLU(), nn.MaxPool2d(2) 76 | ) 77 | 78 | self.fc_block = nn.Sequential( 79 | nn.Linear(64 * 7 * 7, 128), nn.ReLU(), nn.Linear(128, 10) 80 | ) 81 | 82 | def forward(self, x): 83 | x = self.conv_block1(x) 84 | x = self.conv_block2(x) 85 | x = x.view(x.size(0), -1) 86 | x = self.fc_block(x) 87 | return x 88 | 89 | def training_step(self, batch, batch_idx): 90 | x, y = batch 91 | y_hat = self(x) 92 | loss = F.cross_entropy(y_hat, y) 93 | return loss 94 | 95 | def validation_step(self, batch, batch_idx): 96 | x, y = batch 97 | y_hat = self(x) 98 | loss = F.cross_entropy(y_hat, y) 99 | self.log("val_loss", loss) 100 | return loss 101 | 102 | def test_step(self, batch, batch_idx): 103 | x, y = batch 104 | y_hat = self(x) 105 | loss = F.cross_entropy(y_hat, y) 106 | return loss 107 | 108 | def predict_step(self, batch, batch_idx, dataloader_idx=None): 109 | x, _ = batch 110 | return self(x) 111 | 112 | def configure_optimizers(self): 113 | optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate) 114 | return optimizer 115 | 116 | 117 | data_module = MNISTDataModule() 118 | model = LitConvClassifier() 119 | 120 | # ONNX is a package developed by Microsoft to optimize inference. 121 | # ONNX allows the model to be independent of PyTorch and run on any ONNX Runtime. 122 | filepath = "model.onnx" 123 | model.to_onnx(filepath, export_params=True) 124 | 125 | # Once you have the exported model, you can run it on your ONNX runtime in the following way: 126 | ort_session = onnxruntime.InferenceSession(filepath) 127 | input_name = ort_session.get_inputs()[0].name 128 | ort_inputs = {input_name: torch.rand(1, 1, 28, 28).numpy()} 129 | ort_outs = ort_session.run(None, ort_inputs) 130 | -------------------------------------------------------------------------------- /src/basic/level_06_debugging_model/01_quick_run.py: -------------------------------------------------------------------------------- 1 | # Documentation Link 2 | # https://lightning.ai/docs/pytorch/stable/debug/debugging_basic.html 3 | 4 | import os 5 | import time 6 | 7 | import lightning.pytorch as pl 8 | import torch 9 | from lightning.pytorch.callbacks.early_stopping import EarlyStopping 10 | from torch import nn 11 | from torch.nn import functional as F 12 | from torch.utils.data import DataLoader 13 | from torchvision import transforms 14 | from torchvision.datasets import MNIST 15 | 16 | 17 | class LitConvClassifier(pl.LightningModule): 18 | def __init__(self, learning_rate=1e-3): 19 | super().__init__() 20 | self.save_hyperparameters() 21 | 22 | self.learning_rate = learning_rate 23 | self.conv1 = nn.Conv2d(1, 32, 3, stride=1, padding=1) 24 | self.conv2 = nn.Conv2d(32, 64, 3, stride=1, padding=1) 25 | self.fc1 = nn.Linear(64 * 7 * 7, 128) 26 | self.fc2 = nn.Linear(128, 10) 27 | 28 | def forward(self, x): 29 | x = F.relu(self.conv1(x)) 30 | x = F.max_pool2d(x, 2) 31 | x = F.relu(self.conv2(x)) 32 | x = F.max_pool2d(x, 2) 33 | x = x.view(x.size(0), -1) 34 | x = F.relu(self.fc1(x)) 35 | x = self.fc2(x) 36 | return x 37 | 38 | def training_step(self, batch, batch_idx): 39 | x, y = batch 40 | y_hat = self(x) 41 | loss = F.cross_entropy(y_hat, y) 42 | return loss 43 | 44 | def validation_step(self, batch, batch_idx): 45 | x, y = batch 46 | y_hat = self(x) 47 | loss = F.cross_entropy(y_hat, y) 48 | self.log("val_loss", loss) 49 | return loss 50 | 51 | def test_step(self, batch, batch_idx): 52 | x, y = batch 53 | y_hat = self(x) 54 | loss = F.cross_entropy(y_hat, y) 55 | return loss 56 | 57 | def configure_optimizers(self): 58 | optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate) 59 | return optimizer 60 | 61 | 62 | def prepare_dataloaders(): 63 | train_dataset = MNIST( 64 | "./", download=True, train=True, transform=transforms.ToTensor() 65 | ) 66 | 67 | train_size = int(0.8 * len(train_dataset)) 68 | val_size = len(train_dataset) - train_size 69 | 70 | seed = torch.Generator().manual_seed(42) 71 | train_dataset, val_dataset = torch.utils.data.random_split( 72 | train_dataset, [train_size, val_size], generator=seed 73 | ) 74 | 75 | test_dataset = MNIST( 76 | "./", download=True, train=False, transform=transforms.ToTensor() 77 | ) 78 | 79 | train_dataloader = DataLoader(train_dataset, batch_size=32) 80 | val_dataloader = DataLoader(val_dataset, batch_size=32) 81 | test_dataloader = DataLoader(test_dataset, batch_size=32) 82 | 83 | return train_dataloader, val_dataloader, test_dataloader 84 | 85 | 86 | train_dataloader, val_dataloader, test_dataloader = prepare_dataloaders() 87 | 88 | model = LitConvClassifier() 89 | 90 | # Default 91 | start = time.time() 92 | trainer = pl.Trainer( 93 | max_epochs=1, 94 | default_root_dir="experiments/", 95 | callbacks=[EarlyStopping(monitor="val_loss", mode="min")], 96 | ) 97 | trainer.fit(model, train_dataloader, val_dataloader) 98 | end = time.time() 99 | print(f"\nDefault Training time: {end - start}") 100 | 101 | # fast_dev_run 102 | # The fast_dev_run argument in the trainer runs 5 batch of training, validation, 103 | # test and prediction data through your trainer to see if there are any bugs 104 | # To change how many batches to use, change the argument to an integer. 105 | # This argument will disable tuner, checkpoint callbacks, early stopping callbacks, 106 | # loggers and logger callbacks like LearningRateMonitor and DeviceStatsMonitor. 107 | start = time.time() 108 | trainer = pl.Trainer(fast_dev_run=True) 109 | trainer.fit(model, train_dataloader, val_dataloader) 110 | end = time.time() 111 | print(f"\nFast Dev Run Training time: {end - start}") 112 | 113 | # Shorten Epoch Length 114 | # Here use only 10% of training data and 1% of val data 115 | # You can also specify the num batches as integers 116 | start = time.time() 117 | trainer = pl.Trainer(max_epochs=1, limit_train_batches=0.1, limit_val_batches=0.1) 118 | trainer.fit(model, train_dataloader, val_dataloader) 119 | end = time.time() 120 | print(f"S\nhortened Epoch Training time: {end - start}") 121 | 122 | # Sanity Check 123 | # Lightning runs 2 steps of validation in the beginning of training. 124 | # This avoids crashing in the validation loop sometime deep into a lengthy training loop. 125 | start = time.time() 126 | trainer = pl.Trainer(max_epochs=1, num_sanity_val_steps=2) 127 | trainer.fit(model, train_dataloader, val_dataloader) 128 | end = time.time() 129 | print(f"\nSanity Check Training time: {end - start}") 130 | -------------------------------------------------------------------------------- /src/intermediate/level_11_scaling_techniques/01_precision_training.py: -------------------------------------------------------------------------------- 1 | # Documentation Link 2 | # https://lightning.ai/docs/pytorch/stable/common/precision_basic.html 3 | 4 | import os 5 | 6 | import lightning.pytorch as pl 7 | import torch 8 | from lightning.pytorch.callbacks import ModelSummary 9 | from lightning.pytorch.callbacks.early_stopping import EarlyStopping 10 | from torch import nn 11 | from torch.nn import functional as F 12 | from torch.utils.data import DataLoader, random_split 13 | from torchvision import transforms 14 | from torchvision.datasets import MNIST 15 | 16 | 17 | class MNISTDataModule(pl.LightningDataModule): 18 | def __init__(self, data_dir: str = "./"): 19 | super().__init__() 20 | self.data_dir = data_dir 21 | self.transform = transforms.Compose( 22 | [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))] 23 | ) 24 | 25 | def prepare_data(self): 26 | # download 27 | MNIST(self.data_dir, train=True, download=True) 28 | MNIST(self.data_dir, train=False, download=True) 29 | 30 | def setup(self, stage: str): 31 | # Assign train/val datasets for use in dataloaders 32 | if stage == "fit": 33 | mnist_full = MNIST(self.data_dir, train=True, transform=self.transform) 34 | self.mnist_train, self.mnist_val = random_split(mnist_full, [55000, 5000]) 35 | 36 | # Assign test dataset for use in dataloader(s) 37 | if stage == "test": 38 | self.mnist_test = MNIST( 39 | self.data_dir, train=False, transform=self.transform 40 | ) 41 | 42 | if stage == "predict": 43 | self.mnist_predict = MNIST( 44 | self.data_dir, train=False, transform=self.transform 45 | ) 46 | 47 | def train_dataloader(self): 48 | return DataLoader(self.mnist_train, batch_size=32) 49 | 50 | def val_dataloader(self): 51 | return DataLoader(self.mnist_val, batch_size=32) 52 | 53 | def test_dataloader(self): 54 | return DataLoader(self.mnist_test, batch_size=32) 55 | 56 | def predict_dataloader(self): 57 | return DataLoader(self.mnist_predict, batch_size=32) 58 | 59 | 60 | class LitConvClassifier(pl.LightningModule): 61 | def __init__(self, learning_rate=1e-3): 62 | super().__init__() 63 | self.save_hyperparameters() 64 | self.example_input_array = torch.rand(1, 1, 28, 28) 65 | 66 | self.learning_rate = learning_rate 67 | 68 | # Define blocks of layers as submodules 69 | self.conv_block1 = nn.Sequential( 70 | nn.Conv2d(1, 32, 3, stride=1, padding=1), nn.ReLU(), nn.MaxPool2d(2) 71 | ) 72 | 73 | self.conv_block2 = nn.Sequential( 74 | nn.Conv2d(32, 64, 3, stride=1, padding=1), nn.ReLU(), nn.MaxPool2d(2) 75 | ) 76 | 77 | self.fc_block = nn.Sequential( 78 | nn.Linear(64 * 7 * 7, 128), nn.ReLU(), nn.Linear(128, 10) 79 | ) 80 | 81 | def forward(self, x): 82 | x = self.conv_block1(x) 83 | x = self.conv_block2(x) 84 | x = x.view(x.size(0), -1) 85 | x = self.fc_block(x) 86 | return x 87 | 88 | def training_step(self, batch, batch_idx): 89 | x, y = batch 90 | y_hat = self(x) 91 | loss = F.cross_entropy(y_hat, y) 92 | return loss 93 | 94 | def validation_step(self, batch, batch_idx): 95 | x, y = batch 96 | y_hat = self(x) 97 | loss = F.cross_entropy(y_hat, y) 98 | self.log("val_loss", loss) 99 | return loss 100 | 101 | def test_step(self, batch, batch_idx): 102 | x, y = batch 103 | y_hat = self(x) 104 | loss = F.cross_entropy(y_hat, y) 105 | return loss 106 | 107 | def predict_step(self, batch, batch_idx, dataloader_idx=None): 108 | x, _ = batch 109 | return self(x) 110 | 111 | def configure_optimizers(self): 112 | optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate) 113 | return optimizer 114 | 115 | 116 | data_module = MNISTDataModule() 117 | model = LitConvClassifier() 118 | 119 | # Lower precision, such as 16-bit floating-point, requires less memory and 120 | # enables training and deploying larger models. Higher precision, 121 | # such as the 64-bit floating-point, can be used for highly sensitive use-cases. 122 | 123 | trainer = pl.Trainer( 124 | max_epochs=1, 125 | default_root_dir="experiments/", 126 | callbacks=[ 127 | EarlyStopping(monitor="val_loss", mode="min"), 128 | ModelSummary(max_depth=-1), 129 | ], 130 | precision="16-mixed", 131 | ) 132 | 133 | trainer.fit(model, data_module) 134 | 135 | # Get Predictions 136 | predictions = trainer.predict(model, data_module) 137 | print(len(predictions)) 138 | -------------------------------------------------------------------------------- /src/intermediate/level_08_accelerated_hardware/02_tpu_traininig.py: -------------------------------------------------------------------------------- 1 | # Documentation Link 2 | # https://lightning.ai/docs/pytorch/stable/accelerators/tpu_basic.html 3 | 4 | import os 5 | 6 | import lightning.pytorch as pl 7 | import torch 8 | from lightning.pytorch.callbacks import ModelSummary 9 | from lightning.pytorch.callbacks.early_stopping import EarlyStopping 10 | from torch import nn 11 | from torch.nn import functional as F 12 | from torch.utils.data import DataLoader 13 | from torchvision import transforms 14 | from torchvision.datasets import MNIST 15 | 16 | 17 | class LitConvClassifier(pl.LightningModule): 18 | def __init__(self, learning_rate=1e-3): 19 | super().__init__() 20 | self.save_hyperparameters() 21 | self.example_input_array = torch.rand(1, 1, 28, 28) 22 | 23 | self.learning_rate = learning_rate 24 | 25 | # Define blocks of layers as submodules 26 | self.conv_block1 = nn.Sequential( 27 | nn.Conv2d(1, 32, 3, stride=1, padding=1), nn.ReLU(), nn.MaxPool2d(2) 28 | ) 29 | 30 | self.conv_block2 = nn.Sequential( 31 | nn.Conv2d(32, 64, 3, stride=1, padding=1), nn.ReLU(), nn.MaxPool2d(2) 32 | ) 33 | 34 | self.fc_block = nn.Sequential( 35 | nn.Linear(64 * 7 * 7, 128), nn.ReLU(), nn.Linear(128, 10) 36 | ) 37 | 38 | def forward(self, x): 39 | x = self.conv_block1(x) 40 | x = self.conv_block2(x) 41 | x = x.view(x.size(0), -1) 42 | x = self.fc_block(x) 43 | return x 44 | 45 | def training_step(self, batch, batch_idx): 46 | x, y = batch 47 | y_hat = self(x) 48 | loss = F.cross_entropy(y_hat, y) 49 | return loss 50 | 51 | def validation_step(self, batch, batch_idx): 52 | x, y = batch 53 | y_hat = self(x) 54 | loss = F.cross_entropy(y_hat, y) 55 | self.log("val_loss", loss) 56 | return loss 57 | 58 | def test_step(self, batch, batch_idx): 59 | x, y = batch 60 | y_hat = self(x) 61 | loss = F.cross_entropy(y_hat, y) 62 | return loss 63 | 64 | def predict_step(self, batch, batch_idx, dataloader_idx=None): 65 | x, _ = batch 66 | return self(x) 67 | 68 | def configure_optimizers(self): 69 | optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate) 70 | return optimizer 71 | 72 | 73 | def prepare_dataloaders(): 74 | train_dataset = MNIST( 75 | "./", download=True, train=True, transform=transforms.ToTensor() 76 | ) 77 | 78 | train_size = int(0.8 * len(train_dataset)) 79 | val_size = len(train_dataset) - train_size 80 | 81 | seed = torch.Generator().manual_seed(42) 82 | train_dataset, val_dataset = torch.utils.data.random_split( 83 | train_dataset, [train_size, val_size], generator=seed 84 | ) 85 | 86 | test_dataset = MNIST( 87 | "./", download=True, train=False, transform=transforms.ToTensor() 88 | ) 89 | 90 | train_dataloader = DataLoader(train_dataset, batch_size=32) 91 | val_dataloader = DataLoader(val_dataset, batch_size=32) 92 | test_dataloader = DataLoader(test_dataset, batch_size=32) 93 | 94 | return train_dataloader, val_dataloader, test_dataloader 95 | 96 | 97 | train_dataloader, val_dataloader, test_dataloader = prepare_dataloaders() 98 | 99 | model = LitConvClassifier() 100 | 101 | # Tensor Processing Unit (TPU) is an AI accelerator application-specific integrated circuit (ASIC) developed by 102 | # Google specifically for neural networks. 103 | 104 | # A TPU has 8 cores where each core is optimized for 128x128 matrix multiplies. 105 | # In general, a single TPU is about as fast as 5 V100 GPUs! 106 | 107 | # A TPU pod hosts many TPUs on it. Currently, TPU v3 Pod has up to 2048 TPU cores and 32 TiB of memory! 108 | # You can request a full pod from Google cloud or a “slice” which gives you some subset of those 2048 cores. 109 | 110 | # run on as many TPUs as available by default 111 | trainer = pl.Trainer( 112 | max_epochs=5, 113 | default_root_dir="experiments/", 114 | accelerator="auto", 115 | devices="auto", 116 | strategy="auto", 117 | ) 118 | # equivalent to 119 | trainer = pl.Trainer() 120 | 121 | # run on one TPU core 122 | trainer = pl.Trainer( 123 | max_epochs=5, default_root_dir="experiments/", accelerator="tpu", devices=1 124 | ) 125 | 126 | # run on multiple TPU cores 127 | trainer = pl.Trainer( 128 | max_epochs=5, default_root_dir="experiments/", accelerator="tpu", devices=8 129 | ) 130 | 131 | # run on the 5th core 132 | trainer = pl.Trainer( 133 | max_epochs=5, default_root_dir="experiments/", accelerator="tpu", devices=[5] 134 | ) 135 | 136 | # choose the number of cores automatically 137 | trainer = pl.Trainer( 138 | max_epochs=5, default_root_dir="experiments/", accelerator="tpu", devices="auto" 139 | ) 140 | 141 | trainer.fit(model, train_dataloader, val_dataloader) 142 | -------------------------------------------------------------------------------- /src/basic/level_06_debugging_model/02_model_summary.py: -------------------------------------------------------------------------------- 1 | # Documentation Link 2 | # https://lightning.ai/docs/pytorch/stable/debug/debugging_basic.html 3 | 4 | import os 5 | 6 | import lightning.pytorch as pl 7 | import torch 8 | 9 | # Used for child modules in the model summary 10 | from lightning.pytorch.callbacks import ModelSummary 11 | from lightning.pytorch.callbacks.early_stopping import EarlyStopping 12 | from torch import nn 13 | from torch.nn import functional as F 14 | from torch.utils.data import DataLoader 15 | from torchvision import transforms 16 | from torchvision.datasets import MNIST 17 | 18 | 19 | # We have updated the model to use nn.Sequential() and named the blocks of layers. 20 | # This will help us understand the Model Summary output. 21 | class LitConvClassifier(pl.LightningModule): 22 | def __init__(self, learning_rate=1e-3): 23 | super().__init__() 24 | self.save_hyperparameters() 25 | 26 | # Another debugging tool is to display the intermediate input- and output sizes of 27 | # all your layers by setting the example_input_array attribute in your LightningModule. 28 | self.example_input_array = torch.rand(1, 1, 28, 28) 29 | 30 | self.learning_rate = learning_rate 31 | 32 | # Define blocks of layers as submodules 33 | self.conv_block1 = nn.Sequential( 34 | nn.Conv2d(1, 32, 3, stride=1, padding=1), nn.ReLU(), nn.MaxPool2d(2) 35 | ) 36 | 37 | self.conv_block2 = nn.Sequential( 38 | nn.Conv2d(32, 64, 3, stride=1, padding=1), nn.ReLU(), nn.MaxPool2d(2) 39 | ) 40 | 41 | self.fc_block = nn.Sequential( 42 | nn.Linear(64 * 7 * 7, 128), nn.ReLU(), nn.Linear(128, 10) 43 | ) 44 | 45 | def forward(self, x): 46 | x = self.conv_block1(x) 47 | x = self.conv_block2(x) 48 | x = x.view(x.size(0), -1) 49 | x = self.fc_block(x) 50 | return x 51 | 52 | def training_step(self, batch, batch_idx): 53 | x, y = batch 54 | y_hat = self(x) 55 | loss = F.cross_entropy(y_hat, y) 56 | return loss 57 | 58 | def validation_step(self, batch, batch_idx): 59 | x, y = batch 60 | y_hat = self(x) 61 | loss = F.cross_entropy(y_hat, y) 62 | self.log("val_loss", loss) 63 | return loss 64 | 65 | def test_step(self, batch, batch_idx): 66 | x, y = batch 67 | y_hat = self(x) 68 | loss = F.cross_entropy(y_hat, y) 69 | return loss 70 | 71 | def configure_optimizers(self): 72 | optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate) 73 | return optimizer 74 | 75 | 76 | def prepare_dataloaders(): 77 | train_dataset = MNIST( 78 | "./", download=True, train=True, transform=transforms.ToTensor() 79 | ) 80 | 81 | train_size = int(0.8 * len(train_dataset)) 82 | val_size = len(train_dataset) - train_size 83 | 84 | seed = torch.Generator().manual_seed(42) 85 | train_dataset, val_dataset = torch.utils.data.random_split( 86 | train_dataset, [train_size, val_size], generator=seed 87 | ) 88 | 89 | test_dataset = MNIST( 90 | "./", download=True, train=False, transform=transforms.ToTensor() 91 | ) 92 | 93 | train_dataloader = DataLoader(train_dataset, batch_size=32) 94 | val_dataloader = DataLoader(val_dataset, batch_size=32) 95 | test_dataloader = DataLoader(test_dataset, batch_size=32) 96 | 97 | return train_dataloader, val_dataloader, test_dataloader 98 | 99 | 100 | train_dataloader, val_dataloader, test_dataloader = prepare_dataloaders() 101 | 102 | model = LitConvClassifier() 103 | 104 | # Default 105 | # Whenever the .fit() function gets called, 106 | # the Trainer will print the weights summary for the LightningModule. 107 | print("\n----------------------------------") 108 | print("Default Model Summary") 109 | print("----------------------------------") 110 | trainer = pl.Trainer( 111 | max_epochs=1, 112 | default_root_dir="experiments/", 113 | callbacks=[EarlyStopping(monitor="val_loss", mode="min")], 114 | ) 115 | trainer.fit(model, train_dataloader, val_dataloader) 116 | 117 | # Child Modules 118 | print("\n----------------------------------") 119 | print("Child Modules Model Summary") 120 | print("----------------------------------") 121 | trainer = pl.Trainer( 122 | max_epochs=1, 123 | default_root_dir="experiments/", 124 | callbacks=[ 125 | EarlyStopping(monitor="val_loss", mode="min"), 126 | ModelSummary(max_depth=-1), 127 | ], 128 | ) 129 | trainer.fit(model, train_dataloader, val_dataloader) 130 | 131 | # Turn off model summary 132 | print("\n----------------------------------") 133 | print("Turn off Model Summary") 134 | print("----------------------------------") 135 | trainer = pl.Trainer( 136 | max_epochs=1, 137 | default_root_dir="experiments/", 138 | callbacks=[EarlyStopping(monitor="val_loss", mode="min")], 139 | enable_model_summary=False, 140 | ) 141 | trainer.fit(model, train_dataloader, val_dataloader) 142 | -------------------------------------------------------------------------------- /src/intermediate/level_13_profiler/02_profile_pytorch_operations.py: -------------------------------------------------------------------------------- 1 | # Documentation Link 2 | # https://lightning.ai/docs/pytorch/stable/tuning/profiler_intermediate.html 3 | 4 | import os 5 | 6 | import lightning.pytorch as pl 7 | import torch 8 | from lightning.pytorch.callbacks import ( 9 | DeviceStatsMonitor, 10 | ModelSummary, 11 | StochasticWeightAveraging, 12 | ) 13 | from lightning.pytorch.callbacks.early_stopping import EarlyStopping 14 | from lightning.pytorch.profilers import PyTorchProfiler 15 | from torch import nn 16 | from torch.nn import functional as F 17 | from torch.utils.data import DataLoader, random_split 18 | from torchvision import transforms 19 | from torchvision.datasets import MNIST 20 | 21 | 22 | class MNISTDataModule(pl.LightningDataModule): 23 | def __init__(self, data_dir: str = "./"): 24 | super().__init__() 25 | self.data_dir = data_dir 26 | self.transform = transforms.Compose( 27 | [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))] 28 | ) 29 | 30 | def prepare_data(self): 31 | # download 32 | MNIST(self.data_dir, train=True, download=True) 33 | MNIST(self.data_dir, train=False, download=True) 34 | 35 | def setup(self, stage: str): 36 | # Assign train/val datasets for use in dataloaders 37 | if stage == "fit": 38 | mnist_full = MNIST(self.data_dir, train=True, transform=self.transform) 39 | self.mnist_train, self.mnist_val = random_split(mnist_full, [55000, 5000]) 40 | 41 | # Assign test dataset for use in dataloader(s) 42 | if stage == "test": 43 | self.mnist_test = MNIST( 44 | self.data_dir, train=False, transform=self.transform 45 | ) 46 | 47 | if stage == "predict": 48 | self.mnist_predict = MNIST( 49 | self.data_dir, train=False, transform=self.transform 50 | ) 51 | 52 | def train_dataloader(self): 53 | return DataLoader(self.mnist_train, batch_size=32) 54 | 55 | def val_dataloader(self): 56 | return DataLoader(self.mnist_val, batch_size=32) 57 | 58 | def test_dataloader(self): 59 | return DataLoader(self.mnist_test, batch_size=32) 60 | 61 | def predict_dataloader(self): 62 | return DataLoader(self.mnist_predict, batch_size=32) 63 | 64 | 65 | class LitConvClassifier(pl.LightningModule): 66 | def __init__(self, learning_rate=1e-3): 67 | super().__init__() 68 | self.save_hyperparameters() 69 | self.example_input_array = torch.rand(1, 1, 28, 28) 70 | 71 | self.learning_rate = learning_rate 72 | 73 | # Define blocks of layers as submodules 74 | self.conv_block1 = nn.Sequential( 75 | nn.Conv2d(1, 32, 3, stride=1, padding=1), nn.ReLU(), nn.MaxPool2d(2) 76 | ) 77 | 78 | self.conv_block2 = nn.Sequential( 79 | nn.Conv2d(32, 64, 3, stride=1, padding=1), nn.ReLU(), nn.MaxPool2d(2) 80 | ) 81 | 82 | self.fc_block = nn.Sequential( 83 | nn.Linear(64 * 7 * 7, 128), nn.ReLU(), nn.Linear(128, 10) 84 | ) 85 | 86 | def forward(self, x): 87 | x = self.conv_block1(x) 88 | x = self.conv_block2(x) 89 | x = x.view(x.size(0), -1) 90 | x = self.fc_block(x) 91 | return x 92 | 93 | def training_step(self, batch, batch_idx): 94 | x, y = batch 95 | y_hat = self(x) 96 | loss = F.cross_entropy(y_hat, y) 97 | return loss 98 | 99 | def validation_step(self, batch, batch_idx): 100 | x, y = batch 101 | y_hat = self(x) 102 | loss = F.cross_entropy(y_hat, y) 103 | self.log("val_loss", loss) 104 | return loss 105 | 106 | def test_step(self, batch, batch_idx): 107 | x, y = batch 108 | y_hat = self(x) 109 | loss = F.cross_entropy(y_hat, y) 110 | return loss 111 | 112 | def predict_step(self, batch, batch_idx, dataloader_idx=None): 113 | x, _ = batch 114 | return self(x) 115 | 116 | def configure_optimizers(self): 117 | optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate) 118 | return optimizer 119 | 120 | 121 | data_module = MNISTDataModule() 122 | model = LitConvClassifier() 123 | 124 | # To understand the cost of each PyTorch operation, 125 | # use the PyTorchProfiler built on top of the PyTorch profiler. 126 | trainer = pl.Trainer( 127 | max_epochs=1, 128 | default_root_dir="experiments/", 129 | callbacks=[ 130 | EarlyStopping(monitor="val_loss", mode="min"), 131 | ModelSummary(max_depth=-1), 132 | StochasticWeightAveraging(swa_lrs=1e-2), 133 | DeviceStatsMonitor(), 134 | ], 135 | precision="16-mixed", 136 | profiler=PyTorchProfiler(), 137 | limit_train_batches=0.1, 138 | limit_val_batches=0.01, 139 | ) 140 | 141 | trainer.fit(model, data_module) 142 | 143 | # Get Predictions 144 | predictions = trainer.predict(model, data_module) 145 | print(len(predictions)) 146 | -------------------------------------------------------------------------------- /src/intermediate/level_11_scaling_techniques/04_stochastic_weight_averaging.py: -------------------------------------------------------------------------------- 1 | # Documentation Link 2 | # https://lightning.ai/docs/pytorch/stable/advanced/training_tricks.html 3 | 4 | import os 5 | 6 | import lightning.pytorch as pl 7 | import torch 8 | from lightning.pytorch.callbacks import ModelSummary, StochasticWeightAveraging 9 | from lightning.pytorch.callbacks.early_stopping import EarlyStopping 10 | from torch import nn 11 | from torch.nn import functional as F 12 | from torch.utils.data import DataLoader, random_split 13 | from torchvision import transforms 14 | from torchvision.datasets import MNIST 15 | 16 | 17 | class MNISTDataModule(pl.LightningDataModule): 18 | def __init__(self, data_dir: str = "./"): 19 | super().__init__() 20 | self.data_dir = data_dir 21 | self.transform = transforms.Compose( 22 | [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))] 23 | ) 24 | 25 | def prepare_data(self): 26 | # download 27 | MNIST(self.data_dir, train=True, download=True) 28 | MNIST(self.data_dir, train=False, download=True) 29 | 30 | def setup(self, stage: str): 31 | # Assign train/val datasets for use in dataloaders 32 | if stage == "fit": 33 | mnist_full = MNIST(self.data_dir, train=True, transform=self.transform) 34 | self.mnist_train, self.mnist_val = random_split(mnist_full, [55000, 5000]) 35 | 36 | # Assign test dataset for use in dataloader(s) 37 | if stage == "test": 38 | self.mnist_test = MNIST( 39 | self.data_dir, train=False, transform=self.transform 40 | ) 41 | 42 | if stage == "predict": 43 | self.mnist_predict = MNIST( 44 | self.data_dir, train=False, transform=self.transform 45 | ) 46 | 47 | def train_dataloader(self): 48 | return DataLoader(self.mnist_train, batch_size=32) 49 | 50 | def val_dataloader(self): 51 | return DataLoader(self.mnist_val, batch_size=32) 52 | 53 | def test_dataloader(self): 54 | return DataLoader(self.mnist_test, batch_size=32) 55 | 56 | def predict_dataloader(self): 57 | return DataLoader(self.mnist_predict, batch_size=32) 58 | 59 | 60 | class LitConvClassifier(pl.LightningModule): 61 | def __init__(self, learning_rate=1e-3): 62 | super().__init__() 63 | self.save_hyperparameters() 64 | self.example_input_array = torch.rand(1, 1, 28, 28) 65 | 66 | self.learning_rate = learning_rate 67 | 68 | # Define blocks of layers as submodules 69 | self.conv_block1 = nn.Sequential( 70 | nn.Conv2d(1, 32, 3, stride=1, padding=1), nn.ReLU(), nn.MaxPool2d(2) 71 | ) 72 | 73 | self.conv_block2 = nn.Sequential( 74 | nn.Conv2d(32, 64, 3, stride=1, padding=1), nn.ReLU(), nn.MaxPool2d(2) 75 | ) 76 | 77 | self.fc_block = nn.Sequential( 78 | nn.Linear(64 * 7 * 7, 128), nn.ReLU(), nn.Linear(128, 10) 79 | ) 80 | 81 | def forward(self, x): 82 | x = self.conv_block1(x) 83 | x = self.conv_block2(x) 84 | x = x.view(x.size(0), -1) 85 | x = self.fc_block(x) 86 | return x 87 | 88 | def training_step(self, batch, batch_idx): 89 | x, y = batch 90 | y_hat = self(x) 91 | loss = F.cross_entropy(y_hat, y) 92 | return loss 93 | 94 | def validation_step(self, batch, batch_idx): 95 | x, y = batch 96 | y_hat = self(x) 97 | loss = F.cross_entropy(y_hat, y) 98 | self.log("val_loss", loss) 99 | return loss 100 | 101 | def test_step(self, batch, batch_idx): 102 | x, y = batch 103 | y_hat = self(x) 104 | loss = F.cross_entropy(y_hat, y) 105 | return loss 106 | 107 | def predict_step(self, batch, batch_idx, dataloader_idx=None): 108 | x, _ = batch 109 | return self(x) 110 | 111 | def configure_optimizers(self): 112 | optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate) 113 | return optimizer 114 | 115 | 116 | data_module = MNISTDataModule() 117 | model = LitConvClassifier() 118 | 119 | # Stochastic Weight Averaging (SWA) can make your models generalize better at virtually no additional cost. 120 | # This can be used with both non-trained and trained models. 121 | # The SWA procedure smooths the loss landscape thus making it harder to end up in a local minimum during optimization. 122 | 123 | trainer = pl.Trainer( 124 | max_epochs=1, 125 | default_root_dir="experiments/", 126 | callbacks=[ 127 | EarlyStopping(monitor="val_loss", mode="min"), 128 | ModelSummary(max_depth=-1), 129 | StochasticWeightAveraging( 130 | swa_lrs=1e-2 131 | ), # Enable Stochastic Weight Averaging using the callback 132 | ], 133 | precision="16-mixed", 134 | ) 135 | 136 | trainer.fit(model, data_module) 137 | 138 | # Get Predictions 139 | predictions = trainer.predict(model, data_module) 140 | print(len(predictions)) 141 | -------------------------------------------------------------------------------- /src/intermediate/level_09_modularize/01_lightning_datamodule.py: -------------------------------------------------------------------------------- 1 | # Documentation Link 2 | # https://lightning.ai/docs/pytorch/stable/levels/intermediate_level_9.html 3 | 4 | import os 5 | 6 | import lightning.pytorch as pl 7 | import torch 8 | from lightning.pytorch.callbacks import ModelSummary 9 | from lightning.pytorch.callbacks.early_stopping import EarlyStopping 10 | from torch import nn 11 | from torch.nn import functional as F 12 | from torch.utils.data import DataLoader, random_split 13 | from torchvision import transforms 14 | from torchvision.datasets import MNIST 15 | 16 | # The LightningDataModule is a convenient way to manage data in PyTorch Lightning. 17 | # It encapsulates training, validation, testing, and prediction dataloaders, 18 | # as well as any necessary steps for data processing, downloads, and transformations. 19 | # By using a LightningDataModule, you can easily develop dataset-agnostic models, hot-swap different datasets, 20 | # and share data splits and transformations across projects. 21 | 22 | 23 | class MNISTDataModule(pl.LightningDataModule): 24 | def __init__(self, data_dir: str = "./"): 25 | super().__init__() 26 | self.data_dir = data_dir 27 | self.transform = transforms.Compose( 28 | [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))] 29 | ) 30 | 31 | def prepare_data(self): 32 | # download 33 | MNIST(self.data_dir, train=True, download=True) 34 | MNIST(self.data_dir, train=False, download=True) 35 | 36 | def setup(self, stage: str): 37 | # Assign train/val datasets for use in dataloaders 38 | if stage == "fit": 39 | mnist_full = MNIST(self.data_dir, train=True, transform=self.transform) 40 | self.mnist_train, self.mnist_val = random_split(mnist_full, [55000, 5000]) 41 | 42 | # Assign test dataset for use in dataloader(s) 43 | if stage == "test": 44 | self.mnist_test = MNIST( 45 | self.data_dir, train=False, transform=self.transform 46 | ) 47 | 48 | if stage == "predict": 49 | self.mnist_predict = MNIST( 50 | self.data_dir, train=False, transform=self.transform 51 | ) 52 | 53 | def train_dataloader(self): 54 | return DataLoader(self.mnist_train, batch_size=32) 55 | 56 | def val_dataloader(self): 57 | return DataLoader(self.mnist_val, batch_size=32) 58 | 59 | def test_dataloader(self): 60 | return DataLoader(self.mnist_test, batch_size=32) 61 | 62 | def predict_dataloader(self): 63 | return DataLoader(self.mnist_predict, batch_size=32) 64 | 65 | 66 | class LitConvClassifier(pl.LightningModule): 67 | def __init__(self, learning_rate=1e-3): 68 | super().__init__() 69 | self.save_hyperparameters() 70 | self.example_input_array = torch.rand(1, 1, 28, 28) 71 | 72 | self.learning_rate = learning_rate 73 | 74 | # Define blocks of layers as submodules 75 | self.conv_block1 = nn.Sequential( 76 | nn.Conv2d(1, 32, 3, stride=1, padding=1), nn.ReLU(), nn.MaxPool2d(2) 77 | ) 78 | 79 | self.conv_block2 = nn.Sequential( 80 | nn.Conv2d(32, 64, 3, stride=1, padding=1), nn.ReLU(), nn.MaxPool2d(2) 81 | ) 82 | 83 | self.fc_block = nn.Sequential( 84 | nn.Linear(64 * 7 * 7, 128), nn.ReLU(), nn.Linear(128, 10) 85 | ) 86 | 87 | def forward(self, x): 88 | x = self.conv_block1(x) 89 | x = self.conv_block2(x) 90 | x = x.view(x.size(0), -1) 91 | x = self.fc_block(x) 92 | return x 93 | 94 | def training_step(self, batch, batch_idx): 95 | x, y = batch 96 | y_hat = self(x) 97 | loss = F.cross_entropy(y_hat, y) 98 | return loss 99 | 100 | def validation_step(self, batch, batch_idx): 101 | x, y = batch 102 | y_hat = self(x) 103 | loss = F.cross_entropy(y_hat, y) 104 | self.log("val_loss", loss) 105 | return loss 106 | 107 | def test_step(self, batch, batch_idx): 108 | x, y = batch 109 | y_hat = self(x) 110 | loss = F.cross_entropy(y_hat, y) 111 | return loss 112 | 113 | def predict_step(self, batch, batch_idx, dataloader_idx=None): 114 | x, _ = batch 115 | return self(x) 116 | 117 | def configure_optimizers(self): 118 | optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate) 119 | return optimizer 120 | 121 | 122 | data_module = MNISTDataModule() 123 | model = LitConvClassifier() 124 | 125 | trainer = pl.Trainer( 126 | max_epochs=1, 127 | default_root_dir="experiments/", 128 | callbacks=[ 129 | EarlyStopping(monitor="val_loss", mode="min"), 130 | ModelSummary(max_depth=-1), 131 | ], 132 | ) 133 | 134 | # Train Model 135 | # We can pass the data module directly to the trainer 136 | trainer.fit(model, data_module) 137 | 138 | # Get Predictions 139 | predictions = trainer.predict(model, data_module) 140 | print(len(predictions)) 141 | -------------------------------------------------------------------------------- /src/intermediate/level_11_scaling_techniques/02_gradient_accumulation.py: -------------------------------------------------------------------------------- 1 | # Documentation Link 2 | # https://lightning.ai/docs/pytorch/stable/advanced/training_tricks.html 3 | 4 | import os 5 | 6 | import lightning.pytorch as pl 7 | import torch 8 | from lightning.pytorch.callbacks import ModelSummary 9 | from lightning.pytorch.callbacks.early_stopping import EarlyStopping 10 | from torch import nn 11 | from torch.nn import functional as F 12 | from torch.utils.data import DataLoader, random_split 13 | from torchvision import transforms 14 | from torchvision.datasets import MNIST 15 | 16 | 17 | class MNISTDataModule(pl.LightningDataModule): 18 | def __init__(self, data_dir: str = "./"): 19 | super().__init__() 20 | self.data_dir = data_dir 21 | self.transform = transforms.Compose( 22 | [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))] 23 | ) 24 | 25 | def prepare_data(self): 26 | # download 27 | MNIST(self.data_dir, train=True, download=True) 28 | MNIST(self.data_dir, train=False, download=True) 29 | 30 | def setup(self, stage: str): 31 | # Assign train/val datasets for use in dataloaders 32 | if stage == "fit": 33 | mnist_full = MNIST(self.data_dir, train=True, transform=self.transform) 34 | self.mnist_train, self.mnist_val = random_split(mnist_full, [55000, 5000]) 35 | 36 | # Assign test dataset for use in dataloader(s) 37 | if stage == "test": 38 | self.mnist_test = MNIST( 39 | self.data_dir, train=False, transform=self.transform 40 | ) 41 | 42 | if stage == "predict": 43 | self.mnist_predict = MNIST( 44 | self.data_dir, train=False, transform=self.transform 45 | ) 46 | 47 | def train_dataloader(self): 48 | return DataLoader(self.mnist_train, batch_size=32) 49 | 50 | def val_dataloader(self): 51 | return DataLoader(self.mnist_val, batch_size=32) 52 | 53 | def test_dataloader(self): 54 | return DataLoader(self.mnist_test, batch_size=32) 55 | 56 | def predict_dataloader(self): 57 | return DataLoader(self.mnist_predict, batch_size=32) 58 | 59 | 60 | class LitConvClassifier(pl.LightningModule): 61 | def __init__(self, learning_rate=1e-3): 62 | super().__init__() 63 | self.save_hyperparameters() 64 | self.example_input_array = torch.rand(1, 1, 28, 28) 65 | 66 | self.learning_rate = learning_rate 67 | 68 | # Define blocks of layers as submodules 69 | self.conv_block1 = nn.Sequential( 70 | nn.Conv2d(1, 32, 3, stride=1, padding=1), nn.ReLU(), nn.MaxPool2d(2) 71 | ) 72 | 73 | self.conv_block2 = nn.Sequential( 74 | nn.Conv2d(32, 64, 3, stride=1, padding=1), nn.ReLU(), nn.MaxPool2d(2) 75 | ) 76 | 77 | self.fc_block = nn.Sequential( 78 | nn.Linear(64 * 7 * 7, 128), nn.ReLU(), nn.Linear(128, 10) 79 | ) 80 | 81 | def forward(self, x): 82 | x = self.conv_block1(x) 83 | x = self.conv_block2(x) 84 | x = x.view(x.size(0), -1) 85 | x = self.fc_block(x) 86 | return x 87 | 88 | def training_step(self, batch, batch_idx): 89 | x, y = batch 90 | y_hat = self(x) 91 | loss = F.cross_entropy(y_hat, y) 92 | return loss 93 | 94 | def validation_step(self, batch, batch_idx): 95 | x, y = batch 96 | y_hat = self(x) 97 | loss = F.cross_entropy(y_hat, y) 98 | self.log("val_loss", loss) 99 | return loss 100 | 101 | def test_step(self, batch, batch_idx): 102 | x, y = batch 103 | y_hat = self(x) 104 | loss = F.cross_entropy(y_hat, y) 105 | return loss 106 | 107 | def predict_step(self, batch, batch_idx, dataloader_idx=None): 108 | x, _ = batch 109 | return self(x) 110 | 111 | def configure_optimizers(self): 112 | optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate) 113 | return optimizer 114 | 115 | 116 | data_module = MNISTDataModule() 117 | model = LitConvClassifier() 118 | 119 | # Accumulated gradients run K small batches of size N before doing a backward pass. 120 | # The effect is a large effective batch size of size KxN, where N is the batch size. 121 | # Internally it doesn’t stack up the batches and do a forward pass rather 122 | # it accumulates the gradients for K batches and then do an optimizer.step 123 | # to make sure the effective batch size is increased but there is no memory overhead. 124 | 125 | trainer = pl.Trainer( 126 | max_epochs=1, 127 | default_root_dir="experiments/", 128 | callbacks=[ 129 | EarlyStopping(monitor="val_loss", mode="min"), 130 | ModelSummary(max_depth=-1), 131 | ], 132 | precision="16-mixed", 133 | accumulate_grad_batches=7, # Accumulate gradients for 7 batches 134 | ) 135 | 136 | trainer.fit(model, data_module) 137 | 138 | # Get Predictions 139 | predictions = trainer.predict(model, data_module) 140 | print(len(predictions)) 141 | -------------------------------------------------------------------------------- /src/intermediate/level_11_scaling_techniques/03_gradient_clipping.py: -------------------------------------------------------------------------------- 1 | # Documentation Link 2 | # https://lightning.ai/docs/pytorch/stable/advanced/training_tricks.html 3 | 4 | import os 5 | 6 | import lightning.pytorch as pl 7 | import torch 8 | from lightning.pytorch.callbacks import ModelSummary 9 | from lightning.pytorch.callbacks.early_stopping import EarlyStopping 10 | from torch import nn 11 | from torch.nn import functional as F 12 | from torch.utils.data import DataLoader, random_split 13 | from torchvision import transforms 14 | from torchvision.datasets import MNIST 15 | 16 | 17 | class MNISTDataModule(pl.LightningDataModule): 18 | def __init__(self, data_dir: str = "./"): 19 | super().__init__() 20 | self.data_dir = data_dir 21 | self.transform = transforms.Compose( 22 | [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))] 23 | ) 24 | 25 | def prepare_data(self): 26 | # download 27 | MNIST(self.data_dir, train=True, download=True) 28 | MNIST(self.data_dir, train=False, download=True) 29 | 30 | def setup(self, stage: str): 31 | # Assign train/val datasets for use in dataloaders 32 | if stage == "fit": 33 | mnist_full = MNIST(self.data_dir, train=True, transform=self.transform) 34 | self.mnist_train, self.mnist_val = random_split(mnist_full, [55000, 5000]) 35 | 36 | # Assign test dataset for use in dataloader(s) 37 | if stage == "test": 38 | self.mnist_test = MNIST( 39 | self.data_dir, train=False, transform=self.transform 40 | ) 41 | 42 | if stage == "predict": 43 | self.mnist_predict = MNIST( 44 | self.data_dir, train=False, transform=self.transform 45 | ) 46 | 47 | def train_dataloader(self): 48 | return DataLoader(self.mnist_train, batch_size=32) 49 | 50 | def val_dataloader(self): 51 | return DataLoader(self.mnist_val, batch_size=32) 52 | 53 | def test_dataloader(self): 54 | return DataLoader(self.mnist_test, batch_size=32) 55 | 56 | def predict_dataloader(self): 57 | return DataLoader(self.mnist_predict, batch_size=32) 58 | 59 | 60 | class LitConvClassifier(pl.LightningModule): 61 | def __init__(self, learning_rate=1e-3): 62 | super().__init__() 63 | self.save_hyperparameters() 64 | self.example_input_array = torch.rand(1, 1, 28, 28) 65 | 66 | self.learning_rate = learning_rate 67 | 68 | # Define blocks of layers as submodules 69 | self.conv_block1 = nn.Sequential( 70 | nn.Conv2d(1, 32, 3, stride=1, padding=1), nn.ReLU(), nn.MaxPool2d(2) 71 | ) 72 | 73 | self.conv_block2 = nn.Sequential( 74 | nn.Conv2d(32, 64, 3, stride=1, padding=1), nn.ReLU(), nn.MaxPool2d(2) 75 | ) 76 | 77 | self.fc_block = nn.Sequential( 78 | nn.Linear(64 * 7 * 7, 128), nn.ReLU(), nn.Linear(128, 10) 79 | ) 80 | 81 | def forward(self, x): 82 | x = self.conv_block1(x) 83 | x = self.conv_block2(x) 84 | x = x.view(x.size(0), -1) 85 | x = self.fc_block(x) 86 | return x 87 | 88 | def training_step(self, batch, batch_idx): 89 | x, y = batch 90 | y_hat = self(x) 91 | loss = F.cross_entropy(y_hat, y) 92 | return loss 93 | 94 | def validation_step(self, batch, batch_idx): 95 | x, y = batch 96 | y_hat = self(x) 97 | loss = F.cross_entropy(y_hat, y) 98 | self.log("val_loss", loss) 99 | return loss 100 | 101 | def test_step(self, batch, batch_idx): 102 | x, y = batch 103 | y_hat = self(x) 104 | loss = F.cross_entropy(y_hat, y) 105 | return loss 106 | 107 | def predict_step(self, batch, batch_idx, dataloader_idx=None): 108 | x, _ = batch 109 | return self(x) 110 | 111 | def configure_optimizers(self): 112 | optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate) 113 | return optimizer 114 | 115 | 116 | data_module = MNISTDataModule() 117 | model = LitConvClassifier() 118 | 119 | # Gradient clipping can be enabled to avoid exploding gradients. 120 | # By default, this will clip the gradient norm by calling torch.nn.utils.clip_grad_norm_() 121 | # computed over all model parameters together. If the Trainer’s gradient_clip_algorithm is set to 122 | # 'value' ('norm' by default), this will use instead torch.nn.utils.clip_grad_value_() for each parameter instead. 123 | 124 | trainer = pl.Trainer( 125 | max_epochs=1, 126 | default_root_dir="experiments/", 127 | callbacks=[ 128 | EarlyStopping(monitor="val_loss", mode="min"), 129 | ModelSummary(max_depth=-1), 130 | ], 131 | precision="16-mixed", 132 | gradient_clip_val=0.5, # clip gradients' global norm to <=0.5 using gradient_clip_algorithm='norm' by default 133 | ) 134 | 135 | trainer.fit(model, data_module) 136 | 137 | # Get Predictions 138 | predictions = trainer.predict(model, data_module) 139 | print(len(predictions)) 140 | -------------------------------------------------------------------------------- /src/advanced/level_15_modify_trainer/01_create_callbacks.py: -------------------------------------------------------------------------------- 1 | # Documentaion Link 2 | # https://lightning.ai/docs/pytorch/stable/extensions/callbacks.html 3 | 4 | import lightning.pytorch as pl 5 | import torch 6 | from lightning.pytorch.callbacks import Callback, ModelSummary 7 | from lightning.pytorch.callbacks.early_stopping import EarlyStopping 8 | from torch import nn 9 | from torch.nn import functional as F 10 | from torch.utils.data import DataLoader, random_split 11 | from torchvision import transforms 12 | from torchvision.datasets import MNIST 13 | 14 | 15 | class MNISTDataModule(pl.LightningDataModule): 16 | def __init__(self, data_dir: str = "./"): 17 | super().__init__() 18 | self.data_dir = data_dir 19 | self.transform = transforms.Compose( 20 | [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))] 21 | ) 22 | 23 | def prepare_data(self): 24 | # download 25 | MNIST(self.data_dir, train=True, download=True) 26 | MNIST(self.data_dir, train=False, download=True) 27 | 28 | def setup(self, stage: str): 29 | # Assign train/val datasets for use in dataloaders 30 | if stage == "fit": 31 | mnist_full = MNIST(self.data_dir, train=True, transform=self.transform) 32 | self.mnist_train, self.mnist_val = random_split(mnist_full, [55000, 5000]) 33 | 34 | # Assign test dataset for use in dataloader(s) 35 | if stage == "test": 36 | self.mnist_test = MNIST( 37 | self.data_dir, train=False, transform=self.transform 38 | ) 39 | 40 | if stage == "predict": 41 | self.mnist_predict = MNIST( 42 | self.data_dir, train=False, transform=self.transform 43 | ) 44 | 45 | def train_dataloader(self): 46 | return DataLoader(self.mnist_train, batch_size=32) 47 | 48 | def val_dataloader(self): 49 | return DataLoader(self.mnist_val, batch_size=32) 50 | 51 | def test_dataloader(self): 52 | return DataLoader(self.mnist_test, batch_size=32) 53 | 54 | def predict_dataloader(self): 55 | return DataLoader(self.mnist_predict, batch_size=32) 56 | 57 | 58 | class LitConvClassifier(pl.LightningModule): 59 | def __init__(self, learning_rate=1e-3): 60 | super().__init__() 61 | self.save_hyperparameters() 62 | self.example_input_array = torch.rand(1, 1, 28, 28) 63 | 64 | self.learning_rate = learning_rate 65 | 66 | # Define blocks of layers as submodules 67 | self.conv_block1 = nn.Sequential( 68 | nn.Conv2d(1, 32, 3, stride=1, padding=1), nn.ReLU(), nn.MaxPool2d(2) 69 | ) 70 | 71 | self.conv_block2 = nn.Sequential( 72 | nn.Conv2d(32, 64, 3, stride=1, padding=1), nn.ReLU(), nn.MaxPool2d(2) 73 | ) 74 | 75 | self.fc_block = nn.Sequential( 76 | nn.Linear(64 * 7 * 7, 128), nn.ReLU(), nn.Linear(128, 10) 77 | ) 78 | 79 | def forward(self, x): 80 | x = self.conv_block1(x) 81 | x = self.conv_block2(x) 82 | x = x.view(x.size(0), -1) 83 | x = self.fc_block(x) 84 | return x 85 | 86 | def training_step(self, batch, batch_idx): 87 | x, y = batch 88 | y_hat = self(x) 89 | loss = F.cross_entropy(y_hat, y) 90 | return loss 91 | 92 | def validation_step(self, batch, batch_idx): 93 | x, y = batch 94 | y_hat = self(x) 95 | loss = F.cross_entropy(y_hat, y) 96 | self.log("val_loss", loss) 97 | return loss 98 | 99 | def test_step(self, batch, batch_idx): 100 | x, y = batch 101 | y_hat = self(x) 102 | loss = F.cross_entropy(y_hat, y) 103 | return loss 104 | 105 | def predict_step(self, batch, batch_idx, dataloader_idx=None): 106 | x, _ = batch 107 | return self(x) 108 | 109 | def configure_optimizers(self): 110 | optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate) 111 | return optimizer 112 | 113 | 114 | data_module = MNISTDataModule() 115 | model = LitConvClassifier() 116 | 117 | 118 | class CustomValidationCallback(Callback): 119 | """ 120 | This is a simple demonstration of creating a Custom Callback and 121 | passing it to the trainer. 122 | 123 | The Callback is used everytime the Validation starts and ends. 124 | You should be able to see it in the logs! 125 | """ 126 | 127 | def on_validation_start(self, trainer, pl_module): 128 | print("Validation is starting.") 129 | 130 | def on_validation_end(self, trainer, pl_module): 131 | print("Validation has ended.") 132 | 133 | 134 | trainer = pl.Trainer( 135 | max_epochs=1, 136 | default_root_dir="experiments/", 137 | callbacks=[ 138 | EarlyStopping(monitor="val_loss", mode="min"), 139 | ModelSummary(max_depth=-1), 140 | CustomValidationCallback(), # Pass the custom callback to the trainer 141 | ], 142 | precision="16-mixed", 143 | limit_train_batches=0.1, 144 | limit_val_batches=0.01, 145 | ) 146 | 147 | trainer.fit(model, data_module) 148 | 149 | # Get Predictions 150 | predictions = trainer.predict(model, data_module) 151 | print(len(predictions)) 152 | -------------------------------------------------------------------------------- /src/advanced/level_15_modify_trainer/02_customize_progress_bar.py: -------------------------------------------------------------------------------- 1 | # Documentation Link 2 | # https://lightning.ai/docs/pytorch/stable/common/progress_bar.html 3 | 4 | import lightning.pytorch as pl 5 | import torch 6 | 7 | # In this example we will use the RichProgressBar 8 | # Lightning by default uses tqdm 9 | from lightning.pytorch.callbacks import ModelSummary, RichProgressBar 10 | from lightning.pytorch.callbacks.early_stopping import EarlyStopping 11 | from lightning.pytorch.callbacks.progress.rich_progress import RichProgressBarTheme 12 | from torch import nn 13 | from torch.nn import functional as F 14 | from torch.utils.data import DataLoader, random_split 15 | from torchvision import transforms 16 | from torchvision.datasets import MNIST 17 | 18 | 19 | class MNISTDataModule(pl.LightningDataModule): 20 | def __init__(self, data_dir: str = "./"): 21 | super().__init__() 22 | self.data_dir = data_dir 23 | self.transform = transforms.Compose( 24 | [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))] 25 | ) 26 | 27 | def prepare_data(self): 28 | # download 29 | MNIST(self.data_dir, train=True, download=True) 30 | MNIST(self.data_dir, train=False, download=True) 31 | 32 | def setup(self, stage: str): 33 | # Assign train/val datasets for use in dataloaders 34 | if stage == "fit": 35 | mnist_full = MNIST(self.data_dir, train=True, transform=self.transform) 36 | self.mnist_train, self.mnist_val = random_split(mnist_full, [55000, 5000]) 37 | 38 | # Assign test dataset for use in dataloader(s) 39 | if stage == "test": 40 | self.mnist_test = MNIST( 41 | self.data_dir, train=False, transform=self.transform 42 | ) 43 | 44 | if stage == "predict": 45 | self.mnist_predict = MNIST( 46 | self.data_dir, train=False, transform=self.transform 47 | ) 48 | 49 | def train_dataloader(self): 50 | return DataLoader(self.mnist_train, batch_size=32) 51 | 52 | def val_dataloader(self): 53 | return DataLoader(self.mnist_val, batch_size=32) 54 | 55 | def test_dataloader(self): 56 | return DataLoader(self.mnist_test, batch_size=32) 57 | 58 | def predict_dataloader(self): 59 | return DataLoader(self.mnist_predict, batch_size=32) 60 | 61 | 62 | class LitConvClassifier(pl.LightningModule): 63 | def __init__(self, learning_rate=1e-3): 64 | super().__init__() 65 | self.save_hyperparameters() 66 | self.example_input_array = torch.rand(1, 1, 28, 28) 67 | 68 | self.learning_rate = learning_rate 69 | 70 | # Define blocks of layers as submodules 71 | self.conv_block1 = nn.Sequential( 72 | nn.Conv2d(1, 32, 3, stride=1, padding=1), nn.ReLU(), nn.MaxPool2d(2) 73 | ) 74 | 75 | self.conv_block2 = nn.Sequential( 76 | nn.Conv2d(32, 64, 3, stride=1, padding=1), nn.ReLU(), nn.MaxPool2d(2) 77 | ) 78 | 79 | self.fc_block = nn.Sequential( 80 | nn.Linear(64 * 7 * 7, 128), nn.ReLU(), nn.Linear(128, 10) 81 | ) 82 | 83 | def forward(self, x): 84 | x = self.conv_block1(x) 85 | x = self.conv_block2(x) 86 | x = x.view(x.size(0), -1) 87 | x = self.fc_block(x) 88 | return x 89 | 90 | def training_step(self, batch, batch_idx): 91 | x, y = batch 92 | y_hat = self(x) 93 | loss = F.cross_entropy(y_hat, y) 94 | return loss 95 | 96 | def validation_step(self, batch, batch_idx): 97 | x, y = batch 98 | y_hat = self(x) 99 | loss = F.cross_entropy(y_hat, y) 100 | self.log("val_loss", loss) 101 | return loss 102 | 103 | def test_step(self, batch, batch_idx): 104 | x, y = batch 105 | y_hat = self(x) 106 | loss = F.cross_entropy(y_hat, y) 107 | return loss 108 | 109 | def predict_step(self, batch, batch_idx, dataloader_idx=None): 110 | x, _ = batch 111 | return self(x) 112 | 113 | def configure_optimizers(self): 114 | optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate) 115 | return optimizer 116 | 117 | 118 | data_module = MNISTDataModule() 119 | model = LitConvClassifier() 120 | 121 | # Define a custom theme for the RichProgressBar 122 | rich_progress_bar = RichProgressBar( 123 | theme=RichProgressBarTheme( 124 | description="green_yellow", 125 | progress_bar="green1", 126 | progress_bar_finished="green1", 127 | progress_bar_pulse="#6206E0", 128 | batch_progress="green_yellow", 129 | time="grey82", 130 | processing_speed="grey82", 131 | metrics="grey82", 132 | metrics_text_delimiter="\n", 133 | metrics_format=".3e", 134 | ) 135 | ) 136 | 137 | trainer = pl.Trainer( 138 | max_epochs=1, 139 | default_root_dir="experiments", 140 | callbacks=[ 141 | EarlyStopping(monitor="val_loss", mode="min"), 142 | ModelSummary(max_depth=-1), 143 | rich_progress_bar, # Pass the rich_progress_bar to the Trainer 144 | ], 145 | precision="16-mixed", 146 | limit_train_batches=0.1, 147 | limit_val_batches=0.01, 148 | ) 149 | 150 | trainer.fit(model, data_module) 151 | 152 | # Get Predictions 153 | predictions = trainer.predict(model, data_module) 154 | print(len(predictions)) 155 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Zero-to-Lightning :zap:: Comprehensive PyTorch Lightning Tutorial Guide 2 | 3 | 4 | Open In Studio 5 | 6 | 7 | Welcome to the GitHub repository for Zero-to-Lightning! This project contains a collection of independent, executable scripts that showcase most of the available functionalities in PyTorch Lightning, each covering a new feature or technique. It's organized to help you smoothly progress from basic to advanced PyTorch Lightning concepts. 8 | 9 | ## Project Demo 10 | 11 | https://github.com/ishandutta0098/zero-to-lightning/assets/47643789/a068e1d1-0ec8-4357-b4e2-d1c8090224fd 12 | 13 | 14 | ## Project Directory 15 | 16 | ``` 17 | zero-to-lightning 18 | |-src 19 | |-basic 20 | | |-level_01_lightning_module 21 | | |-level_02_validation_and_testing 22 | | |-level_03_checkpointing 23 | | |-level_04_early_stopping 24 | | |-level_05_pretrained_model 25 | | |-level_06_debugging_model 26 | | |-level_07_inference 27 | | 28 | |-intermediate 29 | | |-level_08_accelerated_hardware 30 | | |-level_09_modularize 31 | | |-level_11_scaling_techniques 32 | | |-level_12_deploying_models 33 | | |-level_13_profiler 34 | | 35 | |-advanced 36 | |-level_14_run_with_config_file 37 | |-level_15_modify_trainer 38 | |-level_16_enable_manual_optimization 39 | |-level_17_advanced_checkpointing 40 | |-level_18_ipu 41 | |-level_19_hpu 42 | 43 | ``` 44 | 45 | - **Basic**: 🏗 Foundational Lightning concepts like creating modules, validation and testing, checkpointing, early stopping, pretrained models, debugging, and inference. 46 | - **Intermediate**: 🚀 More specialized topics like accelerated hardware, modularization, scaling techniques, deployment, and profiling. 47 | - **Advanced**: 🔍 Deep dives into running with config files, modifying trainers, manual optimization, advanced checkpointing, IPUs, and HPUs. 48 | 49 | ## Overview 50 | 51 | Each sub-directory is designed to help users become familiar with a specific set of PyTorch Lightning functionalities and best practices. Whether you're just starting out or are an advanced user seeking to refine your techniques, the project provides structured guidance and practical examples. 52 | 53 | ## Features 54 | 55 | - **Compact, Executable Scripts**: 📦 Each script is designed to be concise, demonstrating how individual features, functions, or classes operate, making learning targeted and efficient. 56 | - **CPU-Friendly**: 🖥 Most scripts are optimized to run on standard CPUs, minimizing the need for specialized hardware. 57 | - **Quick Iteration**: ⏲ Each script executes in under a minute, enabling rapid testing, learning, and iteration. 58 | - **Official Documentation Links**: 📚 Every script is accompanied by relevant references to official Lightning documentation, helping you deepen your understanding. 59 | - **Independent Execution**: 🏃‍♂️ The scripts are modular, allowing you to explore features individually without needing to execute the entire project. 60 | - **Comprehensive Coverage**: 🌐 From basic modules and validation to advanced manual optimization and hardware-specific integrations, this guide ensures broad exposure to the various functionalities PyTorch Lightning offers. 61 | - **Step-by-Step Structure**: 🛠 Organized progressively, it enables users to gradually advance from foundational knowledge to more sophisticated techniques. 62 | 63 | 64 | 65 | ## Getting Started 66 | 67 | To get started with this project, clone the repository and follow the instructions below. 68 | 69 | ### Installation 70 | 71 | 1. Clone the repository: 72 | ```bash 73 | git clone https://github.com/ishandutta0098/zero-to-lightning.git 74 | ``` 75 | 76 | 2. Navigate to the project directory: 77 | ```bash 78 | cd zero-to-lightning 79 | ``` 80 | 81 | 3. Create the conda environment: 82 | ```bash 83 | # Create the conda environment 84 | conda env create -f conda.yml 85 | 86 | # Activate the environment 87 | conda activate lit-env 88 | ``` 89 | 90 | ### Usage 91 | You can run any script by passing it's path directly as shown below. 92 | 93 | ```bash 94 | python 95 | 96 | # Example 97 | python src/basic/level_01_lightning_module/lightning_module.py 98 | ``` 99 | 100 | Most of the scripts run directly. For one script we use the LightningCLI. 101 | To run the script `src/advanced/level_14_run_with_config_file/run_with_yaml.py` follow the below steps 👇 102 | 103 | ```bash 104 | # There are 3 Steps to run this: 105 | # 1. Save the current configs in config.yaml 106 | python src/advanced/level_14_run_with_config_file/run_with_yaml.py fit --print_config > config.yaml 107 | 108 | # 2. Run the training using the config file 109 | python src/advanced/level_14_run_with_config_file/run_with_yaml.py fit --config config.yaml 110 | 111 | # 3. Modify the config file and run the training again 112 | # Example, try making `max_epochs` as 3 in the config file and run the training again 113 | python src/advanced/level_14_run_with_config_file/run_with_yaml.py fit --config config.yaml 114 | ``` 115 | 116 | ## License 117 | This project is licensed under the MIT License - see the LICENSE file for details. 118 | 119 | -------------------------------------------------------------------------------- /src/intermediate/level_08_accelerated_hardware/01_gpu_training.py: -------------------------------------------------------------------------------- 1 | # Documentation Link 2 | # https://lightning.ai/docs/pytorch/stable/accelerators/gpu_basic.html 3 | 4 | import os 5 | 6 | import lightning.pytorch as pl 7 | import torch 8 | from lightning.pytorch.callbacks import ModelSummary 9 | from lightning.pytorch.callbacks.early_stopping import EarlyStopping 10 | from torch import nn 11 | from torch.nn import functional as F 12 | from torch.utils.data import DataLoader 13 | from torchvision import transforms 14 | from torchvision.datasets import MNIST 15 | 16 | 17 | class LitConvClassifier(pl.LightningModule): 18 | def __init__(self, learning_rate=1e-3): 19 | super().__init__() 20 | self.save_hyperparameters() 21 | self.example_input_array = torch.rand(1, 1, 28, 28) 22 | 23 | self.learning_rate = learning_rate 24 | 25 | # Define blocks of layers as submodules 26 | self.conv_block1 = nn.Sequential( 27 | nn.Conv2d(1, 32, 3, stride=1, padding=1), nn.ReLU(), nn.MaxPool2d(2) 28 | ) 29 | 30 | self.conv_block2 = nn.Sequential( 31 | nn.Conv2d(32, 64, 3, stride=1, padding=1), nn.ReLU(), nn.MaxPool2d(2) 32 | ) 33 | 34 | self.fc_block = nn.Sequential( 35 | nn.Linear(64 * 7 * 7, 128), nn.ReLU(), nn.Linear(128, 10) 36 | ) 37 | 38 | def forward(self, x): 39 | x = self.conv_block1(x) 40 | x = self.conv_block2(x) 41 | x = x.view(x.size(0), -1) 42 | x = self.fc_block(x) 43 | return x 44 | 45 | def training_step(self, batch, batch_idx): 46 | x, y = batch 47 | y_hat = self(x) 48 | loss = F.cross_entropy(y_hat, y) 49 | return loss 50 | 51 | def validation_step(self, batch, batch_idx): 52 | x, y = batch 53 | y_hat = self(x) 54 | loss = F.cross_entropy(y_hat, y) 55 | self.log("val_loss", loss) 56 | return loss 57 | 58 | def test_step(self, batch, batch_idx): 59 | x, y = batch 60 | y_hat = self(x) 61 | loss = F.cross_entropy(y_hat, y) 62 | return loss 63 | 64 | def predict_step(self, batch, batch_idx, dataloader_idx=None): 65 | x, _ = batch 66 | return self(x) 67 | 68 | def configure_optimizers(self): 69 | optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate) 70 | return optimizer 71 | 72 | 73 | def prepare_dataloaders(): 74 | train_dataset = MNIST( 75 | "./", download=True, train=True, transform=transforms.ToTensor() 76 | ) 77 | 78 | train_size = int(0.8 * len(train_dataset)) 79 | val_size = len(train_dataset) - train_size 80 | 81 | seed = torch.Generator().manual_seed(42) 82 | train_dataset, val_dataset = torch.utils.data.random_split( 83 | train_dataset, [train_size, val_size], generator=seed 84 | ) 85 | 86 | test_dataset = MNIST( 87 | "./", download=True, train=False, transform=transforms.ToTensor() 88 | ) 89 | 90 | train_dataloader = DataLoader(train_dataset, batch_size=32) 91 | val_dataloader = DataLoader(val_dataset, batch_size=32) 92 | test_dataloader = DataLoader(test_dataset, batch_size=32) 93 | 94 | return train_dataloader, val_dataloader, test_dataloader 95 | 96 | 97 | train_dataloader, val_dataloader, test_dataloader = prepare_dataloaders() 98 | 99 | model = LitConvClassifier() 100 | 101 | # The below code will train the model on the GPU if unavailable it will throw an error. 102 | # A Graphics Processing Unit (GPU), is a specialized hardware accelerator designed to speed up 103 | # mathematical computations used in gaming and deep learning. 104 | 105 | # Following are the different configuration examples which you can use to train your model on GPU 106 | # based on your hardware configuration. 107 | 108 | # run on as many GPUs as available by default 109 | trainer = pl.Trainer( 110 | max_epochs=5, 111 | default_root_dir="experiments/", 112 | accelerator="auto", 113 | devices="auto", 114 | strategy="auto", 115 | ) 116 | # equivalent to 117 | trainer = pl.Trainer(max_epochs=5, default_root_dir="experiments/") 118 | 119 | # run on one GPU 120 | trainer = pl.Trainer( 121 | max_epochs=5, default_root_dir="experiments/", accelerator="gpu", devices=1 122 | ) 123 | 124 | # run on multiple GPUs 125 | trainer = pl.Trainer( 126 | max_epochs=5, default_root_dir="experiments/", accelerator="gpu", devices=8 127 | ) 128 | 129 | # choose the number of devices automatically 130 | trainer = pl.Trainer( 131 | max_epochs=5, default_root_dir="experiments/", accelerator="gpu", devices="auto" 132 | ) 133 | 134 | # DEFAULT (int) specifies how many GPUs to use per node 135 | k = 2 136 | trainer = pl.Trainer( 137 | max_epochs=5, default_root_dir="experiments/", accelerator="gpu", devices=k 138 | ) 139 | # equivalent to 140 | trainer = pl.Trainer( 141 | max_epochs=5, 142 | default_root_dir="experiments/", 143 | accelerator="gpu", 144 | devices=list(range(k)), 145 | ) 146 | 147 | # Specify which GPUs to use (don't use when running on cluster) 148 | trainer = pl.Trainer( 149 | max_epochs=5, default_root_dir="experiments/", accelerator="gpu", devices=[0, 1] 150 | ) 151 | # equivalent to 152 | trainer = pl.Trainer( 153 | max_epochs=5, default_root_dir="experiments/", accelerator="gpu", devices="0, 1" 154 | ) 155 | 156 | # To use all available GPUs put -1 or '-1' 157 | # equivalent to `list(range(torch.cuda.device_count())) and `"auto"` 158 | trainer = pl.Trainer( 159 | max_epochs=5, default_root_dir="experiments/", accelerator="gpu", devices=-1 160 | ) 161 | 162 | trainer.fit(model, train_dataloader, val_dataloader) 163 | -------------------------------------------------------------------------------- /src/advanced/level_14_run_with_config_file/run_with_yaml.py: -------------------------------------------------------------------------------- 1 | # Documentation Link 2 | # https://lightning.ai/docs/pytorch/stable/levels/advanced_level_15.html 3 | 4 | import lightning.pytorch as pl 5 | import torch 6 | from lightning.pytorch.callbacks import ModelSummary 7 | from lightning.pytorch.callbacks.early_stopping import EarlyStopping 8 | 9 | # We will use the LightningCLI to run the training 10 | from lightning.pytorch.cli import LightningCLI 11 | from torch import nn 12 | from torch.nn import functional as F 13 | from torch.utils.data import DataLoader, random_split 14 | from torchvision import transforms 15 | from torchvision.datasets import MNIST 16 | 17 | 18 | class MNISTDataModule(pl.LightningDataModule): 19 | def __init__(self, data_dir: str = "./"): 20 | super().__init__() 21 | self.data_dir = data_dir 22 | self.transform = transforms.Compose( 23 | [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))] 24 | ) 25 | 26 | def prepare_data(self): 27 | # download 28 | MNIST(self.data_dir, train=True, download=True) 29 | MNIST(self.data_dir, train=False, download=True) 30 | 31 | def setup(self, stage: str): 32 | # Assign train/val datasets for use in dataloaders 33 | if stage == "fit": 34 | mnist_full = MNIST(self.data_dir, train=True, transform=self.transform) 35 | self.mnist_train, self.mnist_val = random_split(mnist_full, [55000, 5000]) 36 | 37 | # Assign test dataset for use in dataloader(s) 38 | if stage == "test": 39 | self.mnist_test = MNIST( 40 | self.data_dir, train=False, transform=self.transform 41 | ) 42 | 43 | if stage == "predict": 44 | self.mnist_predict = MNIST( 45 | self.data_dir, train=False, transform=self.transform 46 | ) 47 | 48 | def train_dataloader(self): 49 | return DataLoader(self.mnist_train, batch_size=32) 50 | 51 | def val_dataloader(self): 52 | return DataLoader(self.mnist_val, batch_size=32) 53 | 54 | def test_dataloader(self): 55 | return DataLoader(self.mnist_test, batch_size=32) 56 | 57 | def predict_dataloader(self): 58 | return DataLoader(self.mnist_predict, batch_size=32) 59 | 60 | 61 | class LitConvClassifier(pl.LightningModule): 62 | def __init__(self, learning_rate=1e-3): 63 | super().__init__() 64 | self.save_hyperparameters() 65 | self.example_input_array = torch.rand(1, 1, 28, 28) 66 | 67 | self.learning_rate = learning_rate 68 | 69 | # Define blocks of layers as submodules 70 | self.conv_block1 = nn.Sequential( 71 | nn.Conv2d(1, 32, 3, stride=1, padding=1), nn.ReLU(), nn.MaxPool2d(2) 72 | ) 73 | 74 | self.conv_block2 = nn.Sequential( 75 | nn.Conv2d(32, 64, 3, stride=1, padding=1), nn.ReLU(), nn.MaxPool2d(2) 76 | ) 77 | 78 | self.fc_block = nn.Sequential( 79 | nn.Linear(64 * 7 * 7, 128), nn.ReLU(), nn.Linear(128, 10) 80 | ) 81 | 82 | def forward(self, x): 83 | x = self.conv_block1(x) 84 | x = self.conv_block2(x) 85 | x = x.view(x.size(0), -1) 86 | x = self.fc_block(x) 87 | return x 88 | 89 | def training_step(self, batch, batch_idx): 90 | x, y = batch 91 | y_hat = self(x) 92 | loss = F.cross_entropy(y_hat, y) 93 | return loss 94 | 95 | def validation_step(self, batch, batch_idx): 96 | x, y = batch 97 | y_hat = self(x) 98 | loss = F.cross_entropy(y_hat, y) 99 | self.log("val_loss", loss) 100 | return loss 101 | 102 | def test_step(self, batch, batch_idx): 103 | x, y = batch 104 | y_hat = self(x) 105 | loss = F.cross_entropy(y_hat, y) 106 | return loss 107 | 108 | def predict_step(self, batch, batch_idx, dataloader_idx=None): 109 | x, _ = batch 110 | return self(x) 111 | 112 | def configure_optimizers(self): 113 | optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate) 114 | return optimizer 115 | 116 | 117 | data_module = MNISTDataModule 118 | model = LitConvClassifier 119 | 120 | # To use the CLI we do not initialize the trainer class separately 121 | # We pass the model, data module, trainer defaults to the LightningCLI function directly 122 | # As you can see here the trainer_defaults are the same as the ones we used in the previous example 123 | 124 | 125 | def cli_main(model, data_module): 126 | cli = LightningCLI( 127 | model_class=LitConvClassifier, 128 | datamodule_class=MNISTDataModule, 129 | trainer_class=pl.Trainer, 130 | trainer_defaults={ 131 | "max_epochs": 1, 132 | "default_root_dir": "experiments/", 133 | "callbacks": [ 134 | EarlyStopping(monitor="val_loss", mode="min"), 135 | ModelSummary(max_depth=-1), 136 | ], 137 | "precision": "16-mixed", 138 | "limit_train_batches": 0.1, 139 | "limit_val_batches": 0.01, 140 | }, 141 | ) 142 | 143 | 144 | if __name__ == "__main__": 145 | cli_main(model, data_module) 146 | 147 | # There are 3 Steps to run this: 148 | # 1. Save the current configs in config.yaml 149 | # python src/advanced/level_14_run_with_config_file/run_with_yaml.py fit --print_config > config.yaml 150 | 151 | # 2. Run the training using the config file 152 | # python src/advanced/level_14_run_with_config_file/run_with_yaml.py fit --config config.yaml 153 | 154 | # 3. Modify the config file and run the training again 155 | # Example, try making `max_epochs` as 3 in the config file and run the training again 156 | # python src/advanced/level_14_run_with_config_file/run_with_yaml.py fit --config config.yaml 157 | -------------------------------------------------------------------------------- /src/advanced/level_18_ipu/ipu.py: -------------------------------------------------------------------------------- 1 | # Documentation Link 2 | # https://lightning.ai/docs/pytorch/stable/integrations/ipu/basic.html 3 | 4 | import lightning.pytorch as pl 5 | import torch 6 | from lightning.pytorch.callbacks import ModelSummary 7 | from lightning.pytorch.callbacks.early_stopping import EarlyStopping 8 | from torch import nn 9 | from torch.nn import functional as F 10 | from torch.utils.data import DataLoader, random_split 11 | from torchvision import transforms 12 | from torchvision.datasets import MNIST 13 | 14 | 15 | class MNISTDataModule(pl.LightningDataModule): 16 | def __init__(self, data_dir: str = "./"): 17 | super().__init__() 18 | self.data_dir = data_dir 19 | self.transform = transforms.Compose( 20 | [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))] 21 | ) 22 | 23 | def prepare_data(self): 24 | MNIST(self.data_dir, train=True, download=True) 25 | MNIST(self.data_dir, train=False, download=True) 26 | 27 | def setup(self, stage: str): 28 | # Assign train/val datasets for use in dataloaders 29 | if stage == "fit": 30 | mnist_full = MNIST(self.data_dir, train=True, transform=self.transform) 31 | self.mnist_train, self.mnist_val = random_split(mnist_full, [55000, 5000]) 32 | 33 | # Assign test dataset for use in dataloader(s) 34 | if stage == "test": 35 | self.mnist_test = MNIST( 36 | self.data_dir, train=False, transform=self.transform 37 | ) 38 | 39 | if stage == "predict": 40 | self.mnist_predict = MNIST( 41 | self.data_dir, train=False, transform=self.transform 42 | ) 43 | 44 | def train_dataloader(self): 45 | return DataLoader(self.mnist_train, batch_size=32) 46 | 47 | def val_dataloader(self): 48 | return DataLoader(self.mnist_val, batch_size=32) 49 | 50 | def test_dataloader(self): 51 | return DataLoader(self.mnist_test, batch_size=32) 52 | 53 | def predict_dataloader(self): 54 | return DataLoader(self.mnist_predict, batch_size=32) 55 | 56 | 57 | class LitConvClassifier(pl.LightningModule): 58 | def __init__(self, learning_rate=1e-3): 59 | super().__init__() 60 | self.save_hyperparameters() 61 | self.example_input_array = torch.rand(1, 1, 28, 28) 62 | 63 | self.learning_rate = learning_rate 64 | 65 | # Define blocks of layers as submodules 66 | self.conv_block1 = nn.Sequential( 67 | nn.Conv2d(1, 32, 3, stride=1, padding=1), nn.ReLU(), nn.MaxPool2d(2) 68 | ) 69 | 70 | self.conv_block2 = nn.Sequential( 71 | nn.Conv2d(32, 64, 3, stride=1, padding=1), nn.ReLU(), nn.MaxPool2d(2) 72 | ) 73 | 74 | self.fc_block = nn.Sequential( 75 | nn.Linear(64 * 7 * 7, 128), nn.ReLU(), nn.Linear(128, 10) 76 | ) 77 | 78 | def forward(self, x): 79 | x = self.conv_block1(x) 80 | x = self.conv_block2(x) 81 | x = x.view(x.size(0), -1) 82 | x = self.fc_block(x) 83 | return x 84 | 85 | def training_step(self, batch, batch_idx): 86 | x, y = batch 87 | y_hat = self(x) 88 | loss = F.cross_entropy(y_hat, y) 89 | return loss 90 | 91 | def validation_step(self, batch, batch_idx): 92 | x, y = batch 93 | y_hat = self(x) 94 | loss = F.cross_entropy(y_hat, y) 95 | self.log("val_loss", loss) 96 | return loss 97 | 98 | def test_step(self, batch, batch_idx): 99 | x, y = batch 100 | y_hat = self(x) 101 | loss = F.cross_entropy(y_hat, y) 102 | return loss 103 | 104 | def predict_step(self, batch, batch_idx, dataloader_idx=None): 105 | x, _ = batch 106 | return self(x) 107 | 108 | def configure_optimizers(self): 109 | optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate) 110 | return optimizer 111 | 112 | 113 | data_module = MNISTDataModule() 114 | model = LitConvClassifier() 115 | 116 | # Run on as many IPUs as available by default 117 | trainer = pl.Trainer( 118 | max_epochs=3, 119 | default_root_dir="experiments", 120 | callbacks=[ 121 | EarlyStopping(monitor="val_loss", mode="min"), 122 | ModelSummary(max_depth=-1), 123 | ], 124 | precision="16-mixed", 125 | limit_train_batches=0.1, 126 | limit_val_batches=0.01, 127 | accelerator="auto", 128 | devices="auto", 129 | strategy="auto", 130 | ) 131 | 132 | # equivalent to 133 | trainer = pl.Trainer( 134 | max_epochs=3, 135 | default_root_dir="experiments", 136 | callbacks=[ 137 | EarlyStopping(monitor="val_loss", mode="min"), 138 | ModelSummary(max_depth=-1), 139 | ], 140 | precision="16-mixed", 141 | limit_train_batches=0.1, 142 | limit_val_batches=0.01, 143 | ) 144 | 145 | # Run on one IPU 146 | trainer = pl.Trainer( 147 | max_epochs=3, 148 | default_root_dir="experiments", 149 | callbacks=[ 150 | EarlyStopping(monitor="val_loss", mode="min"), 151 | ModelSummary(max_depth=-1), 152 | ], 153 | precision="16-mixed", 154 | limit_train_batches=0.1, 155 | limit_val_batches=0.01, 156 | accelerator="ipu", 157 | devices="1", 158 | ) 159 | 160 | # Run on multiple IPUs 161 | trainer = pl.Trainer( 162 | max_epochs=3, 163 | default_root_dir="experiments", 164 | callbacks=[ 165 | EarlyStopping(monitor="val_loss", mode="min"), 166 | ModelSummary(max_depth=-1), 167 | ], 168 | precision="16-mixed", 169 | limit_train_batches=0.1, 170 | limit_val_batches=0.01, 171 | accelerator="ipu", 172 | devices="8", 173 | ) 174 | 175 | trainer.fit(model, data_module) 176 | 177 | # Get Predictions 178 | predictions = trainer.predict(model, data_module) 179 | print(len(predictions)) 180 | -------------------------------------------------------------------------------- /src/advanced/level_17_advanced_checkpointing/checkpoint.py: -------------------------------------------------------------------------------- 1 | # Documentation Link 2 | # https://lightning.ai/docs/pytorch/stable/common/checkpointing_advanced.html 3 | 4 | import lightning.pytorch as pl 5 | import torch 6 | from lightning.pytorch.callbacks import ModelSummary 7 | from lightning.pytorch.callbacks.early_stopping import EarlyStopping 8 | from torch import nn 9 | from torch.nn import functional as F 10 | from torch.utils.data import DataLoader, random_split 11 | from torchvision import transforms 12 | from torchvision.datasets import MNIST 13 | 14 | 15 | class MNISTDataModule(pl.LightningDataModule): 16 | def __init__(self, data_dir: str = "./"): 17 | super().__init__() 18 | self.data_dir = data_dir 19 | self.transform = transforms.Compose( 20 | [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))] 21 | ) 22 | 23 | def prepare_data(self): 24 | # download 25 | MNIST(self.data_dir, train=True, download=True) 26 | MNIST(self.data_dir, train=False, download=True) 27 | 28 | def setup(self, stage: str): 29 | # Assign train/val datasets for use in dataloaders 30 | if stage == "fit": 31 | mnist_full = MNIST(self.data_dir, train=True, transform=self.transform) 32 | self.mnist_train, self.mnist_val = random_split(mnist_full, [55000, 5000]) 33 | 34 | # Assign test dataset for use in dataloader(s) 35 | if stage == "test": 36 | self.mnist_test = MNIST( 37 | self.data_dir, train=False, transform=self.transform 38 | ) 39 | 40 | if stage == "predict": 41 | self.mnist_predict = MNIST( 42 | self.data_dir, train=False, transform=self.transform 43 | ) 44 | 45 | def train_dataloader(self): 46 | return DataLoader(self.mnist_train, batch_size=32) 47 | 48 | def val_dataloader(self): 49 | return DataLoader(self.mnist_val, batch_size=32) 50 | 51 | def test_dataloader(self): 52 | return DataLoader(self.mnist_test, batch_size=32) 53 | 54 | def predict_dataloader(self): 55 | return DataLoader(self.mnist_predict, batch_size=32) 56 | 57 | 58 | # In this example we will learn how to modify a checkpoint 59 | # We create a custom attribute train_batches_processed and increment it in the training_step 60 | # We then modify the checkpoint to save this attribute 61 | class LitConvClassifier(pl.LightningModule): 62 | def __init__(self, learning_rate=1e-3): 63 | super().__init__() 64 | self.save_hyperparameters() 65 | self.example_input_array = torch.rand(1, 1, 28, 28) 66 | 67 | self.learning_rate = learning_rate 68 | 69 | # Custom attribute to keep track of training batches processed 70 | self.train_batches_processed = 0 71 | 72 | # Define blocks of layers as submodules 73 | self.conv_block1 = nn.Sequential( 74 | nn.Conv2d(1, 32, 3, stride=1, padding=1), nn.ReLU(), nn.MaxPool2d(2) 75 | ) 76 | 77 | self.conv_block2 = nn.Sequential( 78 | nn.Conv2d(32, 64, 3, stride=1, padding=1), nn.ReLU(), nn.MaxPool2d(2) 79 | ) 80 | 81 | self.fc_block = nn.Sequential( 82 | nn.Linear(64 * 7 * 7, 128), nn.ReLU(), nn.Linear(128, 10) 83 | ) 84 | 85 | def forward(self, x): 86 | x = self.conv_block1(x) 87 | x = self.conv_block2(x) 88 | x = x.view(x.size(0), -1) 89 | x = self.fc_block(x) 90 | return x 91 | 92 | def training_step(self, batch, batch_idx): 93 | x, y = batch 94 | y_hat = self(x) 95 | loss = F.cross_entropy(y_hat, y) 96 | 97 | # Increment custom attribute train_batches_processed 98 | self.train_batches_processed += 1 99 | self.log("train_batches_processed", self.train_batches_processed) 100 | 101 | return loss 102 | 103 | def on_save_checkpoint(self, checkpoint): 104 | # Add the custom attribute to the checkpoint 105 | checkpoint["train_batches_processed"] = self.train_batches_processed 106 | 107 | def on_load_checkpoint(self, checkpoint): 108 | # Load the custom attribute from the checkpoint 109 | self.train_batches_processed = checkpoint.get("train_batches_processed", 0) 110 | 111 | def validation_step(self, batch, batch_idx): 112 | x, y = batch 113 | y_hat = self(x) 114 | loss = F.cross_entropy(y_hat, y) 115 | self.log("val_loss", loss) 116 | return loss 117 | 118 | def test_step(self, batch, batch_idx): 119 | x, y = batch 120 | y_hat = self(x) 121 | loss = F.cross_entropy(y_hat, y) 122 | return loss 123 | 124 | def predict_step(self, batch, batch_idx, dataloader_idx=None): 125 | x, _ = batch 126 | return self(x) 127 | 128 | def configure_optimizers(self): 129 | optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate) 130 | return optimizer 131 | 132 | 133 | data_module = MNISTDataModule() 134 | model = LitConvClassifier() 135 | 136 | trainer = pl.Trainer( 137 | max_epochs=3, 138 | default_root_dir="experiments", 139 | callbacks=[ 140 | EarlyStopping(monitor="val_loss", mode="min"), 141 | ModelSummary(max_depth=-1), 142 | ], 143 | precision="16-mixed", 144 | limit_train_batches=0.1, 145 | limit_val_batches=0.01, 146 | ) 147 | 148 | trainer.fit(model, data_module) 149 | 150 | # Manually load the saved checkpoint 151 | checkpoint_path = trainer.checkpoint_callback.best_model_path 152 | print(f"\nLoading checkpoint from: {checkpoint_path}") 153 | 154 | # Load the model from the checkpoint 155 | loaded_model = LitConvClassifier.load_from_checkpoint(checkpoint_path) 156 | 157 | # Print the custom attribute stored in the checkpoint 158 | # This is to check if the custom attribute is stored and loaded correctly 159 | print( 160 | f"\nTrain batches processed (from checkpoint): {loaded_model.train_batches_processed}" 161 | ) 162 | 163 | # Get Predictions 164 | predictions = trainer.predict(model, data_module) 165 | print(len(predictions)) 166 | -------------------------------------------------------------------------------- /src/intermediate/level_13_profiler/01_advanced_profiler.py: -------------------------------------------------------------------------------- 1 | # Documentation Link 2 | # https://lightning.ai/docs/pytorch/stable/tuning/profiler_basic.html 3 | 4 | import lightning.pytorch as pl 5 | import torch 6 | from lightning.pytorch.callbacks import ( 7 | DeviceStatsMonitor, 8 | ModelSummary, 9 | StochasticWeightAveraging, 10 | ) 11 | from lightning.pytorch.callbacks.early_stopping import EarlyStopping 12 | from torch import nn 13 | from torch.nn import functional as F 14 | from torch.utils.data import DataLoader, random_split 15 | from torchvision import transforms 16 | from torchvision.datasets import MNIST 17 | 18 | 19 | class MNISTDataModule(pl.LightningDataModule): 20 | def __init__(self, data_dir: str = "./"): 21 | super().__init__() 22 | self.data_dir = data_dir 23 | self.transform = transforms.Compose( 24 | [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))] 25 | ) 26 | 27 | def prepare_data(self): 28 | # download 29 | MNIST(self.data_dir, train=True, download=True) 30 | MNIST(self.data_dir, train=False, download=True) 31 | 32 | def setup(self, stage: str): 33 | # Assign train/val datasets for use in dataloaders 34 | if stage == "fit": 35 | mnist_full = MNIST(self.data_dir, train=True, transform=self.transform) 36 | self.mnist_train, self.mnist_val = random_split(mnist_full, [55000, 5000]) 37 | 38 | # Assign test dataset for use in dataloader(s) 39 | if stage == "test": 40 | self.mnist_test = MNIST( 41 | self.data_dir, train=False, transform=self.transform 42 | ) 43 | 44 | if stage == "predict": 45 | self.mnist_predict = MNIST( 46 | self.data_dir, train=False, transform=self.transform 47 | ) 48 | 49 | def train_dataloader(self): 50 | return DataLoader(self.mnist_train, batch_size=32) 51 | 52 | def val_dataloader(self): 53 | return DataLoader(self.mnist_val, batch_size=32) 54 | 55 | def test_dataloader(self): 56 | return DataLoader(self.mnist_test, batch_size=32) 57 | 58 | def predict_dataloader(self): 59 | return DataLoader(self.mnist_predict, batch_size=32) 60 | 61 | 62 | class LitConvClassifier(pl.LightningModule): 63 | def __init__(self, learning_rate=1e-3): 64 | super().__init__() 65 | self.save_hyperparameters() 66 | self.example_input_array = torch.rand(1, 1, 28, 28) 67 | 68 | self.learning_rate = learning_rate 69 | 70 | # Define blocks of layers as submodules 71 | self.conv_block1 = nn.Sequential( 72 | nn.Conv2d(1, 32, 3, stride=1, padding=1), nn.ReLU(), nn.MaxPool2d(2) 73 | ) 74 | 75 | self.conv_block2 = nn.Sequential( 76 | nn.Conv2d(32, 64, 3, stride=1, padding=1), nn.ReLU(), nn.MaxPool2d(2) 77 | ) 78 | 79 | self.fc_block = nn.Sequential( 80 | nn.Linear(64 * 7 * 7, 128), nn.ReLU(), nn.Linear(128, 10) 81 | ) 82 | 83 | def forward(self, x): 84 | x = self.conv_block1(x) 85 | x = self.conv_block2(x) 86 | x = x.view(x.size(0), -1) 87 | x = self.fc_block(x) 88 | return x 89 | 90 | def training_step(self, batch, batch_idx): 91 | x, y = batch 92 | y_hat = self(x) 93 | loss = F.cross_entropy(y_hat, y) 94 | return loss 95 | 96 | def validation_step(self, batch, batch_idx): 97 | x, y = batch 98 | y_hat = self(x) 99 | loss = F.cross_entropy(y_hat, y) 100 | self.log("val_loss", loss) 101 | return loss 102 | 103 | def test_step(self, batch, batch_idx): 104 | x, y = batch 105 | y_hat = self(x) 106 | loss = F.cross_entropy(y_hat, y) 107 | return loss 108 | 109 | def predict_step(self, batch, batch_idx, dataloader_idx=None): 110 | x, _ = batch 111 | return self(x) 112 | 113 | def configure_optimizers(self): 114 | optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate) 115 | return optimizer 116 | 117 | 118 | data_module = MNISTDataModule() 119 | model = LitConvClassifier() 120 | 121 | 122 | # Profiling helps you find bottlenecks in your code by capturing 123 | # analytics such as how long a function takes or how much memory is used. 124 | 125 | # The most basic profile measures all the key methods across Callbacks, 126 | # DataModules and the LightningModule in the training loop. 127 | print("------------------------------") 128 | print("Basic Profiler") 129 | print("------------------------------") 130 | trainer = pl.Trainer( 131 | max_epochs=1, 132 | default_root_dir="experiments/", 133 | callbacks=[ 134 | EarlyStopping(monitor="val_loss", mode="min"), 135 | ModelSummary(max_depth=-1), 136 | StochasticWeightAveraging(swa_lrs=1e-2), 137 | ], 138 | precision="16-mixed", 139 | profiler="simple", 140 | limit_train_batches=0.1, 141 | limit_val_batches=0.01, 142 | ) 143 | 144 | trainer.fit(model, data_module) 145 | 146 | # To profile the time within every function, use the AdvancedProfiler built on top of Python’s cProfiler. 147 | print("------------------------------") 148 | print("Advanced Profiler") 149 | print("------------------------------") 150 | trainer = pl.Trainer( 151 | max_epochs=1, 152 | default_root_dir="experiments/", 153 | callbacks=[ 154 | EarlyStopping(monitor="val_loss", mode="min"), 155 | ModelSummary(max_depth=-1), 156 | StochasticWeightAveraging(swa_lrs=1e-2), 157 | ], 158 | precision="16-mixed", 159 | profiler="advanced", 160 | limit_train_batches=0.1, 161 | limit_val_batches=0.01, 162 | ) 163 | 164 | trainer.fit(model, data_module) 165 | 166 | # Get Predictions 167 | predictions = trainer.predict(model, data_module) 168 | print(len(predictions)) 169 | 170 | # Another helpful technique to detect bottlenecks is to ensure that 171 | # you’re using the full capacity of your accelerator (GPU/TPU/IPU/HPU). 172 | # This can be measured with the DeviceStatsMonitor: 173 | print("------------------------------") 174 | print("Device Stats Monitor") 175 | print("------------------------------") 176 | trainer = pl.Trainer( 177 | max_epochs=1, 178 | default_root_dir="experiments/", 179 | callbacks=[ 180 | EarlyStopping(monitor="val_loss", mode="min"), 181 | ModelSummary(max_depth=-1), 182 | StochasticWeightAveraging(swa_lrs=1e-2), 183 | DeviceStatsMonitor(), 184 | ], 185 | precision="16-mixed", 186 | profiler="advanced", 187 | limit_train_batches=0.1, 188 | limit_val_batches=0.01, 189 | ) 190 | 191 | trainer.fit(model, data_module) 192 | -------------------------------------------------------------------------------- /src/advanced/level_19_hpu/hpu.py: -------------------------------------------------------------------------------- 1 | # Documentation Link 2 | # https://lightning.ai/docs/pytorch/stable/integrations/hpu/basic.html 3 | 4 | import lightning.pytorch as pl 5 | import torch 6 | from lightning.pytorch.callbacks import ModelSummary 7 | from lightning.pytorch.callbacks.early_stopping import EarlyStopping 8 | 9 | # Import the HPUAccelerator 10 | from lightning_habana.pytorch.accelerator import HPUAccelerator 11 | from lightning_habana.pytorch.strategies import HPUParallelStrategy 12 | from torch import nn 13 | from torch.nn import functional as F 14 | from torch.utils.data import DataLoader, random_split 15 | from torchvision import transforms 16 | from torchvision.datasets import MNIST 17 | 18 | 19 | class MNISTDataModule(pl.LightningDataModule): 20 | def __init__(self, data_dir: str = "./"): 21 | super().__init__() 22 | self.data_dir = data_dir 23 | self.transform = transforms.Compose( 24 | [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))] 25 | ) 26 | 27 | def prepare_data(self): 28 | MNIST(self.data_dir, train=True, download=True) 29 | MNIST(self.data_dir, train=False, download=True) 30 | 31 | def setup(self, stage: str): 32 | # Assign train/val datasets for use in dataloaders 33 | if stage == "fit": 34 | mnist_full = MNIST(self.data_dir, train=True, transform=self.transform) 35 | self.mnist_train, self.mnist_val = random_split(mnist_full, [55000, 5000]) 36 | 37 | # Assign test dataset for use in dataloader(s) 38 | if stage == "test": 39 | self.mnist_test = MNIST( 40 | self.data_dir, train=False, transform=self.transform 41 | ) 42 | 43 | if stage == "predict": 44 | self.mnist_predict = MNIST( 45 | self.data_dir, train=False, transform=self.transform 46 | ) 47 | 48 | def train_dataloader(self): 49 | return DataLoader(self.mnist_train, batch_size=32) 50 | 51 | def val_dataloader(self): 52 | return DataLoader(self.mnist_val, batch_size=32) 53 | 54 | def test_dataloader(self): 55 | return DataLoader(self.mnist_test, batch_size=32) 56 | 57 | def predict_dataloader(self): 58 | return DataLoader(self.mnist_predict, batch_size=32) 59 | 60 | 61 | class LitConvClassifier(pl.LightningModule): 62 | def __init__(self, learning_rate=1e-3): 63 | super().__init__() 64 | self.save_hyperparameters() 65 | self.example_input_array = torch.rand(1, 1, 28, 28) 66 | 67 | self.learning_rate = learning_rate 68 | 69 | # Define blocks of layers as submodules 70 | self.conv_block1 = nn.Sequential( 71 | nn.Conv2d(1, 32, 3, stride=1, padding=1), nn.ReLU(), nn.MaxPool2d(2) 72 | ) 73 | 74 | self.conv_block2 = nn.Sequential( 75 | nn.Conv2d(32, 64, 3, stride=1, padding=1), nn.ReLU(), nn.MaxPool2d(2) 76 | ) 77 | 78 | self.fc_block = nn.Sequential( 79 | nn.Linear(64 * 7 * 7, 128), nn.ReLU(), nn.Linear(128, 10) 80 | ) 81 | 82 | def forward(self, x): 83 | x = self.conv_block1(x) 84 | x = self.conv_block2(x) 85 | x = x.view(x.size(0), -1) 86 | x = self.fc_block(x) 87 | return x 88 | 89 | def training_step(self, batch, batch_idx): 90 | x, y = batch 91 | y_hat = self(x) 92 | loss = F.cross_entropy(y_hat, y) 93 | return loss 94 | 95 | def validation_step(self, batch, batch_idx): 96 | x, y = batch 97 | y_hat = self(x) 98 | loss = F.cross_entropy(y_hat, y) 99 | self.log("val_loss", loss) 100 | return loss 101 | 102 | def test_step(self, batch, batch_idx): 103 | x, y = batch 104 | y_hat = self(x) 105 | loss = F.cross_entropy(y_hat, y) 106 | return loss 107 | 108 | def predict_step(self, batch, batch_idx, dataloader_idx=None): 109 | x, _ = batch 110 | return self(x) 111 | 112 | def configure_optimizers(self): 113 | optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate) 114 | return optimizer 115 | 116 | 117 | data_module = MNISTDataModule() 118 | model = LitConvClassifier() 119 | 120 | # Run on as many Gaudi devices as available by default 121 | trainer = pl.Trainer( 122 | max_epochs=3, 123 | default_root_dir="experiments", 124 | callbacks=[ 125 | EarlyStopping(monitor="val_loss", mode="min"), 126 | ModelSummary(max_depth=-1), 127 | ], 128 | precision="bf16-mixed", 129 | limit_train_batches=0.1, 130 | limit_val_batches=0.01, 131 | accelerator="auto", 132 | devices="auto", 133 | strategy="auto", 134 | ) 135 | 136 | # equivalent to 137 | trainer = pl.Trainer( 138 | max_epochs=3, 139 | default_root_dir="experiments", 140 | callbacks=[ 141 | EarlyStopping(monitor="val_loss", mode="min"), 142 | ModelSummary(max_depth=-1), 143 | ], 144 | precision="bf16-mixed", 145 | limit_train_batches=0.1, 146 | limit_val_batches=0.01, 147 | ) 148 | 149 | # Run on one Gaudi device 150 | trainer = pl.Trainer( 151 | max_epochs=3, 152 | default_root_dir="experiments", 153 | callbacks=[ 154 | EarlyStopping(monitor="val_loss", mode="min"), 155 | ModelSummary(max_depth=-1), 156 | ], 157 | precision="bf16-mixed", 158 | limit_train_batches=0.1, 159 | limit_val_batches=0.01, 160 | accelerator=HPUAccelerator(), 161 | devices="1", 162 | ) 163 | 164 | # Run on multiple Gaudi devices 165 | trainer = pl.Trainer( 166 | max_epochs=3, 167 | default_root_dir="experiments", 168 | callbacks=[ 169 | EarlyStopping(monitor="val_loss", mode="min"), 170 | ModelSummary(max_depth=-1), 171 | ], 172 | precision="bf16-mixed", 173 | limit_train_batches=0.1, 174 | limit_val_batches=0.01, 175 | accelerator=HPUAccelerator(), 176 | devices="8", 177 | ) 178 | 179 | # To train a Lightning model using multiple HPU nodes, 180 | # set the num_nodes parameter with the available nodes in the Trainer class. 181 | hpus = 8 182 | parallel_hpus = [torch.device("hpu")] * hpus 183 | 184 | trainer = pl.Trainer( 185 | max_epochs=3, 186 | default_root_dir="experiments", 187 | callbacks=[ 188 | EarlyStopping(monitor="val_loss", mode="min"), 189 | ModelSummary(max_depth=-1), 190 | ], 191 | precision="bf16-mixed", 192 | limit_train_batches=0.1, 193 | limit_val_batches=0.01, 194 | accelerator=HPUAccelerator(), 195 | devices=hpus, 196 | strategy=HPUParallelStrategy(parallel_devices=parallel_hpus), 197 | num_nodes=2, 198 | ) 199 | 200 | trainer.fit(model, data_module) 201 | 202 | # Get Predictions 203 | predictions = trainer.predict(model, data_module) 204 | print(len(predictions)) 205 | -------------------------------------------------------------------------------- /src/advanced/level_16_own_the_training_loop/01_enable_manual_optimization.py: -------------------------------------------------------------------------------- 1 | # Documentation Link 2 | # https://lightning.ai/docs/pytorch/stable/model/build_model_advanced.html 3 | 4 | import lightning.pytorch as pl 5 | import torch 6 | from lightning.pytorch.callbacks import ModelSummary 7 | from lightning.pytorch.callbacks.early_stopping import EarlyStopping 8 | from torch import nn 9 | from torch.nn import functional as F 10 | from torch.utils.data import DataLoader, random_split 11 | from torchvision import transforms 12 | from torchvision.datasets import MNIST 13 | 14 | 15 | class MNISTDataModule(pl.LightningDataModule): 16 | def __init__(self, data_dir: str = "./"): 17 | super().__init__() 18 | self.data_dir = data_dir 19 | self.transform = transforms.Compose( 20 | [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))] 21 | ) 22 | 23 | def prepare_data(self): 24 | # download 25 | MNIST(self.data_dir, train=True, download=True) 26 | MNIST(self.data_dir, train=False, download=True) 27 | 28 | def setup(self, stage: str): 29 | # Assign train/val datasets for use in dataloaders 30 | if stage == "fit": 31 | mnist_full = MNIST(self.data_dir, train=True, transform=self.transform) 32 | self.mnist_train, self.mnist_val = random_split(mnist_full, [55000, 5000]) 33 | 34 | # Assign test dataset for use in dataloader(s) 35 | if stage == "test": 36 | self.mnist_test = MNIST( 37 | self.data_dir, train=False, transform=self.transform 38 | ) 39 | 40 | if stage == "predict": 41 | self.mnist_predict = MNIST( 42 | self.data_dir, train=False, transform=self.transform 43 | ) 44 | 45 | def train_dataloader(self): 46 | return DataLoader(self.mnist_train, batch_size=32) 47 | 48 | def val_dataloader(self): 49 | return DataLoader(self.mnist_val, batch_size=32) 50 | 51 | def test_dataloader(self): 52 | return DataLoader(self.mnist_test, batch_size=32) 53 | 54 | def predict_dataloader(self): 55 | return DataLoader(self.mnist_predict, batch_size=32) 56 | 57 | 58 | # Steps to enable Manual Optimization 59 | # 1. Set `self.automatic_optimization=False`` in your LightningModule’s __init__. 60 | 61 | # 2. Use the following functions and call them manually: 62 | 63 | # 2.1 `self.optimizers()` to access your optimizers (one or multiple) 64 | 65 | # 2.2 `optimizer.zero_grad()` to clear the gradients from the previous training step 66 | 67 | # 2.3 `self.manual_backward(loss)` instead of loss.backward() 68 | 69 | # 2.4 `optimizer.step()` to update your model parameters 70 | 71 | # 2.5 `self.toggle_optimizer()` and `self.untoggle_optimizer()` if needed 72 | 73 | 74 | class LitConvClassifier(pl.LightningModule): 75 | def __init__(self, learning_rate=1e-3): 76 | super().__init__() 77 | self.save_hyperparameters() 78 | self.example_input_array = torch.rand(1, 1, 28, 28) 79 | 80 | self.learning_rate = learning_rate 81 | 82 | # Enable manual optimization 83 | self.automatic_optimization = False 84 | 85 | self.conv_block1 = nn.Sequential( 86 | nn.Conv2d(1, 32, 3, stride=1, padding=1), nn.ReLU(), nn.MaxPool2d(2) 87 | ) 88 | 89 | self.conv_block2 = nn.Sequential( 90 | nn.Conv2d(32, 64, 3, stride=1, padding=1), nn.ReLU(), nn.MaxPool2d(2) 91 | ) 92 | 93 | self.fc_block = nn.Sequential( 94 | nn.Linear(64 * 7 * 7, 128), nn.ReLU(), nn.Linear(128, 10) 95 | ) 96 | 97 | def forward(self, x): 98 | x = self.conv_block1(x) 99 | x = self.conv_block2(x) 100 | x = x.view(x.size(0), -1) 101 | x = self.fc_block(x) 102 | return x 103 | 104 | # Define the compute_loss method 105 | def compute_loss(self, batch): 106 | x, y = batch 107 | logits = self(x) # Pass inputs through the model 108 | return F.cross_entropy(logits, y) # Calculate cross-entropy loss 109 | 110 | # Here are three examples of how to use manual optimization in Lightning 111 | # Uncomment one of the examples to try it out! 112 | 113 | # Example 1: Basic Manual Optimization 114 | def training_step(self, batch, batch_idx): 115 | opt = self.optimizers() 116 | opt.zero_grad() 117 | loss = self.compute_loss(batch) 118 | self.manual_backward(loss) 119 | opt.step() 120 | 121 | return loss 122 | 123 | # # Example 2: Gradient Accumulation 124 | # def training_step(self, batch, batch_idx, N=5): 125 | # opt = self.optimizers() 126 | 127 | # # scale losses by 1/N (for N batches of gradient accumulation) 128 | # loss = self.compute_loss(batch) / N 129 | # self.manual_backward(loss) 130 | 131 | # # accumulate gradients of N batches 132 | # if (batch_idx + 1) % N == 0: 133 | # opt.step() 134 | # opt.zero_grad() 135 | 136 | # return loss 137 | 138 | # Example 3: Gradient Clipping 139 | # def training_step(self, batch, batch_idx): 140 | # opt = self.optimizers() 141 | 142 | # # compute loss 143 | # loss = self.compute_loss(batch) 144 | 145 | # opt.zero_grad() 146 | # self.manual_backward(loss) 147 | 148 | # # clip gradients 149 | # self.clip_gradients(opt, gradient_clip_val=0.5, gradient_clip_algorithm="norm") 150 | 151 | # opt.step() 152 | 153 | # return loss 154 | 155 | def validation_step(self, batch, batch_idx): 156 | x, y = batch 157 | y_hat = self(x) 158 | loss = F.cross_entropy(y_hat, y) 159 | self.log("val_loss", loss) 160 | return loss 161 | 162 | def test_step(self, batch, batch_idx): 163 | x, y = batch 164 | y_hat = self(x) 165 | loss = F.cross_entropy(y_hat, y) 166 | return loss 167 | 168 | def predict_step(self, batch, batch_idx, dataloader_idx=None): 169 | x, _ = batch 170 | return self(x) 171 | 172 | def configure_optimizers(self): 173 | optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate) 174 | 175 | return optimizer 176 | 177 | 178 | data_module = MNISTDataModule() 179 | model = LitConvClassifier() 180 | 181 | trainer = pl.Trainer( 182 | max_epochs=3, 183 | default_root_dir="experiments", 184 | callbacks=[ 185 | EarlyStopping(monitor="val_loss", mode="min"), 186 | ModelSummary(max_depth=-1), 187 | ], 188 | precision="16-mixed", 189 | limit_train_batches=0.1, 190 | limit_val_batches=0.01, 191 | ) 192 | 193 | trainer.fit(model, data_module) 194 | 195 | # Get Predictions 196 | predictions = trainer.predict(model, data_module) 197 | print(len(predictions)) 198 | --------------------------------------------------------------------------------