├── .gitignore
├── requirements.txt
├── LICENSE
├── conda.yml
├── src
    ├── basic
    │   ├── level_05_pretrained_model
    │   │   └── pretrained_model.py
    │   ├── level_01_lightning_module
    │   │   └── lightning_module.py
    │   ├── level_02_validation_and_testing
    │   │   └── validate_and_test_model.py
    │   ├── level_03_checkpointing
    │   │   ├── 03_disabling_and_resuming_checkpoints.py
    │   │   ├── 02_checkpoints_with_nn_module.py
    │   │   └── 01_saving_and_loading_checkpoints.py
    │   ├── level_07_inference
    │   │   └── lightining_predict_step.py
    │   ├── level_04_early_stopping
    │   │   └── early_stopping.py
    │   └── level_06_debugging_model
    │   │   ├── 01_quick_run.py
    │   │   └── 02_model_summary.py
    ├── intermediate
    │   ├── level_12_deploying_models
    │   │   └── 01_onnx.py
    │   ├── level_11_scaling_techniques
    │   │   ├── 01_precision_training.py
    │   │   ├── 04_stochastic_weight_averaging.py
    │   │   ├── 02_gradient_accumulation.py
    │   │   └── 03_gradient_clipping.py
    │   ├── level_08_accelerated_hardware
    │   │   ├── 02_tpu_traininig.py
    │   │   └── 01_gpu_training.py
    │   ├── level_13_profiler
    │   │   ├── 02_profile_pytorch_operations.py
    │   │   └── 01_advanced_profiler.py
    │   └── level_09_modularize
    │   │   └── 01_lightning_datamodule.py
    └── advanced
    │   ├── level_15_modify_trainer
    │       ├── 01_create_callbacks.py
    │       └── 02_customize_progress_bar.py
    │   ├── level_14_run_with_config_file
    │       └── run_with_yaml.py
    │   ├── level_18_ipu
    │       └── ipu.py
    │   ├── level_17_advanced_checkpointing
    │       └── checkpoint.py
    │   ├── level_19_hpu
    │       └── hpu.py
    │   └── level_16_own_the_training_loop
    │       └── 01_enable_manual_optimization.py
└── README.md


/.gitignore:
--------------------------------------------------------------------------------
1 | lightning_logs
2 | __pycache__
3 | MNIST
4 | .DS_STORE
5 | cifar*


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | aiohttp==3.9.5
 2 | aiosignal==1.3.1
 3 | async-timeout==4.0.3
 4 | attrs==23.2.0
 5 | black==24.4.2
 6 | click==8.1.7
 7 | coloredlogs==15.0.1
 8 | docstring_parser==0.16
 9 | filelock==3.14.0
10 | flatbuffers==24.3.25
11 | frozenlist==1.4.1
12 | fsspec==2024.3.1
13 | humanfriendly==10.0
14 | idna==3.7
15 | importlib_resources==6.4.0
16 | isort==5.13.2
17 | Jinja2==3.1.3
18 | jsonargparse==4.28.0
19 | lightning==2.2.4
20 | lightning-utilities==0.11.2
21 | markdown-it-py==3.0.0
22 | MarkupSafe==2.1.5
23 | mdurl==0.1.2
24 | mpmath==1.3.0
25 | multidict==6.0.5
26 | mypy-extensions==1.0.0
27 | networkx==3.3
28 | numpy==1.26.4
29 | onnx==1.16.0
30 | onnxruntime==1.17.3
31 | packaging==24.0
32 | pathspec==0.12.1
33 | pillow==10.3.0
34 | platformdirs==4.2.1
35 | protobuf==5.26.1
36 | psutil==5.9.8
37 | Pygments==2.18.0
38 | pytorch-lightning==2.2.4
39 | PyYAML==6.0.1
40 | rich==13.7.1
41 | sympy==1.12
42 | tomli==2.0.1
43 | torch==2.3.0
44 | torchmetrics==1.3.2
45 | torchvision==0.18.0
46 | tqdm==4.66.4
47 | typeshed_client==2.5.1
48 | typing_extensions==4.11.0
49 | yarl==1.9.4
50 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 Ishan Dutta
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/conda.yml:
--------------------------------------------------------------------------------
 1 | name: lit-env
 2 | channels:
 3 |   - defaults
 4 | dependencies:
 5 |   - bzip2=1.0.8
 6 |   - ca-certificates=2024.3.11
 7 |   - libffi=3.4.4
 8 |   - ncurses=6.4
 9 |   - openssl=3.0.13
10 |   - pip=23.3.1
11 |   - python=3.10.14
12 |   - readline=8.2
13 |   - setuptools=68.2.2
14 |   - sqlite=3.45.3
15 |   - tk=8.6.12
16 |   - tzdata=2024a
17 |   - wheel=0.43.0
18 |   - xz=5.4.6
19 |   - zlib=1.2.13
20 |   - pip:
21 |       - aiohttp==3.9.5
22 |       - aiosignal==1.3.1
23 |       - async-timeout==4.0.3
24 |       - attrs==23.2.0
25 |       - black==24.4.2
26 |       - click==8.1.7
27 |       - coloredlogs==15.0.1
28 |       - docstring-parser==0.16
29 |       - filelock==3.14.0
30 |       - flatbuffers==24.3.25
31 |       - frozenlist==1.4.1
32 |       - fsspec==2024.3.1
33 |       - humanfriendly==10.0
34 |       - idna==3.7
35 |       - importlib-resources==6.4.0
36 |       - isort==5.13.2
37 |       - jinja2==3.1.3
38 |       - jsonargparse==4.28.0
39 |       - lightning==2.2.4
40 |       - lightning-utilities==0.11.2
41 |       - markdown-it-py==3.0.0
42 |       - markupsafe==2.1.5
43 |       - mdurl==0.1.2
44 |       - mpmath==1.3.0
45 |       - multidict==6.0.5
46 |       - mypy-extensions==1.0.0
47 |       - networkx==3.3
48 |       - numpy==1.26.4
49 |       - onnx==1.16.0
50 |       - onnxruntime==1.17.3
51 |       - packaging==24.0
52 |       - pathspec==0.12.1
53 |       - pillow==10.3.0
54 |       - platformdirs==4.2.1
55 |       - protobuf==5.26.1
56 |       - psutil==5.9.8
57 |       - pygments==2.18.0
58 |       - pytorch-lightning==2.2.4
59 |       - pyyaml==6.0.1
60 |       - rich==13.7.1
61 |       - sympy==1.12
62 |       - tomli==2.0.1
63 |       - torch==2.3.0
64 |       - torchmetrics==1.3.2
65 |       - torchvision==0.18.0
66 |       - tqdm==4.66.4
67 |       - typeshed-client==2.5.1
68 |       - typing-extensions==4.11.0
69 |       - yarl==1.9.4
70 | 


--------------------------------------------------------------------------------
/src/basic/level_05_pretrained_model/pretrained_model.py:
--------------------------------------------------------------------------------
 1 | # Documentation Link
 2 | # https://lightning.ai/docs/pytorch/stable/advanced/transfer_learning.html
 3 | 
 4 | import os
 5 | 
 6 | import lightning.pytorch as pl
 7 | import torch
 8 | import torchvision.models as models
 9 | from torch import nn
10 | from torch.nn import functional as F
11 | from torchvision import datasets, transforms
12 | 
13 | 
14 | # Define the Lightning Module
15 | class ImagenetTransferLearning(pl.LightningModule):
16 |     def __init__(self, learning_rate=1e-3):
17 |         super().__init__()
18 | 
19 |         # init a pretrained resnet
20 |         backbone = models.resnet50(weights="DEFAULT")
21 |         num_filters = backbone.fc.in_features
22 |         layers = list(backbone.children())[:-1]
23 |         self.feature_extractor = nn.Sequential(*layers)
24 | 
25 |         # use the pretrained model to classify cifar-10 (10 image classes)
26 |         num_target_classes = 10
27 |         self.classifier = nn.Linear(num_filters, num_target_classes)
28 |         self.learning_rate = learning_rate
29 | 
30 |     def forward(self, x):
31 |         self.feature_extractor.eval()
32 |         with torch.no_grad():
33 |             representations = self.feature_extractor(x).flatten(1)
34 |         x = self.classifier(representations)
35 |         return x
36 | 
37 |     def training_step(self, batch, batch_idx):
38 |         x, y = batch
39 |         y_hat = self(x)
40 |         loss = F.cross_entropy(y_hat, y)
41 |         return loss
42 | 
43 |     def configure_optimizers(self):
44 |         return torch.optim.Adam(self.parameters(), lr=self.learning_rate)
45 | 
46 | 
47 | # Data preparation
48 | transform = transforms.Compose(
49 |     [
50 |         transforms.Resize((224, 224)),  # ResNet50 expects 224x224 input size
51 |         transforms.ToTensor(),
52 |         transforms.Normalize(
53 |             mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
54 |         ),  # Normalization for Imagenet data
55 |     ]
56 | )
57 | 
58 | train_dataset = datasets.CIFAR10(
59 |     root="./", train=True, transform=transform, download=True
60 | )
61 | # Use a subset of the training data for demonstration purposes
62 | train_dataset = torch.utils.data.Subset(train_dataset, indices=list(range(100)))
63 | 
64 | train_dataloader = torch.utils.data.DataLoader(
65 |     train_dataset, batch_size=32, shuffle=True
66 | )
67 | 
68 | # Training
69 | model = ImagenetTransferLearning()
70 | trainer = pl.Trainer(max_epochs=1)
71 | trainer.fit(model, train_dataloader)
72 | 
73 | # Save the model checkpoint
74 | trainer.save_checkpoint("example_model.ckpt")
75 | 
76 | # Inference
77 | loaded_model = ImagenetTransferLearning.load_from_checkpoint("example_model.ckpt")
78 | loaded_model.freeze()
79 | 
80 | # Load some CIFAR10 images for prediction (assuming you're using the same transform as above)
81 | test_dataset = datasets.CIFAR10(root="./", train=False, transform=transform)
82 | test_dataloader = torch.utils.data.DataLoader(
83 |     test_dataset, batch_size=5
84 | )  # Loading 5 images for demonstration
85 | some_images_from_cifar10, _ = next(iter(test_dataloader))
86 | 
87 | predictions = loaded_model(some_images_from_cifar10)
88 | print(predictions.argmax(dim=1))
89 | 


--------------------------------------------------------------------------------
/src/basic/level_01_lightning_module/lightning_module.py:
--------------------------------------------------------------------------------
 1 | # Documentation Link
 2 | # https://lightning.ai/docs/pytorch/stable/model/train_model_basic.html
 3 | 
 4 | import lightning.pytorch as pl
 5 | import torch
 6 | from torch import nn
 7 | from torch.nn import functional as F
 8 | from torch.utils.data import DataLoader
 9 | from torchvision import transforms
10 | from torchvision.datasets import MNIST
11 | 
12 | 
13 | # A simple convolution based classifier model for MNIST
14 | class LitConvClassifier(pl.LightningModule):
15 |     def __init__(self):
16 |         super().__init__()
17 |         # Define the layers for the model architecture
18 | 
19 |         # Convolutional layer with 32 filters of size 3x3
20 |         # ReLU activation function introduces non-linearity to the model, enabling it to learn more complex patterns
21 |         self.conv1 = nn.Conv2d(1, 32, 3, stride=1, padding=1)
22 | 
23 |         # Second convolutional layer with 64 filters of size 3x3
24 |         self.conv2 = nn.Conv2d(32, 64, 3, stride=1, padding=1)
25 | 
26 |         # Fully connected layers for classification
27 |         # The input size 64*7*7 corresponds to the flattened output of the last convolutional layer
28 |         self.fc1 = nn.Linear(64 * 7 * 7, 128)
29 |         self.fc2 = nn.Linear(128, 10)
30 | 
31 |     def forward(self, x):
32 |         # Define the forward pass through the network
33 |         # Input shape: (batch_size, 1, 28, 28)
34 |         x = F.relu(self.conv1(x))  # Shape: (batch_size, 32, 28, 28)
35 |         x = F.max_pool2d(x, 2)  # Shape: (batch_size, 32, 14, 14)
36 |         x = F.relu(self.conv2(x))  # Shape: (batch_size, 64, 14, 14)
37 |         x = F.max_pool2d(x, 2)  # Shape: (batch_size, 64, 7, 7)
38 |         x = x.view(x.size(0), -1)  # Shape: (batch_size, 64*7*7)
39 |         x = F.relu(self.fc1(x))  # Shape: (batch_size, 128)
40 |         x = self.fc2(x)  # Shape: (batch_size, 10)
41 |         return x
42 | 
43 |     def training_step(self, batch, batch_idx):
44 |         # Define the training step which includes
45 |         # the forward pass, loss calculation and backpropagation
46 | 
47 |         x, y = batch  # Unpack batch
48 |         y_hat = self(x)  # Forward pass, get predicted logits
49 | 
50 |         # Calculate loss using cross-entropy, which is suitable for multi-class classification
51 |         loss = F.cross_entropy(y_hat, y)
52 |         return loss
53 | 
54 |     def configure_optimizers(self):
55 |         # Define the optimizer to use for training
56 |         # Adam is a popular choice due to its adaptive learning rate and momentum
57 |         optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)
58 |         return optimizer
59 | 
60 | 
61 | # Load Dataset
62 | # MNIST is a widely used dataset for handwritten digit recognition
63 | dataset = MNIST("./", download=True, transform=transforms.ToTensor())
64 | 
65 | # Create a Dataloader with batch size of 32
66 | # Batch size is a hyperparameter that defines the number of
67 | # samples to work through before updating the model's weights
68 | train_dataloader = DataLoader(dataset, batch_size=32)
69 | 
70 | # Initialise the model
71 | model = LitConvClassifier()
72 | 
73 | # Initialise the trainer with 1 epoch
74 | # An epoch is a complete pass through the entire training dataset
75 | trainer = pl.Trainer(max_epochs=1)
76 | 
77 | # Train the model
78 | trainer.fit(model, train_dataloader)
79 | 


--------------------------------------------------------------------------------
/src/basic/level_02_validation_and_testing/validate_and_test_model.py:
--------------------------------------------------------------------------------
 1 | # Documentation Link
 2 | # https://lightning.ai/docs/pytorch/stable/common/evaluation_basic.html
 3 | 
 4 | import os
 5 | 
 6 | import lightning.pytorch as pl
 7 | import torch
 8 | from torch import nn
 9 | from torch.nn import functional as F
10 | from torch.utils.data import DataLoader
11 | from torchvision import transforms
12 | from torchvision.datasets import MNIST
13 | 
14 | 
15 | class LitConvClassifier(pl.LightningModule):
16 |     def __init__(self):
17 |         super().__init__()
18 | 
19 |         self.conv1 = nn.Conv2d(1, 32, 3, stride=1, padding=1)
20 |         self.conv2 = nn.Conv2d(32, 64, 3, stride=1, padding=1)
21 |         self.fc1 = nn.Linear(64 * 7 * 7, 128)
22 |         self.fc2 = nn.Linear(128, 10)
23 | 
24 |     def forward(self, x):
25 |         # Define the forward pass through the network
26 |         # Input shape: (batch_size, 1, 28, 28)
27 |         x = F.relu(self.conv1(x))  # Shape: (batch_size, 32, 28, 28)
28 |         x = F.max_pool2d(x, 2)  # Shape: (batch_size, 32, 14, 14)
29 |         x = F.relu(self.conv2(x))  # Shape: (batch_size, 64, 14, 14)
30 |         x = F.max_pool2d(x, 2)  # Shape: (batch_size, 64, 7, 7)
31 |         x = x.view(x.size(0), -1)  # Shape: (batch_size, 64*7*7)
32 |         x = F.relu(self.fc1(x))  # Shape: (batch_size, 128)
33 |         x = self.fc2(x)  # Shape: (batch_size, 10)
34 |         return x
35 | 
36 |     def training_step(self, batch, batch_idx):
37 |         x, y = batch
38 |         y_hat = self(x)
39 | 
40 |         loss = F.cross_entropy(y_hat, y)
41 |         return loss
42 | 
43 |     def validation_step(self, batch, batch_idx):
44 |         # The validation step is performed once per batch of data from the validation set.
45 |         # It's used to check the model's performance on the validation set during training.
46 |         x, y = batch
47 |         y_hat = self(x)
48 | 
49 |         loss = F.cross_entropy(y_hat, y)
50 |         return loss
51 | 
52 |     def test_step(self, batch, batch_idx):
53 |         # The test step is performed once per batch of data from the test set.
54 |         # It's used to assess the model's performance on unseen data after training is complete.
55 |         x, y = batch
56 |         y_hat = self(x)
57 | 
58 |         loss = F.cross_entropy(y_hat, y)
59 |         return loss
60 | 
61 |     def configure_optimizers(self):
62 |         optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)
63 |         return optimizer
64 | 
65 | 
66 | train_dataset = MNIST(
67 |     "./", download=True, train=True, transform=transforms.ToTensor()
68 | )
69 | 
70 | # Calculate training and validation split
71 | # We will keep 80% data for training and 20% for validation
72 | train_size = int(0.8 * len(train_dataset))
73 | val_size = len(train_dataset) - train_size
74 | 
75 | # Split the dataset into training and validation
76 | seed = torch.Generator().manual_seed(42)
77 | train_dataset, val_dataset = torch.utils.data.random_split(
78 |     train_dataset, [train_size, val_size], generator=seed
79 | )
80 | 
81 | test_dataset = MNIST(
82 |     "./", download=True, train=False, transform=transforms.ToTensor()
83 | )
84 | 
85 | # Create data loaders for loading the data in batches
86 | train_dataloader = DataLoader(train_dataset, batch_size=32)
87 | val_dataloader = DataLoader(val_dataset, batch_size=32)
88 | test_dataloader = DataLoader(test_dataset, batch_size=32)
89 | 
90 | model = LitConvClassifier()
91 | 
92 | trainer = pl.Trainer(max_epochs=1)
93 | 
94 | trainer.fit(model, train_dataloader, val_dataloader)
95 | 
96 | # Test the model on the test set after training is complete
97 | trainer.test(model, test_dataloader)
98 | 


--------------------------------------------------------------------------------
/src/basic/level_03_checkpointing/03_disabling_and_resuming_checkpoints.py:
--------------------------------------------------------------------------------
  1 | # Documentation Link:
  2 | # https://lightning.ai/docs/pytorch/stable/common/checkpointing_basic.html#save-a-checkpoint
  3 | 
  4 | import os
  5 | 
  6 | import lightning.pytorch as pl
  7 | import torch
  8 | from torch import nn
  9 | from torch.nn import functional as F
 10 | from torch.utils.data import DataLoader
 11 | from torchvision import transforms
 12 | from torchvision.datasets import MNIST
 13 | 
 14 | 
 15 | class LitConvClassifier(pl.LightningModule):
 16 |     def __init__(self, learning_rate=1e-3):
 17 |         super().__init__()
 18 |         self.save_hyperparameters()
 19 | 
 20 |         self.learning_rate = learning_rate
 21 |         self.conv1 = nn.Conv2d(1, 32, 3, stride=1, padding=1)
 22 |         self.conv2 = nn.Conv2d(32, 64, 3, stride=1, padding=1)
 23 |         self.fc1 = nn.Linear(64 * 7 * 7, 128)
 24 |         self.fc2 = nn.Linear(128, 10)
 25 | 
 26 |     def forward(self, x):
 27 |         x = F.relu(self.conv1(x))
 28 |         x = F.max_pool2d(x, 2)
 29 |         x = F.relu(self.conv2(x))
 30 |         x = F.max_pool2d(x, 2)
 31 |         x = x.view(x.size(0), -1)
 32 |         x = F.relu(self.fc1(x))
 33 |         x = self.fc2(x)
 34 |         return x
 35 | 
 36 |     def training_step(self, batch, batch_idx):
 37 |         x, y = batch
 38 |         y_hat = self(x)
 39 |         loss = F.cross_entropy(y_hat, y)
 40 |         return loss
 41 | 
 42 |     def validation_step(self, batch, batch_idx):
 43 |         x, y = batch
 44 |         y_hat = self(x)
 45 |         loss = F.cross_entropy(y_hat, y)
 46 |         return loss
 47 | 
 48 |     def test_step(self, batch, batch_idx):
 49 |         x, y = batch
 50 |         y_hat = self(x)
 51 |         loss = F.cross_entropy(y_hat, y)
 52 |         return loss
 53 | 
 54 |     def configure_optimizers(self):
 55 |         optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)
 56 |         return optimizer
 57 | 
 58 | 
 59 | def prepare_dataloaders():
 60 |     train_dataset = MNIST(
 61 |         "./", download=True, train=True, transform=transforms.ToTensor()
 62 |     )
 63 | 
 64 |     train_size = int(0.8 * len(train_dataset))
 65 |     val_size = len(train_dataset) - train_size
 66 | 
 67 |     seed = torch.Generator().manual_seed(42)
 68 |     train_dataset, val_dataset = torch.utils.data.random_split(
 69 |         train_dataset, [train_size, val_size], generator=seed
 70 |     )
 71 | 
 72 |     test_dataset = MNIST(
 73 |         "./", download=True, train=False, transform=transforms.ToTensor()
 74 |     )
 75 | 
 76 |     train_dataloader = DataLoader(train_dataset, batch_size=32)
 77 |     val_dataloader = DataLoader(val_dataset, batch_size=32)
 78 |     test_dataloader = DataLoader(test_dataset, batch_size=32)
 79 | 
 80 |     return train_dataloader, val_dataloader, test_dataloader
 81 | 
 82 | 
 83 | train_dataloader, val_dataloader, test_dataloader = prepare_dataloaders()
 84 | 
 85 | model = LitConvClassifier()
 86 | 
 87 | # You can disable checkpointing by setting the Trainer's enable_checkpointing to False
 88 | trainer = pl.Trainer(
 89 |     max_epochs=1, default_root_dir="experiments/", enable_checkpointing=False
 90 | )
 91 | 
 92 | trainer.fit(model, train_dataloader, val_dataloader)
 93 | 
 94 | # To resume training from a checkpoint pass it directly to the fit method
 95 | checkpoint_path = (
 96 |     "experiments/lightning_logs/version_0/checkpoints/epoch=0-step=1500.ckpt"
 97 | )
 98 | 
 99 | model = LitConvClassifier()
100 | 
101 | trainer = pl.Trainer(max_epochs=1, default_root_dir="experiments/")
102 | 
103 | trainer.fit(model, train_dataloader, val_dataloader, ckpt_path=checkpoint_path)
104 | 


--------------------------------------------------------------------------------
/src/basic/level_03_checkpointing/02_checkpoints_with_nn_module.py:
--------------------------------------------------------------------------------
  1 | # Documentation Link:
  2 | # https://lightning.ai/docs/pytorch/stable/common/checkpointing_basic.html#save-a-checkpoint
  3 | 
  4 | import os
  5 | 
  6 | import lightning.pytorch as pl
  7 | import torch
  8 | from torch import nn
  9 | from torch.nn import functional as F
 10 | from torch.utils.data import DataLoader
 11 | from torchvision import transforms
 12 | from torchvision.datasets import MNIST
 13 | 
 14 | 
 15 | class ConvClassifier(nn.Module):
 16 |     def __init__(self):
 17 |         super(ConvClassifier, self).__init__()
 18 |         self.conv1 = nn.Conv2d(1, 32, 3, stride=1, padding=1)
 19 |         self.conv2 = nn.Conv2d(32, 64, 3, stride=1, padding=1)
 20 |         self.fc1 = nn.Linear(64 * 7 * 7, 128)
 21 |         self.fc2 = nn.Linear(128, 10)
 22 | 
 23 |     def forward(self, x):
 24 |         x = F.relu(self.conv1(x))
 25 |         x = F.max_pool2d(x, 2)
 26 |         x = F.relu(self.conv2(x))
 27 |         x = F.max_pool2d(x, 2)
 28 |         x = x.view(x.size(0), -1)
 29 |         x = F.relu(self.fc1(x))
 30 |         x = self.fc2(x)
 31 |         return x
 32 | 
 33 | 
 34 | class LitConvClassifier(pl.LightningModule):
 35 |     def __init__(self, learning_rate=1e-3):
 36 |         super().__init__()
 37 |         self.save_hyperparameters()
 38 | 
 39 |         self.learning_rate = learning_rate
 40 |         self.model = ConvClassifier()
 41 | 
 42 |     def forward(self, x):
 43 |         return self.model(x)
 44 | 
 45 |     def training_step(self, batch, batch_idx):
 46 |         x, y = batch
 47 |         y_hat = self(x)
 48 |         loss = F.cross_entropy(y_hat, y)
 49 |         return loss
 50 | 
 51 |     def validation_step(self, batch, batch_idx):
 52 |         x, y = batch
 53 |         y_hat = self(x)
 54 |         loss = F.cross_entropy(y_hat, y)
 55 |         return loss
 56 | 
 57 |     def test_step(self, batch, batch_idx):
 58 |         x, y = batch
 59 |         y_hat = self(x)
 60 |         loss = F.cross_entropy(y_hat, y)
 61 |         return loss
 62 | 
 63 |     def configure_optimizers(self):
 64 |         optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)
 65 |         return optimizer
 66 | 
 67 | 
 68 | def prepare_dataloaders():
 69 |     train_dataset = MNIST(
 70 |         "./", download=True, train=True, transform=transforms.ToTensor()
 71 |     )
 72 | 
 73 |     train_size = int(0.8 * len(train_dataset))
 74 |     val_size = len(train_dataset) - train_size
 75 | 
 76 |     seed = torch.Generator().manual_seed(42)
 77 |     train_dataset, val_dataset = torch.utils.data.random_split(
 78 |         train_dataset, [train_size, val_size], generator=seed
 79 |     )
 80 | 
 81 |     test_dataset = MNIST(
 82 |         "./", download=True, train=False, transform=transforms.ToTensor()
 83 |     )
 84 | 
 85 |     train_dataloader = DataLoader(train_dataset, batch_size=32)
 86 |     val_dataloader = DataLoader(val_dataset, batch_size=32)
 87 |     test_dataloader = DataLoader(test_dataset, batch_size=32)
 88 | 
 89 |     return train_dataloader, val_dataloader, test_dataloader
 90 | 
 91 | 
 92 | train_dataloader, val_dataloader, test_dataloader = prepare_dataloaders()
 93 | 
 94 | # Train the Model
 95 | model = LitConvClassifier()
 96 | 
 97 | trainer = pl.Trainer(max_epochs=1, default_root_dir="experiments/")
 98 | 
 99 | trainer.fit(model, train_dataloader, val_dataloader)
100 | 
101 | # Load the module using the state dict
102 | 
103 | checkpoint_path = (
104 |     "experiments/lightning_logs/version_0/checkpoints/epoch=0-step=1500.ckpt"
105 | )
106 | 
107 | checkpoint = torch.load(checkpoint_path)
108 | state_dict = checkpoint["state_dict"]
109 | 
110 | classifier_model = ConvClassifier()
111 | classifier_model.load_state_dict(state_dict)
112 | 


--------------------------------------------------------------------------------
/src/basic/level_07_inference/lightining_predict_step.py:
--------------------------------------------------------------------------------
  1 | # Documentation Link
  2 | # https://lightning.ai/docs/pytorch/stable/levels/core_level_6.html
  3 | 
  4 | import os
  5 | 
  6 | import lightning.pytorch as pl
  7 | import torch
  8 | from lightning.pytorch.callbacks import ModelSummary
  9 | from lightning.pytorch.callbacks.early_stopping import EarlyStopping
 10 | from torch import nn
 11 | from torch.nn import functional as F
 12 | from torch.utils.data import DataLoader
 13 | from torchvision import transforms
 14 | from torchvision.datasets import MNIST
 15 | 
 16 | 
 17 | class LitConvClassifier(pl.LightningModule):
 18 |     def __init__(self, learning_rate=1e-3):
 19 |         super().__init__()
 20 |         self.save_hyperparameters()
 21 |         self.example_input_array = torch.rand(1, 1, 28, 28)
 22 | 
 23 |         self.learning_rate = learning_rate
 24 | 
 25 |         # Define blocks of layers as submodules
 26 |         self.conv_block1 = nn.Sequential(
 27 |             nn.Conv2d(1, 32, 3, stride=1, padding=1), nn.ReLU(), nn.MaxPool2d(2)
 28 |         )
 29 | 
 30 |         self.conv_block2 = nn.Sequential(
 31 |             nn.Conv2d(32, 64, 3, stride=1, padding=1), nn.ReLU(), nn.MaxPool2d(2)
 32 |         )
 33 | 
 34 |         self.fc_block = nn.Sequential(
 35 |             nn.Linear(64 * 7 * 7, 128), nn.ReLU(), nn.Linear(128, 10)
 36 |         )
 37 | 
 38 |     def forward(self, x):
 39 |         x = self.conv_block1(x)
 40 |         x = self.conv_block2(x)
 41 |         x = x.view(x.size(0), -1)
 42 |         x = self.fc_block(x)
 43 |         return x
 44 | 
 45 |     def training_step(self, batch, batch_idx):
 46 |         x, y = batch
 47 |         y_hat = self(x)
 48 |         loss = F.cross_entropy(y_hat, y)
 49 |         return loss
 50 | 
 51 |     def validation_step(self, batch, batch_idx):
 52 |         x, y = batch
 53 |         y_hat = self(x)
 54 |         loss = F.cross_entropy(y_hat, y)
 55 |         self.log("val_loss", loss)
 56 |         return loss
 57 | 
 58 |     def test_step(self, batch, batch_idx):
 59 |         x, y = batch
 60 |         y_hat = self(x)
 61 |         loss = F.cross_entropy(y_hat, y)
 62 |         return loss
 63 | 
 64 |     def predict_step(self, batch, batch_idx, dataloader_idx=None):
 65 |         x, _ = batch
 66 |         return self(x)
 67 | 
 68 |     def configure_optimizers(self):
 69 |         optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)
 70 |         return optimizer
 71 | 
 72 | 
 73 | def prepare_dataloaders():
 74 |     train_dataset = MNIST(
 75 |         "./", download=True, train=True, transform=transforms.ToTensor()
 76 |     )
 77 | 
 78 |     train_size = int(0.8 * len(train_dataset))
 79 |     val_size = len(train_dataset) - train_size
 80 | 
 81 |     seed = torch.Generator().manual_seed(42)
 82 |     train_dataset, val_dataset = torch.utils.data.random_split(
 83 |         train_dataset, [train_size, val_size], generator=seed
 84 |     )
 85 | 
 86 |     test_dataset = MNIST(
 87 |         "./", download=True, train=False, transform=transforms.ToTensor()
 88 |     )
 89 | 
 90 |     train_dataloader = DataLoader(train_dataset, batch_size=32)
 91 |     val_dataloader = DataLoader(val_dataset, batch_size=32)
 92 |     test_dataloader = DataLoader(test_dataset, batch_size=32)
 93 | 
 94 |     return train_dataloader, val_dataloader, test_dataloader
 95 | 
 96 | 
 97 | train_dataloader, val_dataloader, test_dataloader = prepare_dataloaders()
 98 | 
 99 | model = LitConvClassifier()
100 | 
101 | trainer = pl.Trainer(
102 |     max_epochs=1,
103 |     default_root_dir="experiments/",
104 |     callbacks=[
105 |         EarlyStopping(monitor="val_loss", mode="min"),
106 |         ModelSummary(max_depth=-1),
107 |     ],
108 | )
109 | trainer.fit(model, train_dataloader, val_dataloader)
110 | 
111 | # Here we use the test_dataloader to get the predictions for the test set
112 | # You can use the predict_step() method for the required data
113 | predictions = trainer.predict(model, test_dataloader)
114 | print(len(predictions))
115 | 


--------------------------------------------------------------------------------
/src/basic/level_03_checkpointing/01_saving_and_loading_checkpoints.py:
--------------------------------------------------------------------------------
  1 | # Documentation Link:
  2 | # https://lightning.ai/docs/pytorch/stable/common/checkpointing_basic.html#save-a-checkpoint
  3 | 
  4 | import os
  5 | 
  6 | import lightning.pytorch as pl
  7 | import torch
  8 | from torch import nn
  9 | from torch.nn import functional as F
 10 | from torch.utils.data import DataLoader
 11 | from torchvision import transforms
 12 | from torchvision.datasets import MNIST
 13 | 
 14 | 
 15 | class LitConvClassifier(pl.LightningModule):
 16 |     def __init__(self, learning_rate=1e-3):
 17 |         super().__init__()
 18 | 
 19 |         # You can save the hyperparameters initialized in the __init__ method
 20 |         # by calling self.save_hyperparameters() in the __init__ method.
 21 |         # Here we save the learning_rate hyperparameter.
 22 |         self.save_hyperparameters()
 23 | 
 24 |         self.learning_rate = learning_rate
 25 |         self.conv1 = nn.Conv2d(1, 32, 3, stride=1, padding=1)
 26 |         self.conv2 = nn.Conv2d(32, 64, 3, stride=1, padding=1)
 27 |         self.fc1 = nn.Linear(64 * 7 * 7, 128)
 28 |         self.fc2 = nn.Linear(128, 10)
 29 | 
 30 |     def forward(self, x):
 31 |         x = F.relu(self.conv1(x))
 32 |         x = F.max_pool2d(x, 2)
 33 |         x = F.relu(self.conv2(x))
 34 |         x = F.max_pool2d(x, 2)
 35 |         x = x.view(x.size(0), -1)
 36 |         x = F.relu(self.fc1(x))
 37 |         x = self.fc2(x)
 38 |         return x
 39 | 
 40 |     def training_step(self, batch, batch_idx):
 41 |         x, y = batch
 42 |         y_hat = self(x)
 43 |         loss = F.cross_entropy(y_hat, y)
 44 |         return loss
 45 | 
 46 |     def validation_step(self, batch, batch_idx):
 47 |         x, y = batch
 48 |         y_hat = self(x)
 49 |         loss = F.cross_entropy(y_hat, y)
 50 |         return loss
 51 | 
 52 |     def test_step(self, batch, batch_idx):
 53 |         x, y = batch
 54 |         y_hat = self(x)
 55 |         loss = F.cross_entropy(y_hat, y)
 56 |         return loss
 57 | 
 58 |     def configure_optimizers(self):
 59 |         optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)
 60 |         return optimizer
 61 | 
 62 | 
 63 | def prepare_dataloaders():
 64 |     train_dataset = MNIST(
 65 |         "./", download=True, train=True, transform=transforms.ToTensor()
 66 |     )
 67 | 
 68 |     train_size = int(0.8 * len(train_dataset))
 69 |     val_size = len(train_dataset) - train_size
 70 | 
 71 |     seed = torch.Generator().manual_seed(42)
 72 |     train_dataset, val_dataset = torch.utils.data.random_split(
 73 |         train_dataset, [train_size, val_size], generator=seed
 74 |     )
 75 | 
 76 |     test_dataset = MNIST(
 77 |         "./", download=True, train=False, transform=transforms.ToTensor()
 78 |     )
 79 | 
 80 |     train_dataloader = DataLoader(train_dataset, batch_size=32)
 81 |     val_dataloader = DataLoader(val_dataset, batch_size=32)
 82 |     test_dataloader = DataLoader(test_dataset, batch_size=32)
 83 | 
 84 |     return train_dataloader, val_dataloader, test_dataloader
 85 | 
 86 | 
 87 | train_dataloader, val_dataloader, test_dataloader = prepare_dataloaders()
 88 | 
 89 | model = LitConvClassifier()
 90 | 
 91 | # Lightning automatically saves a checkpoint for you in your current working directory,
 92 | # with the state of your last training epoch.
 93 | # Or you can specify the path to save the checkpoint to.
 94 | trainer = pl.Trainer(max_epochs=1, default_root_dir="experiments/")
 95 | 
 96 | trainer.fit(model, train_dataloader, val_dataloader)
 97 | 
 98 | # Load the checkpoint from the path
 99 | # You can modify the path to the checkpoint file you want to load
100 | checkpoint_path = (
101 |     "experiments/lightning_logs/version_0/checkpoints/epoch=0-step=1500.ckpt"
102 | )
103 | 
104 | # By default, the checkpoint loads the model with the same parameters as the original model
105 | model = LitConvClassifier.load_from_checkpoint(checkpoint_path)
106 | print(f"Original Model Learning Rate: {model.learning_rate}")  # prints 0.001
107 | 
108 | # You can also load the checkpoint with different parameters
109 | model = LitConvClassifier.load_from_checkpoint(checkpoint_path, learning_rate=0.01)
110 | print(f"Updated Model Learning Rate: {model.learning_rate}")  # prints 0.01
111 | 


--------------------------------------------------------------------------------
/src/basic/level_04_early_stopping/early_stopping.py:
--------------------------------------------------------------------------------
  1 | # Documentation Link
  2 | #  https://lightning.ai/docs/pytorch/stable/common/early_stopping.html
  3 | 
  4 | 
  5 | import os
  6 | 
  7 | import lightning.pytorch as pl
  8 | import torch
  9 | 
 10 | # Import the early stopping callback
 11 | from lightning.pytorch.callbacks.early_stopping import EarlyStopping
 12 | from torch import nn
 13 | from torch.nn import functional as F
 14 | from torch.utils.data import DataLoader
 15 | from torchvision import transforms
 16 | from torchvision.datasets import MNIST
 17 | 
 18 | 
 19 | class LitConvClassifier(pl.LightningModule):
 20 |     def __init__(self, learning_rate=1e-3):
 21 |         super().__init__()
 22 |         self.save_hyperparameters()
 23 | 
 24 |         self.learning_rate = learning_rate
 25 |         self.conv1 = nn.Conv2d(1, 32, 3, stride=1, padding=1)
 26 |         self.conv2 = nn.Conv2d(32, 64, 3, stride=1, padding=1)
 27 |         self.fc1 = nn.Linear(64 * 7 * 7, 128)
 28 |         self.fc2 = nn.Linear(128, 10)
 29 | 
 30 |     def forward(self, x):
 31 |         x = F.relu(self.conv1(x))
 32 |         x = F.max_pool2d(x, 2)
 33 |         x = F.relu(self.conv2(x))
 34 |         x = F.max_pool2d(x, 2)
 35 |         x = x.view(x.size(0), -1)
 36 |         x = F.relu(self.fc1(x))
 37 |         x = self.fc2(x)
 38 |         return x
 39 | 
 40 |     def training_step(self, batch, batch_idx):
 41 |         x, y = batch
 42 |         y_hat = self(x)
 43 |         loss = F.cross_entropy(y_hat, y)
 44 |         return loss
 45 | 
 46 |     def validation_step(self, batch, batch_idx):
 47 |         x, y = batch
 48 |         y_hat = self(x)
 49 |         loss = F.cross_entropy(y_hat, y)
 50 | 
 51 |         # First we log the loss of interest
 52 |         self.log("val_loss", loss)
 53 |         return loss
 54 | 
 55 |     def test_step(self, batch, batch_idx):
 56 |         x, y = batch
 57 |         y_hat = self(x)
 58 |         loss = F.cross_entropy(y_hat, y)
 59 |         return loss
 60 | 
 61 |     def configure_optimizers(self):
 62 |         optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)
 63 |         return optimizer
 64 | 
 65 | 
 66 | def prepare_dataloaders():
 67 |     train_dataset = MNIST(
 68 |         "./", download=True, train=True, transform=transforms.ToTensor()
 69 |     )
 70 | 
 71 |     train_size = int(0.8 * len(train_dataset))
 72 |     val_size = len(train_dataset) - train_size
 73 | 
 74 |     seed = torch.Generator().manual_seed(42)
 75 |     train_dataset, val_dataset = torch.utils.data.random_split(
 76 |         train_dataset, [train_size, val_size], generator=seed
 77 |     )
 78 | 
 79 |     test_dataset = MNIST(
 80 |         "./", download=True, train=False, transform=transforms.ToTensor()
 81 |     )
 82 | 
 83 |     train_dataloader = DataLoader(train_dataset, batch_size=32)
 84 |     val_dataloader = DataLoader(val_dataset, batch_size=32)
 85 |     test_dataloader = DataLoader(test_dataset, batch_size=32)
 86 | 
 87 |     return train_dataloader, val_dataloader, test_dataloader
 88 | 
 89 | 
 90 | train_dataloader, val_dataloader, test_dataloader = prepare_dataloaders()
 91 | 
 92 | model = LitConvClassifier()
 93 | 
 94 | # Then pass the callback to the trainer
 95 | trainer = pl.Trainer(
 96 |     max_epochs=3,
 97 |     default_root_dir="experiments/",
 98 |     callbacks=[EarlyStopping(monitor="val_loss", mode="min")],
 99 | )
100 | trainer.fit(model, train_dataloader, val_dataloader)
101 | 
102 | # Or customize the early stopping callback and pass it to the trainer
103 | early_stopping = EarlyStopping(monitor="val_loss", mode="min", patience=3, verbose=True)
104 | trainer = pl.Trainer(
105 |     max_epochs=3, default_root_dir="experiments/", callbacks=[early_stopping]
106 | )
107 | 
108 | trainer.fit(model, train_dataloader, val_dataloader)
109 | 
110 | # Additional parameters that stop training at extreme points:
111 | # --> stopping_threshold: Stops training immediately once the monitored quantity reaches this threshold.
112 | #                         It is useful when we know that going beyond a certain optimal value does not further benefit us.
113 | 
114 | # --> divergence_threshold: Stops training as soon as the monitored quantity becomes worse than this threshold.
115 | #                           When reaching a value this bad, we believes the model cannot recover anymore
116 | #                           and it is better to stop early and run with different initial conditions.
117 | 
118 | # --> check_finite: When turned on, it stops training if the monitored metric becomes NaN or infinite.
119 | 
120 | # --> check_on_train_epoch_end: When turned on, it checks the metric at the end of a training epoch.
121 | #                               Use this only when you are monitoring any metric logged within training-specific
122 | #                               hooks on epoch-level.
123 | 


--------------------------------------------------------------------------------
/src/intermediate/level_12_deploying_models/01_onnx.py:
--------------------------------------------------------------------------------
  1 | # Documentation Link
  2 | # https://lightning.ai/docs/pytorch/stable/deploy/production_advanced.html
  3 | 
  4 | import os
  5 | 
  6 | import lightning.pytorch as pl
  7 | import onnxruntime
  8 | import torch
  9 | from lightning.pytorch.callbacks import ModelSummary, StochasticWeightAveraging
 10 | from lightning.pytorch.callbacks.early_stopping import EarlyStopping
 11 | from torch import nn
 12 | from torch.nn import functional as F
 13 | from torch.utils.data import DataLoader, random_split
 14 | from torchvision import transforms
 15 | from torchvision.datasets import MNIST
 16 | 
 17 | 
 18 | class MNISTDataModule(pl.LightningDataModule):
 19 |     def __init__(self, data_dir: str = "./"):
 20 |         super().__init__()
 21 |         self.data_dir = data_dir
 22 |         self.transform = transforms.Compose(
 23 |             [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]
 24 |         )
 25 | 
 26 |     def prepare_data(self):
 27 |         # download
 28 |         MNIST(self.data_dir, train=True, download=True)
 29 |         MNIST(self.data_dir, train=False, download=True)
 30 | 
 31 |     def setup(self, stage: str):
 32 |         # Assign train/val datasets for use in dataloaders
 33 |         if stage == "fit":
 34 |             mnist_full = MNIST(self.data_dir, train=True, transform=self.transform)
 35 |             self.mnist_train, self.mnist_val = random_split(mnist_full, [55000, 5000])
 36 | 
 37 |         # Assign test dataset for use in dataloader(s)
 38 |         if stage == "test":
 39 |             self.mnist_test = MNIST(
 40 |                 self.data_dir, train=False, transform=self.transform
 41 |             )
 42 | 
 43 |         if stage == "predict":
 44 |             self.mnist_predict = MNIST(
 45 |                 self.data_dir, train=False, transform=self.transform
 46 |             )
 47 | 
 48 |     def train_dataloader(self):
 49 |         return DataLoader(self.mnist_train, batch_size=32)
 50 | 
 51 |     def val_dataloader(self):
 52 |         return DataLoader(self.mnist_val, batch_size=32)
 53 | 
 54 |     def test_dataloader(self):
 55 |         return DataLoader(self.mnist_test, batch_size=32)
 56 | 
 57 |     def predict_dataloader(self):
 58 |         return DataLoader(self.mnist_predict, batch_size=32)
 59 | 
 60 | 
 61 | class LitConvClassifier(pl.LightningModule):
 62 |     def __init__(self, learning_rate=1e-3):
 63 |         super().__init__()
 64 |         self.save_hyperparameters()
 65 |         self.example_input_array = torch.rand(1, 1, 28, 28)
 66 | 
 67 |         self.learning_rate = learning_rate
 68 | 
 69 |         # Define blocks of layers as submodules
 70 |         self.conv_block1 = nn.Sequential(
 71 |             nn.Conv2d(1, 32, 3, stride=1, padding=1), nn.ReLU(), nn.MaxPool2d(2)
 72 |         )
 73 | 
 74 |         self.conv_block2 = nn.Sequential(
 75 |             nn.Conv2d(32, 64, 3, stride=1, padding=1), nn.ReLU(), nn.MaxPool2d(2)
 76 |         )
 77 | 
 78 |         self.fc_block = nn.Sequential(
 79 |             nn.Linear(64 * 7 * 7, 128), nn.ReLU(), nn.Linear(128, 10)
 80 |         )
 81 | 
 82 |     def forward(self, x):
 83 |         x = self.conv_block1(x)
 84 |         x = self.conv_block2(x)
 85 |         x = x.view(x.size(0), -1)
 86 |         x = self.fc_block(x)
 87 |         return x
 88 | 
 89 |     def training_step(self, batch, batch_idx):
 90 |         x, y = batch
 91 |         y_hat = self(x)
 92 |         loss = F.cross_entropy(y_hat, y)
 93 |         return loss
 94 | 
 95 |     def validation_step(self, batch, batch_idx):
 96 |         x, y = batch
 97 |         y_hat = self(x)
 98 |         loss = F.cross_entropy(y_hat, y)
 99 |         self.log("val_loss", loss)
100 |         return loss
101 | 
102 |     def test_step(self, batch, batch_idx):
103 |         x, y = batch
104 |         y_hat = self(x)
105 |         loss = F.cross_entropy(y_hat, y)
106 |         return loss
107 | 
108 |     def predict_step(self, batch, batch_idx, dataloader_idx=None):
109 |         x, _ = batch
110 |         return self(x)
111 | 
112 |     def configure_optimizers(self):
113 |         optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)
114 |         return optimizer
115 | 
116 | 
117 | data_module = MNISTDataModule()
118 | model = LitConvClassifier()
119 | 
120 | # ONNX is a package developed by Microsoft to optimize inference.
121 | # ONNX allows the model to be independent of PyTorch and run on any ONNX Runtime.
122 | filepath = "model.onnx"
123 | model.to_onnx(filepath, export_params=True)
124 | 
125 | # Once you have the exported model, you can run it on your ONNX runtime in the following way:
126 | ort_session = onnxruntime.InferenceSession(filepath)
127 | input_name = ort_session.get_inputs()[0].name
128 | ort_inputs = {input_name: torch.rand(1, 1, 28, 28).numpy()}
129 | ort_outs = ort_session.run(None, ort_inputs)
130 | 


--------------------------------------------------------------------------------
/src/basic/level_06_debugging_model/01_quick_run.py:
--------------------------------------------------------------------------------
  1 | # Documentation Link
  2 | # https://lightning.ai/docs/pytorch/stable/debug/debugging_basic.html
  3 | 
  4 | import os
  5 | import time
  6 | 
  7 | import lightning.pytorch as pl
  8 | import torch
  9 | from lightning.pytorch.callbacks.early_stopping import EarlyStopping
 10 | from torch import nn
 11 | from torch.nn import functional as F
 12 | from torch.utils.data import DataLoader
 13 | from torchvision import transforms
 14 | from torchvision.datasets import MNIST
 15 | 
 16 | 
 17 | class LitConvClassifier(pl.LightningModule):
 18 |     def __init__(self, learning_rate=1e-3):
 19 |         super().__init__()
 20 |         self.save_hyperparameters()
 21 | 
 22 |         self.learning_rate = learning_rate
 23 |         self.conv1 = nn.Conv2d(1, 32, 3, stride=1, padding=1)
 24 |         self.conv2 = nn.Conv2d(32, 64, 3, stride=1, padding=1)
 25 |         self.fc1 = nn.Linear(64 * 7 * 7, 128)
 26 |         self.fc2 = nn.Linear(128, 10)
 27 | 
 28 |     def forward(self, x):
 29 |         x = F.relu(self.conv1(x))
 30 |         x = F.max_pool2d(x, 2)
 31 |         x = F.relu(self.conv2(x))
 32 |         x = F.max_pool2d(x, 2)
 33 |         x = x.view(x.size(0), -1)
 34 |         x = F.relu(self.fc1(x))
 35 |         x = self.fc2(x)
 36 |         return x
 37 | 
 38 |     def training_step(self, batch, batch_idx):
 39 |         x, y = batch
 40 |         y_hat = self(x)
 41 |         loss = F.cross_entropy(y_hat, y)
 42 |         return loss
 43 | 
 44 |     def validation_step(self, batch, batch_idx):
 45 |         x, y = batch
 46 |         y_hat = self(x)
 47 |         loss = F.cross_entropy(y_hat, y)
 48 |         self.log("val_loss", loss)
 49 |         return loss
 50 | 
 51 |     def test_step(self, batch, batch_idx):
 52 |         x, y = batch
 53 |         y_hat = self(x)
 54 |         loss = F.cross_entropy(y_hat, y)
 55 |         return loss
 56 | 
 57 |     def configure_optimizers(self):
 58 |         optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)
 59 |         return optimizer
 60 | 
 61 | 
 62 | def prepare_dataloaders():
 63 |     train_dataset = MNIST(
 64 |         "./", download=True, train=True, transform=transforms.ToTensor()
 65 |     )
 66 | 
 67 |     train_size = int(0.8 * len(train_dataset))
 68 |     val_size = len(train_dataset) - train_size
 69 | 
 70 |     seed = torch.Generator().manual_seed(42)
 71 |     train_dataset, val_dataset = torch.utils.data.random_split(
 72 |         train_dataset, [train_size, val_size], generator=seed
 73 |     )
 74 | 
 75 |     test_dataset = MNIST(
 76 |         "./", download=True, train=False, transform=transforms.ToTensor()
 77 |     )
 78 | 
 79 |     train_dataloader = DataLoader(train_dataset, batch_size=32)
 80 |     val_dataloader = DataLoader(val_dataset, batch_size=32)
 81 |     test_dataloader = DataLoader(test_dataset, batch_size=32)
 82 | 
 83 |     return train_dataloader, val_dataloader, test_dataloader
 84 | 
 85 | 
 86 | train_dataloader, val_dataloader, test_dataloader = prepare_dataloaders()
 87 | 
 88 | model = LitConvClassifier()
 89 | 
 90 | # Default
 91 | start = time.time()
 92 | trainer = pl.Trainer(
 93 |     max_epochs=1,
 94 |     default_root_dir="experiments/",
 95 |     callbacks=[EarlyStopping(monitor="val_loss", mode="min")],
 96 | )
 97 | trainer.fit(model, train_dataloader, val_dataloader)
 98 | end = time.time()
 99 | print(f"\nDefault Training time: {end - start}")
100 | 
101 | # fast_dev_run
102 | # The fast_dev_run argument in the trainer runs 5 batch of training, validation,
103 | # test and prediction data through your trainer to see if there are any bugs
104 | # To change how many batches to use, change the argument to an integer.
105 | # This argument will disable tuner, checkpoint callbacks, early stopping callbacks,
106 | # loggers and logger callbacks like LearningRateMonitor and DeviceStatsMonitor.
107 | start = time.time()
108 | trainer = pl.Trainer(fast_dev_run=True)
109 | trainer.fit(model, train_dataloader, val_dataloader)
110 | end = time.time()
111 | print(f"\nFast Dev Run Training time: {end - start}")
112 | 
113 | # Shorten Epoch Length
114 | # Here use only 10% of training data and 1% of val data
115 | # You can also specify the num batches as integers
116 | start = time.time()
117 | trainer = pl.Trainer(max_epochs=1, limit_train_batches=0.1, limit_val_batches=0.1)
118 | trainer.fit(model, train_dataloader, val_dataloader)
119 | end = time.time()
120 | print(f"S\nhortened Epoch Training time: {end - start}")
121 | 
122 | # Sanity Check
123 | # Lightning runs 2 steps of validation in the beginning of training.
124 | # This avoids crashing in the validation loop sometime deep into a lengthy training loop.
125 | start = time.time()
126 | trainer = pl.Trainer(max_epochs=1, num_sanity_val_steps=2)
127 | trainer.fit(model, train_dataloader, val_dataloader)
128 | end = time.time()
129 | print(f"\nSanity Check Training time: {end - start}")
130 | 


--------------------------------------------------------------------------------
/src/intermediate/level_11_scaling_techniques/01_precision_training.py:
--------------------------------------------------------------------------------
  1 | # Documentation Link
  2 | # https://lightning.ai/docs/pytorch/stable/common/precision_basic.html
  3 | 
  4 | import os
  5 | 
  6 | import lightning.pytorch as pl
  7 | import torch
  8 | from lightning.pytorch.callbacks import ModelSummary
  9 | from lightning.pytorch.callbacks.early_stopping import EarlyStopping
 10 | from torch import nn
 11 | from torch.nn import functional as F
 12 | from torch.utils.data import DataLoader, random_split
 13 | from torchvision import transforms
 14 | from torchvision.datasets import MNIST
 15 | 
 16 | 
 17 | class MNISTDataModule(pl.LightningDataModule):
 18 |     def __init__(self, data_dir: str = "./"):
 19 |         super().__init__()
 20 |         self.data_dir = data_dir
 21 |         self.transform = transforms.Compose(
 22 |             [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]
 23 |         )
 24 | 
 25 |     def prepare_data(self):
 26 |         # download
 27 |         MNIST(self.data_dir, train=True, download=True)
 28 |         MNIST(self.data_dir, train=False, download=True)
 29 | 
 30 |     def setup(self, stage: str):
 31 |         # Assign train/val datasets for use in dataloaders
 32 |         if stage == "fit":
 33 |             mnist_full = MNIST(self.data_dir, train=True, transform=self.transform)
 34 |             self.mnist_train, self.mnist_val = random_split(mnist_full, [55000, 5000])
 35 | 
 36 |         # Assign test dataset for use in dataloader(s)
 37 |         if stage == "test":
 38 |             self.mnist_test = MNIST(
 39 |                 self.data_dir, train=False, transform=self.transform
 40 |             )
 41 | 
 42 |         if stage == "predict":
 43 |             self.mnist_predict = MNIST(
 44 |                 self.data_dir, train=False, transform=self.transform
 45 |             )
 46 | 
 47 |     def train_dataloader(self):
 48 |         return DataLoader(self.mnist_train, batch_size=32)
 49 | 
 50 |     def val_dataloader(self):
 51 |         return DataLoader(self.mnist_val, batch_size=32)
 52 | 
 53 |     def test_dataloader(self):
 54 |         return DataLoader(self.mnist_test, batch_size=32)
 55 | 
 56 |     def predict_dataloader(self):
 57 |         return DataLoader(self.mnist_predict, batch_size=32)
 58 | 
 59 | 
 60 | class LitConvClassifier(pl.LightningModule):
 61 |     def __init__(self, learning_rate=1e-3):
 62 |         super().__init__()
 63 |         self.save_hyperparameters()
 64 |         self.example_input_array = torch.rand(1, 1, 28, 28)
 65 | 
 66 |         self.learning_rate = learning_rate
 67 | 
 68 |         # Define blocks of layers as submodules
 69 |         self.conv_block1 = nn.Sequential(
 70 |             nn.Conv2d(1, 32, 3, stride=1, padding=1), nn.ReLU(), nn.MaxPool2d(2)
 71 |         )
 72 | 
 73 |         self.conv_block2 = nn.Sequential(
 74 |             nn.Conv2d(32, 64, 3, stride=1, padding=1), nn.ReLU(), nn.MaxPool2d(2)
 75 |         )
 76 | 
 77 |         self.fc_block = nn.Sequential(
 78 |             nn.Linear(64 * 7 * 7, 128), nn.ReLU(), nn.Linear(128, 10)
 79 |         )
 80 | 
 81 |     def forward(self, x):
 82 |         x = self.conv_block1(x)
 83 |         x = self.conv_block2(x)
 84 |         x = x.view(x.size(0), -1)
 85 |         x = self.fc_block(x)
 86 |         return x
 87 | 
 88 |     def training_step(self, batch, batch_idx):
 89 |         x, y = batch
 90 |         y_hat = self(x)
 91 |         loss = F.cross_entropy(y_hat, y)
 92 |         return loss
 93 | 
 94 |     def validation_step(self, batch, batch_idx):
 95 |         x, y = batch
 96 |         y_hat = self(x)
 97 |         loss = F.cross_entropy(y_hat, y)
 98 |         self.log("val_loss", loss)
 99 |         return loss
100 | 
101 |     def test_step(self, batch, batch_idx):
102 |         x, y = batch
103 |         y_hat = self(x)
104 |         loss = F.cross_entropy(y_hat, y)
105 |         return loss
106 | 
107 |     def predict_step(self, batch, batch_idx, dataloader_idx=None):
108 |         x, _ = batch
109 |         return self(x)
110 | 
111 |     def configure_optimizers(self):
112 |         optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)
113 |         return optimizer
114 | 
115 | 
116 | data_module = MNISTDataModule()
117 | model = LitConvClassifier()
118 | 
119 | # Lower precision, such as 16-bit floating-point, requires less memory and
120 | # enables training and deploying larger models. Higher precision,
121 | # such as the 64-bit floating-point, can be used for highly sensitive use-cases.
122 | 
123 | trainer = pl.Trainer(
124 |     max_epochs=1,
125 |     default_root_dir="experiments/",
126 |     callbacks=[
127 |         EarlyStopping(monitor="val_loss", mode="min"),
128 |         ModelSummary(max_depth=-1),
129 |     ],
130 |     precision="16-mixed",
131 | )
132 | 
133 | trainer.fit(model, data_module)
134 | 
135 | # Get Predictions
136 | predictions = trainer.predict(model, data_module)
137 | print(len(predictions))
138 | 


--------------------------------------------------------------------------------
/src/intermediate/level_08_accelerated_hardware/02_tpu_traininig.py:
--------------------------------------------------------------------------------
  1 | # Documentation Link
  2 | # https://lightning.ai/docs/pytorch/stable/accelerators/tpu_basic.html
  3 | 
  4 | import os
  5 | 
  6 | import lightning.pytorch as pl
  7 | import torch
  8 | from lightning.pytorch.callbacks import ModelSummary
  9 | from lightning.pytorch.callbacks.early_stopping import EarlyStopping
 10 | from torch import nn
 11 | from torch.nn import functional as F
 12 | from torch.utils.data import DataLoader
 13 | from torchvision import transforms
 14 | from torchvision.datasets import MNIST
 15 | 
 16 | 
 17 | class LitConvClassifier(pl.LightningModule):
 18 |     def __init__(self, learning_rate=1e-3):
 19 |         super().__init__()
 20 |         self.save_hyperparameters()
 21 |         self.example_input_array = torch.rand(1, 1, 28, 28)
 22 | 
 23 |         self.learning_rate = learning_rate
 24 | 
 25 |         # Define blocks of layers as submodules
 26 |         self.conv_block1 = nn.Sequential(
 27 |             nn.Conv2d(1, 32, 3, stride=1, padding=1), nn.ReLU(), nn.MaxPool2d(2)
 28 |         )
 29 | 
 30 |         self.conv_block2 = nn.Sequential(
 31 |             nn.Conv2d(32, 64, 3, stride=1, padding=1), nn.ReLU(), nn.MaxPool2d(2)
 32 |         )
 33 | 
 34 |         self.fc_block = nn.Sequential(
 35 |             nn.Linear(64 * 7 * 7, 128), nn.ReLU(), nn.Linear(128, 10)
 36 |         )
 37 | 
 38 |     def forward(self, x):
 39 |         x = self.conv_block1(x)
 40 |         x = self.conv_block2(x)
 41 |         x = x.view(x.size(0), -1)
 42 |         x = self.fc_block(x)
 43 |         return x
 44 | 
 45 |     def training_step(self, batch, batch_idx):
 46 |         x, y = batch
 47 |         y_hat = self(x)
 48 |         loss = F.cross_entropy(y_hat, y)
 49 |         return loss
 50 | 
 51 |     def validation_step(self, batch, batch_idx):
 52 |         x, y = batch
 53 |         y_hat = self(x)
 54 |         loss = F.cross_entropy(y_hat, y)
 55 |         self.log("val_loss", loss)
 56 |         return loss
 57 | 
 58 |     def test_step(self, batch, batch_idx):
 59 |         x, y = batch
 60 |         y_hat = self(x)
 61 |         loss = F.cross_entropy(y_hat, y)
 62 |         return loss
 63 | 
 64 |     def predict_step(self, batch, batch_idx, dataloader_idx=None):
 65 |         x, _ = batch
 66 |         return self(x)
 67 | 
 68 |     def configure_optimizers(self):
 69 |         optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)
 70 |         return optimizer
 71 | 
 72 | 
 73 | def prepare_dataloaders():
 74 |     train_dataset = MNIST(
 75 |         "./", download=True, train=True, transform=transforms.ToTensor()
 76 |     )
 77 | 
 78 |     train_size = int(0.8 * len(train_dataset))
 79 |     val_size = len(train_dataset) - train_size
 80 | 
 81 |     seed = torch.Generator().manual_seed(42)
 82 |     train_dataset, val_dataset = torch.utils.data.random_split(
 83 |         train_dataset, [train_size, val_size], generator=seed
 84 |     )
 85 | 
 86 |     test_dataset = MNIST(
 87 |         "./", download=True, train=False, transform=transforms.ToTensor()
 88 |     )
 89 | 
 90 |     train_dataloader = DataLoader(train_dataset, batch_size=32)
 91 |     val_dataloader = DataLoader(val_dataset, batch_size=32)
 92 |     test_dataloader = DataLoader(test_dataset, batch_size=32)
 93 | 
 94 |     return train_dataloader, val_dataloader, test_dataloader
 95 | 
 96 | 
 97 | train_dataloader, val_dataloader, test_dataloader = prepare_dataloaders()
 98 | 
 99 | model = LitConvClassifier()
100 | 
101 | # Tensor Processing Unit (TPU) is an AI accelerator application-specific integrated circuit (ASIC) developed by
102 | # Google specifically for neural networks.
103 | 
104 | # A TPU has 8 cores where each core is optimized for 128x128 matrix multiplies.
105 | # In general, a single TPU is about as fast as 5 V100 GPUs!
106 | 
107 | # A TPU pod hosts many TPUs on it. Currently, TPU v3 Pod has up to 2048 TPU cores and 32 TiB of memory!
108 | # You can request a full pod from Google cloud or a “slice” which gives you some subset of those 2048 cores.
109 | 
110 | # run on as many TPUs as available by default
111 | trainer = pl.Trainer(
112 |     max_epochs=5,
113 |     default_root_dir="experiments/",
114 |     accelerator="auto",
115 |     devices="auto",
116 |     strategy="auto",
117 | )
118 | # equivalent to
119 | trainer = pl.Trainer()
120 | 
121 | # run on one TPU core
122 | trainer = pl.Trainer(
123 |     max_epochs=5, default_root_dir="experiments/", accelerator="tpu", devices=1
124 | )
125 | 
126 | # run on multiple TPU cores
127 | trainer = pl.Trainer(
128 |     max_epochs=5, default_root_dir="experiments/", accelerator="tpu", devices=8
129 | )
130 | 
131 | # run on the 5th core
132 | trainer = pl.Trainer(
133 |     max_epochs=5, default_root_dir="experiments/", accelerator="tpu", devices=[5]
134 | )
135 | 
136 | # choose the number of cores automatically
137 | trainer = pl.Trainer(
138 |     max_epochs=5, default_root_dir="experiments/", accelerator="tpu", devices="auto"
139 | )
140 | 
141 | trainer.fit(model, train_dataloader, val_dataloader)
142 | 


--------------------------------------------------------------------------------
/src/basic/level_06_debugging_model/02_model_summary.py:
--------------------------------------------------------------------------------
  1 | # Documentation Link
  2 | # https://lightning.ai/docs/pytorch/stable/debug/debugging_basic.html
  3 | 
  4 | import os
  5 | 
  6 | import lightning.pytorch as pl
  7 | import torch
  8 | 
  9 | # Used for child modules in the model summary
 10 | from lightning.pytorch.callbacks import ModelSummary
 11 | from lightning.pytorch.callbacks.early_stopping import EarlyStopping
 12 | from torch import nn
 13 | from torch.nn import functional as F
 14 | from torch.utils.data import DataLoader
 15 | from torchvision import transforms
 16 | from torchvision.datasets import MNIST
 17 | 
 18 | 
 19 | # We have updated the model to use nn.Sequential() and named the blocks of layers.
 20 | # This will help us understand the Model Summary output.
 21 | class LitConvClassifier(pl.LightningModule):
 22 |     def __init__(self, learning_rate=1e-3):
 23 |         super().__init__()
 24 |         self.save_hyperparameters()
 25 | 
 26 |         # Another debugging tool is to display the intermediate input- and output sizes of
 27 |         # all your layers by setting the example_input_array attribute in your LightningModule.
 28 |         self.example_input_array = torch.rand(1, 1, 28, 28)
 29 | 
 30 |         self.learning_rate = learning_rate
 31 | 
 32 |         # Define blocks of layers as submodules
 33 |         self.conv_block1 = nn.Sequential(
 34 |             nn.Conv2d(1, 32, 3, stride=1, padding=1), nn.ReLU(), nn.MaxPool2d(2)
 35 |         )
 36 | 
 37 |         self.conv_block2 = nn.Sequential(
 38 |             nn.Conv2d(32, 64, 3, stride=1, padding=1), nn.ReLU(), nn.MaxPool2d(2)
 39 |         )
 40 | 
 41 |         self.fc_block = nn.Sequential(
 42 |             nn.Linear(64 * 7 * 7, 128), nn.ReLU(), nn.Linear(128, 10)
 43 |         )
 44 | 
 45 |     def forward(self, x):
 46 |         x = self.conv_block1(x)
 47 |         x = self.conv_block2(x)
 48 |         x = x.view(x.size(0), -1)
 49 |         x = self.fc_block(x)
 50 |         return x
 51 | 
 52 |     def training_step(self, batch, batch_idx):
 53 |         x, y = batch
 54 |         y_hat = self(x)
 55 |         loss = F.cross_entropy(y_hat, y)
 56 |         return loss
 57 | 
 58 |     def validation_step(self, batch, batch_idx):
 59 |         x, y = batch
 60 |         y_hat = self(x)
 61 |         loss = F.cross_entropy(y_hat, y)
 62 |         self.log("val_loss", loss)
 63 |         return loss
 64 | 
 65 |     def test_step(self, batch, batch_idx):
 66 |         x, y = batch
 67 |         y_hat = self(x)
 68 |         loss = F.cross_entropy(y_hat, y)
 69 |         return loss
 70 | 
 71 |     def configure_optimizers(self):
 72 |         optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)
 73 |         return optimizer
 74 | 
 75 | 
 76 | def prepare_dataloaders():
 77 |     train_dataset = MNIST(
 78 |         "./", download=True, train=True, transform=transforms.ToTensor()
 79 |     )
 80 | 
 81 |     train_size = int(0.8 * len(train_dataset))
 82 |     val_size = len(train_dataset) - train_size
 83 | 
 84 |     seed = torch.Generator().manual_seed(42)
 85 |     train_dataset, val_dataset = torch.utils.data.random_split(
 86 |         train_dataset, [train_size, val_size], generator=seed
 87 |     )
 88 | 
 89 |     test_dataset = MNIST(
 90 |         "./", download=True, train=False, transform=transforms.ToTensor()
 91 |     )
 92 | 
 93 |     train_dataloader = DataLoader(train_dataset, batch_size=32)
 94 |     val_dataloader = DataLoader(val_dataset, batch_size=32)
 95 |     test_dataloader = DataLoader(test_dataset, batch_size=32)
 96 | 
 97 |     return train_dataloader, val_dataloader, test_dataloader
 98 | 
 99 | 
100 | train_dataloader, val_dataloader, test_dataloader = prepare_dataloaders()
101 | 
102 | model = LitConvClassifier()
103 | 
104 | # Default
105 | # Whenever the .fit() function gets called,
106 | # the Trainer will print the weights summary for the LightningModule.
107 | print("\n----------------------------------")
108 | print("Default Model Summary")
109 | print("----------------------------------")
110 | trainer = pl.Trainer(
111 |     max_epochs=1,
112 |     default_root_dir="experiments/",
113 |     callbacks=[EarlyStopping(monitor="val_loss", mode="min")],
114 | )
115 | trainer.fit(model, train_dataloader, val_dataloader)
116 | 
117 | # Child Modules
118 | print("\n----------------------------------")
119 | print("Child Modules Model Summary")
120 | print("----------------------------------")
121 | trainer = pl.Trainer(
122 |     max_epochs=1,
123 |     default_root_dir="experiments/",
124 |     callbacks=[
125 |         EarlyStopping(monitor="val_loss", mode="min"),
126 |         ModelSummary(max_depth=-1),
127 |     ],
128 | )
129 | trainer.fit(model, train_dataloader, val_dataloader)
130 | 
131 | # Turn off model summary
132 | print("\n----------------------------------")
133 | print("Turn off Model Summary")
134 | print("----------------------------------")
135 | trainer = pl.Trainer(
136 |     max_epochs=1,
137 |     default_root_dir="experiments/",
138 |     callbacks=[EarlyStopping(monitor="val_loss", mode="min")],
139 |     enable_model_summary=False,
140 | )
141 | trainer.fit(model, train_dataloader, val_dataloader)
142 | 


--------------------------------------------------------------------------------
/src/intermediate/level_13_profiler/02_profile_pytorch_operations.py:
--------------------------------------------------------------------------------
  1 | # Documentation Link
  2 | # https://lightning.ai/docs/pytorch/stable/tuning/profiler_intermediate.html
  3 | 
  4 | import os
  5 | 
  6 | import lightning.pytorch as pl
  7 | import torch
  8 | from lightning.pytorch.callbacks import (
  9 |     DeviceStatsMonitor,
 10 |     ModelSummary,
 11 |     StochasticWeightAveraging,
 12 | )
 13 | from lightning.pytorch.callbacks.early_stopping import EarlyStopping
 14 | from lightning.pytorch.profilers import PyTorchProfiler
 15 | from torch import nn
 16 | from torch.nn import functional as F
 17 | from torch.utils.data import DataLoader, random_split
 18 | from torchvision import transforms
 19 | from torchvision.datasets import MNIST
 20 | 
 21 | 
 22 | class MNISTDataModule(pl.LightningDataModule):
 23 |     def __init__(self, data_dir: str = "./"):
 24 |         super().__init__()
 25 |         self.data_dir = data_dir
 26 |         self.transform = transforms.Compose(
 27 |             [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]
 28 |         )
 29 | 
 30 |     def prepare_data(self):
 31 |         # download
 32 |         MNIST(self.data_dir, train=True, download=True)
 33 |         MNIST(self.data_dir, train=False, download=True)
 34 | 
 35 |     def setup(self, stage: str):
 36 |         # Assign train/val datasets for use in dataloaders
 37 |         if stage == "fit":
 38 |             mnist_full = MNIST(self.data_dir, train=True, transform=self.transform)
 39 |             self.mnist_train, self.mnist_val = random_split(mnist_full, [55000, 5000])
 40 | 
 41 |         # Assign test dataset for use in dataloader(s)
 42 |         if stage == "test":
 43 |             self.mnist_test = MNIST(
 44 |                 self.data_dir, train=False, transform=self.transform
 45 |             )
 46 | 
 47 |         if stage == "predict":
 48 |             self.mnist_predict = MNIST(
 49 |                 self.data_dir, train=False, transform=self.transform
 50 |             )
 51 | 
 52 |     def train_dataloader(self):
 53 |         return DataLoader(self.mnist_train, batch_size=32)
 54 | 
 55 |     def val_dataloader(self):
 56 |         return DataLoader(self.mnist_val, batch_size=32)
 57 | 
 58 |     def test_dataloader(self):
 59 |         return DataLoader(self.mnist_test, batch_size=32)
 60 | 
 61 |     def predict_dataloader(self):
 62 |         return DataLoader(self.mnist_predict, batch_size=32)
 63 | 
 64 | 
 65 | class LitConvClassifier(pl.LightningModule):
 66 |     def __init__(self, learning_rate=1e-3):
 67 |         super().__init__()
 68 |         self.save_hyperparameters()
 69 |         self.example_input_array = torch.rand(1, 1, 28, 28)
 70 | 
 71 |         self.learning_rate = learning_rate
 72 | 
 73 |         # Define blocks of layers as submodules
 74 |         self.conv_block1 = nn.Sequential(
 75 |             nn.Conv2d(1, 32, 3, stride=1, padding=1), nn.ReLU(), nn.MaxPool2d(2)
 76 |         )
 77 | 
 78 |         self.conv_block2 = nn.Sequential(
 79 |             nn.Conv2d(32, 64, 3, stride=1, padding=1), nn.ReLU(), nn.MaxPool2d(2)
 80 |         )
 81 | 
 82 |         self.fc_block = nn.Sequential(
 83 |             nn.Linear(64 * 7 * 7, 128), nn.ReLU(), nn.Linear(128, 10)
 84 |         )
 85 | 
 86 |     def forward(self, x):
 87 |         x = self.conv_block1(x)
 88 |         x = self.conv_block2(x)
 89 |         x = x.view(x.size(0), -1)
 90 |         x = self.fc_block(x)
 91 |         return x
 92 | 
 93 |     def training_step(self, batch, batch_idx):
 94 |         x, y = batch
 95 |         y_hat = self(x)
 96 |         loss = F.cross_entropy(y_hat, y)
 97 |         return loss
 98 | 
 99 |     def validation_step(self, batch, batch_idx):
100 |         x, y = batch
101 |         y_hat = self(x)
102 |         loss = F.cross_entropy(y_hat, y)
103 |         self.log("val_loss", loss)
104 |         return loss
105 | 
106 |     def test_step(self, batch, batch_idx):
107 |         x, y = batch
108 |         y_hat = self(x)
109 |         loss = F.cross_entropy(y_hat, y)
110 |         return loss
111 | 
112 |     def predict_step(self, batch, batch_idx, dataloader_idx=None):
113 |         x, _ = batch
114 |         return self(x)
115 | 
116 |     def configure_optimizers(self):
117 |         optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)
118 |         return optimizer
119 | 
120 | 
121 | data_module = MNISTDataModule()
122 | model = LitConvClassifier()
123 | 
124 | # To understand the cost of each PyTorch operation,
125 | # use the PyTorchProfiler built on top of the PyTorch profiler.
126 | trainer = pl.Trainer(
127 |     max_epochs=1,
128 |     default_root_dir="experiments/",
129 |     callbacks=[
130 |         EarlyStopping(monitor="val_loss", mode="min"),
131 |         ModelSummary(max_depth=-1),
132 |         StochasticWeightAveraging(swa_lrs=1e-2),
133 |         DeviceStatsMonitor(),
134 |     ],
135 |     precision="16-mixed",
136 |     profiler=PyTorchProfiler(),
137 |     limit_train_batches=0.1,
138 |     limit_val_batches=0.01,
139 | )
140 | 
141 | trainer.fit(model, data_module)
142 | 
143 | # Get Predictions
144 | predictions = trainer.predict(model, data_module)
145 | print(len(predictions))
146 | 


--------------------------------------------------------------------------------
/src/intermediate/level_11_scaling_techniques/04_stochastic_weight_averaging.py:
--------------------------------------------------------------------------------
  1 | # Documentation Link
  2 | # https://lightning.ai/docs/pytorch/stable/advanced/training_tricks.html
  3 | 
  4 | import os
  5 | 
  6 | import lightning.pytorch as pl
  7 | import torch
  8 | from lightning.pytorch.callbacks import ModelSummary, StochasticWeightAveraging
  9 | from lightning.pytorch.callbacks.early_stopping import EarlyStopping
 10 | from torch import nn
 11 | from torch.nn import functional as F
 12 | from torch.utils.data import DataLoader, random_split
 13 | from torchvision import transforms
 14 | from torchvision.datasets import MNIST
 15 | 
 16 | 
 17 | class MNISTDataModule(pl.LightningDataModule):
 18 |     def __init__(self, data_dir: str = "./"):
 19 |         super().__init__()
 20 |         self.data_dir = data_dir
 21 |         self.transform = transforms.Compose(
 22 |             [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]
 23 |         )
 24 | 
 25 |     def prepare_data(self):
 26 |         # download
 27 |         MNIST(self.data_dir, train=True, download=True)
 28 |         MNIST(self.data_dir, train=False, download=True)
 29 | 
 30 |     def setup(self, stage: str):
 31 |         # Assign train/val datasets for use in dataloaders
 32 |         if stage == "fit":
 33 |             mnist_full = MNIST(self.data_dir, train=True, transform=self.transform)
 34 |             self.mnist_train, self.mnist_val = random_split(mnist_full, [55000, 5000])
 35 | 
 36 |         # Assign test dataset for use in dataloader(s)
 37 |         if stage == "test":
 38 |             self.mnist_test = MNIST(
 39 |                 self.data_dir, train=False, transform=self.transform
 40 |             )
 41 | 
 42 |         if stage == "predict":
 43 |             self.mnist_predict = MNIST(
 44 |                 self.data_dir, train=False, transform=self.transform
 45 |             )
 46 | 
 47 |     def train_dataloader(self):
 48 |         return DataLoader(self.mnist_train, batch_size=32)
 49 | 
 50 |     def val_dataloader(self):
 51 |         return DataLoader(self.mnist_val, batch_size=32)
 52 | 
 53 |     def test_dataloader(self):
 54 |         return DataLoader(self.mnist_test, batch_size=32)
 55 | 
 56 |     def predict_dataloader(self):
 57 |         return DataLoader(self.mnist_predict, batch_size=32)
 58 | 
 59 | 
 60 | class LitConvClassifier(pl.LightningModule):
 61 |     def __init__(self, learning_rate=1e-3):
 62 |         super().__init__()
 63 |         self.save_hyperparameters()
 64 |         self.example_input_array = torch.rand(1, 1, 28, 28)
 65 | 
 66 |         self.learning_rate = learning_rate
 67 | 
 68 |         # Define blocks of layers as submodules
 69 |         self.conv_block1 = nn.Sequential(
 70 |             nn.Conv2d(1, 32, 3, stride=1, padding=1), nn.ReLU(), nn.MaxPool2d(2)
 71 |         )
 72 | 
 73 |         self.conv_block2 = nn.Sequential(
 74 |             nn.Conv2d(32, 64, 3, stride=1, padding=1), nn.ReLU(), nn.MaxPool2d(2)
 75 |         )
 76 | 
 77 |         self.fc_block = nn.Sequential(
 78 |             nn.Linear(64 * 7 * 7, 128), nn.ReLU(), nn.Linear(128, 10)
 79 |         )
 80 | 
 81 |     def forward(self, x):
 82 |         x = self.conv_block1(x)
 83 |         x = self.conv_block2(x)
 84 |         x = x.view(x.size(0), -1)
 85 |         x = self.fc_block(x)
 86 |         return x
 87 | 
 88 |     def training_step(self, batch, batch_idx):
 89 |         x, y = batch
 90 |         y_hat = self(x)
 91 |         loss = F.cross_entropy(y_hat, y)
 92 |         return loss
 93 | 
 94 |     def validation_step(self, batch, batch_idx):
 95 |         x, y = batch
 96 |         y_hat = self(x)
 97 |         loss = F.cross_entropy(y_hat, y)
 98 |         self.log("val_loss", loss)
 99 |         return loss
100 | 
101 |     def test_step(self, batch, batch_idx):
102 |         x, y = batch
103 |         y_hat = self(x)
104 |         loss = F.cross_entropy(y_hat, y)
105 |         return loss
106 | 
107 |     def predict_step(self, batch, batch_idx, dataloader_idx=None):
108 |         x, _ = batch
109 |         return self(x)
110 | 
111 |     def configure_optimizers(self):
112 |         optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)
113 |         return optimizer
114 | 
115 | 
116 | data_module = MNISTDataModule()
117 | model = LitConvClassifier()
118 | 
119 | # Stochastic Weight Averaging (SWA) can make your models generalize better at virtually no additional cost.
120 | # This can be used with both non-trained and trained models.
121 | # The SWA procedure smooths the loss landscape thus making it harder to end up in a local minimum during optimization.
122 | 
123 | trainer = pl.Trainer(
124 |     max_epochs=1,
125 |     default_root_dir="experiments/",
126 |     callbacks=[
127 |         EarlyStopping(monitor="val_loss", mode="min"),
128 |         ModelSummary(max_depth=-1),
129 |         StochasticWeightAveraging(
130 |             swa_lrs=1e-2
131 |         ),  # Enable Stochastic Weight Averaging using the callback
132 |     ],
133 |     precision="16-mixed",
134 | )
135 | 
136 | trainer.fit(model, data_module)
137 | 
138 | # Get Predictions
139 | predictions = trainer.predict(model, data_module)
140 | print(len(predictions))
141 | 


--------------------------------------------------------------------------------
/src/intermediate/level_09_modularize/01_lightning_datamodule.py:
--------------------------------------------------------------------------------
  1 | # Documentation Link
  2 | # https://lightning.ai/docs/pytorch/stable/levels/intermediate_level_9.html
  3 | 
  4 | import os
  5 | 
  6 | import lightning.pytorch as pl
  7 | import torch
  8 | from lightning.pytorch.callbacks import ModelSummary
  9 | from lightning.pytorch.callbacks.early_stopping import EarlyStopping
 10 | from torch import nn
 11 | from torch.nn import functional as F
 12 | from torch.utils.data import DataLoader, random_split
 13 | from torchvision import transforms
 14 | from torchvision.datasets import MNIST
 15 | 
 16 | # The LightningDataModule is a convenient way to manage data in PyTorch Lightning.
 17 | # It encapsulates training, validation, testing, and prediction dataloaders,
 18 | # as well as any necessary steps for data processing, downloads, and transformations.
 19 | # By using a LightningDataModule, you can easily develop dataset-agnostic models, hot-swap different datasets,
 20 | # and share data splits and transformations across projects.
 21 | 
 22 | 
 23 | class MNISTDataModule(pl.LightningDataModule):
 24 |     def __init__(self, data_dir: str = "./"):
 25 |         super().__init__()
 26 |         self.data_dir = data_dir
 27 |         self.transform = transforms.Compose(
 28 |             [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]
 29 |         )
 30 | 
 31 |     def prepare_data(self):
 32 |         # download
 33 |         MNIST(self.data_dir, train=True, download=True)
 34 |         MNIST(self.data_dir, train=False, download=True)
 35 | 
 36 |     def setup(self, stage: str):
 37 |         # Assign train/val datasets for use in dataloaders
 38 |         if stage == "fit":
 39 |             mnist_full = MNIST(self.data_dir, train=True, transform=self.transform)
 40 |             self.mnist_train, self.mnist_val = random_split(mnist_full, [55000, 5000])
 41 | 
 42 |         # Assign test dataset for use in dataloader(s)
 43 |         if stage == "test":
 44 |             self.mnist_test = MNIST(
 45 |                 self.data_dir, train=False, transform=self.transform
 46 |             )
 47 | 
 48 |         if stage == "predict":
 49 |             self.mnist_predict = MNIST(
 50 |                 self.data_dir, train=False, transform=self.transform
 51 |             )
 52 | 
 53 |     def train_dataloader(self):
 54 |         return DataLoader(self.mnist_train, batch_size=32)
 55 | 
 56 |     def val_dataloader(self):
 57 |         return DataLoader(self.mnist_val, batch_size=32)
 58 | 
 59 |     def test_dataloader(self):
 60 |         return DataLoader(self.mnist_test, batch_size=32)
 61 | 
 62 |     def predict_dataloader(self):
 63 |         return DataLoader(self.mnist_predict, batch_size=32)
 64 | 
 65 | 
 66 | class LitConvClassifier(pl.LightningModule):
 67 |     def __init__(self, learning_rate=1e-3):
 68 |         super().__init__()
 69 |         self.save_hyperparameters()
 70 |         self.example_input_array = torch.rand(1, 1, 28, 28)
 71 | 
 72 |         self.learning_rate = learning_rate
 73 | 
 74 |         # Define blocks of layers as submodules
 75 |         self.conv_block1 = nn.Sequential(
 76 |             nn.Conv2d(1, 32, 3, stride=1, padding=1), nn.ReLU(), nn.MaxPool2d(2)
 77 |         )
 78 | 
 79 |         self.conv_block2 = nn.Sequential(
 80 |             nn.Conv2d(32, 64, 3, stride=1, padding=1), nn.ReLU(), nn.MaxPool2d(2)
 81 |         )
 82 | 
 83 |         self.fc_block = nn.Sequential(
 84 |             nn.Linear(64 * 7 * 7, 128), nn.ReLU(), nn.Linear(128, 10)
 85 |         )
 86 | 
 87 |     def forward(self, x):
 88 |         x = self.conv_block1(x)
 89 |         x = self.conv_block2(x)
 90 |         x = x.view(x.size(0), -1)
 91 |         x = self.fc_block(x)
 92 |         return x
 93 | 
 94 |     def training_step(self, batch, batch_idx):
 95 |         x, y = batch
 96 |         y_hat = self(x)
 97 |         loss = F.cross_entropy(y_hat, y)
 98 |         return loss
 99 | 
100 |     def validation_step(self, batch, batch_idx):
101 |         x, y = batch
102 |         y_hat = self(x)
103 |         loss = F.cross_entropy(y_hat, y)
104 |         self.log("val_loss", loss)
105 |         return loss
106 | 
107 |     def test_step(self, batch, batch_idx):
108 |         x, y = batch
109 |         y_hat = self(x)
110 |         loss = F.cross_entropy(y_hat, y)
111 |         return loss
112 | 
113 |     def predict_step(self, batch, batch_idx, dataloader_idx=None):
114 |         x, _ = batch
115 |         return self(x)
116 | 
117 |     def configure_optimizers(self):
118 |         optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)
119 |         return optimizer
120 | 
121 | 
122 | data_module = MNISTDataModule()
123 | model = LitConvClassifier()
124 | 
125 | trainer = pl.Trainer(
126 |     max_epochs=1,
127 |     default_root_dir="experiments/",
128 |     callbacks=[
129 |         EarlyStopping(monitor="val_loss", mode="min"),
130 |         ModelSummary(max_depth=-1),
131 |     ],
132 | )
133 | 
134 | # Train Model
135 | # We can pass the data module directly to the trainer
136 | trainer.fit(model, data_module)
137 | 
138 | # Get Predictions
139 | predictions = trainer.predict(model, data_module)
140 | print(len(predictions))
141 | 


--------------------------------------------------------------------------------
/src/intermediate/level_11_scaling_techniques/02_gradient_accumulation.py:
--------------------------------------------------------------------------------
  1 | # Documentation Link
  2 | # https://lightning.ai/docs/pytorch/stable/advanced/training_tricks.html
  3 | 
  4 | import os
  5 | 
  6 | import lightning.pytorch as pl
  7 | import torch
  8 | from lightning.pytorch.callbacks import ModelSummary
  9 | from lightning.pytorch.callbacks.early_stopping import EarlyStopping
 10 | from torch import nn
 11 | from torch.nn import functional as F
 12 | from torch.utils.data import DataLoader, random_split
 13 | from torchvision import transforms
 14 | from torchvision.datasets import MNIST
 15 | 
 16 | 
 17 | class MNISTDataModule(pl.LightningDataModule):
 18 |     def __init__(self, data_dir: str = "./"):
 19 |         super().__init__()
 20 |         self.data_dir = data_dir
 21 |         self.transform = transforms.Compose(
 22 |             [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]
 23 |         )
 24 | 
 25 |     def prepare_data(self):
 26 |         # download
 27 |         MNIST(self.data_dir, train=True, download=True)
 28 |         MNIST(self.data_dir, train=False, download=True)
 29 | 
 30 |     def setup(self, stage: str):
 31 |         # Assign train/val datasets for use in dataloaders
 32 |         if stage == "fit":
 33 |             mnist_full = MNIST(self.data_dir, train=True, transform=self.transform)
 34 |             self.mnist_train, self.mnist_val = random_split(mnist_full, [55000, 5000])
 35 | 
 36 |         # Assign test dataset for use in dataloader(s)
 37 |         if stage == "test":
 38 |             self.mnist_test = MNIST(
 39 |                 self.data_dir, train=False, transform=self.transform
 40 |             )
 41 | 
 42 |         if stage == "predict":
 43 |             self.mnist_predict = MNIST(
 44 |                 self.data_dir, train=False, transform=self.transform
 45 |             )
 46 | 
 47 |     def train_dataloader(self):
 48 |         return DataLoader(self.mnist_train, batch_size=32)
 49 | 
 50 |     def val_dataloader(self):
 51 |         return DataLoader(self.mnist_val, batch_size=32)
 52 | 
 53 |     def test_dataloader(self):
 54 |         return DataLoader(self.mnist_test, batch_size=32)
 55 | 
 56 |     def predict_dataloader(self):
 57 |         return DataLoader(self.mnist_predict, batch_size=32)
 58 | 
 59 | 
 60 | class LitConvClassifier(pl.LightningModule):
 61 |     def __init__(self, learning_rate=1e-3):
 62 |         super().__init__()
 63 |         self.save_hyperparameters()
 64 |         self.example_input_array = torch.rand(1, 1, 28, 28)
 65 | 
 66 |         self.learning_rate = learning_rate
 67 | 
 68 |         # Define blocks of layers as submodules
 69 |         self.conv_block1 = nn.Sequential(
 70 |             nn.Conv2d(1, 32, 3, stride=1, padding=1), nn.ReLU(), nn.MaxPool2d(2)
 71 |         )
 72 | 
 73 |         self.conv_block2 = nn.Sequential(
 74 |             nn.Conv2d(32, 64, 3, stride=1, padding=1), nn.ReLU(), nn.MaxPool2d(2)
 75 |         )
 76 | 
 77 |         self.fc_block = nn.Sequential(
 78 |             nn.Linear(64 * 7 * 7, 128), nn.ReLU(), nn.Linear(128, 10)
 79 |         )
 80 | 
 81 |     def forward(self, x):
 82 |         x = self.conv_block1(x)
 83 |         x = self.conv_block2(x)
 84 |         x = x.view(x.size(0), -1)
 85 |         x = self.fc_block(x)
 86 |         return x
 87 | 
 88 |     def training_step(self, batch, batch_idx):
 89 |         x, y = batch
 90 |         y_hat = self(x)
 91 |         loss = F.cross_entropy(y_hat, y)
 92 |         return loss
 93 | 
 94 |     def validation_step(self, batch, batch_idx):
 95 |         x, y = batch
 96 |         y_hat = self(x)
 97 |         loss = F.cross_entropy(y_hat, y)
 98 |         self.log("val_loss", loss)
 99 |         return loss
100 | 
101 |     def test_step(self, batch, batch_idx):
102 |         x, y = batch
103 |         y_hat = self(x)
104 |         loss = F.cross_entropy(y_hat, y)
105 |         return loss
106 | 
107 |     def predict_step(self, batch, batch_idx, dataloader_idx=None):
108 |         x, _ = batch
109 |         return self(x)
110 | 
111 |     def configure_optimizers(self):
112 |         optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)
113 |         return optimizer
114 | 
115 | 
116 | data_module = MNISTDataModule()
117 | model = LitConvClassifier()
118 | 
119 | # Accumulated gradients run K small batches of size N before doing a backward pass.
120 | # The effect is a large effective batch size of size KxN, where N is the batch size.
121 | # Internally it doesn’t stack up the batches and do a forward pass rather
122 | # it accumulates the gradients for K batches and then do an optimizer.step
123 | # to make sure the effective batch size is increased but there is no memory overhead.
124 | 
125 | trainer = pl.Trainer(
126 |     max_epochs=1,
127 |     default_root_dir="experiments/",
128 |     callbacks=[
129 |         EarlyStopping(monitor="val_loss", mode="min"),
130 |         ModelSummary(max_depth=-1),
131 |     ],
132 |     precision="16-mixed",
133 |     accumulate_grad_batches=7,  # Accumulate gradients for 7 batches
134 | )
135 | 
136 | trainer.fit(model, data_module)
137 | 
138 | # Get Predictions
139 | predictions = trainer.predict(model, data_module)
140 | print(len(predictions))
141 | 


--------------------------------------------------------------------------------
/src/intermediate/level_11_scaling_techniques/03_gradient_clipping.py:
--------------------------------------------------------------------------------
  1 | # Documentation Link
  2 | # https://lightning.ai/docs/pytorch/stable/advanced/training_tricks.html
  3 | 
  4 | import os
  5 | 
  6 | import lightning.pytorch as pl
  7 | import torch
  8 | from lightning.pytorch.callbacks import ModelSummary
  9 | from lightning.pytorch.callbacks.early_stopping import EarlyStopping
 10 | from torch import nn
 11 | from torch.nn import functional as F
 12 | from torch.utils.data import DataLoader, random_split
 13 | from torchvision import transforms
 14 | from torchvision.datasets import MNIST
 15 | 
 16 | 
 17 | class MNISTDataModule(pl.LightningDataModule):
 18 |     def __init__(self, data_dir: str = "./"):
 19 |         super().__init__()
 20 |         self.data_dir = data_dir
 21 |         self.transform = transforms.Compose(
 22 |             [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]
 23 |         )
 24 | 
 25 |     def prepare_data(self):
 26 |         # download
 27 |         MNIST(self.data_dir, train=True, download=True)
 28 |         MNIST(self.data_dir, train=False, download=True)
 29 | 
 30 |     def setup(self, stage: str):
 31 |         # Assign train/val datasets for use in dataloaders
 32 |         if stage == "fit":
 33 |             mnist_full = MNIST(self.data_dir, train=True, transform=self.transform)
 34 |             self.mnist_train, self.mnist_val = random_split(mnist_full, [55000, 5000])
 35 | 
 36 |         # Assign test dataset for use in dataloader(s)
 37 |         if stage == "test":
 38 |             self.mnist_test = MNIST(
 39 |                 self.data_dir, train=False, transform=self.transform
 40 |             )
 41 | 
 42 |         if stage == "predict":
 43 |             self.mnist_predict = MNIST(
 44 |                 self.data_dir, train=False, transform=self.transform
 45 |             )
 46 | 
 47 |     def train_dataloader(self):
 48 |         return DataLoader(self.mnist_train, batch_size=32)
 49 | 
 50 |     def val_dataloader(self):
 51 |         return DataLoader(self.mnist_val, batch_size=32)
 52 | 
 53 |     def test_dataloader(self):
 54 |         return DataLoader(self.mnist_test, batch_size=32)
 55 | 
 56 |     def predict_dataloader(self):
 57 |         return DataLoader(self.mnist_predict, batch_size=32)
 58 | 
 59 | 
 60 | class LitConvClassifier(pl.LightningModule):
 61 |     def __init__(self, learning_rate=1e-3):
 62 |         super().__init__()
 63 |         self.save_hyperparameters()
 64 |         self.example_input_array = torch.rand(1, 1, 28, 28)
 65 | 
 66 |         self.learning_rate = learning_rate
 67 | 
 68 |         # Define blocks of layers as submodules
 69 |         self.conv_block1 = nn.Sequential(
 70 |             nn.Conv2d(1, 32, 3, stride=1, padding=1), nn.ReLU(), nn.MaxPool2d(2)
 71 |         )
 72 | 
 73 |         self.conv_block2 = nn.Sequential(
 74 |             nn.Conv2d(32, 64, 3, stride=1, padding=1), nn.ReLU(), nn.MaxPool2d(2)
 75 |         )
 76 | 
 77 |         self.fc_block = nn.Sequential(
 78 |             nn.Linear(64 * 7 * 7, 128), nn.ReLU(), nn.Linear(128, 10)
 79 |         )
 80 | 
 81 |     def forward(self, x):
 82 |         x = self.conv_block1(x)
 83 |         x = self.conv_block2(x)
 84 |         x = x.view(x.size(0), -1)
 85 |         x = self.fc_block(x)
 86 |         return x
 87 | 
 88 |     def training_step(self, batch, batch_idx):
 89 |         x, y = batch
 90 |         y_hat = self(x)
 91 |         loss = F.cross_entropy(y_hat, y)
 92 |         return loss
 93 | 
 94 |     def validation_step(self, batch, batch_idx):
 95 |         x, y = batch
 96 |         y_hat = self(x)
 97 |         loss = F.cross_entropy(y_hat, y)
 98 |         self.log("val_loss", loss)
 99 |         return loss
100 | 
101 |     def test_step(self, batch, batch_idx):
102 |         x, y = batch
103 |         y_hat = self(x)
104 |         loss = F.cross_entropy(y_hat, y)
105 |         return loss
106 | 
107 |     def predict_step(self, batch, batch_idx, dataloader_idx=None):
108 |         x, _ = batch
109 |         return self(x)
110 | 
111 |     def configure_optimizers(self):
112 |         optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)
113 |         return optimizer
114 | 
115 | 
116 | data_module = MNISTDataModule()
117 | model = LitConvClassifier()
118 | 
119 | # Gradient clipping can be enabled to avoid exploding gradients.
120 | # By default, this will clip the gradient norm by calling torch.nn.utils.clip_grad_norm_()
121 | # computed over all model parameters together. If the Trainer’s gradient_clip_algorithm is set to
122 | # 'value' ('norm' by default), this will use instead torch.nn.utils.clip_grad_value_() for each parameter instead.
123 | 
124 | trainer = pl.Trainer(
125 |     max_epochs=1,
126 |     default_root_dir="experiments/",
127 |     callbacks=[
128 |         EarlyStopping(monitor="val_loss", mode="min"),
129 |         ModelSummary(max_depth=-1),
130 |     ],
131 |     precision="16-mixed",
132 |     gradient_clip_val=0.5,  # clip gradients' global norm to <=0.5 using gradient_clip_algorithm='norm' by default
133 | )
134 | 
135 | trainer.fit(model, data_module)
136 | 
137 | # Get Predictions
138 | predictions = trainer.predict(model, data_module)
139 | print(len(predictions))
140 | 


--------------------------------------------------------------------------------
/src/advanced/level_15_modify_trainer/01_create_callbacks.py:
--------------------------------------------------------------------------------
  1 | # Documentaion Link
  2 | # https://lightning.ai/docs/pytorch/stable/extensions/callbacks.html
  3 | 
  4 | import lightning.pytorch as pl
  5 | import torch
  6 | from lightning.pytorch.callbacks import Callback, ModelSummary
  7 | from lightning.pytorch.callbacks.early_stopping import EarlyStopping
  8 | from torch import nn
  9 | from torch.nn import functional as F
 10 | from torch.utils.data import DataLoader, random_split
 11 | from torchvision import transforms
 12 | from torchvision.datasets import MNIST
 13 | 
 14 | 
 15 | class MNISTDataModule(pl.LightningDataModule):
 16 |     def __init__(self, data_dir: str = "./"):
 17 |         super().__init__()
 18 |         self.data_dir = data_dir
 19 |         self.transform = transforms.Compose(
 20 |             [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]
 21 |         )
 22 | 
 23 |     def prepare_data(self):
 24 |         # download
 25 |         MNIST(self.data_dir, train=True, download=True)
 26 |         MNIST(self.data_dir, train=False, download=True)
 27 | 
 28 |     def setup(self, stage: str):
 29 |         # Assign train/val datasets for use in dataloaders
 30 |         if stage == "fit":
 31 |             mnist_full = MNIST(self.data_dir, train=True, transform=self.transform)
 32 |             self.mnist_train, self.mnist_val = random_split(mnist_full, [55000, 5000])
 33 | 
 34 |         # Assign test dataset for use in dataloader(s)
 35 |         if stage == "test":
 36 |             self.mnist_test = MNIST(
 37 |                 self.data_dir, train=False, transform=self.transform
 38 |             )
 39 | 
 40 |         if stage == "predict":
 41 |             self.mnist_predict = MNIST(
 42 |                 self.data_dir, train=False, transform=self.transform
 43 |             )
 44 | 
 45 |     def train_dataloader(self):
 46 |         return DataLoader(self.mnist_train, batch_size=32)
 47 | 
 48 |     def val_dataloader(self):
 49 |         return DataLoader(self.mnist_val, batch_size=32)
 50 | 
 51 |     def test_dataloader(self):
 52 |         return DataLoader(self.mnist_test, batch_size=32)
 53 | 
 54 |     def predict_dataloader(self):
 55 |         return DataLoader(self.mnist_predict, batch_size=32)
 56 | 
 57 | 
 58 | class LitConvClassifier(pl.LightningModule):
 59 |     def __init__(self, learning_rate=1e-3):
 60 |         super().__init__()
 61 |         self.save_hyperparameters()
 62 |         self.example_input_array = torch.rand(1, 1, 28, 28)
 63 | 
 64 |         self.learning_rate = learning_rate
 65 | 
 66 |         # Define blocks of layers as submodules
 67 |         self.conv_block1 = nn.Sequential(
 68 |             nn.Conv2d(1, 32, 3, stride=1, padding=1), nn.ReLU(), nn.MaxPool2d(2)
 69 |         )
 70 | 
 71 |         self.conv_block2 = nn.Sequential(
 72 |             nn.Conv2d(32, 64, 3, stride=1, padding=1), nn.ReLU(), nn.MaxPool2d(2)
 73 |         )
 74 | 
 75 |         self.fc_block = nn.Sequential(
 76 |             nn.Linear(64 * 7 * 7, 128), nn.ReLU(), nn.Linear(128, 10)
 77 |         )
 78 | 
 79 |     def forward(self, x):
 80 |         x = self.conv_block1(x)
 81 |         x = self.conv_block2(x)
 82 |         x = x.view(x.size(0), -1)
 83 |         x = self.fc_block(x)
 84 |         return x
 85 | 
 86 |     def training_step(self, batch, batch_idx):
 87 |         x, y = batch
 88 |         y_hat = self(x)
 89 |         loss = F.cross_entropy(y_hat, y)
 90 |         return loss
 91 | 
 92 |     def validation_step(self, batch, batch_idx):
 93 |         x, y = batch
 94 |         y_hat = self(x)
 95 |         loss = F.cross_entropy(y_hat, y)
 96 |         self.log("val_loss", loss)
 97 |         return loss
 98 | 
 99 |     def test_step(self, batch, batch_idx):
100 |         x, y = batch
101 |         y_hat = self(x)
102 |         loss = F.cross_entropy(y_hat, y)
103 |         return loss
104 | 
105 |     def predict_step(self, batch, batch_idx, dataloader_idx=None):
106 |         x, _ = batch
107 |         return self(x)
108 | 
109 |     def configure_optimizers(self):
110 |         optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)
111 |         return optimizer
112 | 
113 | 
114 | data_module = MNISTDataModule()
115 | model = LitConvClassifier()
116 | 
117 | 
118 | class CustomValidationCallback(Callback):
119 |     """
120 |     This is a simple demonstration of creating a Custom Callback and
121 |     passing it to the trainer.
122 | 
123 |     The Callback is used everytime the Validation starts and ends.
124 |     You should be able to see it in the logs!
125 |     """
126 | 
127 |     def on_validation_start(self, trainer, pl_module):
128 |         print("Validation is starting.")
129 | 
130 |     def on_validation_end(self, trainer, pl_module):
131 |         print("Validation has ended.")
132 | 
133 | 
134 | trainer = pl.Trainer(
135 |     max_epochs=1,
136 |     default_root_dir="experiments/",
137 |     callbacks=[
138 |         EarlyStopping(monitor="val_loss", mode="min"),
139 |         ModelSummary(max_depth=-1),
140 |         CustomValidationCallback(),  # Pass the custom callback to the trainer
141 |     ],
142 |     precision="16-mixed",
143 |     limit_train_batches=0.1,
144 |     limit_val_batches=0.01,
145 | )
146 | 
147 | trainer.fit(model, data_module)
148 | 
149 | # Get Predictions
150 | predictions = trainer.predict(model, data_module)
151 | print(len(predictions))
152 | 


--------------------------------------------------------------------------------
/src/advanced/level_15_modify_trainer/02_customize_progress_bar.py:
--------------------------------------------------------------------------------
  1 | # Documentation Link
  2 | # https://lightning.ai/docs/pytorch/stable/common/progress_bar.html
  3 | 
  4 | import lightning.pytorch as pl
  5 | import torch
  6 | 
  7 | # In this example we will use the RichProgressBar
  8 | # Lightning by default uses tqdm
  9 | from lightning.pytorch.callbacks import ModelSummary, RichProgressBar
 10 | from lightning.pytorch.callbacks.early_stopping import EarlyStopping
 11 | from lightning.pytorch.callbacks.progress.rich_progress import RichProgressBarTheme
 12 | from torch import nn
 13 | from torch.nn import functional as F
 14 | from torch.utils.data import DataLoader, random_split
 15 | from torchvision import transforms
 16 | from torchvision.datasets import MNIST
 17 | 
 18 | 
 19 | class MNISTDataModule(pl.LightningDataModule):
 20 |     def __init__(self, data_dir: str = "./"):
 21 |         super().__init__()
 22 |         self.data_dir = data_dir
 23 |         self.transform = transforms.Compose(
 24 |             [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]
 25 |         )
 26 | 
 27 |     def prepare_data(self):
 28 |         # download
 29 |         MNIST(self.data_dir, train=True, download=True)
 30 |         MNIST(self.data_dir, train=False, download=True)
 31 | 
 32 |     def setup(self, stage: str):
 33 |         # Assign train/val datasets for use in dataloaders
 34 |         if stage == "fit":
 35 |             mnist_full = MNIST(self.data_dir, train=True, transform=self.transform)
 36 |             self.mnist_train, self.mnist_val = random_split(mnist_full, [55000, 5000])
 37 | 
 38 |         # Assign test dataset for use in dataloader(s)
 39 |         if stage == "test":
 40 |             self.mnist_test = MNIST(
 41 |                 self.data_dir, train=False, transform=self.transform
 42 |             )
 43 | 
 44 |         if stage == "predict":
 45 |             self.mnist_predict = MNIST(
 46 |                 self.data_dir, train=False, transform=self.transform
 47 |             )
 48 | 
 49 |     def train_dataloader(self):
 50 |         return DataLoader(self.mnist_train, batch_size=32)
 51 | 
 52 |     def val_dataloader(self):
 53 |         return DataLoader(self.mnist_val, batch_size=32)
 54 | 
 55 |     def test_dataloader(self):
 56 |         return DataLoader(self.mnist_test, batch_size=32)
 57 | 
 58 |     def predict_dataloader(self):
 59 |         return DataLoader(self.mnist_predict, batch_size=32)
 60 | 
 61 | 
 62 | class LitConvClassifier(pl.LightningModule):
 63 |     def __init__(self, learning_rate=1e-3):
 64 |         super().__init__()
 65 |         self.save_hyperparameters()
 66 |         self.example_input_array = torch.rand(1, 1, 28, 28)
 67 | 
 68 |         self.learning_rate = learning_rate
 69 | 
 70 |         # Define blocks of layers as submodules
 71 |         self.conv_block1 = nn.Sequential(
 72 |             nn.Conv2d(1, 32, 3, stride=1, padding=1), nn.ReLU(), nn.MaxPool2d(2)
 73 |         )
 74 | 
 75 |         self.conv_block2 = nn.Sequential(
 76 |             nn.Conv2d(32, 64, 3, stride=1, padding=1), nn.ReLU(), nn.MaxPool2d(2)
 77 |         )
 78 | 
 79 |         self.fc_block = nn.Sequential(
 80 |             nn.Linear(64 * 7 * 7, 128), nn.ReLU(), nn.Linear(128, 10)
 81 |         )
 82 | 
 83 |     def forward(self, x):
 84 |         x = self.conv_block1(x)
 85 |         x = self.conv_block2(x)
 86 |         x = x.view(x.size(0), -1)
 87 |         x = self.fc_block(x)
 88 |         return x
 89 | 
 90 |     def training_step(self, batch, batch_idx):
 91 |         x, y = batch
 92 |         y_hat = self(x)
 93 |         loss = F.cross_entropy(y_hat, y)
 94 |         return loss
 95 | 
 96 |     def validation_step(self, batch, batch_idx):
 97 |         x, y = batch
 98 |         y_hat = self(x)
 99 |         loss = F.cross_entropy(y_hat, y)
100 |         self.log("val_loss", loss)
101 |         return loss
102 | 
103 |     def test_step(self, batch, batch_idx):
104 |         x, y = batch
105 |         y_hat = self(x)
106 |         loss = F.cross_entropy(y_hat, y)
107 |         return loss
108 | 
109 |     def predict_step(self, batch, batch_idx, dataloader_idx=None):
110 |         x, _ = batch
111 |         return self(x)
112 | 
113 |     def configure_optimizers(self):
114 |         optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)
115 |         return optimizer
116 | 
117 | 
118 | data_module = MNISTDataModule()
119 | model = LitConvClassifier()
120 | 
121 | # Define a custom theme for the RichProgressBar
122 | rich_progress_bar = RichProgressBar(
123 |     theme=RichProgressBarTheme(
124 |         description="green_yellow",
125 |         progress_bar="green1",
126 |         progress_bar_finished="green1",
127 |         progress_bar_pulse="#6206E0",
128 |         batch_progress="green_yellow",
129 |         time="grey82",
130 |         processing_speed="grey82",
131 |         metrics="grey82",
132 |         metrics_text_delimiter="\n",
133 |         metrics_format=".3e",
134 |     )
135 | )
136 | 
137 | trainer = pl.Trainer(
138 |     max_epochs=1,
139 |     default_root_dir="experiments",
140 |     callbacks=[
141 |         EarlyStopping(monitor="val_loss", mode="min"),
142 |         ModelSummary(max_depth=-1),
143 |         rich_progress_bar,  # Pass the rich_progress_bar to the Trainer
144 |     ],
145 |     precision="16-mixed",
146 |     limit_train_batches=0.1,
147 |     limit_val_batches=0.01,
148 | )
149 | 
150 | trainer.fit(model, data_module)
151 | 
152 | # Get Predictions
153 | predictions = trainer.predict(model, data_module)
154 | print(len(predictions))
155 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Zero-to-Lightning :zap::  Comprehensive PyTorch Lightning Tutorial Guide
  2 | 
  3 | <a target="_blank" href="https://lightning.ai/ishandutta0098/studios/zero-to-lightning">
  4 |   <img src="https://pl-bolts-doc-images.s3.us-east-2.amazonaws.com/app-2/studio-badge.svg" alt="Open In Studio"/>
  5 | </a>
  6 |   
  7 | Welcome to the GitHub repository for Zero-to-Lightning! This project contains a collection of independent, executable scripts that showcase most of the available functionalities in PyTorch Lightning, each covering a new feature or technique. It's organized to help you smoothly progress from basic to advanced PyTorch Lightning concepts.
  8 | 
  9 | ## Project Demo
 10 | 
 11 | https://github.com/ishandutta0098/zero-to-lightning/assets/47643789/a068e1d1-0ec8-4357-b4e2-d1c8090224fd
 12 | 
 13 | 
 14 | ## Project Directory
 15 | 
 16 | ```
 17 | zero-to-lightning
 18 |         |-src
 19 |             |-basic
 20 |             |   |-level_01_lightning_module
 21 |             |   |-level_02_validation_and_testing
 22 |             |   |-level_03_checkpointing
 23 |             |   |-level_04_early_stopping
 24 |             |   |-level_05_pretrained_model
 25 |             |   |-level_06_debugging_model
 26 |             |   |-level_07_inference
 27 |             |
 28 |             |-intermediate
 29 |             |   |-level_08_accelerated_hardware
 30 |             |   |-level_09_modularize
 31 |             |   |-level_11_scaling_techniques
 32 |             |   |-level_12_deploying_models
 33 |             |   |-level_13_profiler
 34 |             |
 35 |             |-advanced
 36 |                 |-level_14_run_with_config_file
 37 |                 |-level_15_modify_trainer
 38 |                 |-level_16_enable_manual_optimization
 39 |                 |-level_17_advanced_checkpointing
 40 |                 |-level_18_ipu
 41 |                 |-level_19_hpu
 42 | 
 43 | ```
 44 |   
 45 | - **Basic**: 🏗 Foundational Lightning concepts like creating modules, validation and testing, checkpointing, early stopping, pretrained models, debugging, and inference.
 46 | - **Intermediate**: 🚀 More specialized topics like accelerated hardware, modularization, scaling techniques, deployment, and profiling.
 47 | - **Advanced**: 🔍 Deep dives into running with config files, modifying trainers, manual optimization, advanced checkpointing, IPUs, and HPUs.
 48 | 
 49 | ## Overview
 50 | 
 51 | Each sub-directory is designed to help users become familiar with a specific set of PyTorch Lightning functionalities and best practices. Whether you're just starting out or are an advanced user seeking to refine your techniques, the project provides structured guidance and practical examples.
 52 | 
 53 | ## Features
 54 | 
 55 | - **Compact, Executable Scripts**: 📦 Each script is designed to be concise, demonstrating how individual features, functions, or classes operate, making learning targeted and efficient.
 56 | - **CPU-Friendly**: 🖥 Most scripts are optimized to run on standard CPUs, minimizing the need for specialized hardware.
 57 | - **Quick Iteration**: ⏲ Each script executes in under a minute, enabling rapid testing, learning, and iteration.
 58 | - **Official Documentation Links**: 📚 Every script is accompanied by relevant references to official Lightning documentation, helping you deepen your understanding.
 59 | - **Independent Execution**: 🏃‍♂️ The scripts are modular, allowing you to explore features individually without needing to execute the entire project.
 60 | - **Comprehensive Coverage**: 🌐 From basic modules and validation to advanced manual optimization and hardware-specific integrations, this guide ensures broad exposure to the various functionalities PyTorch Lightning offers.
 61 | - **Step-by-Step Structure**: 🛠 Organized progressively, it enables users to gradually advance from foundational knowledge to more sophisticated techniques.
 62 | 
 63 | 
 64 |   
 65 | ## Getting Started
 66 | 
 67 | To get started with this project, clone the repository and follow the instructions below.
 68 | 
 69 | ### Installation
 70 | 
 71 | 1. Clone the repository:
 72 | ```bash
 73 | git clone https://github.com/ishandutta0098/zero-to-lightning.git
 74 | ```
 75 | 
 76 | 2. Navigate to the project directory:
 77 | ```bash
 78 | cd zero-to-lightning
 79 | ```
 80 | 
 81 | 3. Create the conda environment:
 82 | ```bash
 83 | # Create the conda environment
 84 | conda env create -f conda.yml
 85 | 
 86 | # Activate the environment
 87 | conda activate lit-env
 88 | ```
 89 | 
 90 | ### Usage
 91 | You can run any script by passing it's path directly as shown below.
 92 | 
 93 | ```bash
 94 | python <path_to_script>
 95 | 
 96 | # Example
 97 | python src/basic/level_01_lightning_module/lightning_module.py
 98 | ```
 99 | 
100 | Most of the scripts run directly. For one script we use the LightningCLI.  
101 | To run the script `src/advanced/level_14_run_with_config_file/run_with_yaml.py` follow the below steps 👇
102 |   
103 | ```bash
104 | # There are 3 Steps to run this:
105 | # 1. Save the current configs in config.yaml
106 | python src/advanced/level_14_run_with_config_file/run_with_yaml.py fit --print_config > config.yaml
107 | 
108 | # 2. Run the training using the config file
109 | python src/advanced/level_14_run_with_config_file/run_with_yaml.py fit --config config.yaml
110 | 
111 | # 3. Modify the config file and run the training again
112 | # Example, try making `max_epochs` as 3 in the config file and run the training again
113 | python src/advanced/level_14_run_with_config_file/run_with_yaml.py fit --config config.yaml
114 | ```
115 | 
116 | ## License
117 | This project is licensed under the MIT License - see the LICENSE file for details.
118 | 
119 | 


--------------------------------------------------------------------------------
/src/intermediate/level_08_accelerated_hardware/01_gpu_training.py:
--------------------------------------------------------------------------------
  1 | # Documentation Link
  2 | # https://lightning.ai/docs/pytorch/stable/accelerators/gpu_basic.html
  3 | 
  4 | import os
  5 | 
  6 | import lightning.pytorch as pl
  7 | import torch
  8 | from lightning.pytorch.callbacks import ModelSummary
  9 | from lightning.pytorch.callbacks.early_stopping import EarlyStopping
 10 | from torch import nn
 11 | from torch.nn import functional as F
 12 | from torch.utils.data import DataLoader
 13 | from torchvision import transforms
 14 | from torchvision.datasets import MNIST
 15 | 
 16 | 
 17 | class LitConvClassifier(pl.LightningModule):
 18 |     def __init__(self, learning_rate=1e-3):
 19 |         super().__init__()
 20 |         self.save_hyperparameters()
 21 |         self.example_input_array = torch.rand(1, 1, 28, 28)
 22 | 
 23 |         self.learning_rate = learning_rate
 24 | 
 25 |         # Define blocks of layers as submodules
 26 |         self.conv_block1 = nn.Sequential(
 27 |             nn.Conv2d(1, 32, 3, stride=1, padding=1), nn.ReLU(), nn.MaxPool2d(2)
 28 |         )
 29 | 
 30 |         self.conv_block2 = nn.Sequential(
 31 |             nn.Conv2d(32, 64, 3, stride=1, padding=1), nn.ReLU(), nn.MaxPool2d(2)
 32 |         )
 33 | 
 34 |         self.fc_block = nn.Sequential(
 35 |             nn.Linear(64 * 7 * 7, 128), nn.ReLU(), nn.Linear(128, 10)
 36 |         )
 37 | 
 38 |     def forward(self, x):
 39 |         x = self.conv_block1(x)
 40 |         x = self.conv_block2(x)
 41 |         x = x.view(x.size(0), -1)
 42 |         x = self.fc_block(x)
 43 |         return x
 44 | 
 45 |     def training_step(self, batch, batch_idx):
 46 |         x, y = batch
 47 |         y_hat = self(x)
 48 |         loss = F.cross_entropy(y_hat, y)
 49 |         return loss
 50 | 
 51 |     def validation_step(self, batch, batch_idx):
 52 |         x, y = batch
 53 |         y_hat = self(x)
 54 |         loss = F.cross_entropy(y_hat, y)
 55 |         self.log("val_loss", loss)
 56 |         return loss
 57 | 
 58 |     def test_step(self, batch, batch_idx):
 59 |         x, y = batch
 60 |         y_hat = self(x)
 61 |         loss = F.cross_entropy(y_hat, y)
 62 |         return loss
 63 | 
 64 |     def predict_step(self, batch, batch_idx, dataloader_idx=None):
 65 |         x, _ = batch
 66 |         return self(x)
 67 | 
 68 |     def configure_optimizers(self):
 69 |         optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)
 70 |         return optimizer
 71 | 
 72 | 
 73 | def prepare_dataloaders():
 74 |     train_dataset = MNIST(
 75 |         "./", download=True, train=True, transform=transforms.ToTensor()
 76 |     )
 77 | 
 78 |     train_size = int(0.8 * len(train_dataset))
 79 |     val_size = len(train_dataset) - train_size
 80 | 
 81 |     seed = torch.Generator().manual_seed(42)
 82 |     train_dataset, val_dataset = torch.utils.data.random_split(
 83 |         train_dataset, [train_size, val_size], generator=seed
 84 |     )
 85 | 
 86 |     test_dataset = MNIST(
 87 |         "./", download=True, train=False, transform=transforms.ToTensor()
 88 |     )
 89 | 
 90 |     train_dataloader = DataLoader(train_dataset, batch_size=32)
 91 |     val_dataloader = DataLoader(val_dataset, batch_size=32)
 92 |     test_dataloader = DataLoader(test_dataset, batch_size=32)
 93 | 
 94 |     return train_dataloader, val_dataloader, test_dataloader
 95 | 
 96 | 
 97 | train_dataloader, val_dataloader, test_dataloader = prepare_dataloaders()
 98 | 
 99 | model = LitConvClassifier()
100 | 
101 | # The below code will train the model on the GPU if unavailable it will throw an error.
102 | # A Graphics Processing Unit (GPU), is a specialized hardware accelerator designed to speed up
103 | # mathematical computations used in gaming and deep learning.
104 | 
105 | # Following are the different configuration examples which you can use to train your model on GPU
106 | # based on your hardware configuration.
107 | 
108 | # run on as many GPUs as available by default
109 | trainer = pl.Trainer(
110 |     max_epochs=5,
111 |     default_root_dir="experiments/",
112 |     accelerator="auto",
113 |     devices="auto",
114 |     strategy="auto",
115 | )
116 | # equivalent to
117 | trainer = pl.Trainer(max_epochs=5, default_root_dir="experiments/")
118 | 
119 | # run on one GPU
120 | trainer = pl.Trainer(
121 |     max_epochs=5, default_root_dir="experiments/", accelerator="gpu", devices=1
122 | )
123 | 
124 | # run on multiple GPUs
125 | trainer = pl.Trainer(
126 |     max_epochs=5, default_root_dir="experiments/", accelerator="gpu", devices=8
127 | )
128 | 
129 | # choose the number of devices automatically
130 | trainer = pl.Trainer(
131 |     max_epochs=5, default_root_dir="experiments/", accelerator="gpu", devices="auto"
132 | )
133 | 
134 | # DEFAULT (int) specifies how many GPUs to use per node
135 | k = 2
136 | trainer = pl.Trainer(
137 |     max_epochs=5, default_root_dir="experiments/", accelerator="gpu", devices=k
138 | )
139 | # equivalent to
140 | trainer = pl.Trainer(
141 |     max_epochs=5,
142 |     default_root_dir="experiments/",
143 |     accelerator="gpu",
144 |     devices=list(range(k)),
145 | )
146 | 
147 | # Specify which GPUs to use (don't use when running on cluster)
148 | trainer = pl.Trainer(
149 |     max_epochs=5, default_root_dir="experiments/", accelerator="gpu", devices=[0, 1]
150 | )
151 | # equivalent to
152 | trainer = pl.Trainer(
153 |     max_epochs=5, default_root_dir="experiments/", accelerator="gpu", devices="0, 1"
154 | )
155 | 
156 | # To use all available GPUs put -1 or '-1'
157 | # equivalent to `list(range(torch.cuda.device_count())) and `"auto"`
158 | trainer = pl.Trainer(
159 |     max_epochs=5, default_root_dir="experiments/", accelerator="gpu", devices=-1
160 | )
161 | 
162 | trainer.fit(model, train_dataloader, val_dataloader)
163 | 


--------------------------------------------------------------------------------
/src/advanced/level_14_run_with_config_file/run_with_yaml.py:
--------------------------------------------------------------------------------
  1 | # Documentation Link
  2 | # https://lightning.ai/docs/pytorch/stable/levels/advanced_level_15.html
  3 | 
  4 | import lightning.pytorch as pl
  5 | import torch
  6 | from lightning.pytorch.callbacks import ModelSummary
  7 | from lightning.pytorch.callbacks.early_stopping import EarlyStopping
  8 | 
  9 | # We will use the LightningCLI to run the training
 10 | from lightning.pytorch.cli import LightningCLI
 11 | from torch import nn
 12 | from torch.nn import functional as F
 13 | from torch.utils.data import DataLoader, random_split
 14 | from torchvision import transforms
 15 | from torchvision.datasets import MNIST
 16 | 
 17 | 
 18 | class MNISTDataModule(pl.LightningDataModule):
 19 |     def __init__(self, data_dir: str = "./"):
 20 |         super().__init__()
 21 |         self.data_dir = data_dir
 22 |         self.transform = transforms.Compose(
 23 |             [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]
 24 |         )
 25 | 
 26 |     def prepare_data(self):
 27 |         # download
 28 |         MNIST(self.data_dir, train=True, download=True)
 29 |         MNIST(self.data_dir, train=False, download=True)
 30 | 
 31 |     def setup(self, stage: str):
 32 |         # Assign train/val datasets for use in dataloaders
 33 |         if stage == "fit":
 34 |             mnist_full = MNIST(self.data_dir, train=True, transform=self.transform)
 35 |             self.mnist_train, self.mnist_val = random_split(mnist_full, [55000, 5000])
 36 | 
 37 |         # Assign test dataset for use in dataloader(s)
 38 |         if stage == "test":
 39 |             self.mnist_test = MNIST(
 40 |                 self.data_dir, train=False, transform=self.transform
 41 |             )
 42 | 
 43 |         if stage == "predict":
 44 |             self.mnist_predict = MNIST(
 45 |                 self.data_dir, train=False, transform=self.transform
 46 |             )
 47 | 
 48 |     def train_dataloader(self):
 49 |         return DataLoader(self.mnist_train, batch_size=32)
 50 | 
 51 |     def val_dataloader(self):
 52 |         return DataLoader(self.mnist_val, batch_size=32)
 53 | 
 54 |     def test_dataloader(self):
 55 |         return DataLoader(self.mnist_test, batch_size=32)
 56 | 
 57 |     def predict_dataloader(self):
 58 |         return DataLoader(self.mnist_predict, batch_size=32)
 59 | 
 60 | 
 61 | class LitConvClassifier(pl.LightningModule):
 62 |     def __init__(self, learning_rate=1e-3):
 63 |         super().__init__()
 64 |         self.save_hyperparameters()
 65 |         self.example_input_array = torch.rand(1, 1, 28, 28)
 66 | 
 67 |         self.learning_rate = learning_rate
 68 | 
 69 |         # Define blocks of layers as submodules
 70 |         self.conv_block1 = nn.Sequential(
 71 |             nn.Conv2d(1, 32, 3, stride=1, padding=1), nn.ReLU(), nn.MaxPool2d(2)
 72 |         )
 73 | 
 74 |         self.conv_block2 = nn.Sequential(
 75 |             nn.Conv2d(32, 64, 3, stride=1, padding=1), nn.ReLU(), nn.MaxPool2d(2)
 76 |         )
 77 | 
 78 |         self.fc_block = nn.Sequential(
 79 |             nn.Linear(64 * 7 * 7, 128), nn.ReLU(), nn.Linear(128, 10)
 80 |         )
 81 | 
 82 |     def forward(self, x):
 83 |         x = self.conv_block1(x)
 84 |         x = self.conv_block2(x)
 85 |         x = x.view(x.size(0), -1)
 86 |         x = self.fc_block(x)
 87 |         return x
 88 | 
 89 |     def training_step(self, batch, batch_idx):
 90 |         x, y = batch
 91 |         y_hat = self(x)
 92 |         loss = F.cross_entropy(y_hat, y)
 93 |         return loss
 94 | 
 95 |     def validation_step(self, batch, batch_idx):
 96 |         x, y = batch
 97 |         y_hat = self(x)
 98 |         loss = F.cross_entropy(y_hat, y)
 99 |         self.log("val_loss", loss)
100 |         return loss
101 | 
102 |     def test_step(self, batch, batch_idx):
103 |         x, y = batch
104 |         y_hat = self(x)
105 |         loss = F.cross_entropy(y_hat, y)
106 |         return loss
107 | 
108 |     def predict_step(self, batch, batch_idx, dataloader_idx=None):
109 |         x, _ = batch
110 |         return self(x)
111 | 
112 |     def configure_optimizers(self):
113 |         optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)
114 |         return optimizer
115 | 
116 | 
117 | data_module = MNISTDataModule
118 | model = LitConvClassifier
119 | 
120 | # To use the CLI we do not initialize the trainer class separately
121 | # We pass the model, data module, trainer defaults to the LightningCLI function directly
122 | # As you can see here the trainer_defaults are the same as the ones we used in the previous example
123 | 
124 | 
125 | def cli_main(model, data_module):
126 |     cli = LightningCLI(
127 |         model_class=LitConvClassifier,
128 |         datamodule_class=MNISTDataModule,
129 |         trainer_class=pl.Trainer,
130 |         trainer_defaults={
131 |             "max_epochs": 1,
132 |             "default_root_dir": "experiments/",
133 |             "callbacks": [
134 |                 EarlyStopping(monitor="val_loss", mode="min"),
135 |                 ModelSummary(max_depth=-1),
136 |             ],
137 |             "precision": "16-mixed",
138 |             "limit_train_batches": 0.1,
139 |             "limit_val_batches": 0.01,
140 |         },
141 |     )
142 | 
143 | 
144 | if __name__ == "__main__":
145 |     cli_main(model, data_module)
146 | 
147 | # There are 3 Steps to run this:
148 | # 1. Save the current configs in config.yaml
149 | # python src/advanced/level_14_run_with_config_file/run_with_yaml.py fit --print_config > config.yaml
150 | 
151 | # 2. Run the training using the config file
152 | # python src/advanced/level_14_run_with_config_file/run_with_yaml.py fit --config config.yaml
153 | 
154 | # 3. Modify the config file and run the training again
155 | # Example, try making `max_epochs` as 3 in the config file and run the training again
156 | # python src/advanced/level_14_run_with_config_file/run_with_yaml.py fit --config config.yaml
157 | 


--------------------------------------------------------------------------------
/src/advanced/level_18_ipu/ipu.py:
--------------------------------------------------------------------------------
  1 | # Documentation Link
  2 | # https://lightning.ai/docs/pytorch/stable/integrations/ipu/basic.html
  3 | 
  4 | import lightning.pytorch as pl
  5 | import torch
  6 | from lightning.pytorch.callbacks import ModelSummary
  7 | from lightning.pytorch.callbacks.early_stopping import EarlyStopping
  8 | from torch import nn
  9 | from torch.nn import functional as F
 10 | from torch.utils.data import DataLoader, random_split
 11 | from torchvision import transforms
 12 | from torchvision.datasets import MNIST
 13 | 
 14 | 
 15 | class MNISTDataModule(pl.LightningDataModule):
 16 |     def __init__(self, data_dir: str = "./"):
 17 |         super().__init__()
 18 |         self.data_dir = data_dir
 19 |         self.transform = transforms.Compose(
 20 |             [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]
 21 |         )
 22 | 
 23 |     def prepare_data(self):
 24 |         MNIST(self.data_dir, train=True, download=True)
 25 |         MNIST(self.data_dir, train=False, download=True)
 26 | 
 27 |     def setup(self, stage: str):
 28 |         # Assign train/val datasets for use in dataloaders
 29 |         if stage == "fit":
 30 |             mnist_full = MNIST(self.data_dir, train=True, transform=self.transform)
 31 |             self.mnist_train, self.mnist_val = random_split(mnist_full, [55000, 5000])
 32 | 
 33 |         # Assign test dataset for use in dataloader(s)
 34 |         if stage == "test":
 35 |             self.mnist_test = MNIST(
 36 |                 self.data_dir, train=False, transform=self.transform
 37 |             )
 38 | 
 39 |         if stage == "predict":
 40 |             self.mnist_predict = MNIST(
 41 |                 self.data_dir, train=False, transform=self.transform
 42 |             )
 43 | 
 44 |     def train_dataloader(self):
 45 |         return DataLoader(self.mnist_train, batch_size=32)
 46 | 
 47 |     def val_dataloader(self):
 48 |         return DataLoader(self.mnist_val, batch_size=32)
 49 | 
 50 |     def test_dataloader(self):
 51 |         return DataLoader(self.mnist_test, batch_size=32)
 52 | 
 53 |     def predict_dataloader(self):
 54 |         return DataLoader(self.mnist_predict, batch_size=32)
 55 | 
 56 | 
 57 | class LitConvClassifier(pl.LightningModule):
 58 |     def __init__(self, learning_rate=1e-3):
 59 |         super().__init__()
 60 |         self.save_hyperparameters()
 61 |         self.example_input_array = torch.rand(1, 1, 28, 28)
 62 | 
 63 |         self.learning_rate = learning_rate
 64 | 
 65 |         # Define blocks of layers as submodules
 66 |         self.conv_block1 = nn.Sequential(
 67 |             nn.Conv2d(1, 32, 3, stride=1, padding=1), nn.ReLU(), nn.MaxPool2d(2)
 68 |         )
 69 | 
 70 |         self.conv_block2 = nn.Sequential(
 71 |             nn.Conv2d(32, 64, 3, stride=1, padding=1), nn.ReLU(), nn.MaxPool2d(2)
 72 |         )
 73 | 
 74 |         self.fc_block = nn.Sequential(
 75 |             nn.Linear(64 * 7 * 7, 128), nn.ReLU(), nn.Linear(128, 10)
 76 |         )
 77 | 
 78 |     def forward(self, x):
 79 |         x = self.conv_block1(x)
 80 |         x = self.conv_block2(x)
 81 |         x = x.view(x.size(0), -1)
 82 |         x = self.fc_block(x)
 83 |         return x
 84 | 
 85 |     def training_step(self, batch, batch_idx):
 86 |         x, y = batch
 87 |         y_hat = self(x)
 88 |         loss = F.cross_entropy(y_hat, y)
 89 |         return loss
 90 | 
 91 |     def validation_step(self, batch, batch_idx):
 92 |         x, y = batch
 93 |         y_hat = self(x)
 94 |         loss = F.cross_entropy(y_hat, y)
 95 |         self.log("val_loss", loss)
 96 |         return loss
 97 | 
 98 |     def test_step(self, batch, batch_idx):
 99 |         x, y = batch
100 |         y_hat = self(x)
101 |         loss = F.cross_entropy(y_hat, y)
102 |         return loss
103 | 
104 |     def predict_step(self, batch, batch_idx, dataloader_idx=None):
105 |         x, _ = batch
106 |         return self(x)
107 | 
108 |     def configure_optimizers(self):
109 |         optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)
110 |         return optimizer
111 | 
112 | 
113 | data_module = MNISTDataModule()
114 | model = LitConvClassifier()
115 | 
116 | # Run on as many IPUs as available by default
117 | trainer = pl.Trainer(
118 |     max_epochs=3,
119 |     default_root_dir="experiments",
120 |     callbacks=[
121 |         EarlyStopping(monitor="val_loss", mode="min"),
122 |         ModelSummary(max_depth=-1),
123 |     ],
124 |     precision="16-mixed",
125 |     limit_train_batches=0.1,
126 |     limit_val_batches=0.01,
127 |     accelerator="auto",
128 |     devices="auto",
129 |     strategy="auto",
130 | )
131 | 
132 | # equivalent to
133 | trainer = pl.Trainer(
134 |     max_epochs=3,
135 |     default_root_dir="experiments",
136 |     callbacks=[
137 |         EarlyStopping(monitor="val_loss", mode="min"),
138 |         ModelSummary(max_depth=-1),
139 |     ],
140 |     precision="16-mixed",
141 |     limit_train_batches=0.1,
142 |     limit_val_batches=0.01,
143 | )
144 | 
145 | # Run on one IPU
146 | trainer = pl.Trainer(
147 |     max_epochs=3,
148 |     default_root_dir="experiments",
149 |     callbacks=[
150 |         EarlyStopping(monitor="val_loss", mode="min"),
151 |         ModelSummary(max_depth=-1),
152 |     ],
153 |     precision="16-mixed",
154 |     limit_train_batches=0.1,
155 |     limit_val_batches=0.01,
156 |     accelerator="ipu",
157 |     devices="1",
158 | )
159 | 
160 | # Run on multiple IPUs
161 | trainer = pl.Trainer(
162 |     max_epochs=3,
163 |     default_root_dir="experiments",
164 |     callbacks=[
165 |         EarlyStopping(monitor="val_loss", mode="min"),
166 |         ModelSummary(max_depth=-1),
167 |     ],
168 |     precision="16-mixed",
169 |     limit_train_batches=0.1,
170 |     limit_val_batches=0.01,
171 |     accelerator="ipu",
172 |     devices="8",
173 | )
174 | 
175 | trainer.fit(model, data_module)
176 | 
177 | # Get Predictions
178 | predictions = trainer.predict(model, data_module)
179 | print(len(predictions))
180 | 


--------------------------------------------------------------------------------
/src/advanced/level_17_advanced_checkpointing/checkpoint.py:
--------------------------------------------------------------------------------
  1 | # Documentation Link
  2 | # https://lightning.ai/docs/pytorch/stable/common/checkpointing_advanced.html
  3 | 
  4 | import lightning.pytorch as pl
  5 | import torch
  6 | from lightning.pytorch.callbacks import ModelSummary
  7 | from lightning.pytorch.callbacks.early_stopping import EarlyStopping
  8 | from torch import nn
  9 | from torch.nn import functional as F
 10 | from torch.utils.data import DataLoader, random_split
 11 | from torchvision import transforms
 12 | from torchvision.datasets import MNIST
 13 | 
 14 | 
 15 | class MNISTDataModule(pl.LightningDataModule):
 16 |     def __init__(self, data_dir: str = "./"):
 17 |         super().__init__()
 18 |         self.data_dir = data_dir
 19 |         self.transform = transforms.Compose(
 20 |             [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]
 21 |         )
 22 | 
 23 |     def prepare_data(self):
 24 |         # download
 25 |         MNIST(self.data_dir, train=True, download=True)
 26 |         MNIST(self.data_dir, train=False, download=True)
 27 | 
 28 |     def setup(self, stage: str):
 29 |         # Assign train/val datasets for use in dataloaders
 30 |         if stage == "fit":
 31 |             mnist_full = MNIST(self.data_dir, train=True, transform=self.transform)
 32 |             self.mnist_train, self.mnist_val = random_split(mnist_full, [55000, 5000])
 33 | 
 34 |         # Assign test dataset for use in dataloader(s)
 35 |         if stage == "test":
 36 |             self.mnist_test = MNIST(
 37 |                 self.data_dir, train=False, transform=self.transform
 38 |             )
 39 | 
 40 |         if stage == "predict":
 41 |             self.mnist_predict = MNIST(
 42 |                 self.data_dir, train=False, transform=self.transform
 43 |             )
 44 | 
 45 |     def train_dataloader(self):
 46 |         return DataLoader(self.mnist_train, batch_size=32)
 47 | 
 48 |     def val_dataloader(self):
 49 |         return DataLoader(self.mnist_val, batch_size=32)
 50 | 
 51 |     def test_dataloader(self):
 52 |         return DataLoader(self.mnist_test, batch_size=32)
 53 | 
 54 |     def predict_dataloader(self):
 55 |         return DataLoader(self.mnist_predict, batch_size=32)
 56 | 
 57 | 
 58 | # In this example we will learn how to modify a checkpoint
 59 | # We create a custom attribute train_batches_processed and increment it in the training_step
 60 | # We then modify the checkpoint to save this attribute
 61 | class LitConvClassifier(pl.LightningModule):
 62 |     def __init__(self, learning_rate=1e-3):
 63 |         super().__init__()
 64 |         self.save_hyperparameters()
 65 |         self.example_input_array = torch.rand(1, 1, 28, 28)
 66 | 
 67 |         self.learning_rate = learning_rate
 68 | 
 69 |         # Custom attribute to keep track of training batches processed
 70 |         self.train_batches_processed = 0
 71 | 
 72 |         # Define blocks of layers as submodules
 73 |         self.conv_block1 = nn.Sequential(
 74 |             nn.Conv2d(1, 32, 3, stride=1, padding=1), nn.ReLU(), nn.MaxPool2d(2)
 75 |         )
 76 | 
 77 |         self.conv_block2 = nn.Sequential(
 78 |             nn.Conv2d(32, 64, 3, stride=1, padding=1), nn.ReLU(), nn.MaxPool2d(2)
 79 |         )
 80 | 
 81 |         self.fc_block = nn.Sequential(
 82 |             nn.Linear(64 * 7 * 7, 128), nn.ReLU(), nn.Linear(128, 10)
 83 |         )
 84 | 
 85 |     def forward(self, x):
 86 |         x = self.conv_block1(x)
 87 |         x = self.conv_block2(x)
 88 |         x = x.view(x.size(0), -1)
 89 |         x = self.fc_block(x)
 90 |         return x
 91 | 
 92 |     def training_step(self, batch, batch_idx):
 93 |         x, y = batch
 94 |         y_hat = self(x)
 95 |         loss = F.cross_entropy(y_hat, y)
 96 | 
 97 |         # Increment custom attribute train_batches_processed
 98 |         self.train_batches_processed += 1
 99 |         self.log("train_batches_processed", self.train_batches_processed)
100 | 
101 |         return loss
102 | 
103 |     def on_save_checkpoint(self, checkpoint):
104 |         # Add the custom attribute to the checkpoint
105 |         checkpoint["train_batches_processed"] = self.train_batches_processed
106 | 
107 |     def on_load_checkpoint(self, checkpoint):
108 |         # Load the custom attribute from the checkpoint
109 |         self.train_batches_processed = checkpoint.get("train_batches_processed", 0)
110 | 
111 |     def validation_step(self, batch, batch_idx):
112 |         x, y = batch
113 |         y_hat = self(x)
114 |         loss = F.cross_entropy(y_hat, y)
115 |         self.log("val_loss", loss)
116 |         return loss
117 | 
118 |     def test_step(self, batch, batch_idx):
119 |         x, y = batch
120 |         y_hat = self(x)
121 |         loss = F.cross_entropy(y_hat, y)
122 |         return loss
123 | 
124 |     def predict_step(self, batch, batch_idx, dataloader_idx=None):
125 |         x, _ = batch
126 |         return self(x)
127 | 
128 |     def configure_optimizers(self):
129 |         optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)
130 |         return optimizer
131 | 
132 | 
133 | data_module = MNISTDataModule()
134 | model = LitConvClassifier()
135 | 
136 | trainer = pl.Trainer(
137 |     max_epochs=3,
138 |     default_root_dir="experiments",
139 |     callbacks=[
140 |         EarlyStopping(monitor="val_loss", mode="min"),
141 |         ModelSummary(max_depth=-1),
142 |     ],
143 |     precision="16-mixed",
144 |     limit_train_batches=0.1,
145 |     limit_val_batches=0.01,
146 | )
147 | 
148 | trainer.fit(model, data_module)
149 | 
150 | # Manually load the saved checkpoint
151 | checkpoint_path = trainer.checkpoint_callback.best_model_path
152 | print(f"\nLoading checkpoint from: {checkpoint_path}")
153 | 
154 | # Load the model from the checkpoint
155 | loaded_model = LitConvClassifier.load_from_checkpoint(checkpoint_path)
156 | 
157 | # Print the custom attribute stored in the checkpoint
158 | # This is to check if the custom attribute is stored and loaded correctly
159 | print(
160 |     f"\nTrain batches processed (from checkpoint): {loaded_model.train_batches_processed}"
161 | )
162 | 
163 | # Get Predictions
164 | predictions = trainer.predict(model, data_module)
165 | print(len(predictions))
166 | 


--------------------------------------------------------------------------------
/src/intermediate/level_13_profiler/01_advanced_profiler.py:
--------------------------------------------------------------------------------
  1 | # Documentation Link
  2 | # https://lightning.ai/docs/pytorch/stable/tuning/profiler_basic.html
  3 | 
  4 | import lightning.pytorch as pl
  5 | import torch
  6 | from lightning.pytorch.callbacks import (
  7 |     DeviceStatsMonitor,
  8 |     ModelSummary,
  9 |     StochasticWeightAveraging,
 10 | )
 11 | from lightning.pytorch.callbacks.early_stopping import EarlyStopping
 12 | from torch import nn
 13 | from torch.nn import functional as F
 14 | from torch.utils.data import DataLoader, random_split
 15 | from torchvision import transforms
 16 | from torchvision.datasets import MNIST
 17 | 
 18 | 
 19 | class MNISTDataModule(pl.LightningDataModule):
 20 |     def __init__(self, data_dir: str = "./"):
 21 |         super().__init__()
 22 |         self.data_dir = data_dir
 23 |         self.transform = transforms.Compose(
 24 |             [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]
 25 |         )
 26 | 
 27 |     def prepare_data(self):
 28 |         # download
 29 |         MNIST(self.data_dir, train=True, download=True)
 30 |         MNIST(self.data_dir, train=False, download=True)
 31 | 
 32 |     def setup(self, stage: str):
 33 |         # Assign train/val datasets for use in dataloaders
 34 |         if stage == "fit":
 35 |             mnist_full = MNIST(self.data_dir, train=True, transform=self.transform)
 36 |             self.mnist_train, self.mnist_val = random_split(mnist_full, [55000, 5000])
 37 | 
 38 |         # Assign test dataset for use in dataloader(s)
 39 |         if stage == "test":
 40 |             self.mnist_test = MNIST(
 41 |                 self.data_dir, train=False, transform=self.transform
 42 |             )
 43 | 
 44 |         if stage == "predict":
 45 |             self.mnist_predict = MNIST(
 46 |                 self.data_dir, train=False, transform=self.transform
 47 |             )
 48 | 
 49 |     def train_dataloader(self):
 50 |         return DataLoader(self.mnist_train, batch_size=32)
 51 | 
 52 |     def val_dataloader(self):
 53 |         return DataLoader(self.mnist_val, batch_size=32)
 54 | 
 55 |     def test_dataloader(self):
 56 |         return DataLoader(self.mnist_test, batch_size=32)
 57 | 
 58 |     def predict_dataloader(self):
 59 |         return DataLoader(self.mnist_predict, batch_size=32)
 60 | 
 61 | 
 62 | class LitConvClassifier(pl.LightningModule):
 63 |     def __init__(self, learning_rate=1e-3):
 64 |         super().__init__()
 65 |         self.save_hyperparameters()
 66 |         self.example_input_array = torch.rand(1, 1, 28, 28)
 67 | 
 68 |         self.learning_rate = learning_rate
 69 | 
 70 |         # Define blocks of layers as submodules
 71 |         self.conv_block1 = nn.Sequential(
 72 |             nn.Conv2d(1, 32, 3, stride=1, padding=1), nn.ReLU(), nn.MaxPool2d(2)
 73 |         )
 74 | 
 75 |         self.conv_block2 = nn.Sequential(
 76 |             nn.Conv2d(32, 64, 3, stride=1, padding=1), nn.ReLU(), nn.MaxPool2d(2)
 77 |         )
 78 | 
 79 |         self.fc_block = nn.Sequential(
 80 |             nn.Linear(64 * 7 * 7, 128), nn.ReLU(), nn.Linear(128, 10)
 81 |         )
 82 | 
 83 |     def forward(self, x):
 84 |         x = self.conv_block1(x)
 85 |         x = self.conv_block2(x)
 86 |         x = x.view(x.size(0), -1)
 87 |         x = self.fc_block(x)
 88 |         return x
 89 | 
 90 |     def training_step(self, batch, batch_idx):
 91 |         x, y = batch
 92 |         y_hat = self(x)
 93 |         loss = F.cross_entropy(y_hat, y)
 94 |         return loss
 95 | 
 96 |     def validation_step(self, batch, batch_idx):
 97 |         x, y = batch
 98 |         y_hat = self(x)
 99 |         loss = F.cross_entropy(y_hat, y)
100 |         self.log("val_loss", loss)
101 |         return loss
102 | 
103 |     def test_step(self, batch, batch_idx):
104 |         x, y = batch
105 |         y_hat = self(x)
106 |         loss = F.cross_entropy(y_hat, y)
107 |         return loss
108 | 
109 |     def predict_step(self, batch, batch_idx, dataloader_idx=None):
110 |         x, _ = batch
111 |         return self(x)
112 | 
113 |     def configure_optimizers(self):
114 |         optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)
115 |         return optimizer
116 | 
117 | 
118 | data_module = MNISTDataModule()
119 | model = LitConvClassifier()
120 | 
121 | 
122 | # Profiling helps you find bottlenecks in your code by capturing
123 | # analytics such as how long a function takes or how much memory is used.
124 | 
125 | # The most basic profile measures all the key methods across Callbacks,
126 | # DataModules and the LightningModule in the training loop.
127 | print("------------------------------")
128 | print("Basic Profiler")
129 | print("------------------------------")
130 | trainer = pl.Trainer(
131 |     max_epochs=1,
132 |     default_root_dir="experiments/",
133 |     callbacks=[
134 |         EarlyStopping(monitor="val_loss", mode="min"),
135 |         ModelSummary(max_depth=-1),
136 |         StochasticWeightAveraging(swa_lrs=1e-2),
137 |     ],
138 |     precision="16-mixed",
139 |     profiler="simple",
140 |     limit_train_batches=0.1,
141 |     limit_val_batches=0.01,
142 | )
143 | 
144 | trainer.fit(model, data_module)
145 | 
146 | # To profile the time within every function, use the AdvancedProfiler built on top of Python’s cProfiler.
147 | print("------------------------------")
148 | print("Advanced Profiler")
149 | print("------------------------------")
150 | trainer = pl.Trainer(
151 |     max_epochs=1,
152 |     default_root_dir="experiments/",
153 |     callbacks=[
154 |         EarlyStopping(monitor="val_loss", mode="min"),
155 |         ModelSummary(max_depth=-1),
156 |         StochasticWeightAveraging(swa_lrs=1e-2),
157 |     ],
158 |     precision="16-mixed",
159 |     profiler="advanced",
160 |     limit_train_batches=0.1,
161 |     limit_val_batches=0.01,
162 | )
163 | 
164 | trainer.fit(model, data_module)
165 | 
166 | # Get Predictions
167 | predictions = trainer.predict(model, data_module)
168 | print(len(predictions))
169 | 
170 | # Another helpful technique to detect bottlenecks is to ensure that
171 | # you’re using the full capacity of your accelerator (GPU/TPU/IPU/HPU).
172 | # This can be measured with the DeviceStatsMonitor:
173 | print("------------------------------")
174 | print("Device Stats Monitor")
175 | print("------------------------------")
176 | trainer = pl.Trainer(
177 |     max_epochs=1,
178 |     default_root_dir="experiments/",
179 |     callbacks=[
180 |         EarlyStopping(monitor="val_loss", mode="min"),
181 |         ModelSummary(max_depth=-1),
182 |         StochasticWeightAveraging(swa_lrs=1e-2),
183 |         DeviceStatsMonitor(),
184 |     ],
185 |     precision="16-mixed",
186 |     profiler="advanced",
187 |     limit_train_batches=0.1,
188 |     limit_val_batches=0.01,
189 | )
190 | 
191 | trainer.fit(model, data_module)
192 | 


--------------------------------------------------------------------------------
/src/advanced/level_19_hpu/hpu.py:
--------------------------------------------------------------------------------
  1 | # Documentation Link
  2 | # https://lightning.ai/docs/pytorch/stable/integrations/hpu/basic.html
  3 | 
  4 | import lightning.pytorch as pl
  5 | import torch
  6 | from lightning.pytorch.callbacks import ModelSummary
  7 | from lightning.pytorch.callbacks.early_stopping import EarlyStopping
  8 | 
  9 | # Import the HPUAccelerator
 10 | from lightning_habana.pytorch.accelerator import HPUAccelerator
 11 | from lightning_habana.pytorch.strategies import HPUParallelStrategy
 12 | from torch import nn
 13 | from torch.nn import functional as F
 14 | from torch.utils.data import DataLoader, random_split
 15 | from torchvision import transforms
 16 | from torchvision.datasets import MNIST
 17 | 
 18 | 
 19 | class MNISTDataModule(pl.LightningDataModule):
 20 |     def __init__(self, data_dir: str = "./"):
 21 |         super().__init__()
 22 |         self.data_dir = data_dir
 23 |         self.transform = transforms.Compose(
 24 |             [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]
 25 |         )
 26 | 
 27 |     def prepare_data(self):
 28 |         MNIST(self.data_dir, train=True, download=True)
 29 |         MNIST(self.data_dir, train=False, download=True)
 30 | 
 31 |     def setup(self, stage: str):
 32 |         # Assign train/val datasets for use in dataloaders
 33 |         if stage == "fit":
 34 |             mnist_full = MNIST(self.data_dir, train=True, transform=self.transform)
 35 |             self.mnist_train, self.mnist_val = random_split(mnist_full, [55000, 5000])
 36 | 
 37 |         # Assign test dataset for use in dataloader(s)
 38 |         if stage == "test":
 39 |             self.mnist_test = MNIST(
 40 |                 self.data_dir, train=False, transform=self.transform
 41 |             )
 42 | 
 43 |         if stage == "predict":
 44 |             self.mnist_predict = MNIST(
 45 |                 self.data_dir, train=False, transform=self.transform
 46 |             )
 47 | 
 48 |     def train_dataloader(self):
 49 |         return DataLoader(self.mnist_train, batch_size=32)
 50 | 
 51 |     def val_dataloader(self):
 52 |         return DataLoader(self.mnist_val, batch_size=32)
 53 | 
 54 |     def test_dataloader(self):
 55 |         return DataLoader(self.mnist_test, batch_size=32)
 56 | 
 57 |     def predict_dataloader(self):
 58 |         return DataLoader(self.mnist_predict, batch_size=32)
 59 | 
 60 | 
 61 | class LitConvClassifier(pl.LightningModule):
 62 |     def __init__(self, learning_rate=1e-3):
 63 |         super().__init__()
 64 |         self.save_hyperparameters()
 65 |         self.example_input_array = torch.rand(1, 1, 28, 28)
 66 | 
 67 |         self.learning_rate = learning_rate
 68 | 
 69 |         # Define blocks of layers as submodules
 70 |         self.conv_block1 = nn.Sequential(
 71 |             nn.Conv2d(1, 32, 3, stride=1, padding=1), nn.ReLU(), nn.MaxPool2d(2)
 72 |         )
 73 | 
 74 |         self.conv_block2 = nn.Sequential(
 75 |             nn.Conv2d(32, 64, 3, stride=1, padding=1), nn.ReLU(), nn.MaxPool2d(2)
 76 |         )
 77 | 
 78 |         self.fc_block = nn.Sequential(
 79 |             nn.Linear(64 * 7 * 7, 128), nn.ReLU(), nn.Linear(128, 10)
 80 |         )
 81 | 
 82 |     def forward(self, x):
 83 |         x = self.conv_block1(x)
 84 |         x = self.conv_block2(x)
 85 |         x = x.view(x.size(0), -1)
 86 |         x = self.fc_block(x)
 87 |         return x
 88 | 
 89 |     def training_step(self, batch, batch_idx):
 90 |         x, y = batch
 91 |         y_hat = self(x)
 92 |         loss = F.cross_entropy(y_hat, y)
 93 |         return loss
 94 | 
 95 |     def validation_step(self, batch, batch_idx):
 96 |         x, y = batch
 97 |         y_hat = self(x)
 98 |         loss = F.cross_entropy(y_hat, y)
 99 |         self.log("val_loss", loss)
100 |         return loss
101 | 
102 |     def test_step(self, batch, batch_idx):
103 |         x, y = batch
104 |         y_hat = self(x)
105 |         loss = F.cross_entropy(y_hat, y)
106 |         return loss
107 | 
108 |     def predict_step(self, batch, batch_idx, dataloader_idx=None):
109 |         x, _ = batch
110 |         return self(x)
111 | 
112 |     def configure_optimizers(self):
113 |         optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)
114 |         return optimizer
115 | 
116 | 
117 | data_module = MNISTDataModule()
118 | model = LitConvClassifier()
119 | 
120 | # Run on as many Gaudi devices as available by default
121 | trainer = pl.Trainer(
122 |     max_epochs=3,
123 |     default_root_dir="experiments",
124 |     callbacks=[
125 |         EarlyStopping(monitor="val_loss", mode="min"),
126 |         ModelSummary(max_depth=-1),
127 |     ],
128 |     precision="bf16-mixed",
129 |     limit_train_batches=0.1,
130 |     limit_val_batches=0.01,
131 |     accelerator="auto",
132 |     devices="auto",
133 |     strategy="auto",
134 | )
135 | 
136 | # equivalent to
137 | trainer = pl.Trainer(
138 |     max_epochs=3,
139 |     default_root_dir="experiments",
140 |     callbacks=[
141 |         EarlyStopping(monitor="val_loss", mode="min"),
142 |         ModelSummary(max_depth=-1),
143 |     ],
144 |     precision="bf16-mixed",
145 |     limit_train_batches=0.1,
146 |     limit_val_batches=0.01,
147 | )
148 | 
149 | # Run on one Gaudi device
150 | trainer = pl.Trainer(
151 |     max_epochs=3,
152 |     default_root_dir="experiments",
153 |     callbacks=[
154 |         EarlyStopping(monitor="val_loss", mode="min"),
155 |         ModelSummary(max_depth=-1),
156 |     ],
157 |     precision="bf16-mixed",
158 |     limit_train_batches=0.1,
159 |     limit_val_batches=0.01,
160 |     accelerator=HPUAccelerator(),
161 |     devices="1",
162 | )
163 | 
164 | # Run on multiple Gaudi devices
165 | trainer = pl.Trainer(
166 |     max_epochs=3,
167 |     default_root_dir="experiments",
168 |     callbacks=[
169 |         EarlyStopping(monitor="val_loss", mode="min"),
170 |         ModelSummary(max_depth=-1),
171 |     ],
172 |     precision="bf16-mixed",
173 |     limit_train_batches=0.1,
174 |     limit_val_batches=0.01,
175 |     accelerator=HPUAccelerator(),
176 |     devices="8",
177 | )
178 | 
179 | # To train a Lightning model using multiple HPU nodes,
180 | # set the num_nodes parameter with the available nodes in the Trainer class.
181 | hpus = 8
182 | parallel_hpus = [torch.device("hpu")] * hpus
183 | 
184 | trainer = pl.Trainer(
185 |     max_epochs=3,
186 |     default_root_dir="experiments",
187 |     callbacks=[
188 |         EarlyStopping(monitor="val_loss", mode="min"),
189 |         ModelSummary(max_depth=-1),
190 |     ],
191 |     precision="bf16-mixed",
192 |     limit_train_batches=0.1,
193 |     limit_val_batches=0.01,
194 |     accelerator=HPUAccelerator(),
195 |     devices=hpus,
196 |     strategy=HPUParallelStrategy(parallel_devices=parallel_hpus),
197 |     num_nodes=2,
198 | )
199 | 
200 | trainer.fit(model, data_module)
201 | 
202 | # Get Predictions
203 | predictions = trainer.predict(model, data_module)
204 | print(len(predictions))
205 | 


--------------------------------------------------------------------------------
/src/advanced/level_16_own_the_training_loop/01_enable_manual_optimization.py:
--------------------------------------------------------------------------------
  1 | # Documentation Link
  2 | # https://lightning.ai/docs/pytorch/stable/model/build_model_advanced.html
  3 | 
  4 | import lightning.pytorch as pl
  5 | import torch
  6 | from lightning.pytorch.callbacks import ModelSummary
  7 | from lightning.pytorch.callbacks.early_stopping import EarlyStopping
  8 | from torch import nn
  9 | from torch.nn import functional as F
 10 | from torch.utils.data import DataLoader, random_split
 11 | from torchvision import transforms
 12 | from torchvision.datasets import MNIST
 13 | 
 14 | 
 15 | class MNISTDataModule(pl.LightningDataModule):
 16 |     def __init__(self, data_dir: str = "./"):
 17 |         super().__init__()
 18 |         self.data_dir = data_dir
 19 |         self.transform = transforms.Compose(
 20 |             [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]
 21 |         )
 22 | 
 23 |     def prepare_data(self):
 24 |         # download
 25 |         MNIST(self.data_dir, train=True, download=True)
 26 |         MNIST(self.data_dir, train=False, download=True)
 27 | 
 28 |     def setup(self, stage: str):
 29 |         # Assign train/val datasets for use in dataloaders
 30 |         if stage == "fit":
 31 |             mnist_full = MNIST(self.data_dir, train=True, transform=self.transform)
 32 |             self.mnist_train, self.mnist_val = random_split(mnist_full, [55000, 5000])
 33 | 
 34 |         # Assign test dataset for use in dataloader(s)
 35 |         if stage == "test":
 36 |             self.mnist_test = MNIST(
 37 |                 self.data_dir, train=False, transform=self.transform
 38 |             )
 39 | 
 40 |         if stage == "predict":
 41 |             self.mnist_predict = MNIST(
 42 |                 self.data_dir, train=False, transform=self.transform
 43 |             )
 44 | 
 45 |     def train_dataloader(self):
 46 |         return DataLoader(self.mnist_train, batch_size=32)
 47 | 
 48 |     def val_dataloader(self):
 49 |         return DataLoader(self.mnist_val, batch_size=32)
 50 | 
 51 |     def test_dataloader(self):
 52 |         return DataLoader(self.mnist_test, batch_size=32)
 53 | 
 54 |     def predict_dataloader(self):
 55 |         return DataLoader(self.mnist_predict, batch_size=32)
 56 | 
 57 | 
 58 | # Steps to enable Manual Optimization
 59 | # 1. Set `self.automatic_optimization=False`` in your LightningModule’s __init__.
 60 | 
 61 | # 2. Use the following functions and call them manually:
 62 | 
 63 | # 2.1 `self.optimizers()` to access your optimizers (one or multiple)
 64 | 
 65 | # 2.2 `optimizer.zero_grad()` to clear the gradients from the previous training step
 66 | 
 67 | # 2.3 `self.manual_backward(loss)` instead of loss.backward()
 68 | 
 69 | # 2.4 `optimizer.step()` to update your model parameters
 70 | 
 71 | # 2.5 `self.toggle_optimizer()` and `self.untoggle_optimizer()` if needed
 72 | 
 73 | 
 74 | class LitConvClassifier(pl.LightningModule):
 75 |     def __init__(self, learning_rate=1e-3):
 76 |         super().__init__()
 77 |         self.save_hyperparameters()
 78 |         self.example_input_array = torch.rand(1, 1, 28, 28)
 79 | 
 80 |         self.learning_rate = learning_rate
 81 | 
 82 |         # Enable manual optimization
 83 |         self.automatic_optimization = False
 84 | 
 85 |         self.conv_block1 = nn.Sequential(
 86 |             nn.Conv2d(1, 32, 3, stride=1, padding=1), nn.ReLU(), nn.MaxPool2d(2)
 87 |         )
 88 | 
 89 |         self.conv_block2 = nn.Sequential(
 90 |             nn.Conv2d(32, 64, 3, stride=1, padding=1), nn.ReLU(), nn.MaxPool2d(2)
 91 |         )
 92 | 
 93 |         self.fc_block = nn.Sequential(
 94 |             nn.Linear(64 * 7 * 7, 128), nn.ReLU(), nn.Linear(128, 10)
 95 |         )
 96 | 
 97 |     def forward(self, x):
 98 |         x = self.conv_block1(x)
 99 |         x = self.conv_block2(x)
100 |         x = x.view(x.size(0), -1)
101 |         x = self.fc_block(x)
102 |         return x
103 | 
104 |     # Define the compute_loss method
105 |     def compute_loss(self, batch):
106 |         x, y = batch
107 |         logits = self(x)  # Pass inputs through the model
108 |         return F.cross_entropy(logits, y)  # Calculate cross-entropy loss
109 | 
110 |     # Here are three examples of how to use manual optimization in Lightning
111 |     # Uncomment one of the examples to try it out!
112 | 
113 |     # Example 1: Basic Manual Optimization
114 |     def training_step(self, batch, batch_idx):
115 |         opt = self.optimizers()
116 |         opt.zero_grad()
117 |         loss = self.compute_loss(batch)
118 |         self.manual_backward(loss)
119 |         opt.step()
120 | 
121 |         return loss
122 | 
123 |     # # Example 2: Gradient Accumulation
124 |     # def training_step(self, batch, batch_idx, N=5):
125 |     #     opt = self.optimizers()
126 | 
127 |     #     # scale losses by 1/N (for N batches of gradient accumulation)
128 |     #     loss = self.compute_loss(batch) / N
129 |     #     self.manual_backward(loss)
130 | 
131 |     #     # accumulate gradients of N batches
132 |     #     if (batch_idx + 1) % N == 0:
133 |     #         opt.step()
134 |     #         opt.zero_grad()
135 | 
136 |     #     return loss
137 | 
138 |     # Example 3: Gradient Clipping
139 |     # def training_step(self, batch, batch_idx):
140 |     #     opt = self.optimizers()
141 | 
142 |     #     # compute loss
143 |     #     loss = self.compute_loss(batch)
144 | 
145 |     #     opt.zero_grad()
146 |     #     self.manual_backward(loss)
147 | 
148 |     #     # clip gradients
149 |     #     self.clip_gradients(opt, gradient_clip_val=0.5, gradient_clip_algorithm="norm")
150 | 
151 |     #     opt.step()
152 | 
153 |     #     return loss
154 | 
155 |     def validation_step(self, batch, batch_idx):
156 |         x, y = batch
157 |         y_hat = self(x)
158 |         loss = F.cross_entropy(y_hat, y)
159 |         self.log("val_loss", loss)
160 |         return loss
161 | 
162 |     def test_step(self, batch, batch_idx):
163 |         x, y = batch
164 |         y_hat = self(x)
165 |         loss = F.cross_entropy(y_hat, y)
166 |         return loss
167 | 
168 |     def predict_step(self, batch, batch_idx, dataloader_idx=None):
169 |         x, _ = batch
170 |         return self(x)
171 | 
172 |     def configure_optimizers(self):
173 |         optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)
174 | 
175 |         return optimizer
176 | 
177 | 
178 | data_module = MNISTDataModule()
179 | model = LitConvClassifier()
180 | 
181 | trainer = pl.Trainer(
182 |     max_epochs=3,
183 |     default_root_dir="experiments",
184 |     callbacks=[
185 |         EarlyStopping(monitor="val_loss", mode="min"),
186 |         ModelSummary(max_depth=-1),
187 |     ],
188 |     precision="16-mixed",
189 |     limit_train_batches=0.1,
190 |     limit_val_batches=0.01,
191 | )
192 | 
193 | trainer.fit(model, data_module)
194 | 
195 | # Get Predictions
196 | predictions = trainer.predict(model, data_module)
197 | print(len(predictions))
198 | 


--------------------------------------------------------------------------------