├── .gitignore
├── requirements.txt
├── LICENSE
├── conda.yml
├── src
├── basic
│ ├── level_05_pretrained_model
│ │ └── pretrained_model.py
│ ├── level_01_lightning_module
│ │ └── lightning_module.py
│ ├── level_02_validation_and_testing
│ │ └── validate_and_test_model.py
│ ├── level_03_checkpointing
│ │ ├── 03_disabling_and_resuming_checkpoints.py
│ │ ├── 02_checkpoints_with_nn_module.py
│ │ └── 01_saving_and_loading_checkpoints.py
│ ├── level_07_inference
│ │ └── lightining_predict_step.py
│ ├── level_04_early_stopping
│ │ └── early_stopping.py
│ └── level_06_debugging_model
│ │ ├── 01_quick_run.py
│ │ └── 02_model_summary.py
├── intermediate
│ ├── level_12_deploying_models
│ │ └── 01_onnx.py
│ ├── level_11_scaling_techniques
│ │ ├── 01_precision_training.py
│ │ ├── 04_stochastic_weight_averaging.py
│ │ ├── 02_gradient_accumulation.py
│ │ └── 03_gradient_clipping.py
│ ├── level_08_accelerated_hardware
│ │ ├── 02_tpu_traininig.py
│ │ └── 01_gpu_training.py
│ ├── level_13_profiler
│ │ ├── 02_profile_pytorch_operations.py
│ │ └── 01_advanced_profiler.py
│ └── level_09_modularize
│ │ └── 01_lightning_datamodule.py
└── advanced
│ ├── level_15_modify_trainer
│ ├── 01_create_callbacks.py
│ └── 02_customize_progress_bar.py
│ ├── level_14_run_with_config_file
│ └── run_with_yaml.py
│ ├── level_18_ipu
│ └── ipu.py
│ ├── level_17_advanced_checkpointing
│ └── checkpoint.py
│ ├── level_19_hpu
│ └── hpu.py
│ └── level_16_own_the_training_loop
│ └── 01_enable_manual_optimization.py
└── README.md
/.gitignore:
--------------------------------------------------------------------------------
1 | lightning_logs
2 | __pycache__
3 | MNIST
4 | .DS_STORE
5 | cifar*
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | aiohttp==3.9.5
2 | aiosignal==1.3.1
3 | async-timeout==4.0.3
4 | attrs==23.2.0
5 | black==24.4.2
6 | click==8.1.7
7 | coloredlogs==15.0.1
8 | docstring_parser==0.16
9 | filelock==3.14.0
10 | flatbuffers==24.3.25
11 | frozenlist==1.4.1
12 | fsspec==2024.3.1
13 | humanfriendly==10.0
14 | idna==3.7
15 | importlib_resources==6.4.0
16 | isort==5.13.2
17 | Jinja2==3.1.3
18 | jsonargparse==4.28.0
19 | lightning==2.2.4
20 | lightning-utilities==0.11.2
21 | markdown-it-py==3.0.0
22 | MarkupSafe==2.1.5
23 | mdurl==0.1.2
24 | mpmath==1.3.0
25 | multidict==6.0.5
26 | mypy-extensions==1.0.0
27 | networkx==3.3
28 | numpy==1.26.4
29 | onnx==1.16.0
30 | onnxruntime==1.17.3
31 | packaging==24.0
32 | pathspec==0.12.1
33 | pillow==10.3.0
34 | platformdirs==4.2.1
35 | protobuf==5.26.1
36 | psutil==5.9.8
37 | Pygments==2.18.0
38 | pytorch-lightning==2.2.4
39 | PyYAML==6.0.1
40 | rich==13.7.1
41 | sympy==1.12
42 | tomli==2.0.1
43 | torch==2.3.0
44 | torchmetrics==1.3.2
45 | torchvision==0.18.0
46 | tqdm==4.66.4
47 | typeshed_client==2.5.1
48 | typing_extensions==4.11.0
49 | yarl==1.9.4
50 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2024 Ishan Dutta
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/conda.yml:
--------------------------------------------------------------------------------
1 | name: lit-env
2 | channels:
3 | - defaults
4 | dependencies:
5 | - bzip2=1.0.8
6 | - ca-certificates=2024.3.11
7 | - libffi=3.4.4
8 | - ncurses=6.4
9 | - openssl=3.0.13
10 | - pip=23.3.1
11 | - python=3.10.14
12 | - readline=8.2
13 | - setuptools=68.2.2
14 | - sqlite=3.45.3
15 | - tk=8.6.12
16 | - tzdata=2024a
17 | - wheel=0.43.0
18 | - xz=5.4.6
19 | - zlib=1.2.13
20 | - pip:
21 | - aiohttp==3.9.5
22 | - aiosignal==1.3.1
23 | - async-timeout==4.0.3
24 | - attrs==23.2.0
25 | - black==24.4.2
26 | - click==8.1.7
27 | - coloredlogs==15.0.1
28 | - docstring-parser==0.16
29 | - filelock==3.14.0
30 | - flatbuffers==24.3.25
31 | - frozenlist==1.4.1
32 | - fsspec==2024.3.1
33 | - humanfriendly==10.0
34 | - idna==3.7
35 | - importlib-resources==6.4.0
36 | - isort==5.13.2
37 | - jinja2==3.1.3
38 | - jsonargparse==4.28.0
39 | - lightning==2.2.4
40 | - lightning-utilities==0.11.2
41 | - markdown-it-py==3.0.0
42 | - markupsafe==2.1.5
43 | - mdurl==0.1.2
44 | - mpmath==1.3.0
45 | - multidict==6.0.5
46 | - mypy-extensions==1.0.0
47 | - networkx==3.3
48 | - numpy==1.26.4
49 | - onnx==1.16.0
50 | - onnxruntime==1.17.3
51 | - packaging==24.0
52 | - pathspec==0.12.1
53 | - pillow==10.3.0
54 | - platformdirs==4.2.1
55 | - protobuf==5.26.1
56 | - psutil==5.9.8
57 | - pygments==2.18.0
58 | - pytorch-lightning==2.2.4
59 | - pyyaml==6.0.1
60 | - rich==13.7.1
61 | - sympy==1.12
62 | - tomli==2.0.1
63 | - torch==2.3.0
64 | - torchmetrics==1.3.2
65 | - torchvision==0.18.0
66 | - tqdm==4.66.4
67 | - typeshed-client==2.5.1
68 | - typing-extensions==4.11.0
69 | - yarl==1.9.4
70 |
--------------------------------------------------------------------------------
/src/basic/level_05_pretrained_model/pretrained_model.py:
--------------------------------------------------------------------------------
1 | # Documentation Link
2 | # https://lightning.ai/docs/pytorch/stable/advanced/transfer_learning.html
3 |
4 | import os
5 |
6 | import lightning.pytorch as pl
7 | import torch
8 | import torchvision.models as models
9 | from torch import nn
10 | from torch.nn import functional as F
11 | from torchvision import datasets, transforms
12 |
13 |
14 | # Define the Lightning Module
15 | class ImagenetTransferLearning(pl.LightningModule):
16 | def __init__(self, learning_rate=1e-3):
17 | super().__init__()
18 |
19 | # init a pretrained resnet
20 | backbone = models.resnet50(weights="DEFAULT")
21 | num_filters = backbone.fc.in_features
22 | layers = list(backbone.children())[:-1]
23 | self.feature_extractor = nn.Sequential(*layers)
24 |
25 | # use the pretrained model to classify cifar-10 (10 image classes)
26 | num_target_classes = 10
27 | self.classifier = nn.Linear(num_filters, num_target_classes)
28 | self.learning_rate = learning_rate
29 |
30 | def forward(self, x):
31 | self.feature_extractor.eval()
32 | with torch.no_grad():
33 | representations = self.feature_extractor(x).flatten(1)
34 | x = self.classifier(representations)
35 | return x
36 |
37 | def training_step(self, batch, batch_idx):
38 | x, y = batch
39 | y_hat = self(x)
40 | loss = F.cross_entropy(y_hat, y)
41 | return loss
42 |
43 | def configure_optimizers(self):
44 | return torch.optim.Adam(self.parameters(), lr=self.learning_rate)
45 |
46 |
47 | # Data preparation
48 | transform = transforms.Compose(
49 | [
50 | transforms.Resize((224, 224)), # ResNet50 expects 224x224 input size
51 | transforms.ToTensor(),
52 | transforms.Normalize(
53 | mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
54 | ), # Normalization for Imagenet data
55 | ]
56 | )
57 |
58 | train_dataset = datasets.CIFAR10(
59 | root="./", train=True, transform=transform, download=True
60 | )
61 | # Use a subset of the training data for demonstration purposes
62 | train_dataset = torch.utils.data.Subset(train_dataset, indices=list(range(100)))
63 |
64 | train_dataloader = torch.utils.data.DataLoader(
65 | train_dataset, batch_size=32, shuffle=True
66 | )
67 |
68 | # Training
69 | model = ImagenetTransferLearning()
70 | trainer = pl.Trainer(max_epochs=1)
71 | trainer.fit(model, train_dataloader)
72 |
73 | # Save the model checkpoint
74 | trainer.save_checkpoint("example_model.ckpt")
75 |
76 | # Inference
77 | loaded_model = ImagenetTransferLearning.load_from_checkpoint("example_model.ckpt")
78 | loaded_model.freeze()
79 |
80 | # Load some CIFAR10 images for prediction (assuming you're using the same transform as above)
81 | test_dataset = datasets.CIFAR10(root="./", train=False, transform=transform)
82 | test_dataloader = torch.utils.data.DataLoader(
83 | test_dataset, batch_size=5
84 | ) # Loading 5 images for demonstration
85 | some_images_from_cifar10, _ = next(iter(test_dataloader))
86 |
87 | predictions = loaded_model(some_images_from_cifar10)
88 | print(predictions.argmax(dim=1))
89 |
--------------------------------------------------------------------------------
/src/basic/level_01_lightning_module/lightning_module.py:
--------------------------------------------------------------------------------
1 | # Documentation Link
2 | # https://lightning.ai/docs/pytorch/stable/model/train_model_basic.html
3 |
4 | import lightning.pytorch as pl
5 | import torch
6 | from torch import nn
7 | from torch.nn import functional as F
8 | from torch.utils.data import DataLoader
9 | from torchvision import transforms
10 | from torchvision.datasets import MNIST
11 |
12 |
13 | # A simple convolution based classifier model for MNIST
14 | class LitConvClassifier(pl.LightningModule):
15 | def __init__(self):
16 | super().__init__()
17 | # Define the layers for the model architecture
18 |
19 | # Convolutional layer with 32 filters of size 3x3
20 | # ReLU activation function introduces non-linearity to the model, enabling it to learn more complex patterns
21 | self.conv1 = nn.Conv2d(1, 32, 3, stride=1, padding=1)
22 |
23 | # Second convolutional layer with 64 filters of size 3x3
24 | self.conv2 = nn.Conv2d(32, 64, 3, stride=1, padding=1)
25 |
26 | # Fully connected layers for classification
27 | # The input size 64*7*7 corresponds to the flattened output of the last convolutional layer
28 | self.fc1 = nn.Linear(64 * 7 * 7, 128)
29 | self.fc2 = nn.Linear(128, 10)
30 |
31 | def forward(self, x):
32 | # Define the forward pass through the network
33 | # Input shape: (batch_size, 1, 28, 28)
34 | x = F.relu(self.conv1(x)) # Shape: (batch_size, 32, 28, 28)
35 | x = F.max_pool2d(x, 2) # Shape: (batch_size, 32, 14, 14)
36 | x = F.relu(self.conv2(x)) # Shape: (batch_size, 64, 14, 14)
37 | x = F.max_pool2d(x, 2) # Shape: (batch_size, 64, 7, 7)
38 | x = x.view(x.size(0), -1) # Shape: (batch_size, 64*7*7)
39 | x = F.relu(self.fc1(x)) # Shape: (batch_size, 128)
40 | x = self.fc2(x) # Shape: (batch_size, 10)
41 | return x
42 |
43 | def training_step(self, batch, batch_idx):
44 | # Define the training step which includes
45 | # the forward pass, loss calculation and backpropagation
46 |
47 | x, y = batch # Unpack batch
48 | y_hat = self(x) # Forward pass, get predicted logits
49 |
50 | # Calculate loss using cross-entropy, which is suitable for multi-class classification
51 | loss = F.cross_entropy(y_hat, y)
52 | return loss
53 |
54 | def configure_optimizers(self):
55 | # Define the optimizer to use for training
56 | # Adam is a popular choice due to its adaptive learning rate and momentum
57 | optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)
58 | return optimizer
59 |
60 |
61 | # Load Dataset
62 | # MNIST is a widely used dataset for handwritten digit recognition
63 | dataset = MNIST("./", download=True, transform=transforms.ToTensor())
64 |
65 | # Create a Dataloader with batch size of 32
66 | # Batch size is a hyperparameter that defines the number of
67 | # samples to work through before updating the model's weights
68 | train_dataloader = DataLoader(dataset, batch_size=32)
69 |
70 | # Initialise the model
71 | model = LitConvClassifier()
72 |
73 | # Initialise the trainer with 1 epoch
74 | # An epoch is a complete pass through the entire training dataset
75 | trainer = pl.Trainer(max_epochs=1)
76 |
77 | # Train the model
78 | trainer.fit(model, train_dataloader)
79 |
--------------------------------------------------------------------------------
/src/basic/level_02_validation_and_testing/validate_and_test_model.py:
--------------------------------------------------------------------------------
1 | # Documentation Link
2 | # https://lightning.ai/docs/pytorch/stable/common/evaluation_basic.html
3 |
4 | import os
5 |
6 | import lightning.pytorch as pl
7 | import torch
8 | from torch import nn
9 | from torch.nn import functional as F
10 | from torch.utils.data import DataLoader
11 | from torchvision import transforms
12 | from torchvision.datasets import MNIST
13 |
14 |
15 | class LitConvClassifier(pl.LightningModule):
16 | def __init__(self):
17 | super().__init__()
18 |
19 | self.conv1 = nn.Conv2d(1, 32, 3, stride=1, padding=1)
20 | self.conv2 = nn.Conv2d(32, 64, 3, stride=1, padding=1)
21 | self.fc1 = nn.Linear(64 * 7 * 7, 128)
22 | self.fc2 = nn.Linear(128, 10)
23 |
24 | def forward(self, x):
25 | # Define the forward pass through the network
26 | # Input shape: (batch_size, 1, 28, 28)
27 | x = F.relu(self.conv1(x)) # Shape: (batch_size, 32, 28, 28)
28 | x = F.max_pool2d(x, 2) # Shape: (batch_size, 32, 14, 14)
29 | x = F.relu(self.conv2(x)) # Shape: (batch_size, 64, 14, 14)
30 | x = F.max_pool2d(x, 2) # Shape: (batch_size, 64, 7, 7)
31 | x = x.view(x.size(0), -1) # Shape: (batch_size, 64*7*7)
32 | x = F.relu(self.fc1(x)) # Shape: (batch_size, 128)
33 | x = self.fc2(x) # Shape: (batch_size, 10)
34 | return x
35 |
36 | def training_step(self, batch, batch_idx):
37 | x, y = batch
38 | y_hat = self(x)
39 |
40 | loss = F.cross_entropy(y_hat, y)
41 | return loss
42 |
43 | def validation_step(self, batch, batch_idx):
44 | # The validation step is performed once per batch of data from the validation set.
45 | # It's used to check the model's performance on the validation set during training.
46 | x, y = batch
47 | y_hat = self(x)
48 |
49 | loss = F.cross_entropy(y_hat, y)
50 | return loss
51 |
52 | def test_step(self, batch, batch_idx):
53 | # The test step is performed once per batch of data from the test set.
54 | # It's used to assess the model's performance on unseen data after training is complete.
55 | x, y = batch
56 | y_hat = self(x)
57 |
58 | loss = F.cross_entropy(y_hat, y)
59 | return loss
60 |
61 | def configure_optimizers(self):
62 | optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)
63 | return optimizer
64 |
65 |
66 | train_dataset = MNIST(
67 | "./", download=True, train=True, transform=transforms.ToTensor()
68 | )
69 |
70 | # Calculate training and validation split
71 | # We will keep 80% data for training and 20% for validation
72 | train_size = int(0.8 * len(train_dataset))
73 | val_size = len(train_dataset) - train_size
74 |
75 | # Split the dataset into training and validation
76 | seed = torch.Generator().manual_seed(42)
77 | train_dataset, val_dataset = torch.utils.data.random_split(
78 | train_dataset, [train_size, val_size], generator=seed
79 | )
80 |
81 | test_dataset = MNIST(
82 | "./", download=True, train=False, transform=transforms.ToTensor()
83 | )
84 |
85 | # Create data loaders for loading the data in batches
86 | train_dataloader = DataLoader(train_dataset, batch_size=32)
87 | val_dataloader = DataLoader(val_dataset, batch_size=32)
88 | test_dataloader = DataLoader(test_dataset, batch_size=32)
89 |
90 | model = LitConvClassifier()
91 |
92 | trainer = pl.Trainer(max_epochs=1)
93 |
94 | trainer.fit(model, train_dataloader, val_dataloader)
95 |
96 | # Test the model on the test set after training is complete
97 | trainer.test(model, test_dataloader)
98 |
--------------------------------------------------------------------------------
/src/basic/level_03_checkpointing/03_disabling_and_resuming_checkpoints.py:
--------------------------------------------------------------------------------
1 | # Documentation Link:
2 | # https://lightning.ai/docs/pytorch/stable/common/checkpointing_basic.html#save-a-checkpoint
3 |
4 | import os
5 |
6 | import lightning.pytorch as pl
7 | import torch
8 | from torch import nn
9 | from torch.nn import functional as F
10 | from torch.utils.data import DataLoader
11 | from torchvision import transforms
12 | from torchvision.datasets import MNIST
13 |
14 |
15 | class LitConvClassifier(pl.LightningModule):
16 | def __init__(self, learning_rate=1e-3):
17 | super().__init__()
18 | self.save_hyperparameters()
19 |
20 | self.learning_rate = learning_rate
21 | self.conv1 = nn.Conv2d(1, 32, 3, stride=1, padding=1)
22 | self.conv2 = nn.Conv2d(32, 64, 3, stride=1, padding=1)
23 | self.fc1 = nn.Linear(64 * 7 * 7, 128)
24 | self.fc2 = nn.Linear(128, 10)
25 |
26 | def forward(self, x):
27 | x = F.relu(self.conv1(x))
28 | x = F.max_pool2d(x, 2)
29 | x = F.relu(self.conv2(x))
30 | x = F.max_pool2d(x, 2)
31 | x = x.view(x.size(0), -1)
32 | x = F.relu(self.fc1(x))
33 | x = self.fc2(x)
34 | return x
35 |
36 | def training_step(self, batch, batch_idx):
37 | x, y = batch
38 | y_hat = self(x)
39 | loss = F.cross_entropy(y_hat, y)
40 | return loss
41 |
42 | def validation_step(self, batch, batch_idx):
43 | x, y = batch
44 | y_hat = self(x)
45 | loss = F.cross_entropy(y_hat, y)
46 | return loss
47 |
48 | def test_step(self, batch, batch_idx):
49 | x, y = batch
50 | y_hat = self(x)
51 | loss = F.cross_entropy(y_hat, y)
52 | return loss
53 |
54 | def configure_optimizers(self):
55 | optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)
56 | return optimizer
57 |
58 |
59 | def prepare_dataloaders():
60 | train_dataset = MNIST(
61 | "./", download=True, train=True, transform=transforms.ToTensor()
62 | )
63 |
64 | train_size = int(0.8 * len(train_dataset))
65 | val_size = len(train_dataset) - train_size
66 |
67 | seed = torch.Generator().manual_seed(42)
68 | train_dataset, val_dataset = torch.utils.data.random_split(
69 | train_dataset, [train_size, val_size], generator=seed
70 | )
71 |
72 | test_dataset = MNIST(
73 | "./", download=True, train=False, transform=transforms.ToTensor()
74 | )
75 |
76 | train_dataloader = DataLoader(train_dataset, batch_size=32)
77 | val_dataloader = DataLoader(val_dataset, batch_size=32)
78 | test_dataloader = DataLoader(test_dataset, batch_size=32)
79 |
80 | return train_dataloader, val_dataloader, test_dataloader
81 |
82 |
83 | train_dataloader, val_dataloader, test_dataloader = prepare_dataloaders()
84 |
85 | model = LitConvClassifier()
86 |
87 | # You can disable checkpointing by setting the Trainer's enable_checkpointing to False
88 | trainer = pl.Trainer(
89 | max_epochs=1, default_root_dir="experiments/", enable_checkpointing=False
90 | )
91 |
92 | trainer.fit(model, train_dataloader, val_dataloader)
93 |
94 | # To resume training from a checkpoint pass it directly to the fit method
95 | checkpoint_path = (
96 | "experiments/lightning_logs/version_0/checkpoints/epoch=0-step=1500.ckpt"
97 | )
98 |
99 | model = LitConvClassifier()
100 |
101 | trainer = pl.Trainer(max_epochs=1, default_root_dir="experiments/")
102 |
103 | trainer.fit(model, train_dataloader, val_dataloader, ckpt_path=checkpoint_path)
104 |
--------------------------------------------------------------------------------
/src/basic/level_03_checkpointing/02_checkpoints_with_nn_module.py:
--------------------------------------------------------------------------------
1 | # Documentation Link:
2 | # https://lightning.ai/docs/pytorch/stable/common/checkpointing_basic.html#save-a-checkpoint
3 |
4 | import os
5 |
6 | import lightning.pytorch as pl
7 | import torch
8 | from torch import nn
9 | from torch.nn import functional as F
10 | from torch.utils.data import DataLoader
11 | from torchvision import transforms
12 | from torchvision.datasets import MNIST
13 |
14 |
15 | class ConvClassifier(nn.Module):
16 | def __init__(self):
17 | super(ConvClassifier, self).__init__()
18 | self.conv1 = nn.Conv2d(1, 32, 3, stride=1, padding=1)
19 | self.conv2 = nn.Conv2d(32, 64, 3, stride=1, padding=1)
20 | self.fc1 = nn.Linear(64 * 7 * 7, 128)
21 | self.fc2 = nn.Linear(128, 10)
22 |
23 | def forward(self, x):
24 | x = F.relu(self.conv1(x))
25 | x = F.max_pool2d(x, 2)
26 | x = F.relu(self.conv2(x))
27 | x = F.max_pool2d(x, 2)
28 | x = x.view(x.size(0), -1)
29 | x = F.relu(self.fc1(x))
30 | x = self.fc2(x)
31 | return x
32 |
33 |
34 | class LitConvClassifier(pl.LightningModule):
35 | def __init__(self, learning_rate=1e-3):
36 | super().__init__()
37 | self.save_hyperparameters()
38 |
39 | self.learning_rate = learning_rate
40 | self.model = ConvClassifier()
41 |
42 | def forward(self, x):
43 | return self.model(x)
44 |
45 | def training_step(self, batch, batch_idx):
46 | x, y = batch
47 | y_hat = self(x)
48 | loss = F.cross_entropy(y_hat, y)
49 | return loss
50 |
51 | def validation_step(self, batch, batch_idx):
52 | x, y = batch
53 | y_hat = self(x)
54 | loss = F.cross_entropy(y_hat, y)
55 | return loss
56 |
57 | def test_step(self, batch, batch_idx):
58 | x, y = batch
59 | y_hat = self(x)
60 | loss = F.cross_entropy(y_hat, y)
61 | return loss
62 |
63 | def configure_optimizers(self):
64 | optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)
65 | return optimizer
66 |
67 |
68 | def prepare_dataloaders():
69 | train_dataset = MNIST(
70 | "./", download=True, train=True, transform=transforms.ToTensor()
71 | )
72 |
73 | train_size = int(0.8 * len(train_dataset))
74 | val_size = len(train_dataset) - train_size
75 |
76 | seed = torch.Generator().manual_seed(42)
77 | train_dataset, val_dataset = torch.utils.data.random_split(
78 | train_dataset, [train_size, val_size], generator=seed
79 | )
80 |
81 | test_dataset = MNIST(
82 | "./", download=True, train=False, transform=transforms.ToTensor()
83 | )
84 |
85 | train_dataloader = DataLoader(train_dataset, batch_size=32)
86 | val_dataloader = DataLoader(val_dataset, batch_size=32)
87 | test_dataloader = DataLoader(test_dataset, batch_size=32)
88 |
89 | return train_dataloader, val_dataloader, test_dataloader
90 |
91 |
92 | train_dataloader, val_dataloader, test_dataloader = prepare_dataloaders()
93 |
94 | # Train the Model
95 | model = LitConvClassifier()
96 |
97 | trainer = pl.Trainer(max_epochs=1, default_root_dir="experiments/")
98 |
99 | trainer.fit(model, train_dataloader, val_dataloader)
100 |
101 | # Load the module using the state dict
102 |
103 | checkpoint_path = (
104 | "experiments/lightning_logs/version_0/checkpoints/epoch=0-step=1500.ckpt"
105 | )
106 |
107 | checkpoint = torch.load(checkpoint_path)
108 | state_dict = checkpoint["state_dict"]
109 |
110 | classifier_model = ConvClassifier()
111 | classifier_model.load_state_dict(state_dict)
112 |
--------------------------------------------------------------------------------
/src/basic/level_07_inference/lightining_predict_step.py:
--------------------------------------------------------------------------------
1 | # Documentation Link
2 | # https://lightning.ai/docs/pytorch/stable/levels/core_level_6.html
3 |
4 | import os
5 |
6 | import lightning.pytorch as pl
7 | import torch
8 | from lightning.pytorch.callbacks import ModelSummary
9 | from lightning.pytorch.callbacks.early_stopping import EarlyStopping
10 | from torch import nn
11 | from torch.nn import functional as F
12 | from torch.utils.data import DataLoader
13 | from torchvision import transforms
14 | from torchvision.datasets import MNIST
15 |
16 |
17 | class LitConvClassifier(pl.LightningModule):
18 | def __init__(self, learning_rate=1e-3):
19 | super().__init__()
20 | self.save_hyperparameters()
21 | self.example_input_array = torch.rand(1, 1, 28, 28)
22 |
23 | self.learning_rate = learning_rate
24 |
25 | # Define blocks of layers as submodules
26 | self.conv_block1 = nn.Sequential(
27 | nn.Conv2d(1, 32, 3, stride=1, padding=1), nn.ReLU(), nn.MaxPool2d(2)
28 | )
29 |
30 | self.conv_block2 = nn.Sequential(
31 | nn.Conv2d(32, 64, 3, stride=1, padding=1), nn.ReLU(), nn.MaxPool2d(2)
32 | )
33 |
34 | self.fc_block = nn.Sequential(
35 | nn.Linear(64 * 7 * 7, 128), nn.ReLU(), nn.Linear(128, 10)
36 | )
37 |
38 | def forward(self, x):
39 | x = self.conv_block1(x)
40 | x = self.conv_block2(x)
41 | x = x.view(x.size(0), -1)
42 | x = self.fc_block(x)
43 | return x
44 |
45 | def training_step(self, batch, batch_idx):
46 | x, y = batch
47 | y_hat = self(x)
48 | loss = F.cross_entropy(y_hat, y)
49 | return loss
50 |
51 | def validation_step(self, batch, batch_idx):
52 | x, y = batch
53 | y_hat = self(x)
54 | loss = F.cross_entropy(y_hat, y)
55 | self.log("val_loss", loss)
56 | return loss
57 |
58 | def test_step(self, batch, batch_idx):
59 | x, y = batch
60 | y_hat = self(x)
61 | loss = F.cross_entropy(y_hat, y)
62 | return loss
63 |
64 | def predict_step(self, batch, batch_idx, dataloader_idx=None):
65 | x, _ = batch
66 | return self(x)
67 |
68 | def configure_optimizers(self):
69 | optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)
70 | return optimizer
71 |
72 |
73 | def prepare_dataloaders():
74 | train_dataset = MNIST(
75 | "./", download=True, train=True, transform=transforms.ToTensor()
76 | )
77 |
78 | train_size = int(0.8 * len(train_dataset))
79 | val_size = len(train_dataset) - train_size
80 |
81 | seed = torch.Generator().manual_seed(42)
82 | train_dataset, val_dataset = torch.utils.data.random_split(
83 | train_dataset, [train_size, val_size], generator=seed
84 | )
85 |
86 | test_dataset = MNIST(
87 | "./", download=True, train=False, transform=transforms.ToTensor()
88 | )
89 |
90 | train_dataloader = DataLoader(train_dataset, batch_size=32)
91 | val_dataloader = DataLoader(val_dataset, batch_size=32)
92 | test_dataloader = DataLoader(test_dataset, batch_size=32)
93 |
94 | return train_dataloader, val_dataloader, test_dataloader
95 |
96 |
97 | train_dataloader, val_dataloader, test_dataloader = prepare_dataloaders()
98 |
99 | model = LitConvClassifier()
100 |
101 | trainer = pl.Trainer(
102 | max_epochs=1,
103 | default_root_dir="experiments/",
104 | callbacks=[
105 | EarlyStopping(monitor="val_loss", mode="min"),
106 | ModelSummary(max_depth=-1),
107 | ],
108 | )
109 | trainer.fit(model, train_dataloader, val_dataloader)
110 |
111 | # Here we use the test_dataloader to get the predictions for the test set
112 | # You can use the predict_step() method for the required data
113 | predictions = trainer.predict(model, test_dataloader)
114 | print(len(predictions))
115 |
--------------------------------------------------------------------------------
/src/basic/level_03_checkpointing/01_saving_and_loading_checkpoints.py:
--------------------------------------------------------------------------------
1 | # Documentation Link:
2 | # https://lightning.ai/docs/pytorch/stable/common/checkpointing_basic.html#save-a-checkpoint
3 |
4 | import os
5 |
6 | import lightning.pytorch as pl
7 | import torch
8 | from torch import nn
9 | from torch.nn import functional as F
10 | from torch.utils.data import DataLoader
11 | from torchvision import transforms
12 | from torchvision.datasets import MNIST
13 |
14 |
15 | class LitConvClassifier(pl.LightningModule):
16 | def __init__(self, learning_rate=1e-3):
17 | super().__init__()
18 |
19 | # You can save the hyperparameters initialized in the __init__ method
20 | # by calling self.save_hyperparameters() in the __init__ method.
21 | # Here we save the learning_rate hyperparameter.
22 | self.save_hyperparameters()
23 |
24 | self.learning_rate = learning_rate
25 | self.conv1 = nn.Conv2d(1, 32, 3, stride=1, padding=1)
26 | self.conv2 = nn.Conv2d(32, 64, 3, stride=1, padding=1)
27 | self.fc1 = nn.Linear(64 * 7 * 7, 128)
28 | self.fc2 = nn.Linear(128, 10)
29 |
30 | def forward(self, x):
31 | x = F.relu(self.conv1(x))
32 | x = F.max_pool2d(x, 2)
33 | x = F.relu(self.conv2(x))
34 | x = F.max_pool2d(x, 2)
35 | x = x.view(x.size(0), -1)
36 | x = F.relu(self.fc1(x))
37 | x = self.fc2(x)
38 | return x
39 |
40 | def training_step(self, batch, batch_idx):
41 | x, y = batch
42 | y_hat = self(x)
43 | loss = F.cross_entropy(y_hat, y)
44 | return loss
45 |
46 | def validation_step(self, batch, batch_idx):
47 | x, y = batch
48 | y_hat = self(x)
49 | loss = F.cross_entropy(y_hat, y)
50 | return loss
51 |
52 | def test_step(self, batch, batch_idx):
53 | x, y = batch
54 | y_hat = self(x)
55 | loss = F.cross_entropy(y_hat, y)
56 | return loss
57 |
58 | def configure_optimizers(self):
59 | optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)
60 | return optimizer
61 |
62 |
63 | def prepare_dataloaders():
64 | train_dataset = MNIST(
65 | "./", download=True, train=True, transform=transforms.ToTensor()
66 | )
67 |
68 | train_size = int(0.8 * len(train_dataset))
69 | val_size = len(train_dataset) - train_size
70 |
71 | seed = torch.Generator().manual_seed(42)
72 | train_dataset, val_dataset = torch.utils.data.random_split(
73 | train_dataset, [train_size, val_size], generator=seed
74 | )
75 |
76 | test_dataset = MNIST(
77 | "./", download=True, train=False, transform=transforms.ToTensor()
78 | )
79 |
80 | train_dataloader = DataLoader(train_dataset, batch_size=32)
81 | val_dataloader = DataLoader(val_dataset, batch_size=32)
82 | test_dataloader = DataLoader(test_dataset, batch_size=32)
83 |
84 | return train_dataloader, val_dataloader, test_dataloader
85 |
86 |
87 | train_dataloader, val_dataloader, test_dataloader = prepare_dataloaders()
88 |
89 | model = LitConvClassifier()
90 |
91 | # Lightning automatically saves a checkpoint for you in your current working directory,
92 | # with the state of your last training epoch.
93 | # Or you can specify the path to save the checkpoint to.
94 | trainer = pl.Trainer(max_epochs=1, default_root_dir="experiments/")
95 |
96 | trainer.fit(model, train_dataloader, val_dataloader)
97 |
98 | # Load the checkpoint from the path
99 | # You can modify the path to the checkpoint file you want to load
100 | checkpoint_path = (
101 | "experiments/lightning_logs/version_0/checkpoints/epoch=0-step=1500.ckpt"
102 | )
103 |
104 | # By default, the checkpoint loads the model with the same parameters as the original model
105 | model = LitConvClassifier.load_from_checkpoint(checkpoint_path)
106 | print(f"Original Model Learning Rate: {model.learning_rate}") # prints 0.001
107 |
108 | # You can also load the checkpoint with different parameters
109 | model = LitConvClassifier.load_from_checkpoint(checkpoint_path, learning_rate=0.01)
110 | print(f"Updated Model Learning Rate: {model.learning_rate}") # prints 0.01
111 |
--------------------------------------------------------------------------------
/src/basic/level_04_early_stopping/early_stopping.py:
--------------------------------------------------------------------------------
1 | # Documentation Link
2 | # https://lightning.ai/docs/pytorch/stable/common/early_stopping.html
3 |
4 |
5 | import os
6 |
7 | import lightning.pytorch as pl
8 | import torch
9 |
10 | # Import the early stopping callback
11 | from lightning.pytorch.callbacks.early_stopping import EarlyStopping
12 | from torch import nn
13 | from torch.nn import functional as F
14 | from torch.utils.data import DataLoader
15 | from torchvision import transforms
16 | from torchvision.datasets import MNIST
17 |
18 |
19 | class LitConvClassifier(pl.LightningModule):
20 | def __init__(self, learning_rate=1e-3):
21 | super().__init__()
22 | self.save_hyperparameters()
23 |
24 | self.learning_rate = learning_rate
25 | self.conv1 = nn.Conv2d(1, 32, 3, stride=1, padding=1)
26 | self.conv2 = nn.Conv2d(32, 64, 3, stride=1, padding=1)
27 | self.fc1 = nn.Linear(64 * 7 * 7, 128)
28 | self.fc2 = nn.Linear(128, 10)
29 |
30 | def forward(self, x):
31 | x = F.relu(self.conv1(x))
32 | x = F.max_pool2d(x, 2)
33 | x = F.relu(self.conv2(x))
34 | x = F.max_pool2d(x, 2)
35 | x = x.view(x.size(0), -1)
36 | x = F.relu(self.fc1(x))
37 | x = self.fc2(x)
38 | return x
39 |
40 | def training_step(self, batch, batch_idx):
41 | x, y = batch
42 | y_hat = self(x)
43 | loss = F.cross_entropy(y_hat, y)
44 | return loss
45 |
46 | def validation_step(self, batch, batch_idx):
47 | x, y = batch
48 | y_hat = self(x)
49 | loss = F.cross_entropy(y_hat, y)
50 |
51 | # First we log the loss of interest
52 | self.log("val_loss", loss)
53 | return loss
54 |
55 | def test_step(self, batch, batch_idx):
56 | x, y = batch
57 | y_hat = self(x)
58 | loss = F.cross_entropy(y_hat, y)
59 | return loss
60 |
61 | def configure_optimizers(self):
62 | optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)
63 | return optimizer
64 |
65 |
66 | def prepare_dataloaders():
67 | train_dataset = MNIST(
68 | "./", download=True, train=True, transform=transforms.ToTensor()
69 | )
70 |
71 | train_size = int(0.8 * len(train_dataset))
72 | val_size = len(train_dataset) - train_size
73 |
74 | seed = torch.Generator().manual_seed(42)
75 | train_dataset, val_dataset = torch.utils.data.random_split(
76 | train_dataset, [train_size, val_size], generator=seed
77 | )
78 |
79 | test_dataset = MNIST(
80 | "./", download=True, train=False, transform=transforms.ToTensor()
81 | )
82 |
83 | train_dataloader = DataLoader(train_dataset, batch_size=32)
84 | val_dataloader = DataLoader(val_dataset, batch_size=32)
85 | test_dataloader = DataLoader(test_dataset, batch_size=32)
86 |
87 | return train_dataloader, val_dataloader, test_dataloader
88 |
89 |
90 | train_dataloader, val_dataloader, test_dataloader = prepare_dataloaders()
91 |
92 | model = LitConvClassifier()
93 |
94 | # Then pass the callback to the trainer
95 | trainer = pl.Trainer(
96 | max_epochs=3,
97 | default_root_dir="experiments/",
98 | callbacks=[EarlyStopping(monitor="val_loss", mode="min")],
99 | )
100 | trainer.fit(model, train_dataloader, val_dataloader)
101 |
102 | # Or customize the early stopping callback and pass it to the trainer
103 | early_stopping = EarlyStopping(monitor="val_loss", mode="min", patience=3, verbose=True)
104 | trainer = pl.Trainer(
105 | max_epochs=3, default_root_dir="experiments/", callbacks=[early_stopping]
106 | )
107 |
108 | trainer.fit(model, train_dataloader, val_dataloader)
109 |
110 | # Additional parameters that stop training at extreme points:
111 | # --> stopping_threshold: Stops training immediately once the monitored quantity reaches this threshold.
112 | # It is useful when we know that going beyond a certain optimal value does not further benefit us.
113 |
114 | # --> divergence_threshold: Stops training as soon as the monitored quantity becomes worse than this threshold.
115 | # When reaching a value this bad, we believes the model cannot recover anymore
116 | # and it is better to stop early and run with different initial conditions.
117 |
118 | # --> check_finite: When turned on, it stops training if the monitored metric becomes NaN or infinite.
119 |
120 | # --> check_on_train_epoch_end: When turned on, it checks the metric at the end of a training epoch.
121 | # Use this only when you are monitoring any metric logged within training-specific
122 | # hooks on epoch-level.
123 |
--------------------------------------------------------------------------------
/src/intermediate/level_12_deploying_models/01_onnx.py:
--------------------------------------------------------------------------------
1 | # Documentation Link
2 | # https://lightning.ai/docs/pytorch/stable/deploy/production_advanced.html
3 |
4 | import os
5 |
6 | import lightning.pytorch as pl
7 | import onnxruntime
8 | import torch
9 | from lightning.pytorch.callbacks import ModelSummary, StochasticWeightAveraging
10 | from lightning.pytorch.callbacks.early_stopping import EarlyStopping
11 | from torch import nn
12 | from torch.nn import functional as F
13 | from torch.utils.data import DataLoader, random_split
14 | from torchvision import transforms
15 | from torchvision.datasets import MNIST
16 |
17 |
18 | class MNISTDataModule(pl.LightningDataModule):
19 | def __init__(self, data_dir: str = "./"):
20 | super().__init__()
21 | self.data_dir = data_dir
22 | self.transform = transforms.Compose(
23 | [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]
24 | )
25 |
26 | def prepare_data(self):
27 | # download
28 | MNIST(self.data_dir, train=True, download=True)
29 | MNIST(self.data_dir, train=False, download=True)
30 |
31 | def setup(self, stage: str):
32 | # Assign train/val datasets for use in dataloaders
33 | if stage == "fit":
34 | mnist_full = MNIST(self.data_dir, train=True, transform=self.transform)
35 | self.mnist_train, self.mnist_val = random_split(mnist_full, [55000, 5000])
36 |
37 | # Assign test dataset for use in dataloader(s)
38 | if stage == "test":
39 | self.mnist_test = MNIST(
40 | self.data_dir, train=False, transform=self.transform
41 | )
42 |
43 | if stage == "predict":
44 | self.mnist_predict = MNIST(
45 | self.data_dir, train=False, transform=self.transform
46 | )
47 |
48 | def train_dataloader(self):
49 | return DataLoader(self.mnist_train, batch_size=32)
50 |
51 | def val_dataloader(self):
52 | return DataLoader(self.mnist_val, batch_size=32)
53 |
54 | def test_dataloader(self):
55 | return DataLoader(self.mnist_test, batch_size=32)
56 |
57 | def predict_dataloader(self):
58 | return DataLoader(self.mnist_predict, batch_size=32)
59 |
60 |
61 | class LitConvClassifier(pl.LightningModule):
62 | def __init__(self, learning_rate=1e-3):
63 | super().__init__()
64 | self.save_hyperparameters()
65 | self.example_input_array = torch.rand(1, 1, 28, 28)
66 |
67 | self.learning_rate = learning_rate
68 |
69 | # Define blocks of layers as submodules
70 | self.conv_block1 = nn.Sequential(
71 | nn.Conv2d(1, 32, 3, stride=1, padding=1), nn.ReLU(), nn.MaxPool2d(2)
72 | )
73 |
74 | self.conv_block2 = nn.Sequential(
75 | nn.Conv2d(32, 64, 3, stride=1, padding=1), nn.ReLU(), nn.MaxPool2d(2)
76 | )
77 |
78 | self.fc_block = nn.Sequential(
79 | nn.Linear(64 * 7 * 7, 128), nn.ReLU(), nn.Linear(128, 10)
80 | )
81 |
82 | def forward(self, x):
83 | x = self.conv_block1(x)
84 | x = self.conv_block2(x)
85 | x = x.view(x.size(0), -1)
86 | x = self.fc_block(x)
87 | return x
88 |
89 | def training_step(self, batch, batch_idx):
90 | x, y = batch
91 | y_hat = self(x)
92 | loss = F.cross_entropy(y_hat, y)
93 | return loss
94 |
95 | def validation_step(self, batch, batch_idx):
96 | x, y = batch
97 | y_hat = self(x)
98 | loss = F.cross_entropy(y_hat, y)
99 | self.log("val_loss", loss)
100 | return loss
101 |
102 | def test_step(self, batch, batch_idx):
103 | x, y = batch
104 | y_hat = self(x)
105 | loss = F.cross_entropy(y_hat, y)
106 | return loss
107 |
108 | def predict_step(self, batch, batch_idx, dataloader_idx=None):
109 | x, _ = batch
110 | return self(x)
111 |
112 | def configure_optimizers(self):
113 | optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)
114 | return optimizer
115 |
116 |
117 | data_module = MNISTDataModule()
118 | model = LitConvClassifier()
119 |
120 | # ONNX is a package developed by Microsoft to optimize inference.
121 | # ONNX allows the model to be independent of PyTorch and run on any ONNX Runtime.
122 | filepath = "model.onnx"
123 | model.to_onnx(filepath, export_params=True)
124 |
125 | # Once you have the exported model, you can run it on your ONNX runtime in the following way:
126 | ort_session = onnxruntime.InferenceSession(filepath)
127 | input_name = ort_session.get_inputs()[0].name
128 | ort_inputs = {input_name: torch.rand(1, 1, 28, 28).numpy()}
129 | ort_outs = ort_session.run(None, ort_inputs)
130 |
--------------------------------------------------------------------------------
/src/basic/level_06_debugging_model/01_quick_run.py:
--------------------------------------------------------------------------------
1 | # Documentation Link
2 | # https://lightning.ai/docs/pytorch/stable/debug/debugging_basic.html
3 |
4 | import os
5 | import time
6 |
7 | import lightning.pytorch as pl
8 | import torch
9 | from lightning.pytorch.callbacks.early_stopping import EarlyStopping
10 | from torch import nn
11 | from torch.nn import functional as F
12 | from torch.utils.data import DataLoader
13 | from torchvision import transforms
14 | from torchvision.datasets import MNIST
15 |
16 |
17 | class LitConvClassifier(pl.LightningModule):
18 | def __init__(self, learning_rate=1e-3):
19 | super().__init__()
20 | self.save_hyperparameters()
21 |
22 | self.learning_rate = learning_rate
23 | self.conv1 = nn.Conv2d(1, 32, 3, stride=1, padding=1)
24 | self.conv2 = nn.Conv2d(32, 64, 3, stride=1, padding=1)
25 | self.fc1 = nn.Linear(64 * 7 * 7, 128)
26 | self.fc2 = nn.Linear(128, 10)
27 |
28 | def forward(self, x):
29 | x = F.relu(self.conv1(x))
30 | x = F.max_pool2d(x, 2)
31 | x = F.relu(self.conv2(x))
32 | x = F.max_pool2d(x, 2)
33 | x = x.view(x.size(0), -1)
34 | x = F.relu(self.fc1(x))
35 | x = self.fc2(x)
36 | return x
37 |
38 | def training_step(self, batch, batch_idx):
39 | x, y = batch
40 | y_hat = self(x)
41 | loss = F.cross_entropy(y_hat, y)
42 | return loss
43 |
44 | def validation_step(self, batch, batch_idx):
45 | x, y = batch
46 | y_hat = self(x)
47 | loss = F.cross_entropy(y_hat, y)
48 | self.log("val_loss", loss)
49 | return loss
50 |
51 | def test_step(self, batch, batch_idx):
52 | x, y = batch
53 | y_hat = self(x)
54 | loss = F.cross_entropy(y_hat, y)
55 | return loss
56 |
57 | def configure_optimizers(self):
58 | optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)
59 | return optimizer
60 |
61 |
62 | def prepare_dataloaders():
63 | train_dataset = MNIST(
64 | "./", download=True, train=True, transform=transforms.ToTensor()
65 | )
66 |
67 | train_size = int(0.8 * len(train_dataset))
68 | val_size = len(train_dataset) - train_size
69 |
70 | seed = torch.Generator().manual_seed(42)
71 | train_dataset, val_dataset = torch.utils.data.random_split(
72 | train_dataset, [train_size, val_size], generator=seed
73 | )
74 |
75 | test_dataset = MNIST(
76 | "./", download=True, train=False, transform=transforms.ToTensor()
77 | )
78 |
79 | train_dataloader = DataLoader(train_dataset, batch_size=32)
80 | val_dataloader = DataLoader(val_dataset, batch_size=32)
81 | test_dataloader = DataLoader(test_dataset, batch_size=32)
82 |
83 | return train_dataloader, val_dataloader, test_dataloader
84 |
85 |
86 | train_dataloader, val_dataloader, test_dataloader = prepare_dataloaders()
87 |
88 | model = LitConvClassifier()
89 |
90 | # Default
91 | start = time.time()
92 | trainer = pl.Trainer(
93 | max_epochs=1,
94 | default_root_dir="experiments/",
95 | callbacks=[EarlyStopping(monitor="val_loss", mode="min")],
96 | )
97 | trainer.fit(model, train_dataloader, val_dataloader)
98 | end = time.time()
99 | print(f"\nDefault Training time: {end - start}")
100 |
101 | # fast_dev_run
102 | # The fast_dev_run argument in the trainer runs 5 batch of training, validation,
103 | # test and prediction data through your trainer to see if there are any bugs
104 | # To change how many batches to use, change the argument to an integer.
105 | # This argument will disable tuner, checkpoint callbacks, early stopping callbacks,
106 | # loggers and logger callbacks like LearningRateMonitor and DeviceStatsMonitor.
107 | start = time.time()
108 | trainer = pl.Trainer(fast_dev_run=True)
109 | trainer.fit(model, train_dataloader, val_dataloader)
110 | end = time.time()
111 | print(f"\nFast Dev Run Training time: {end - start}")
112 |
113 | # Shorten Epoch Length
114 | # Here use only 10% of training data and 1% of val data
115 | # You can also specify the num batches as integers
116 | start = time.time()
117 | trainer = pl.Trainer(max_epochs=1, limit_train_batches=0.1, limit_val_batches=0.1)
118 | trainer.fit(model, train_dataloader, val_dataloader)
119 | end = time.time()
120 | print(f"S\nhortened Epoch Training time: {end - start}")
121 |
122 | # Sanity Check
123 | # Lightning runs 2 steps of validation in the beginning of training.
124 | # This avoids crashing in the validation loop sometime deep into a lengthy training loop.
125 | start = time.time()
126 | trainer = pl.Trainer(max_epochs=1, num_sanity_val_steps=2)
127 | trainer.fit(model, train_dataloader, val_dataloader)
128 | end = time.time()
129 | print(f"\nSanity Check Training time: {end - start}")
130 |
--------------------------------------------------------------------------------
/src/intermediate/level_11_scaling_techniques/01_precision_training.py:
--------------------------------------------------------------------------------
1 | # Documentation Link
2 | # https://lightning.ai/docs/pytorch/stable/common/precision_basic.html
3 |
4 | import os
5 |
6 | import lightning.pytorch as pl
7 | import torch
8 | from lightning.pytorch.callbacks import ModelSummary
9 | from lightning.pytorch.callbacks.early_stopping import EarlyStopping
10 | from torch import nn
11 | from torch.nn import functional as F
12 | from torch.utils.data import DataLoader, random_split
13 | from torchvision import transforms
14 | from torchvision.datasets import MNIST
15 |
16 |
17 | class MNISTDataModule(pl.LightningDataModule):
18 | def __init__(self, data_dir: str = "./"):
19 | super().__init__()
20 | self.data_dir = data_dir
21 | self.transform = transforms.Compose(
22 | [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]
23 | )
24 |
25 | def prepare_data(self):
26 | # download
27 | MNIST(self.data_dir, train=True, download=True)
28 | MNIST(self.data_dir, train=False, download=True)
29 |
30 | def setup(self, stage: str):
31 | # Assign train/val datasets for use in dataloaders
32 | if stage == "fit":
33 | mnist_full = MNIST(self.data_dir, train=True, transform=self.transform)
34 | self.mnist_train, self.mnist_val = random_split(mnist_full, [55000, 5000])
35 |
36 | # Assign test dataset for use in dataloader(s)
37 | if stage == "test":
38 | self.mnist_test = MNIST(
39 | self.data_dir, train=False, transform=self.transform
40 | )
41 |
42 | if stage == "predict":
43 | self.mnist_predict = MNIST(
44 | self.data_dir, train=False, transform=self.transform
45 | )
46 |
47 | def train_dataloader(self):
48 | return DataLoader(self.mnist_train, batch_size=32)
49 |
50 | def val_dataloader(self):
51 | return DataLoader(self.mnist_val, batch_size=32)
52 |
53 | def test_dataloader(self):
54 | return DataLoader(self.mnist_test, batch_size=32)
55 |
56 | def predict_dataloader(self):
57 | return DataLoader(self.mnist_predict, batch_size=32)
58 |
59 |
60 | class LitConvClassifier(pl.LightningModule):
61 | def __init__(self, learning_rate=1e-3):
62 | super().__init__()
63 | self.save_hyperparameters()
64 | self.example_input_array = torch.rand(1, 1, 28, 28)
65 |
66 | self.learning_rate = learning_rate
67 |
68 | # Define blocks of layers as submodules
69 | self.conv_block1 = nn.Sequential(
70 | nn.Conv2d(1, 32, 3, stride=1, padding=1), nn.ReLU(), nn.MaxPool2d(2)
71 | )
72 |
73 | self.conv_block2 = nn.Sequential(
74 | nn.Conv2d(32, 64, 3, stride=1, padding=1), nn.ReLU(), nn.MaxPool2d(2)
75 | )
76 |
77 | self.fc_block = nn.Sequential(
78 | nn.Linear(64 * 7 * 7, 128), nn.ReLU(), nn.Linear(128, 10)
79 | )
80 |
81 | def forward(self, x):
82 | x = self.conv_block1(x)
83 | x = self.conv_block2(x)
84 | x = x.view(x.size(0), -1)
85 | x = self.fc_block(x)
86 | return x
87 |
88 | def training_step(self, batch, batch_idx):
89 | x, y = batch
90 | y_hat = self(x)
91 | loss = F.cross_entropy(y_hat, y)
92 | return loss
93 |
94 | def validation_step(self, batch, batch_idx):
95 | x, y = batch
96 | y_hat = self(x)
97 | loss = F.cross_entropy(y_hat, y)
98 | self.log("val_loss", loss)
99 | return loss
100 |
101 | def test_step(self, batch, batch_idx):
102 | x, y = batch
103 | y_hat = self(x)
104 | loss = F.cross_entropy(y_hat, y)
105 | return loss
106 |
107 | def predict_step(self, batch, batch_idx, dataloader_idx=None):
108 | x, _ = batch
109 | return self(x)
110 |
111 | def configure_optimizers(self):
112 | optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)
113 | return optimizer
114 |
115 |
116 | data_module = MNISTDataModule()
117 | model = LitConvClassifier()
118 |
119 | # Lower precision, such as 16-bit floating-point, requires less memory and
120 | # enables training and deploying larger models. Higher precision,
121 | # such as the 64-bit floating-point, can be used for highly sensitive use-cases.
122 |
123 | trainer = pl.Trainer(
124 | max_epochs=1,
125 | default_root_dir="experiments/",
126 | callbacks=[
127 | EarlyStopping(monitor="val_loss", mode="min"),
128 | ModelSummary(max_depth=-1),
129 | ],
130 | precision="16-mixed",
131 | )
132 |
133 | trainer.fit(model, data_module)
134 |
135 | # Get Predictions
136 | predictions = trainer.predict(model, data_module)
137 | print(len(predictions))
138 |
--------------------------------------------------------------------------------
/src/intermediate/level_08_accelerated_hardware/02_tpu_traininig.py:
--------------------------------------------------------------------------------
1 | # Documentation Link
2 | # https://lightning.ai/docs/pytorch/stable/accelerators/tpu_basic.html
3 |
4 | import os
5 |
6 | import lightning.pytorch as pl
7 | import torch
8 | from lightning.pytorch.callbacks import ModelSummary
9 | from lightning.pytorch.callbacks.early_stopping import EarlyStopping
10 | from torch import nn
11 | from torch.nn import functional as F
12 | from torch.utils.data import DataLoader
13 | from torchvision import transforms
14 | from torchvision.datasets import MNIST
15 |
16 |
17 | class LitConvClassifier(pl.LightningModule):
18 | def __init__(self, learning_rate=1e-3):
19 | super().__init__()
20 | self.save_hyperparameters()
21 | self.example_input_array = torch.rand(1, 1, 28, 28)
22 |
23 | self.learning_rate = learning_rate
24 |
25 | # Define blocks of layers as submodules
26 | self.conv_block1 = nn.Sequential(
27 | nn.Conv2d(1, 32, 3, stride=1, padding=1), nn.ReLU(), nn.MaxPool2d(2)
28 | )
29 |
30 | self.conv_block2 = nn.Sequential(
31 | nn.Conv2d(32, 64, 3, stride=1, padding=1), nn.ReLU(), nn.MaxPool2d(2)
32 | )
33 |
34 | self.fc_block = nn.Sequential(
35 | nn.Linear(64 * 7 * 7, 128), nn.ReLU(), nn.Linear(128, 10)
36 | )
37 |
38 | def forward(self, x):
39 | x = self.conv_block1(x)
40 | x = self.conv_block2(x)
41 | x = x.view(x.size(0), -1)
42 | x = self.fc_block(x)
43 | return x
44 |
45 | def training_step(self, batch, batch_idx):
46 | x, y = batch
47 | y_hat = self(x)
48 | loss = F.cross_entropy(y_hat, y)
49 | return loss
50 |
51 | def validation_step(self, batch, batch_idx):
52 | x, y = batch
53 | y_hat = self(x)
54 | loss = F.cross_entropy(y_hat, y)
55 | self.log("val_loss", loss)
56 | return loss
57 |
58 | def test_step(self, batch, batch_idx):
59 | x, y = batch
60 | y_hat = self(x)
61 | loss = F.cross_entropy(y_hat, y)
62 | return loss
63 |
64 | def predict_step(self, batch, batch_idx, dataloader_idx=None):
65 | x, _ = batch
66 | return self(x)
67 |
68 | def configure_optimizers(self):
69 | optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)
70 | return optimizer
71 |
72 |
73 | def prepare_dataloaders():
74 | train_dataset = MNIST(
75 | "./", download=True, train=True, transform=transforms.ToTensor()
76 | )
77 |
78 | train_size = int(0.8 * len(train_dataset))
79 | val_size = len(train_dataset) - train_size
80 |
81 | seed = torch.Generator().manual_seed(42)
82 | train_dataset, val_dataset = torch.utils.data.random_split(
83 | train_dataset, [train_size, val_size], generator=seed
84 | )
85 |
86 | test_dataset = MNIST(
87 | "./", download=True, train=False, transform=transforms.ToTensor()
88 | )
89 |
90 | train_dataloader = DataLoader(train_dataset, batch_size=32)
91 | val_dataloader = DataLoader(val_dataset, batch_size=32)
92 | test_dataloader = DataLoader(test_dataset, batch_size=32)
93 |
94 | return train_dataloader, val_dataloader, test_dataloader
95 |
96 |
97 | train_dataloader, val_dataloader, test_dataloader = prepare_dataloaders()
98 |
99 | model = LitConvClassifier()
100 |
101 | # Tensor Processing Unit (TPU) is an AI accelerator application-specific integrated circuit (ASIC) developed by
102 | # Google specifically for neural networks.
103 |
104 | # A TPU has 8 cores where each core is optimized for 128x128 matrix multiplies.
105 | # In general, a single TPU is about as fast as 5 V100 GPUs!
106 |
107 | # A TPU pod hosts many TPUs on it. Currently, TPU v3 Pod has up to 2048 TPU cores and 32 TiB of memory!
108 | # You can request a full pod from Google cloud or a “slice” which gives you some subset of those 2048 cores.
109 |
110 | # run on as many TPUs as available by default
111 | trainer = pl.Trainer(
112 | max_epochs=5,
113 | default_root_dir="experiments/",
114 | accelerator="auto",
115 | devices="auto",
116 | strategy="auto",
117 | )
118 | # equivalent to
119 | trainer = pl.Trainer()
120 |
121 | # run on one TPU core
122 | trainer = pl.Trainer(
123 | max_epochs=5, default_root_dir="experiments/", accelerator="tpu", devices=1
124 | )
125 |
126 | # run on multiple TPU cores
127 | trainer = pl.Trainer(
128 | max_epochs=5, default_root_dir="experiments/", accelerator="tpu", devices=8
129 | )
130 |
131 | # run on the 5th core
132 | trainer = pl.Trainer(
133 | max_epochs=5, default_root_dir="experiments/", accelerator="tpu", devices=[5]
134 | )
135 |
136 | # choose the number of cores automatically
137 | trainer = pl.Trainer(
138 | max_epochs=5, default_root_dir="experiments/", accelerator="tpu", devices="auto"
139 | )
140 |
141 | trainer.fit(model, train_dataloader, val_dataloader)
142 |
--------------------------------------------------------------------------------
/src/basic/level_06_debugging_model/02_model_summary.py:
--------------------------------------------------------------------------------
1 | # Documentation Link
2 | # https://lightning.ai/docs/pytorch/stable/debug/debugging_basic.html
3 |
4 | import os
5 |
6 | import lightning.pytorch as pl
7 | import torch
8 |
9 | # Used for child modules in the model summary
10 | from lightning.pytorch.callbacks import ModelSummary
11 | from lightning.pytorch.callbacks.early_stopping import EarlyStopping
12 | from torch import nn
13 | from torch.nn import functional as F
14 | from torch.utils.data import DataLoader
15 | from torchvision import transforms
16 | from torchvision.datasets import MNIST
17 |
18 |
19 | # We have updated the model to use nn.Sequential() and named the blocks of layers.
20 | # This will help us understand the Model Summary output.
21 | class LitConvClassifier(pl.LightningModule):
22 | def __init__(self, learning_rate=1e-3):
23 | super().__init__()
24 | self.save_hyperparameters()
25 |
26 | # Another debugging tool is to display the intermediate input- and output sizes of
27 | # all your layers by setting the example_input_array attribute in your LightningModule.
28 | self.example_input_array = torch.rand(1, 1, 28, 28)
29 |
30 | self.learning_rate = learning_rate
31 |
32 | # Define blocks of layers as submodules
33 | self.conv_block1 = nn.Sequential(
34 | nn.Conv2d(1, 32, 3, stride=1, padding=1), nn.ReLU(), nn.MaxPool2d(2)
35 | )
36 |
37 | self.conv_block2 = nn.Sequential(
38 | nn.Conv2d(32, 64, 3, stride=1, padding=1), nn.ReLU(), nn.MaxPool2d(2)
39 | )
40 |
41 | self.fc_block = nn.Sequential(
42 | nn.Linear(64 * 7 * 7, 128), nn.ReLU(), nn.Linear(128, 10)
43 | )
44 |
45 | def forward(self, x):
46 | x = self.conv_block1(x)
47 | x = self.conv_block2(x)
48 | x = x.view(x.size(0), -1)
49 | x = self.fc_block(x)
50 | return x
51 |
52 | def training_step(self, batch, batch_idx):
53 | x, y = batch
54 | y_hat = self(x)
55 | loss = F.cross_entropy(y_hat, y)
56 | return loss
57 |
58 | def validation_step(self, batch, batch_idx):
59 | x, y = batch
60 | y_hat = self(x)
61 | loss = F.cross_entropy(y_hat, y)
62 | self.log("val_loss", loss)
63 | return loss
64 |
65 | def test_step(self, batch, batch_idx):
66 | x, y = batch
67 | y_hat = self(x)
68 | loss = F.cross_entropy(y_hat, y)
69 | return loss
70 |
71 | def configure_optimizers(self):
72 | optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)
73 | return optimizer
74 |
75 |
76 | def prepare_dataloaders():
77 | train_dataset = MNIST(
78 | "./", download=True, train=True, transform=transforms.ToTensor()
79 | )
80 |
81 | train_size = int(0.8 * len(train_dataset))
82 | val_size = len(train_dataset) - train_size
83 |
84 | seed = torch.Generator().manual_seed(42)
85 | train_dataset, val_dataset = torch.utils.data.random_split(
86 | train_dataset, [train_size, val_size], generator=seed
87 | )
88 |
89 | test_dataset = MNIST(
90 | "./", download=True, train=False, transform=transforms.ToTensor()
91 | )
92 |
93 | train_dataloader = DataLoader(train_dataset, batch_size=32)
94 | val_dataloader = DataLoader(val_dataset, batch_size=32)
95 | test_dataloader = DataLoader(test_dataset, batch_size=32)
96 |
97 | return train_dataloader, val_dataloader, test_dataloader
98 |
99 |
100 | train_dataloader, val_dataloader, test_dataloader = prepare_dataloaders()
101 |
102 | model = LitConvClassifier()
103 |
104 | # Default
105 | # Whenever the .fit() function gets called,
106 | # the Trainer will print the weights summary for the LightningModule.
107 | print("\n----------------------------------")
108 | print("Default Model Summary")
109 | print("----------------------------------")
110 | trainer = pl.Trainer(
111 | max_epochs=1,
112 | default_root_dir="experiments/",
113 | callbacks=[EarlyStopping(monitor="val_loss", mode="min")],
114 | )
115 | trainer.fit(model, train_dataloader, val_dataloader)
116 |
117 | # Child Modules
118 | print("\n----------------------------------")
119 | print("Child Modules Model Summary")
120 | print("----------------------------------")
121 | trainer = pl.Trainer(
122 | max_epochs=1,
123 | default_root_dir="experiments/",
124 | callbacks=[
125 | EarlyStopping(monitor="val_loss", mode="min"),
126 | ModelSummary(max_depth=-1),
127 | ],
128 | )
129 | trainer.fit(model, train_dataloader, val_dataloader)
130 |
131 | # Turn off model summary
132 | print("\n----------------------------------")
133 | print("Turn off Model Summary")
134 | print("----------------------------------")
135 | trainer = pl.Trainer(
136 | max_epochs=1,
137 | default_root_dir="experiments/",
138 | callbacks=[EarlyStopping(monitor="val_loss", mode="min")],
139 | enable_model_summary=False,
140 | )
141 | trainer.fit(model, train_dataloader, val_dataloader)
142 |
--------------------------------------------------------------------------------
/src/intermediate/level_13_profiler/02_profile_pytorch_operations.py:
--------------------------------------------------------------------------------
1 | # Documentation Link
2 | # https://lightning.ai/docs/pytorch/stable/tuning/profiler_intermediate.html
3 |
4 | import os
5 |
6 | import lightning.pytorch as pl
7 | import torch
8 | from lightning.pytorch.callbacks import (
9 | DeviceStatsMonitor,
10 | ModelSummary,
11 | StochasticWeightAveraging,
12 | )
13 | from lightning.pytorch.callbacks.early_stopping import EarlyStopping
14 | from lightning.pytorch.profilers import PyTorchProfiler
15 | from torch import nn
16 | from torch.nn import functional as F
17 | from torch.utils.data import DataLoader, random_split
18 | from torchvision import transforms
19 | from torchvision.datasets import MNIST
20 |
21 |
22 | class MNISTDataModule(pl.LightningDataModule):
23 | def __init__(self, data_dir: str = "./"):
24 | super().__init__()
25 | self.data_dir = data_dir
26 | self.transform = transforms.Compose(
27 | [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]
28 | )
29 |
30 | def prepare_data(self):
31 | # download
32 | MNIST(self.data_dir, train=True, download=True)
33 | MNIST(self.data_dir, train=False, download=True)
34 |
35 | def setup(self, stage: str):
36 | # Assign train/val datasets for use in dataloaders
37 | if stage == "fit":
38 | mnist_full = MNIST(self.data_dir, train=True, transform=self.transform)
39 | self.mnist_train, self.mnist_val = random_split(mnist_full, [55000, 5000])
40 |
41 | # Assign test dataset for use in dataloader(s)
42 | if stage == "test":
43 | self.mnist_test = MNIST(
44 | self.data_dir, train=False, transform=self.transform
45 | )
46 |
47 | if stage == "predict":
48 | self.mnist_predict = MNIST(
49 | self.data_dir, train=False, transform=self.transform
50 | )
51 |
52 | def train_dataloader(self):
53 | return DataLoader(self.mnist_train, batch_size=32)
54 |
55 | def val_dataloader(self):
56 | return DataLoader(self.mnist_val, batch_size=32)
57 |
58 | def test_dataloader(self):
59 | return DataLoader(self.mnist_test, batch_size=32)
60 |
61 | def predict_dataloader(self):
62 | return DataLoader(self.mnist_predict, batch_size=32)
63 |
64 |
65 | class LitConvClassifier(pl.LightningModule):
66 | def __init__(self, learning_rate=1e-3):
67 | super().__init__()
68 | self.save_hyperparameters()
69 | self.example_input_array = torch.rand(1, 1, 28, 28)
70 |
71 | self.learning_rate = learning_rate
72 |
73 | # Define blocks of layers as submodules
74 | self.conv_block1 = nn.Sequential(
75 | nn.Conv2d(1, 32, 3, stride=1, padding=1), nn.ReLU(), nn.MaxPool2d(2)
76 | )
77 |
78 | self.conv_block2 = nn.Sequential(
79 | nn.Conv2d(32, 64, 3, stride=1, padding=1), nn.ReLU(), nn.MaxPool2d(2)
80 | )
81 |
82 | self.fc_block = nn.Sequential(
83 | nn.Linear(64 * 7 * 7, 128), nn.ReLU(), nn.Linear(128, 10)
84 | )
85 |
86 | def forward(self, x):
87 | x = self.conv_block1(x)
88 | x = self.conv_block2(x)
89 | x = x.view(x.size(0), -1)
90 | x = self.fc_block(x)
91 | return x
92 |
93 | def training_step(self, batch, batch_idx):
94 | x, y = batch
95 | y_hat = self(x)
96 | loss = F.cross_entropy(y_hat, y)
97 | return loss
98 |
99 | def validation_step(self, batch, batch_idx):
100 | x, y = batch
101 | y_hat = self(x)
102 | loss = F.cross_entropy(y_hat, y)
103 | self.log("val_loss", loss)
104 | return loss
105 |
106 | def test_step(self, batch, batch_idx):
107 | x, y = batch
108 | y_hat = self(x)
109 | loss = F.cross_entropy(y_hat, y)
110 | return loss
111 |
112 | def predict_step(self, batch, batch_idx, dataloader_idx=None):
113 | x, _ = batch
114 | return self(x)
115 |
116 | def configure_optimizers(self):
117 | optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)
118 | return optimizer
119 |
120 |
121 | data_module = MNISTDataModule()
122 | model = LitConvClassifier()
123 |
124 | # To understand the cost of each PyTorch operation,
125 | # use the PyTorchProfiler built on top of the PyTorch profiler.
126 | trainer = pl.Trainer(
127 | max_epochs=1,
128 | default_root_dir="experiments/",
129 | callbacks=[
130 | EarlyStopping(monitor="val_loss", mode="min"),
131 | ModelSummary(max_depth=-1),
132 | StochasticWeightAveraging(swa_lrs=1e-2),
133 | DeviceStatsMonitor(),
134 | ],
135 | precision="16-mixed",
136 | profiler=PyTorchProfiler(),
137 | limit_train_batches=0.1,
138 | limit_val_batches=0.01,
139 | )
140 |
141 | trainer.fit(model, data_module)
142 |
143 | # Get Predictions
144 | predictions = trainer.predict(model, data_module)
145 | print(len(predictions))
146 |
--------------------------------------------------------------------------------
/src/intermediate/level_11_scaling_techniques/04_stochastic_weight_averaging.py:
--------------------------------------------------------------------------------
1 | # Documentation Link
2 | # https://lightning.ai/docs/pytorch/stable/advanced/training_tricks.html
3 |
4 | import os
5 |
6 | import lightning.pytorch as pl
7 | import torch
8 | from lightning.pytorch.callbacks import ModelSummary, StochasticWeightAveraging
9 | from lightning.pytorch.callbacks.early_stopping import EarlyStopping
10 | from torch import nn
11 | from torch.nn import functional as F
12 | from torch.utils.data import DataLoader, random_split
13 | from torchvision import transforms
14 | from torchvision.datasets import MNIST
15 |
16 |
17 | class MNISTDataModule(pl.LightningDataModule):
18 | def __init__(self, data_dir: str = "./"):
19 | super().__init__()
20 | self.data_dir = data_dir
21 | self.transform = transforms.Compose(
22 | [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]
23 | )
24 |
25 | def prepare_data(self):
26 | # download
27 | MNIST(self.data_dir, train=True, download=True)
28 | MNIST(self.data_dir, train=False, download=True)
29 |
30 | def setup(self, stage: str):
31 | # Assign train/val datasets for use in dataloaders
32 | if stage == "fit":
33 | mnist_full = MNIST(self.data_dir, train=True, transform=self.transform)
34 | self.mnist_train, self.mnist_val = random_split(mnist_full, [55000, 5000])
35 |
36 | # Assign test dataset for use in dataloader(s)
37 | if stage == "test":
38 | self.mnist_test = MNIST(
39 | self.data_dir, train=False, transform=self.transform
40 | )
41 |
42 | if stage == "predict":
43 | self.mnist_predict = MNIST(
44 | self.data_dir, train=False, transform=self.transform
45 | )
46 |
47 | def train_dataloader(self):
48 | return DataLoader(self.mnist_train, batch_size=32)
49 |
50 | def val_dataloader(self):
51 | return DataLoader(self.mnist_val, batch_size=32)
52 |
53 | def test_dataloader(self):
54 | return DataLoader(self.mnist_test, batch_size=32)
55 |
56 | def predict_dataloader(self):
57 | return DataLoader(self.mnist_predict, batch_size=32)
58 |
59 |
60 | class LitConvClassifier(pl.LightningModule):
61 | def __init__(self, learning_rate=1e-3):
62 | super().__init__()
63 | self.save_hyperparameters()
64 | self.example_input_array = torch.rand(1, 1, 28, 28)
65 |
66 | self.learning_rate = learning_rate
67 |
68 | # Define blocks of layers as submodules
69 | self.conv_block1 = nn.Sequential(
70 | nn.Conv2d(1, 32, 3, stride=1, padding=1), nn.ReLU(), nn.MaxPool2d(2)
71 | )
72 |
73 | self.conv_block2 = nn.Sequential(
74 | nn.Conv2d(32, 64, 3, stride=1, padding=1), nn.ReLU(), nn.MaxPool2d(2)
75 | )
76 |
77 | self.fc_block = nn.Sequential(
78 | nn.Linear(64 * 7 * 7, 128), nn.ReLU(), nn.Linear(128, 10)
79 | )
80 |
81 | def forward(self, x):
82 | x = self.conv_block1(x)
83 | x = self.conv_block2(x)
84 | x = x.view(x.size(0), -1)
85 | x = self.fc_block(x)
86 | return x
87 |
88 | def training_step(self, batch, batch_idx):
89 | x, y = batch
90 | y_hat = self(x)
91 | loss = F.cross_entropy(y_hat, y)
92 | return loss
93 |
94 | def validation_step(self, batch, batch_idx):
95 | x, y = batch
96 | y_hat = self(x)
97 | loss = F.cross_entropy(y_hat, y)
98 | self.log("val_loss", loss)
99 | return loss
100 |
101 | def test_step(self, batch, batch_idx):
102 | x, y = batch
103 | y_hat = self(x)
104 | loss = F.cross_entropy(y_hat, y)
105 | return loss
106 |
107 | def predict_step(self, batch, batch_idx, dataloader_idx=None):
108 | x, _ = batch
109 | return self(x)
110 |
111 | def configure_optimizers(self):
112 | optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)
113 | return optimizer
114 |
115 |
116 | data_module = MNISTDataModule()
117 | model = LitConvClassifier()
118 |
119 | # Stochastic Weight Averaging (SWA) can make your models generalize better at virtually no additional cost.
120 | # This can be used with both non-trained and trained models.
121 | # The SWA procedure smooths the loss landscape thus making it harder to end up in a local minimum during optimization.
122 |
123 | trainer = pl.Trainer(
124 | max_epochs=1,
125 | default_root_dir="experiments/",
126 | callbacks=[
127 | EarlyStopping(monitor="val_loss", mode="min"),
128 | ModelSummary(max_depth=-1),
129 | StochasticWeightAveraging(
130 | swa_lrs=1e-2
131 | ), # Enable Stochastic Weight Averaging using the callback
132 | ],
133 | precision="16-mixed",
134 | )
135 |
136 | trainer.fit(model, data_module)
137 |
138 | # Get Predictions
139 | predictions = trainer.predict(model, data_module)
140 | print(len(predictions))
141 |
--------------------------------------------------------------------------------
/src/intermediate/level_09_modularize/01_lightning_datamodule.py:
--------------------------------------------------------------------------------
1 | # Documentation Link
2 | # https://lightning.ai/docs/pytorch/stable/levels/intermediate_level_9.html
3 |
4 | import os
5 |
6 | import lightning.pytorch as pl
7 | import torch
8 | from lightning.pytorch.callbacks import ModelSummary
9 | from lightning.pytorch.callbacks.early_stopping import EarlyStopping
10 | from torch import nn
11 | from torch.nn import functional as F
12 | from torch.utils.data import DataLoader, random_split
13 | from torchvision import transforms
14 | from torchvision.datasets import MNIST
15 |
16 | # The LightningDataModule is a convenient way to manage data in PyTorch Lightning.
17 | # It encapsulates training, validation, testing, and prediction dataloaders,
18 | # as well as any necessary steps for data processing, downloads, and transformations.
19 | # By using a LightningDataModule, you can easily develop dataset-agnostic models, hot-swap different datasets,
20 | # and share data splits and transformations across projects.
21 |
22 |
23 | class MNISTDataModule(pl.LightningDataModule):
24 | def __init__(self, data_dir: str = "./"):
25 | super().__init__()
26 | self.data_dir = data_dir
27 | self.transform = transforms.Compose(
28 | [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]
29 | )
30 |
31 | def prepare_data(self):
32 | # download
33 | MNIST(self.data_dir, train=True, download=True)
34 | MNIST(self.data_dir, train=False, download=True)
35 |
36 | def setup(self, stage: str):
37 | # Assign train/val datasets for use in dataloaders
38 | if stage == "fit":
39 | mnist_full = MNIST(self.data_dir, train=True, transform=self.transform)
40 | self.mnist_train, self.mnist_val = random_split(mnist_full, [55000, 5000])
41 |
42 | # Assign test dataset for use in dataloader(s)
43 | if stage == "test":
44 | self.mnist_test = MNIST(
45 | self.data_dir, train=False, transform=self.transform
46 | )
47 |
48 | if stage == "predict":
49 | self.mnist_predict = MNIST(
50 | self.data_dir, train=False, transform=self.transform
51 | )
52 |
53 | def train_dataloader(self):
54 | return DataLoader(self.mnist_train, batch_size=32)
55 |
56 | def val_dataloader(self):
57 | return DataLoader(self.mnist_val, batch_size=32)
58 |
59 | def test_dataloader(self):
60 | return DataLoader(self.mnist_test, batch_size=32)
61 |
62 | def predict_dataloader(self):
63 | return DataLoader(self.mnist_predict, batch_size=32)
64 |
65 |
66 | class LitConvClassifier(pl.LightningModule):
67 | def __init__(self, learning_rate=1e-3):
68 | super().__init__()
69 | self.save_hyperparameters()
70 | self.example_input_array = torch.rand(1, 1, 28, 28)
71 |
72 | self.learning_rate = learning_rate
73 |
74 | # Define blocks of layers as submodules
75 | self.conv_block1 = nn.Sequential(
76 | nn.Conv2d(1, 32, 3, stride=1, padding=1), nn.ReLU(), nn.MaxPool2d(2)
77 | )
78 |
79 | self.conv_block2 = nn.Sequential(
80 | nn.Conv2d(32, 64, 3, stride=1, padding=1), nn.ReLU(), nn.MaxPool2d(2)
81 | )
82 |
83 | self.fc_block = nn.Sequential(
84 | nn.Linear(64 * 7 * 7, 128), nn.ReLU(), nn.Linear(128, 10)
85 | )
86 |
87 | def forward(self, x):
88 | x = self.conv_block1(x)
89 | x = self.conv_block2(x)
90 | x = x.view(x.size(0), -1)
91 | x = self.fc_block(x)
92 | return x
93 |
94 | def training_step(self, batch, batch_idx):
95 | x, y = batch
96 | y_hat = self(x)
97 | loss = F.cross_entropy(y_hat, y)
98 | return loss
99 |
100 | def validation_step(self, batch, batch_idx):
101 | x, y = batch
102 | y_hat = self(x)
103 | loss = F.cross_entropy(y_hat, y)
104 | self.log("val_loss", loss)
105 | return loss
106 |
107 | def test_step(self, batch, batch_idx):
108 | x, y = batch
109 | y_hat = self(x)
110 | loss = F.cross_entropy(y_hat, y)
111 | return loss
112 |
113 | def predict_step(self, batch, batch_idx, dataloader_idx=None):
114 | x, _ = batch
115 | return self(x)
116 |
117 | def configure_optimizers(self):
118 | optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)
119 | return optimizer
120 |
121 |
122 | data_module = MNISTDataModule()
123 | model = LitConvClassifier()
124 |
125 | trainer = pl.Trainer(
126 | max_epochs=1,
127 | default_root_dir="experiments/",
128 | callbacks=[
129 | EarlyStopping(monitor="val_loss", mode="min"),
130 | ModelSummary(max_depth=-1),
131 | ],
132 | )
133 |
134 | # Train Model
135 | # We can pass the data module directly to the trainer
136 | trainer.fit(model, data_module)
137 |
138 | # Get Predictions
139 | predictions = trainer.predict(model, data_module)
140 | print(len(predictions))
141 |
--------------------------------------------------------------------------------
/src/intermediate/level_11_scaling_techniques/02_gradient_accumulation.py:
--------------------------------------------------------------------------------
1 | # Documentation Link
2 | # https://lightning.ai/docs/pytorch/stable/advanced/training_tricks.html
3 |
4 | import os
5 |
6 | import lightning.pytorch as pl
7 | import torch
8 | from lightning.pytorch.callbacks import ModelSummary
9 | from lightning.pytorch.callbacks.early_stopping import EarlyStopping
10 | from torch import nn
11 | from torch.nn import functional as F
12 | from torch.utils.data import DataLoader, random_split
13 | from torchvision import transforms
14 | from torchvision.datasets import MNIST
15 |
16 |
17 | class MNISTDataModule(pl.LightningDataModule):
18 | def __init__(self, data_dir: str = "./"):
19 | super().__init__()
20 | self.data_dir = data_dir
21 | self.transform = transforms.Compose(
22 | [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]
23 | )
24 |
25 | def prepare_data(self):
26 | # download
27 | MNIST(self.data_dir, train=True, download=True)
28 | MNIST(self.data_dir, train=False, download=True)
29 |
30 | def setup(self, stage: str):
31 | # Assign train/val datasets for use in dataloaders
32 | if stage == "fit":
33 | mnist_full = MNIST(self.data_dir, train=True, transform=self.transform)
34 | self.mnist_train, self.mnist_val = random_split(mnist_full, [55000, 5000])
35 |
36 | # Assign test dataset for use in dataloader(s)
37 | if stage == "test":
38 | self.mnist_test = MNIST(
39 | self.data_dir, train=False, transform=self.transform
40 | )
41 |
42 | if stage == "predict":
43 | self.mnist_predict = MNIST(
44 | self.data_dir, train=False, transform=self.transform
45 | )
46 |
47 | def train_dataloader(self):
48 | return DataLoader(self.mnist_train, batch_size=32)
49 |
50 | def val_dataloader(self):
51 | return DataLoader(self.mnist_val, batch_size=32)
52 |
53 | def test_dataloader(self):
54 | return DataLoader(self.mnist_test, batch_size=32)
55 |
56 | def predict_dataloader(self):
57 | return DataLoader(self.mnist_predict, batch_size=32)
58 |
59 |
60 | class LitConvClassifier(pl.LightningModule):
61 | def __init__(self, learning_rate=1e-3):
62 | super().__init__()
63 | self.save_hyperparameters()
64 | self.example_input_array = torch.rand(1, 1, 28, 28)
65 |
66 | self.learning_rate = learning_rate
67 |
68 | # Define blocks of layers as submodules
69 | self.conv_block1 = nn.Sequential(
70 | nn.Conv2d(1, 32, 3, stride=1, padding=1), nn.ReLU(), nn.MaxPool2d(2)
71 | )
72 |
73 | self.conv_block2 = nn.Sequential(
74 | nn.Conv2d(32, 64, 3, stride=1, padding=1), nn.ReLU(), nn.MaxPool2d(2)
75 | )
76 |
77 | self.fc_block = nn.Sequential(
78 | nn.Linear(64 * 7 * 7, 128), nn.ReLU(), nn.Linear(128, 10)
79 | )
80 |
81 | def forward(self, x):
82 | x = self.conv_block1(x)
83 | x = self.conv_block2(x)
84 | x = x.view(x.size(0), -1)
85 | x = self.fc_block(x)
86 | return x
87 |
88 | def training_step(self, batch, batch_idx):
89 | x, y = batch
90 | y_hat = self(x)
91 | loss = F.cross_entropy(y_hat, y)
92 | return loss
93 |
94 | def validation_step(self, batch, batch_idx):
95 | x, y = batch
96 | y_hat = self(x)
97 | loss = F.cross_entropy(y_hat, y)
98 | self.log("val_loss", loss)
99 | return loss
100 |
101 | def test_step(self, batch, batch_idx):
102 | x, y = batch
103 | y_hat = self(x)
104 | loss = F.cross_entropy(y_hat, y)
105 | return loss
106 |
107 | def predict_step(self, batch, batch_idx, dataloader_idx=None):
108 | x, _ = batch
109 | return self(x)
110 |
111 | def configure_optimizers(self):
112 | optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)
113 | return optimizer
114 |
115 |
116 | data_module = MNISTDataModule()
117 | model = LitConvClassifier()
118 |
119 | # Accumulated gradients run K small batches of size N before doing a backward pass.
120 | # The effect is a large effective batch size of size KxN, where N is the batch size.
121 | # Internally it doesn’t stack up the batches and do a forward pass rather
122 | # it accumulates the gradients for K batches and then do an optimizer.step
123 | # to make sure the effective batch size is increased but there is no memory overhead.
124 |
125 | trainer = pl.Trainer(
126 | max_epochs=1,
127 | default_root_dir="experiments/",
128 | callbacks=[
129 | EarlyStopping(monitor="val_loss", mode="min"),
130 | ModelSummary(max_depth=-1),
131 | ],
132 | precision="16-mixed",
133 | accumulate_grad_batches=7, # Accumulate gradients for 7 batches
134 | )
135 |
136 | trainer.fit(model, data_module)
137 |
138 | # Get Predictions
139 | predictions = trainer.predict(model, data_module)
140 | print(len(predictions))
141 |
--------------------------------------------------------------------------------
/src/intermediate/level_11_scaling_techniques/03_gradient_clipping.py:
--------------------------------------------------------------------------------
1 | # Documentation Link
2 | # https://lightning.ai/docs/pytorch/stable/advanced/training_tricks.html
3 |
4 | import os
5 |
6 | import lightning.pytorch as pl
7 | import torch
8 | from lightning.pytorch.callbacks import ModelSummary
9 | from lightning.pytorch.callbacks.early_stopping import EarlyStopping
10 | from torch import nn
11 | from torch.nn import functional as F
12 | from torch.utils.data import DataLoader, random_split
13 | from torchvision import transforms
14 | from torchvision.datasets import MNIST
15 |
16 |
17 | class MNISTDataModule(pl.LightningDataModule):
18 | def __init__(self, data_dir: str = "./"):
19 | super().__init__()
20 | self.data_dir = data_dir
21 | self.transform = transforms.Compose(
22 | [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]
23 | )
24 |
25 | def prepare_data(self):
26 | # download
27 | MNIST(self.data_dir, train=True, download=True)
28 | MNIST(self.data_dir, train=False, download=True)
29 |
30 | def setup(self, stage: str):
31 | # Assign train/val datasets for use in dataloaders
32 | if stage == "fit":
33 | mnist_full = MNIST(self.data_dir, train=True, transform=self.transform)
34 | self.mnist_train, self.mnist_val = random_split(mnist_full, [55000, 5000])
35 |
36 | # Assign test dataset for use in dataloader(s)
37 | if stage == "test":
38 | self.mnist_test = MNIST(
39 | self.data_dir, train=False, transform=self.transform
40 | )
41 |
42 | if stage == "predict":
43 | self.mnist_predict = MNIST(
44 | self.data_dir, train=False, transform=self.transform
45 | )
46 |
47 | def train_dataloader(self):
48 | return DataLoader(self.mnist_train, batch_size=32)
49 |
50 | def val_dataloader(self):
51 | return DataLoader(self.mnist_val, batch_size=32)
52 |
53 | def test_dataloader(self):
54 | return DataLoader(self.mnist_test, batch_size=32)
55 |
56 | def predict_dataloader(self):
57 | return DataLoader(self.mnist_predict, batch_size=32)
58 |
59 |
60 | class LitConvClassifier(pl.LightningModule):
61 | def __init__(self, learning_rate=1e-3):
62 | super().__init__()
63 | self.save_hyperparameters()
64 | self.example_input_array = torch.rand(1, 1, 28, 28)
65 |
66 | self.learning_rate = learning_rate
67 |
68 | # Define blocks of layers as submodules
69 | self.conv_block1 = nn.Sequential(
70 | nn.Conv2d(1, 32, 3, stride=1, padding=1), nn.ReLU(), nn.MaxPool2d(2)
71 | )
72 |
73 | self.conv_block2 = nn.Sequential(
74 | nn.Conv2d(32, 64, 3, stride=1, padding=1), nn.ReLU(), nn.MaxPool2d(2)
75 | )
76 |
77 | self.fc_block = nn.Sequential(
78 | nn.Linear(64 * 7 * 7, 128), nn.ReLU(), nn.Linear(128, 10)
79 | )
80 |
81 | def forward(self, x):
82 | x = self.conv_block1(x)
83 | x = self.conv_block2(x)
84 | x = x.view(x.size(0), -1)
85 | x = self.fc_block(x)
86 | return x
87 |
88 | def training_step(self, batch, batch_idx):
89 | x, y = batch
90 | y_hat = self(x)
91 | loss = F.cross_entropy(y_hat, y)
92 | return loss
93 |
94 | def validation_step(self, batch, batch_idx):
95 | x, y = batch
96 | y_hat = self(x)
97 | loss = F.cross_entropy(y_hat, y)
98 | self.log("val_loss", loss)
99 | return loss
100 |
101 | def test_step(self, batch, batch_idx):
102 | x, y = batch
103 | y_hat = self(x)
104 | loss = F.cross_entropy(y_hat, y)
105 | return loss
106 |
107 | def predict_step(self, batch, batch_idx, dataloader_idx=None):
108 | x, _ = batch
109 | return self(x)
110 |
111 | def configure_optimizers(self):
112 | optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)
113 | return optimizer
114 |
115 |
116 | data_module = MNISTDataModule()
117 | model = LitConvClassifier()
118 |
119 | # Gradient clipping can be enabled to avoid exploding gradients.
120 | # By default, this will clip the gradient norm by calling torch.nn.utils.clip_grad_norm_()
121 | # computed over all model parameters together. If the Trainer’s gradient_clip_algorithm is set to
122 | # 'value' ('norm' by default), this will use instead torch.nn.utils.clip_grad_value_() for each parameter instead.
123 |
124 | trainer = pl.Trainer(
125 | max_epochs=1,
126 | default_root_dir="experiments/",
127 | callbacks=[
128 | EarlyStopping(monitor="val_loss", mode="min"),
129 | ModelSummary(max_depth=-1),
130 | ],
131 | precision="16-mixed",
132 | gradient_clip_val=0.5, # clip gradients' global norm to <=0.5 using gradient_clip_algorithm='norm' by default
133 | )
134 |
135 | trainer.fit(model, data_module)
136 |
137 | # Get Predictions
138 | predictions = trainer.predict(model, data_module)
139 | print(len(predictions))
140 |
--------------------------------------------------------------------------------
/src/advanced/level_15_modify_trainer/01_create_callbacks.py:
--------------------------------------------------------------------------------
1 | # Documentaion Link
2 | # https://lightning.ai/docs/pytorch/stable/extensions/callbacks.html
3 |
4 | import lightning.pytorch as pl
5 | import torch
6 | from lightning.pytorch.callbacks import Callback, ModelSummary
7 | from lightning.pytorch.callbacks.early_stopping import EarlyStopping
8 | from torch import nn
9 | from torch.nn import functional as F
10 | from torch.utils.data import DataLoader, random_split
11 | from torchvision import transforms
12 | from torchvision.datasets import MNIST
13 |
14 |
15 | class MNISTDataModule(pl.LightningDataModule):
16 | def __init__(self, data_dir: str = "./"):
17 | super().__init__()
18 | self.data_dir = data_dir
19 | self.transform = transforms.Compose(
20 | [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]
21 | )
22 |
23 | def prepare_data(self):
24 | # download
25 | MNIST(self.data_dir, train=True, download=True)
26 | MNIST(self.data_dir, train=False, download=True)
27 |
28 | def setup(self, stage: str):
29 | # Assign train/val datasets for use in dataloaders
30 | if stage == "fit":
31 | mnist_full = MNIST(self.data_dir, train=True, transform=self.transform)
32 | self.mnist_train, self.mnist_val = random_split(mnist_full, [55000, 5000])
33 |
34 | # Assign test dataset for use in dataloader(s)
35 | if stage == "test":
36 | self.mnist_test = MNIST(
37 | self.data_dir, train=False, transform=self.transform
38 | )
39 |
40 | if stage == "predict":
41 | self.mnist_predict = MNIST(
42 | self.data_dir, train=False, transform=self.transform
43 | )
44 |
45 | def train_dataloader(self):
46 | return DataLoader(self.mnist_train, batch_size=32)
47 |
48 | def val_dataloader(self):
49 | return DataLoader(self.mnist_val, batch_size=32)
50 |
51 | def test_dataloader(self):
52 | return DataLoader(self.mnist_test, batch_size=32)
53 |
54 | def predict_dataloader(self):
55 | return DataLoader(self.mnist_predict, batch_size=32)
56 |
57 |
58 | class LitConvClassifier(pl.LightningModule):
59 | def __init__(self, learning_rate=1e-3):
60 | super().__init__()
61 | self.save_hyperparameters()
62 | self.example_input_array = torch.rand(1, 1, 28, 28)
63 |
64 | self.learning_rate = learning_rate
65 |
66 | # Define blocks of layers as submodules
67 | self.conv_block1 = nn.Sequential(
68 | nn.Conv2d(1, 32, 3, stride=1, padding=1), nn.ReLU(), nn.MaxPool2d(2)
69 | )
70 |
71 | self.conv_block2 = nn.Sequential(
72 | nn.Conv2d(32, 64, 3, stride=1, padding=1), nn.ReLU(), nn.MaxPool2d(2)
73 | )
74 |
75 | self.fc_block = nn.Sequential(
76 | nn.Linear(64 * 7 * 7, 128), nn.ReLU(), nn.Linear(128, 10)
77 | )
78 |
79 | def forward(self, x):
80 | x = self.conv_block1(x)
81 | x = self.conv_block2(x)
82 | x = x.view(x.size(0), -1)
83 | x = self.fc_block(x)
84 | return x
85 |
86 | def training_step(self, batch, batch_idx):
87 | x, y = batch
88 | y_hat = self(x)
89 | loss = F.cross_entropy(y_hat, y)
90 | return loss
91 |
92 | def validation_step(self, batch, batch_idx):
93 | x, y = batch
94 | y_hat = self(x)
95 | loss = F.cross_entropy(y_hat, y)
96 | self.log("val_loss", loss)
97 | return loss
98 |
99 | def test_step(self, batch, batch_idx):
100 | x, y = batch
101 | y_hat = self(x)
102 | loss = F.cross_entropy(y_hat, y)
103 | return loss
104 |
105 | def predict_step(self, batch, batch_idx, dataloader_idx=None):
106 | x, _ = batch
107 | return self(x)
108 |
109 | def configure_optimizers(self):
110 | optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)
111 | return optimizer
112 |
113 |
114 | data_module = MNISTDataModule()
115 | model = LitConvClassifier()
116 |
117 |
118 | class CustomValidationCallback(Callback):
119 | """
120 | This is a simple demonstration of creating a Custom Callback and
121 | passing it to the trainer.
122 |
123 | The Callback is used everytime the Validation starts and ends.
124 | You should be able to see it in the logs!
125 | """
126 |
127 | def on_validation_start(self, trainer, pl_module):
128 | print("Validation is starting.")
129 |
130 | def on_validation_end(self, trainer, pl_module):
131 | print("Validation has ended.")
132 |
133 |
134 | trainer = pl.Trainer(
135 | max_epochs=1,
136 | default_root_dir="experiments/",
137 | callbacks=[
138 | EarlyStopping(monitor="val_loss", mode="min"),
139 | ModelSummary(max_depth=-1),
140 | CustomValidationCallback(), # Pass the custom callback to the trainer
141 | ],
142 | precision="16-mixed",
143 | limit_train_batches=0.1,
144 | limit_val_batches=0.01,
145 | )
146 |
147 | trainer.fit(model, data_module)
148 |
149 | # Get Predictions
150 | predictions = trainer.predict(model, data_module)
151 | print(len(predictions))
152 |
--------------------------------------------------------------------------------
/src/advanced/level_15_modify_trainer/02_customize_progress_bar.py:
--------------------------------------------------------------------------------
1 | # Documentation Link
2 | # https://lightning.ai/docs/pytorch/stable/common/progress_bar.html
3 |
4 | import lightning.pytorch as pl
5 | import torch
6 |
7 | # In this example we will use the RichProgressBar
8 | # Lightning by default uses tqdm
9 | from lightning.pytorch.callbacks import ModelSummary, RichProgressBar
10 | from lightning.pytorch.callbacks.early_stopping import EarlyStopping
11 | from lightning.pytorch.callbacks.progress.rich_progress import RichProgressBarTheme
12 | from torch import nn
13 | from torch.nn import functional as F
14 | from torch.utils.data import DataLoader, random_split
15 | from torchvision import transforms
16 | from torchvision.datasets import MNIST
17 |
18 |
19 | class MNISTDataModule(pl.LightningDataModule):
20 | def __init__(self, data_dir: str = "./"):
21 | super().__init__()
22 | self.data_dir = data_dir
23 | self.transform = transforms.Compose(
24 | [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]
25 | )
26 |
27 | def prepare_data(self):
28 | # download
29 | MNIST(self.data_dir, train=True, download=True)
30 | MNIST(self.data_dir, train=False, download=True)
31 |
32 | def setup(self, stage: str):
33 | # Assign train/val datasets for use in dataloaders
34 | if stage == "fit":
35 | mnist_full = MNIST(self.data_dir, train=True, transform=self.transform)
36 | self.mnist_train, self.mnist_val = random_split(mnist_full, [55000, 5000])
37 |
38 | # Assign test dataset for use in dataloader(s)
39 | if stage == "test":
40 | self.mnist_test = MNIST(
41 | self.data_dir, train=False, transform=self.transform
42 | )
43 |
44 | if stage == "predict":
45 | self.mnist_predict = MNIST(
46 | self.data_dir, train=False, transform=self.transform
47 | )
48 |
49 | def train_dataloader(self):
50 | return DataLoader(self.mnist_train, batch_size=32)
51 |
52 | def val_dataloader(self):
53 | return DataLoader(self.mnist_val, batch_size=32)
54 |
55 | def test_dataloader(self):
56 | return DataLoader(self.mnist_test, batch_size=32)
57 |
58 | def predict_dataloader(self):
59 | return DataLoader(self.mnist_predict, batch_size=32)
60 |
61 |
62 | class LitConvClassifier(pl.LightningModule):
63 | def __init__(self, learning_rate=1e-3):
64 | super().__init__()
65 | self.save_hyperparameters()
66 | self.example_input_array = torch.rand(1, 1, 28, 28)
67 |
68 | self.learning_rate = learning_rate
69 |
70 | # Define blocks of layers as submodules
71 | self.conv_block1 = nn.Sequential(
72 | nn.Conv2d(1, 32, 3, stride=1, padding=1), nn.ReLU(), nn.MaxPool2d(2)
73 | )
74 |
75 | self.conv_block2 = nn.Sequential(
76 | nn.Conv2d(32, 64, 3, stride=1, padding=1), nn.ReLU(), nn.MaxPool2d(2)
77 | )
78 |
79 | self.fc_block = nn.Sequential(
80 | nn.Linear(64 * 7 * 7, 128), nn.ReLU(), nn.Linear(128, 10)
81 | )
82 |
83 | def forward(self, x):
84 | x = self.conv_block1(x)
85 | x = self.conv_block2(x)
86 | x = x.view(x.size(0), -1)
87 | x = self.fc_block(x)
88 | return x
89 |
90 | def training_step(self, batch, batch_idx):
91 | x, y = batch
92 | y_hat = self(x)
93 | loss = F.cross_entropy(y_hat, y)
94 | return loss
95 |
96 | def validation_step(self, batch, batch_idx):
97 | x, y = batch
98 | y_hat = self(x)
99 | loss = F.cross_entropy(y_hat, y)
100 | self.log("val_loss", loss)
101 | return loss
102 |
103 | def test_step(self, batch, batch_idx):
104 | x, y = batch
105 | y_hat = self(x)
106 | loss = F.cross_entropy(y_hat, y)
107 | return loss
108 |
109 | def predict_step(self, batch, batch_idx, dataloader_idx=None):
110 | x, _ = batch
111 | return self(x)
112 |
113 | def configure_optimizers(self):
114 | optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)
115 | return optimizer
116 |
117 |
118 | data_module = MNISTDataModule()
119 | model = LitConvClassifier()
120 |
121 | # Define a custom theme for the RichProgressBar
122 | rich_progress_bar = RichProgressBar(
123 | theme=RichProgressBarTheme(
124 | description="green_yellow",
125 | progress_bar="green1",
126 | progress_bar_finished="green1",
127 | progress_bar_pulse="#6206E0",
128 | batch_progress="green_yellow",
129 | time="grey82",
130 | processing_speed="grey82",
131 | metrics="grey82",
132 | metrics_text_delimiter="\n",
133 | metrics_format=".3e",
134 | )
135 | )
136 |
137 | trainer = pl.Trainer(
138 | max_epochs=1,
139 | default_root_dir="experiments",
140 | callbacks=[
141 | EarlyStopping(monitor="val_loss", mode="min"),
142 | ModelSummary(max_depth=-1),
143 | rich_progress_bar, # Pass the rich_progress_bar to the Trainer
144 | ],
145 | precision="16-mixed",
146 | limit_train_batches=0.1,
147 | limit_val_batches=0.01,
148 | )
149 |
150 | trainer.fit(model, data_module)
151 |
152 | # Get Predictions
153 | predictions = trainer.predict(model, data_module)
154 | print(len(predictions))
155 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Zero-to-Lightning :zap:: Comprehensive PyTorch Lightning Tutorial Guide
2 |
3 |
4 |
5 |
6 |
7 | Welcome to the GitHub repository for Zero-to-Lightning! This project contains a collection of independent, executable scripts that showcase most of the available functionalities in PyTorch Lightning, each covering a new feature or technique. It's organized to help you smoothly progress from basic to advanced PyTorch Lightning concepts.
8 |
9 | ## Project Demo
10 |
11 | https://github.com/ishandutta0098/zero-to-lightning/assets/47643789/a068e1d1-0ec8-4357-b4e2-d1c8090224fd
12 |
13 |
14 | ## Project Directory
15 |
16 | ```
17 | zero-to-lightning
18 | |-src
19 | |-basic
20 | | |-level_01_lightning_module
21 | | |-level_02_validation_and_testing
22 | | |-level_03_checkpointing
23 | | |-level_04_early_stopping
24 | | |-level_05_pretrained_model
25 | | |-level_06_debugging_model
26 | | |-level_07_inference
27 | |
28 | |-intermediate
29 | | |-level_08_accelerated_hardware
30 | | |-level_09_modularize
31 | | |-level_11_scaling_techniques
32 | | |-level_12_deploying_models
33 | | |-level_13_profiler
34 | |
35 | |-advanced
36 | |-level_14_run_with_config_file
37 | |-level_15_modify_trainer
38 | |-level_16_enable_manual_optimization
39 | |-level_17_advanced_checkpointing
40 | |-level_18_ipu
41 | |-level_19_hpu
42 |
43 | ```
44 |
45 | - **Basic**: 🏗 Foundational Lightning concepts like creating modules, validation and testing, checkpointing, early stopping, pretrained models, debugging, and inference.
46 | - **Intermediate**: 🚀 More specialized topics like accelerated hardware, modularization, scaling techniques, deployment, and profiling.
47 | - **Advanced**: 🔍 Deep dives into running with config files, modifying trainers, manual optimization, advanced checkpointing, IPUs, and HPUs.
48 |
49 | ## Overview
50 |
51 | Each sub-directory is designed to help users become familiar with a specific set of PyTorch Lightning functionalities and best practices. Whether you're just starting out or are an advanced user seeking to refine your techniques, the project provides structured guidance and practical examples.
52 |
53 | ## Features
54 |
55 | - **Compact, Executable Scripts**: 📦 Each script is designed to be concise, demonstrating how individual features, functions, or classes operate, making learning targeted and efficient.
56 | - **CPU-Friendly**: 🖥 Most scripts are optimized to run on standard CPUs, minimizing the need for specialized hardware.
57 | - **Quick Iteration**: ⏲ Each script executes in under a minute, enabling rapid testing, learning, and iteration.
58 | - **Official Documentation Links**: 📚 Every script is accompanied by relevant references to official Lightning documentation, helping you deepen your understanding.
59 | - **Independent Execution**: 🏃♂️ The scripts are modular, allowing you to explore features individually without needing to execute the entire project.
60 | - **Comprehensive Coverage**: 🌐 From basic modules and validation to advanced manual optimization and hardware-specific integrations, this guide ensures broad exposure to the various functionalities PyTorch Lightning offers.
61 | - **Step-by-Step Structure**: 🛠 Organized progressively, it enables users to gradually advance from foundational knowledge to more sophisticated techniques.
62 |
63 |
64 |
65 | ## Getting Started
66 |
67 | To get started with this project, clone the repository and follow the instructions below.
68 |
69 | ### Installation
70 |
71 | 1. Clone the repository:
72 | ```bash
73 | git clone https://github.com/ishandutta0098/zero-to-lightning.git
74 | ```
75 |
76 | 2. Navigate to the project directory:
77 | ```bash
78 | cd zero-to-lightning
79 | ```
80 |
81 | 3. Create the conda environment:
82 | ```bash
83 | # Create the conda environment
84 | conda env create -f conda.yml
85 |
86 | # Activate the environment
87 | conda activate lit-env
88 | ```
89 |
90 | ### Usage
91 | You can run any script by passing it's path directly as shown below.
92 |
93 | ```bash
94 | python
95 |
96 | # Example
97 | python src/basic/level_01_lightning_module/lightning_module.py
98 | ```
99 |
100 | Most of the scripts run directly. For one script we use the LightningCLI.
101 | To run the script `src/advanced/level_14_run_with_config_file/run_with_yaml.py` follow the below steps 👇
102 |
103 | ```bash
104 | # There are 3 Steps to run this:
105 | # 1. Save the current configs in config.yaml
106 | python src/advanced/level_14_run_with_config_file/run_with_yaml.py fit --print_config > config.yaml
107 |
108 | # 2. Run the training using the config file
109 | python src/advanced/level_14_run_with_config_file/run_with_yaml.py fit --config config.yaml
110 |
111 | # 3. Modify the config file and run the training again
112 | # Example, try making `max_epochs` as 3 in the config file and run the training again
113 | python src/advanced/level_14_run_with_config_file/run_with_yaml.py fit --config config.yaml
114 | ```
115 |
116 | ## License
117 | This project is licensed under the MIT License - see the LICENSE file for details.
118 |
119 |
--------------------------------------------------------------------------------
/src/intermediate/level_08_accelerated_hardware/01_gpu_training.py:
--------------------------------------------------------------------------------
1 | # Documentation Link
2 | # https://lightning.ai/docs/pytorch/stable/accelerators/gpu_basic.html
3 |
4 | import os
5 |
6 | import lightning.pytorch as pl
7 | import torch
8 | from lightning.pytorch.callbacks import ModelSummary
9 | from lightning.pytorch.callbacks.early_stopping import EarlyStopping
10 | from torch import nn
11 | from torch.nn import functional as F
12 | from torch.utils.data import DataLoader
13 | from torchvision import transforms
14 | from torchvision.datasets import MNIST
15 |
16 |
17 | class LitConvClassifier(pl.LightningModule):
18 | def __init__(self, learning_rate=1e-3):
19 | super().__init__()
20 | self.save_hyperparameters()
21 | self.example_input_array = torch.rand(1, 1, 28, 28)
22 |
23 | self.learning_rate = learning_rate
24 |
25 | # Define blocks of layers as submodules
26 | self.conv_block1 = nn.Sequential(
27 | nn.Conv2d(1, 32, 3, stride=1, padding=1), nn.ReLU(), nn.MaxPool2d(2)
28 | )
29 |
30 | self.conv_block2 = nn.Sequential(
31 | nn.Conv2d(32, 64, 3, stride=1, padding=1), nn.ReLU(), nn.MaxPool2d(2)
32 | )
33 |
34 | self.fc_block = nn.Sequential(
35 | nn.Linear(64 * 7 * 7, 128), nn.ReLU(), nn.Linear(128, 10)
36 | )
37 |
38 | def forward(self, x):
39 | x = self.conv_block1(x)
40 | x = self.conv_block2(x)
41 | x = x.view(x.size(0), -1)
42 | x = self.fc_block(x)
43 | return x
44 |
45 | def training_step(self, batch, batch_idx):
46 | x, y = batch
47 | y_hat = self(x)
48 | loss = F.cross_entropy(y_hat, y)
49 | return loss
50 |
51 | def validation_step(self, batch, batch_idx):
52 | x, y = batch
53 | y_hat = self(x)
54 | loss = F.cross_entropy(y_hat, y)
55 | self.log("val_loss", loss)
56 | return loss
57 |
58 | def test_step(self, batch, batch_idx):
59 | x, y = batch
60 | y_hat = self(x)
61 | loss = F.cross_entropy(y_hat, y)
62 | return loss
63 |
64 | def predict_step(self, batch, batch_idx, dataloader_idx=None):
65 | x, _ = batch
66 | return self(x)
67 |
68 | def configure_optimizers(self):
69 | optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)
70 | return optimizer
71 |
72 |
73 | def prepare_dataloaders():
74 | train_dataset = MNIST(
75 | "./", download=True, train=True, transform=transforms.ToTensor()
76 | )
77 |
78 | train_size = int(0.8 * len(train_dataset))
79 | val_size = len(train_dataset) - train_size
80 |
81 | seed = torch.Generator().manual_seed(42)
82 | train_dataset, val_dataset = torch.utils.data.random_split(
83 | train_dataset, [train_size, val_size], generator=seed
84 | )
85 |
86 | test_dataset = MNIST(
87 | "./", download=True, train=False, transform=transforms.ToTensor()
88 | )
89 |
90 | train_dataloader = DataLoader(train_dataset, batch_size=32)
91 | val_dataloader = DataLoader(val_dataset, batch_size=32)
92 | test_dataloader = DataLoader(test_dataset, batch_size=32)
93 |
94 | return train_dataloader, val_dataloader, test_dataloader
95 |
96 |
97 | train_dataloader, val_dataloader, test_dataloader = prepare_dataloaders()
98 |
99 | model = LitConvClassifier()
100 |
101 | # The below code will train the model on the GPU if unavailable it will throw an error.
102 | # A Graphics Processing Unit (GPU), is a specialized hardware accelerator designed to speed up
103 | # mathematical computations used in gaming and deep learning.
104 |
105 | # Following are the different configuration examples which you can use to train your model on GPU
106 | # based on your hardware configuration.
107 |
108 | # run on as many GPUs as available by default
109 | trainer = pl.Trainer(
110 | max_epochs=5,
111 | default_root_dir="experiments/",
112 | accelerator="auto",
113 | devices="auto",
114 | strategy="auto",
115 | )
116 | # equivalent to
117 | trainer = pl.Trainer(max_epochs=5, default_root_dir="experiments/")
118 |
119 | # run on one GPU
120 | trainer = pl.Trainer(
121 | max_epochs=5, default_root_dir="experiments/", accelerator="gpu", devices=1
122 | )
123 |
124 | # run on multiple GPUs
125 | trainer = pl.Trainer(
126 | max_epochs=5, default_root_dir="experiments/", accelerator="gpu", devices=8
127 | )
128 |
129 | # choose the number of devices automatically
130 | trainer = pl.Trainer(
131 | max_epochs=5, default_root_dir="experiments/", accelerator="gpu", devices="auto"
132 | )
133 |
134 | # DEFAULT (int) specifies how many GPUs to use per node
135 | k = 2
136 | trainer = pl.Trainer(
137 | max_epochs=5, default_root_dir="experiments/", accelerator="gpu", devices=k
138 | )
139 | # equivalent to
140 | trainer = pl.Trainer(
141 | max_epochs=5,
142 | default_root_dir="experiments/",
143 | accelerator="gpu",
144 | devices=list(range(k)),
145 | )
146 |
147 | # Specify which GPUs to use (don't use when running on cluster)
148 | trainer = pl.Trainer(
149 | max_epochs=5, default_root_dir="experiments/", accelerator="gpu", devices=[0, 1]
150 | )
151 | # equivalent to
152 | trainer = pl.Trainer(
153 | max_epochs=5, default_root_dir="experiments/", accelerator="gpu", devices="0, 1"
154 | )
155 |
156 | # To use all available GPUs put -1 or '-1'
157 | # equivalent to `list(range(torch.cuda.device_count())) and `"auto"`
158 | trainer = pl.Trainer(
159 | max_epochs=5, default_root_dir="experiments/", accelerator="gpu", devices=-1
160 | )
161 |
162 | trainer.fit(model, train_dataloader, val_dataloader)
163 |
--------------------------------------------------------------------------------
/src/advanced/level_14_run_with_config_file/run_with_yaml.py:
--------------------------------------------------------------------------------
1 | # Documentation Link
2 | # https://lightning.ai/docs/pytorch/stable/levels/advanced_level_15.html
3 |
4 | import lightning.pytorch as pl
5 | import torch
6 | from lightning.pytorch.callbacks import ModelSummary
7 | from lightning.pytorch.callbacks.early_stopping import EarlyStopping
8 |
9 | # We will use the LightningCLI to run the training
10 | from lightning.pytorch.cli import LightningCLI
11 | from torch import nn
12 | from torch.nn import functional as F
13 | from torch.utils.data import DataLoader, random_split
14 | from torchvision import transforms
15 | from torchvision.datasets import MNIST
16 |
17 |
18 | class MNISTDataModule(pl.LightningDataModule):
19 | def __init__(self, data_dir: str = "./"):
20 | super().__init__()
21 | self.data_dir = data_dir
22 | self.transform = transforms.Compose(
23 | [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]
24 | )
25 |
26 | def prepare_data(self):
27 | # download
28 | MNIST(self.data_dir, train=True, download=True)
29 | MNIST(self.data_dir, train=False, download=True)
30 |
31 | def setup(self, stage: str):
32 | # Assign train/val datasets for use in dataloaders
33 | if stage == "fit":
34 | mnist_full = MNIST(self.data_dir, train=True, transform=self.transform)
35 | self.mnist_train, self.mnist_val = random_split(mnist_full, [55000, 5000])
36 |
37 | # Assign test dataset for use in dataloader(s)
38 | if stage == "test":
39 | self.mnist_test = MNIST(
40 | self.data_dir, train=False, transform=self.transform
41 | )
42 |
43 | if stage == "predict":
44 | self.mnist_predict = MNIST(
45 | self.data_dir, train=False, transform=self.transform
46 | )
47 |
48 | def train_dataloader(self):
49 | return DataLoader(self.mnist_train, batch_size=32)
50 |
51 | def val_dataloader(self):
52 | return DataLoader(self.mnist_val, batch_size=32)
53 |
54 | def test_dataloader(self):
55 | return DataLoader(self.mnist_test, batch_size=32)
56 |
57 | def predict_dataloader(self):
58 | return DataLoader(self.mnist_predict, batch_size=32)
59 |
60 |
61 | class LitConvClassifier(pl.LightningModule):
62 | def __init__(self, learning_rate=1e-3):
63 | super().__init__()
64 | self.save_hyperparameters()
65 | self.example_input_array = torch.rand(1, 1, 28, 28)
66 |
67 | self.learning_rate = learning_rate
68 |
69 | # Define blocks of layers as submodules
70 | self.conv_block1 = nn.Sequential(
71 | nn.Conv2d(1, 32, 3, stride=1, padding=1), nn.ReLU(), nn.MaxPool2d(2)
72 | )
73 |
74 | self.conv_block2 = nn.Sequential(
75 | nn.Conv2d(32, 64, 3, stride=1, padding=1), nn.ReLU(), nn.MaxPool2d(2)
76 | )
77 |
78 | self.fc_block = nn.Sequential(
79 | nn.Linear(64 * 7 * 7, 128), nn.ReLU(), nn.Linear(128, 10)
80 | )
81 |
82 | def forward(self, x):
83 | x = self.conv_block1(x)
84 | x = self.conv_block2(x)
85 | x = x.view(x.size(0), -1)
86 | x = self.fc_block(x)
87 | return x
88 |
89 | def training_step(self, batch, batch_idx):
90 | x, y = batch
91 | y_hat = self(x)
92 | loss = F.cross_entropy(y_hat, y)
93 | return loss
94 |
95 | def validation_step(self, batch, batch_idx):
96 | x, y = batch
97 | y_hat = self(x)
98 | loss = F.cross_entropy(y_hat, y)
99 | self.log("val_loss", loss)
100 | return loss
101 |
102 | def test_step(self, batch, batch_idx):
103 | x, y = batch
104 | y_hat = self(x)
105 | loss = F.cross_entropy(y_hat, y)
106 | return loss
107 |
108 | def predict_step(self, batch, batch_idx, dataloader_idx=None):
109 | x, _ = batch
110 | return self(x)
111 |
112 | def configure_optimizers(self):
113 | optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)
114 | return optimizer
115 |
116 |
117 | data_module = MNISTDataModule
118 | model = LitConvClassifier
119 |
120 | # To use the CLI we do not initialize the trainer class separately
121 | # We pass the model, data module, trainer defaults to the LightningCLI function directly
122 | # As you can see here the trainer_defaults are the same as the ones we used in the previous example
123 |
124 |
125 | def cli_main(model, data_module):
126 | cli = LightningCLI(
127 | model_class=LitConvClassifier,
128 | datamodule_class=MNISTDataModule,
129 | trainer_class=pl.Trainer,
130 | trainer_defaults={
131 | "max_epochs": 1,
132 | "default_root_dir": "experiments/",
133 | "callbacks": [
134 | EarlyStopping(monitor="val_loss", mode="min"),
135 | ModelSummary(max_depth=-1),
136 | ],
137 | "precision": "16-mixed",
138 | "limit_train_batches": 0.1,
139 | "limit_val_batches": 0.01,
140 | },
141 | )
142 |
143 |
144 | if __name__ == "__main__":
145 | cli_main(model, data_module)
146 |
147 | # There are 3 Steps to run this:
148 | # 1. Save the current configs in config.yaml
149 | # python src/advanced/level_14_run_with_config_file/run_with_yaml.py fit --print_config > config.yaml
150 |
151 | # 2. Run the training using the config file
152 | # python src/advanced/level_14_run_with_config_file/run_with_yaml.py fit --config config.yaml
153 |
154 | # 3. Modify the config file and run the training again
155 | # Example, try making `max_epochs` as 3 in the config file and run the training again
156 | # python src/advanced/level_14_run_with_config_file/run_with_yaml.py fit --config config.yaml
157 |
--------------------------------------------------------------------------------
/src/advanced/level_18_ipu/ipu.py:
--------------------------------------------------------------------------------
1 | # Documentation Link
2 | # https://lightning.ai/docs/pytorch/stable/integrations/ipu/basic.html
3 |
4 | import lightning.pytorch as pl
5 | import torch
6 | from lightning.pytorch.callbacks import ModelSummary
7 | from lightning.pytorch.callbacks.early_stopping import EarlyStopping
8 | from torch import nn
9 | from torch.nn import functional as F
10 | from torch.utils.data import DataLoader, random_split
11 | from torchvision import transforms
12 | from torchvision.datasets import MNIST
13 |
14 |
15 | class MNISTDataModule(pl.LightningDataModule):
16 | def __init__(self, data_dir: str = "./"):
17 | super().__init__()
18 | self.data_dir = data_dir
19 | self.transform = transforms.Compose(
20 | [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]
21 | )
22 |
23 | def prepare_data(self):
24 | MNIST(self.data_dir, train=True, download=True)
25 | MNIST(self.data_dir, train=False, download=True)
26 |
27 | def setup(self, stage: str):
28 | # Assign train/val datasets for use in dataloaders
29 | if stage == "fit":
30 | mnist_full = MNIST(self.data_dir, train=True, transform=self.transform)
31 | self.mnist_train, self.mnist_val = random_split(mnist_full, [55000, 5000])
32 |
33 | # Assign test dataset for use in dataloader(s)
34 | if stage == "test":
35 | self.mnist_test = MNIST(
36 | self.data_dir, train=False, transform=self.transform
37 | )
38 |
39 | if stage == "predict":
40 | self.mnist_predict = MNIST(
41 | self.data_dir, train=False, transform=self.transform
42 | )
43 |
44 | def train_dataloader(self):
45 | return DataLoader(self.mnist_train, batch_size=32)
46 |
47 | def val_dataloader(self):
48 | return DataLoader(self.mnist_val, batch_size=32)
49 |
50 | def test_dataloader(self):
51 | return DataLoader(self.mnist_test, batch_size=32)
52 |
53 | def predict_dataloader(self):
54 | return DataLoader(self.mnist_predict, batch_size=32)
55 |
56 |
57 | class LitConvClassifier(pl.LightningModule):
58 | def __init__(self, learning_rate=1e-3):
59 | super().__init__()
60 | self.save_hyperparameters()
61 | self.example_input_array = torch.rand(1, 1, 28, 28)
62 |
63 | self.learning_rate = learning_rate
64 |
65 | # Define blocks of layers as submodules
66 | self.conv_block1 = nn.Sequential(
67 | nn.Conv2d(1, 32, 3, stride=1, padding=1), nn.ReLU(), nn.MaxPool2d(2)
68 | )
69 |
70 | self.conv_block2 = nn.Sequential(
71 | nn.Conv2d(32, 64, 3, stride=1, padding=1), nn.ReLU(), nn.MaxPool2d(2)
72 | )
73 |
74 | self.fc_block = nn.Sequential(
75 | nn.Linear(64 * 7 * 7, 128), nn.ReLU(), nn.Linear(128, 10)
76 | )
77 |
78 | def forward(self, x):
79 | x = self.conv_block1(x)
80 | x = self.conv_block2(x)
81 | x = x.view(x.size(0), -1)
82 | x = self.fc_block(x)
83 | return x
84 |
85 | def training_step(self, batch, batch_idx):
86 | x, y = batch
87 | y_hat = self(x)
88 | loss = F.cross_entropy(y_hat, y)
89 | return loss
90 |
91 | def validation_step(self, batch, batch_idx):
92 | x, y = batch
93 | y_hat = self(x)
94 | loss = F.cross_entropy(y_hat, y)
95 | self.log("val_loss", loss)
96 | return loss
97 |
98 | def test_step(self, batch, batch_idx):
99 | x, y = batch
100 | y_hat = self(x)
101 | loss = F.cross_entropy(y_hat, y)
102 | return loss
103 |
104 | def predict_step(self, batch, batch_idx, dataloader_idx=None):
105 | x, _ = batch
106 | return self(x)
107 |
108 | def configure_optimizers(self):
109 | optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)
110 | return optimizer
111 |
112 |
113 | data_module = MNISTDataModule()
114 | model = LitConvClassifier()
115 |
116 | # Run on as many IPUs as available by default
117 | trainer = pl.Trainer(
118 | max_epochs=3,
119 | default_root_dir="experiments",
120 | callbacks=[
121 | EarlyStopping(monitor="val_loss", mode="min"),
122 | ModelSummary(max_depth=-1),
123 | ],
124 | precision="16-mixed",
125 | limit_train_batches=0.1,
126 | limit_val_batches=0.01,
127 | accelerator="auto",
128 | devices="auto",
129 | strategy="auto",
130 | )
131 |
132 | # equivalent to
133 | trainer = pl.Trainer(
134 | max_epochs=3,
135 | default_root_dir="experiments",
136 | callbacks=[
137 | EarlyStopping(monitor="val_loss", mode="min"),
138 | ModelSummary(max_depth=-1),
139 | ],
140 | precision="16-mixed",
141 | limit_train_batches=0.1,
142 | limit_val_batches=0.01,
143 | )
144 |
145 | # Run on one IPU
146 | trainer = pl.Trainer(
147 | max_epochs=3,
148 | default_root_dir="experiments",
149 | callbacks=[
150 | EarlyStopping(monitor="val_loss", mode="min"),
151 | ModelSummary(max_depth=-1),
152 | ],
153 | precision="16-mixed",
154 | limit_train_batches=0.1,
155 | limit_val_batches=0.01,
156 | accelerator="ipu",
157 | devices="1",
158 | )
159 |
160 | # Run on multiple IPUs
161 | trainer = pl.Trainer(
162 | max_epochs=3,
163 | default_root_dir="experiments",
164 | callbacks=[
165 | EarlyStopping(monitor="val_loss", mode="min"),
166 | ModelSummary(max_depth=-1),
167 | ],
168 | precision="16-mixed",
169 | limit_train_batches=0.1,
170 | limit_val_batches=0.01,
171 | accelerator="ipu",
172 | devices="8",
173 | )
174 |
175 | trainer.fit(model, data_module)
176 |
177 | # Get Predictions
178 | predictions = trainer.predict(model, data_module)
179 | print(len(predictions))
180 |
--------------------------------------------------------------------------------
/src/advanced/level_17_advanced_checkpointing/checkpoint.py:
--------------------------------------------------------------------------------
1 | # Documentation Link
2 | # https://lightning.ai/docs/pytorch/stable/common/checkpointing_advanced.html
3 |
4 | import lightning.pytorch as pl
5 | import torch
6 | from lightning.pytorch.callbacks import ModelSummary
7 | from lightning.pytorch.callbacks.early_stopping import EarlyStopping
8 | from torch import nn
9 | from torch.nn import functional as F
10 | from torch.utils.data import DataLoader, random_split
11 | from torchvision import transforms
12 | from torchvision.datasets import MNIST
13 |
14 |
15 | class MNISTDataModule(pl.LightningDataModule):
16 | def __init__(self, data_dir: str = "./"):
17 | super().__init__()
18 | self.data_dir = data_dir
19 | self.transform = transforms.Compose(
20 | [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]
21 | )
22 |
23 | def prepare_data(self):
24 | # download
25 | MNIST(self.data_dir, train=True, download=True)
26 | MNIST(self.data_dir, train=False, download=True)
27 |
28 | def setup(self, stage: str):
29 | # Assign train/val datasets for use in dataloaders
30 | if stage == "fit":
31 | mnist_full = MNIST(self.data_dir, train=True, transform=self.transform)
32 | self.mnist_train, self.mnist_val = random_split(mnist_full, [55000, 5000])
33 |
34 | # Assign test dataset for use in dataloader(s)
35 | if stage == "test":
36 | self.mnist_test = MNIST(
37 | self.data_dir, train=False, transform=self.transform
38 | )
39 |
40 | if stage == "predict":
41 | self.mnist_predict = MNIST(
42 | self.data_dir, train=False, transform=self.transform
43 | )
44 |
45 | def train_dataloader(self):
46 | return DataLoader(self.mnist_train, batch_size=32)
47 |
48 | def val_dataloader(self):
49 | return DataLoader(self.mnist_val, batch_size=32)
50 |
51 | def test_dataloader(self):
52 | return DataLoader(self.mnist_test, batch_size=32)
53 |
54 | def predict_dataloader(self):
55 | return DataLoader(self.mnist_predict, batch_size=32)
56 |
57 |
58 | # In this example we will learn how to modify a checkpoint
59 | # We create a custom attribute train_batches_processed and increment it in the training_step
60 | # We then modify the checkpoint to save this attribute
61 | class LitConvClassifier(pl.LightningModule):
62 | def __init__(self, learning_rate=1e-3):
63 | super().__init__()
64 | self.save_hyperparameters()
65 | self.example_input_array = torch.rand(1, 1, 28, 28)
66 |
67 | self.learning_rate = learning_rate
68 |
69 | # Custom attribute to keep track of training batches processed
70 | self.train_batches_processed = 0
71 |
72 | # Define blocks of layers as submodules
73 | self.conv_block1 = nn.Sequential(
74 | nn.Conv2d(1, 32, 3, stride=1, padding=1), nn.ReLU(), nn.MaxPool2d(2)
75 | )
76 |
77 | self.conv_block2 = nn.Sequential(
78 | nn.Conv2d(32, 64, 3, stride=1, padding=1), nn.ReLU(), nn.MaxPool2d(2)
79 | )
80 |
81 | self.fc_block = nn.Sequential(
82 | nn.Linear(64 * 7 * 7, 128), nn.ReLU(), nn.Linear(128, 10)
83 | )
84 |
85 | def forward(self, x):
86 | x = self.conv_block1(x)
87 | x = self.conv_block2(x)
88 | x = x.view(x.size(0), -1)
89 | x = self.fc_block(x)
90 | return x
91 |
92 | def training_step(self, batch, batch_idx):
93 | x, y = batch
94 | y_hat = self(x)
95 | loss = F.cross_entropy(y_hat, y)
96 |
97 | # Increment custom attribute train_batches_processed
98 | self.train_batches_processed += 1
99 | self.log("train_batches_processed", self.train_batches_processed)
100 |
101 | return loss
102 |
103 | def on_save_checkpoint(self, checkpoint):
104 | # Add the custom attribute to the checkpoint
105 | checkpoint["train_batches_processed"] = self.train_batches_processed
106 |
107 | def on_load_checkpoint(self, checkpoint):
108 | # Load the custom attribute from the checkpoint
109 | self.train_batches_processed = checkpoint.get("train_batches_processed", 0)
110 |
111 | def validation_step(self, batch, batch_idx):
112 | x, y = batch
113 | y_hat = self(x)
114 | loss = F.cross_entropy(y_hat, y)
115 | self.log("val_loss", loss)
116 | return loss
117 |
118 | def test_step(self, batch, batch_idx):
119 | x, y = batch
120 | y_hat = self(x)
121 | loss = F.cross_entropy(y_hat, y)
122 | return loss
123 |
124 | def predict_step(self, batch, batch_idx, dataloader_idx=None):
125 | x, _ = batch
126 | return self(x)
127 |
128 | def configure_optimizers(self):
129 | optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)
130 | return optimizer
131 |
132 |
133 | data_module = MNISTDataModule()
134 | model = LitConvClassifier()
135 |
136 | trainer = pl.Trainer(
137 | max_epochs=3,
138 | default_root_dir="experiments",
139 | callbacks=[
140 | EarlyStopping(monitor="val_loss", mode="min"),
141 | ModelSummary(max_depth=-1),
142 | ],
143 | precision="16-mixed",
144 | limit_train_batches=0.1,
145 | limit_val_batches=0.01,
146 | )
147 |
148 | trainer.fit(model, data_module)
149 |
150 | # Manually load the saved checkpoint
151 | checkpoint_path = trainer.checkpoint_callback.best_model_path
152 | print(f"\nLoading checkpoint from: {checkpoint_path}")
153 |
154 | # Load the model from the checkpoint
155 | loaded_model = LitConvClassifier.load_from_checkpoint(checkpoint_path)
156 |
157 | # Print the custom attribute stored in the checkpoint
158 | # This is to check if the custom attribute is stored and loaded correctly
159 | print(
160 | f"\nTrain batches processed (from checkpoint): {loaded_model.train_batches_processed}"
161 | )
162 |
163 | # Get Predictions
164 | predictions = trainer.predict(model, data_module)
165 | print(len(predictions))
166 |
--------------------------------------------------------------------------------
/src/intermediate/level_13_profiler/01_advanced_profiler.py:
--------------------------------------------------------------------------------
1 | # Documentation Link
2 | # https://lightning.ai/docs/pytorch/stable/tuning/profiler_basic.html
3 |
4 | import lightning.pytorch as pl
5 | import torch
6 | from lightning.pytorch.callbacks import (
7 | DeviceStatsMonitor,
8 | ModelSummary,
9 | StochasticWeightAveraging,
10 | )
11 | from lightning.pytorch.callbacks.early_stopping import EarlyStopping
12 | from torch import nn
13 | from torch.nn import functional as F
14 | from torch.utils.data import DataLoader, random_split
15 | from torchvision import transforms
16 | from torchvision.datasets import MNIST
17 |
18 |
19 | class MNISTDataModule(pl.LightningDataModule):
20 | def __init__(self, data_dir: str = "./"):
21 | super().__init__()
22 | self.data_dir = data_dir
23 | self.transform = transforms.Compose(
24 | [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]
25 | )
26 |
27 | def prepare_data(self):
28 | # download
29 | MNIST(self.data_dir, train=True, download=True)
30 | MNIST(self.data_dir, train=False, download=True)
31 |
32 | def setup(self, stage: str):
33 | # Assign train/val datasets for use in dataloaders
34 | if stage == "fit":
35 | mnist_full = MNIST(self.data_dir, train=True, transform=self.transform)
36 | self.mnist_train, self.mnist_val = random_split(mnist_full, [55000, 5000])
37 |
38 | # Assign test dataset for use in dataloader(s)
39 | if stage == "test":
40 | self.mnist_test = MNIST(
41 | self.data_dir, train=False, transform=self.transform
42 | )
43 |
44 | if stage == "predict":
45 | self.mnist_predict = MNIST(
46 | self.data_dir, train=False, transform=self.transform
47 | )
48 |
49 | def train_dataloader(self):
50 | return DataLoader(self.mnist_train, batch_size=32)
51 |
52 | def val_dataloader(self):
53 | return DataLoader(self.mnist_val, batch_size=32)
54 |
55 | def test_dataloader(self):
56 | return DataLoader(self.mnist_test, batch_size=32)
57 |
58 | def predict_dataloader(self):
59 | return DataLoader(self.mnist_predict, batch_size=32)
60 |
61 |
62 | class LitConvClassifier(pl.LightningModule):
63 | def __init__(self, learning_rate=1e-3):
64 | super().__init__()
65 | self.save_hyperparameters()
66 | self.example_input_array = torch.rand(1, 1, 28, 28)
67 |
68 | self.learning_rate = learning_rate
69 |
70 | # Define blocks of layers as submodules
71 | self.conv_block1 = nn.Sequential(
72 | nn.Conv2d(1, 32, 3, stride=1, padding=1), nn.ReLU(), nn.MaxPool2d(2)
73 | )
74 |
75 | self.conv_block2 = nn.Sequential(
76 | nn.Conv2d(32, 64, 3, stride=1, padding=1), nn.ReLU(), nn.MaxPool2d(2)
77 | )
78 |
79 | self.fc_block = nn.Sequential(
80 | nn.Linear(64 * 7 * 7, 128), nn.ReLU(), nn.Linear(128, 10)
81 | )
82 |
83 | def forward(self, x):
84 | x = self.conv_block1(x)
85 | x = self.conv_block2(x)
86 | x = x.view(x.size(0), -1)
87 | x = self.fc_block(x)
88 | return x
89 |
90 | def training_step(self, batch, batch_idx):
91 | x, y = batch
92 | y_hat = self(x)
93 | loss = F.cross_entropy(y_hat, y)
94 | return loss
95 |
96 | def validation_step(self, batch, batch_idx):
97 | x, y = batch
98 | y_hat = self(x)
99 | loss = F.cross_entropy(y_hat, y)
100 | self.log("val_loss", loss)
101 | return loss
102 |
103 | def test_step(self, batch, batch_idx):
104 | x, y = batch
105 | y_hat = self(x)
106 | loss = F.cross_entropy(y_hat, y)
107 | return loss
108 |
109 | def predict_step(self, batch, batch_idx, dataloader_idx=None):
110 | x, _ = batch
111 | return self(x)
112 |
113 | def configure_optimizers(self):
114 | optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)
115 | return optimizer
116 |
117 |
118 | data_module = MNISTDataModule()
119 | model = LitConvClassifier()
120 |
121 |
122 | # Profiling helps you find bottlenecks in your code by capturing
123 | # analytics such as how long a function takes or how much memory is used.
124 |
125 | # The most basic profile measures all the key methods across Callbacks,
126 | # DataModules and the LightningModule in the training loop.
127 | print("------------------------------")
128 | print("Basic Profiler")
129 | print("------------------------------")
130 | trainer = pl.Trainer(
131 | max_epochs=1,
132 | default_root_dir="experiments/",
133 | callbacks=[
134 | EarlyStopping(monitor="val_loss", mode="min"),
135 | ModelSummary(max_depth=-1),
136 | StochasticWeightAveraging(swa_lrs=1e-2),
137 | ],
138 | precision="16-mixed",
139 | profiler="simple",
140 | limit_train_batches=0.1,
141 | limit_val_batches=0.01,
142 | )
143 |
144 | trainer.fit(model, data_module)
145 |
146 | # To profile the time within every function, use the AdvancedProfiler built on top of Python’s cProfiler.
147 | print("------------------------------")
148 | print("Advanced Profiler")
149 | print("------------------------------")
150 | trainer = pl.Trainer(
151 | max_epochs=1,
152 | default_root_dir="experiments/",
153 | callbacks=[
154 | EarlyStopping(monitor="val_loss", mode="min"),
155 | ModelSummary(max_depth=-1),
156 | StochasticWeightAveraging(swa_lrs=1e-2),
157 | ],
158 | precision="16-mixed",
159 | profiler="advanced",
160 | limit_train_batches=0.1,
161 | limit_val_batches=0.01,
162 | )
163 |
164 | trainer.fit(model, data_module)
165 |
166 | # Get Predictions
167 | predictions = trainer.predict(model, data_module)
168 | print(len(predictions))
169 |
170 | # Another helpful technique to detect bottlenecks is to ensure that
171 | # you’re using the full capacity of your accelerator (GPU/TPU/IPU/HPU).
172 | # This can be measured with the DeviceStatsMonitor:
173 | print("------------------------------")
174 | print("Device Stats Monitor")
175 | print("------------------------------")
176 | trainer = pl.Trainer(
177 | max_epochs=1,
178 | default_root_dir="experiments/",
179 | callbacks=[
180 | EarlyStopping(monitor="val_loss", mode="min"),
181 | ModelSummary(max_depth=-1),
182 | StochasticWeightAveraging(swa_lrs=1e-2),
183 | DeviceStatsMonitor(),
184 | ],
185 | precision="16-mixed",
186 | profiler="advanced",
187 | limit_train_batches=0.1,
188 | limit_val_batches=0.01,
189 | )
190 |
191 | trainer.fit(model, data_module)
192 |
--------------------------------------------------------------------------------
/src/advanced/level_19_hpu/hpu.py:
--------------------------------------------------------------------------------
1 | # Documentation Link
2 | # https://lightning.ai/docs/pytorch/stable/integrations/hpu/basic.html
3 |
4 | import lightning.pytorch as pl
5 | import torch
6 | from lightning.pytorch.callbacks import ModelSummary
7 | from lightning.pytorch.callbacks.early_stopping import EarlyStopping
8 |
9 | # Import the HPUAccelerator
10 | from lightning_habana.pytorch.accelerator import HPUAccelerator
11 | from lightning_habana.pytorch.strategies import HPUParallelStrategy
12 | from torch import nn
13 | from torch.nn import functional as F
14 | from torch.utils.data import DataLoader, random_split
15 | from torchvision import transforms
16 | from torchvision.datasets import MNIST
17 |
18 |
19 | class MNISTDataModule(pl.LightningDataModule):
20 | def __init__(self, data_dir: str = "./"):
21 | super().__init__()
22 | self.data_dir = data_dir
23 | self.transform = transforms.Compose(
24 | [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]
25 | )
26 |
27 | def prepare_data(self):
28 | MNIST(self.data_dir, train=True, download=True)
29 | MNIST(self.data_dir, train=False, download=True)
30 |
31 | def setup(self, stage: str):
32 | # Assign train/val datasets for use in dataloaders
33 | if stage == "fit":
34 | mnist_full = MNIST(self.data_dir, train=True, transform=self.transform)
35 | self.mnist_train, self.mnist_val = random_split(mnist_full, [55000, 5000])
36 |
37 | # Assign test dataset for use in dataloader(s)
38 | if stage == "test":
39 | self.mnist_test = MNIST(
40 | self.data_dir, train=False, transform=self.transform
41 | )
42 |
43 | if stage == "predict":
44 | self.mnist_predict = MNIST(
45 | self.data_dir, train=False, transform=self.transform
46 | )
47 |
48 | def train_dataloader(self):
49 | return DataLoader(self.mnist_train, batch_size=32)
50 |
51 | def val_dataloader(self):
52 | return DataLoader(self.mnist_val, batch_size=32)
53 |
54 | def test_dataloader(self):
55 | return DataLoader(self.mnist_test, batch_size=32)
56 |
57 | def predict_dataloader(self):
58 | return DataLoader(self.mnist_predict, batch_size=32)
59 |
60 |
61 | class LitConvClassifier(pl.LightningModule):
62 | def __init__(self, learning_rate=1e-3):
63 | super().__init__()
64 | self.save_hyperparameters()
65 | self.example_input_array = torch.rand(1, 1, 28, 28)
66 |
67 | self.learning_rate = learning_rate
68 |
69 | # Define blocks of layers as submodules
70 | self.conv_block1 = nn.Sequential(
71 | nn.Conv2d(1, 32, 3, stride=1, padding=1), nn.ReLU(), nn.MaxPool2d(2)
72 | )
73 |
74 | self.conv_block2 = nn.Sequential(
75 | nn.Conv2d(32, 64, 3, stride=1, padding=1), nn.ReLU(), nn.MaxPool2d(2)
76 | )
77 |
78 | self.fc_block = nn.Sequential(
79 | nn.Linear(64 * 7 * 7, 128), nn.ReLU(), nn.Linear(128, 10)
80 | )
81 |
82 | def forward(self, x):
83 | x = self.conv_block1(x)
84 | x = self.conv_block2(x)
85 | x = x.view(x.size(0), -1)
86 | x = self.fc_block(x)
87 | return x
88 |
89 | def training_step(self, batch, batch_idx):
90 | x, y = batch
91 | y_hat = self(x)
92 | loss = F.cross_entropy(y_hat, y)
93 | return loss
94 |
95 | def validation_step(self, batch, batch_idx):
96 | x, y = batch
97 | y_hat = self(x)
98 | loss = F.cross_entropy(y_hat, y)
99 | self.log("val_loss", loss)
100 | return loss
101 |
102 | def test_step(self, batch, batch_idx):
103 | x, y = batch
104 | y_hat = self(x)
105 | loss = F.cross_entropy(y_hat, y)
106 | return loss
107 |
108 | def predict_step(self, batch, batch_idx, dataloader_idx=None):
109 | x, _ = batch
110 | return self(x)
111 |
112 | def configure_optimizers(self):
113 | optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)
114 | return optimizer
115 |
116 |
117 | data_module = MNISTDataModule()
118 | model = LitConvClassifier()
119 |
120 | # Run on as many Gaudi devices as available by default
121 | trainer = pl.Trainer(
122 | max_epochs=3,
123 | default_root_dir="experiments",
124 | callbacks=[
125 | EarlyStopping(monitor="val_loss", mode="min"),
126 | ModelSummary(max_depth=-1),
127 | ],
128 | precision="bf16-mixed",
129 | limit_train_batches=0.1,
130 | limit_val_batches=0.01,
131 | accelerator="auto",
132 | devices="auto",
133 | strategy="auto",
134 | )
135 |
136 | # equivalent to
137 | trainer = pl.Trainer(
138 | max_epochs=3,
139 | default_root_dir="experiments",
140 | callbacks=[
141 | EarlyStopping(monitor="val_loss", mode="min"),
142 | ModelSummary(max_depth=-1),
143 | ],
144 | precision="bf16-mixed",
145 | limit_train_batches=0.1,
146 | limit_val_batches=0.01,
147 | )
148 |
149 | # Run on one Gaudi device
150 | trainer = pl.Trainer(
151 | max_epochs=3,
152 | default_root_dir="experiments",
153 | callbacks=[
154 | EarlyStopping(monitor="val_loss", mode="min"),
155 | ModelSummary(max_depth=-1),
156 | ],
157 | precision="bf16-mixed",
158 | limit_train_batches=0.1,
159 | limit_val_batches=0.01,
160 | accelerator=HPUAccelerator(),
161 | devices="1",
162 | )
163 |
164 | # Run on multiple Gaudi devices
165 | trainer = pl.Trainer(
166 | max_epochs=3,
167 | default_root_dir="experiments",
168 | callbacks=[
169 | EarlyStopping(monitor="val_loss", mode="min"),
170 | ModelSummary(max_depth=-1),
171 | ],
172 | precision="bf16-mixed",
173 | limit_train_batches=0.1,
174 | limit_val_batches=0.01,
175 | accelerator=HPUAccelerator(),
176 | devices="8",
177 | )
178 |
179 | # To train a Lightning model using multiple HPU nodes,
180 | # set the num_nodes parameter with the available nodes in the Trainer class.
181 | hpus = 8
182 | parallel_hpus = [torch.device("hpu")] * hpus
183 |
184 | trainer = pl.Trainer(
185 | max_epochs=3,
186 | default_root_dir="experiments",
187 | callbacks=[
188 | EarlyStopping(monitor="val_loss", mode="min"),
189 | ModelSummary(max_depth=-1),
190 | ],
191 | precision="bf16-mixed",
192 | limit_train_batches=0.1,
193 | limit_val_batches=0.01,
194 | accelerator=HPUAccelerator(),
195 | devices=hpus,
196 | strategy=HPUParallelStrategy(parallel_devices=parallel_hpus),
197 | num_nodes=2,
198 | )
199 |
200 | trainer.fit(model, data_module)
201 |
202 | # Get Predictions
203 | predictions = trainer.predict(model, data_module)
204 | print(len(predictions))
205 |
--------------------------------------------------------------------------------
/src/advanced/level_16_own_the_training_loop/01_enable_manual_optimization.py:
--------------------------------------------------------------------------------
1 | # Documentation Link
2 | # https://lightning.ai/docs/pytorch/stable/model/build_model_advanced.html
3 |
4 | import lightning.pytorch as pl
5 | import torch
6 | from lightning.pytorch.callbacks import ModelSummary
7 | from lightning.pytorch.callbacks.early_stopping import EarlyStopping
8 | from torch import nn
9 | from torch.nn import functional as F
10 | from torch.utils.data import DataLoader, random_split
11 | from torchvision import transforms
12 | from torchvision.datasets import MNIST
13 |
14 |
15 | class MNISTDataModule(pl.LightningDataModule):
16 | def __init__(self, data_dir: str = "./"):
17 | super().__init__()
18 | self.data_dir = data_dir
19 | self.transform = transforms.Compose(
20 | [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]
21 | )
22 |
23 | def prepare_data(self):
24 | # download
25 | MNIST(self.data_dir, train=True, download=True)
26 | MNIST(self.data_dir, train=False, download=True)
27 |
28 | def setup(self, stage: str):
29 | # Assign train/val datasets for use in dataloaders
30 | if stage == "fit":
31 | mnist_full = MNIST(self.data_dir, train=True, transform=self.transform)
32 | self.mnist_train, self.mnist_val = random_split(mnist_full, [55000, 5000])
33 |
34 | # Assign test dataset for use in dataloader(s)
35 | if stage == "test":
36 | self.mnist_test = MNIST(
37 | self.data_dir, train=False, transform=self.transform
38 | )
39 |
40 | if stage == "predict":
41 | self.mnist_predict = MNIST(
42 | self.data_dir, train=False, transform=self.transform
43 | )
44 |
45 | def train_dataloader(self):
46 | return DataLoader(self.mnist_train, batch_size=32)
47 |
48 | def val_dataloader(self):
49 | return DataLoader(self.mnist_val, batch_size=32)
50 |
51 | def test_dataloader(self):
52 | return DataLoader(self.mnist_test, batch_size=32)
53 |
54 | def predict_dataloader(self):
55 | return DataLoader(self.mnist_predict, batch_size=32)
56 |
57 |
58 | # Steps to enable Manual Optimization
59 | # 1. Set `self.automatic_optimization=False`` in your LightningModule’s __init__.
60 |
61 | # 2. Use the following functions and call them manually:
62 |
63 | # 2.1 `self.optimizers()` to access your optimizers (one or multiple)
64 |
65 | # 2.2 `optimizer.zero_grad()` to clear the gradients from the previous training step
66 |
67 | # 2.3 `self.manual_backward(loss)` instead of loss.backward()
68 |
69 | # 2.4 `optimizer.step()` to update your model parameters
70 |
71 | # 2.5 `self.toggle_optimizer()` and `self.untoggle_optimizer()` if needed
72 |
73 |
74 | class LitConvClassifier(pl.LightningModule):
75 | def __init__(self, learning_rate=1e-3):
76 | super().__init__()
77 | self.save_hyperparameters()
78 | self.example_input_array = torch.rand(1, 1, 28, 28)
79 |
80 | self.learning_rate = learning_rate
81 |
82 | # Enable manual optimization
83 | self.automatic_optimization = False
84 |
85 | self.conv_block1 = nn.Sequential(
86 | nn.Conv2d(1, 32, 3, stride=1, padding=1), nn.ReLU(), nn.MaxPool2d(2)
87 | )
88 |
89 | self.conv_block2 = nn.Sequential(
90 | nn.Conv2d(32, 64, 3, stride=1, padding=1), nn.ReLU(), nn.MaxPool2d(2)
91 | )
92 |
93 | self.fc_block = nn.Sequential(
94 | nn.Linear(64 * 7 * 7, 128), nn.ReLU(), nn.Linear(128, 10)
95 | )
96 |
97 | def forward(self, x):
98 | x = self.conv_block1(x)
99 | x = self.conv_block2(x)
100 | x = x.view(x.size(0), -1)
101 | x = self.fc_block(x)
102 | return x
103 |
104 | # Define the compute_loss method
105 | def compute_loss(self, batch):
106 | x, y = batch
107 | logits = self(x) # Pass inputs through the model
108 | return F.cross_entropy(logits, y) # Calculate cross-entropy loss
109 |
110 | # Here are three examples of how to use manual optimization in Lightning
111 | # Uncomment one of the examples to try it out!
112 |
113 | # Example 1: Basic Manual Optimization
114 | def training_step(self, batch, batch_idx):
115 | opt = self.optimizers()
116 | opt.zero_grad()
117 | loss = self.compute_loss(batch)
118 | self.manual_backward(loss)
119 | opt.step()
120 |
121 | return loss
122 |
123 | # # Example 2: Gradient Accumulation
124 | # def training_step(self, batch, batch_idx, N=5):
125 | # opt = self.optimizers()
126 |
127 | # # scale losses by 1/N (for N batches of gradient accumulation)
128 | # loss = self.compute_loss(batch) / N
129 | # self.manual_backward(loss)
130 |
131 | # # accumulate gradients of N batches
132 | # if (batch_idx + 1) % N == 0:
133 | # opt.step()
134 | # opt.zero_grad()
135 |
136 | # return loss
137 |
138 | # Example 3: Gradient Clipping
139 | # def training_step(self, batch, batch_idx):
140 | # opt = self.optimizers()
141 |
142 | # # compute loss
143 | # loss = self.compute_loss(batch)
144 |
145 | # opt.zero_grad()
146 | # self.manual_backward(loss)
147 |
148 | # # clip gradients
149 | # self.clip_gradients(opt, gradient_clip_val=0.5, gradient_clip_algorithm="norm")
150 |
151 | # opt.step()
152 |
153 | # return loss
154 |
155 | def validation_step(self, batch, batch_idx):
156 | x, y = batch
157 | y_hat = self(x)
158 | loss = F.cross_entropy(y_hat, y)
159 | self.log("val_loss", loss)
160 | return loss
161 |
162 | def test_step(self, batch, batch_idx):
163 | x, y = batch
164 | y_hat = self(x)
165 | loss = F.cross_entropy(y_hat, y)
166 | return loss
167 |
168 | def predict_step(self, batch, batch_idx, dataloader_idx=None):
169 | x, _ = batch
170 | return self(x)
171 |
172 | def configure_optimizers(self):
173 | optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)
174 |
175 | return optimizer
176 |
177 |
178 | data_module = MNISTDataModule()
179 | model = LitConvClassifier()
180 |
181 | trainer = pl.Trainer(
182 | max_epochs=3,
183 | default_root_dir="experiments",
184 | callbacks=[
185 | EarlyStopping(monitor="val_loss", mode="min"),
186 | ModelSummary(max_depth=-1),
187 | ],
188 | precision="16-mixed",
189 | limit_train_batches=0.1,
190 | limit_val_batches=0.01,
191 | )
192 |
193 | trainer.fit(model, data_module)
194 |
195 | # Get Predictions
196 | predictions = trainer.predict(model, data_module)
197 | print(len(predictions))
198 |
--------------------------------------------------------------------------------