├── .gitignore
├── 00_pytorch-vit-random-init.py
├── 01_pytorch-vit.py
├── 02_pytorch-vit-compile.py
├── 03_fabric-vit.py
├── 04_fabric-vit-mixed-precision.py
├── 05_fabric-vit-mixed-ddp.py
├── 06_fabric-vit-mixed-fsdp.py
├── 07_fabric-vit-mixed-fsdp-with-scheduler.py
├── 08_saving-and-loading
    ├── 08-1-train.py
    ├── 08-2-load.py
    └── README.md
├── LICENSE.txt
├── README.md
├── cvpr-talk__slides.pdf
├── local_utilities.py
├── logs.md
└── requirements.txt


/.gitignore:
--------------------------------------------------------------------------------
  1 | .DS_Store
  2 | # Byte-compiled / optimized / DLL files
  3 | __pycache__/
  4 | *.py[cod]
  5 | *$py.class
  6 | 
  7 | # C extensions
  8 | *.so
  9 | 
 10 | # Distribution / packaging
 11 | .Python
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | cover/
 54 | 
 55 | # Translations
 56 | *.mo
 57 | *.pot
 58 | 
 59 | # Django stuff:
 60 | *.log
 61 | local_settings.py
 62 | db.sqlite3
 63 | db.sqlite3-journal
 64 | 
 65 | # Flask stuff:
 66 | instance/
 67 | .webassets-cache
 68 | 
 69 | # Scrapy stuff:
 70 | .scrapy
 71 | 
 72 | # Sphinx documentation
 73 | docs/_build/
 74 | 
 75 | # PyBuilder
 76 | .pybuilder/
 77 | target/
 78 | 
 79 | # Jupyter Notebook
 80 | .ipynb_checkpoints
 81 | 
 82 | # IPython
 83 | profile_default/
 84 | ipython_config.py
 85 | 
 86 | # pyenv
 87 | #   For a library or package, you might want to ignore these files since the code is
 88 | #   intended to run in multiple environments; otherwise, check them in:
 89 | # .python-version
 90 | 
 91 | # pipenv
 92 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 93 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 94 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 95 | #   install all needed dependencies.
 96 | #Pipfile.lock
 97 | 
 98 | # poetry
 99 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
100 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
101 | #   commonly ignored for libraries.
102 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
103 | #poetry.lock
104 | 
105 | # pdm
106 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
107 | #pdm.lock
108 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
109 | #   in version control.
110 | #   https://pdm.fming.dev/#use-with-ide
111 | .pdm.toml
112 | 
113 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
114 | __pypackages__/
115 | 
116 | # Celery stuff
117 | celerybeat-schedule
118 | celerybeat.pid
119 | 
120 | # SageMath parsed files
121 | *.sage.py
122 | 
123 | # Environments
124 | .env
125 | .venv
126 | env/
127 | venv/
128 | ENV/
129 | env.bak/
130 | venv.bak/
131 | 
132 | # Spyder project settings
133 | .spyderproject
134 | .spyproject
135 | 
136 | # Rope project settings
137 | .ropeproject
138 | 
139 | # mkdocs documentation
140 | /site
141 | 
142 | # mypy
143 | .mypy_cache/
144 | .dmypy.json
145 | dmypy.json
146 | 
147 | # Pyre type checker
148 | .pyre/
149 | 
150 | # pytype static type analyzer
151 | .pytype/
152 | 
153 | # Cython debug symbols
154 | cython_debug/
155 | 
156 | # PyCharm
157 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
158 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
159 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
160 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
161 | #.idea/
162 | 


--------------------------------------------------------------------------------
/00_pytorch-vit-random-init.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | 
  3 | import lightning as L
  4 | import torch
  5 | import torch.nn.functional as F
  6 | import torchmetrics
  7 | from torchvision import transforms
  8 | from torchvision.models import vit_b_16
  9 | from torchvision.models import ViT_B_16_Weights
 10 | from watermark import watermark
 11 | 
 12 | from local_utilities import get_dataloaders_cifar10
 13 | 
 14 | 
 15 | def train(num_epochs, model, optimizer, train_loader, val_loader, device):
 16 | 
 17 |     for epoch in range(num_epochs):
 18 |         train_acc = torchmetrics.Accuracy(task="multiclass", num_classes=10).to(device)
 19 | 
 20 |         model.train()
 21 |         for batch_idx, (features, targets) in enumerate(train_loader):
 22 |             model.train()
 23 | 
 24 |             features = features.to(device)
 25 |             targets = targets.to(device)
 26 |             
 27 |             ### FORWARD AND BACK PROP   
 28 |             logits = model(features)
 29 |             loss = F.cross_entropy(logits, targets)
 30 |             
 31 |             optimizer.zero_grad()
 32 |             loss.backward()
 33 | 
 34 |             ### UPDATE MODEL PARAMETERS
 35 |             optimizer.step()
 36 | 
 37 |             ### LOGGING
 38 |             if not batch_idx % 300:
 39 |                 print(f"Epoch: {epoch+1:04d}/{num_epochs:04d} | Batch {batch_idx:04d}/{len(train_loader):04d} | Loss: {loss:.4f}")
 40 | 
 41 |             model.eval()
 42 |             with torch.no_grad():
 43 |                 predicted_labels = torch.argmax(logits, 1)
 44 |                 train_acc.update(predicted_labels, targets)
 45 | 
 46 |         ### MORE LOGGING
 47 |         model.eval()
 48 |         with torch.no_grad():
 49 |             val_acc = torchmetrics.Accuracy(task="multiclass", num_classes=10).to(device)
 50 | 
 51 |             for (features, targets) in val_loader:
 52 |                 features = features.to(device)
 53 |                 targets = targets.to(device)
 54 |                 outputs = model(features)
 55 |                 predicted_labels = torch.argmax(outputs, 1)
 56 |                 val_acc.update(predicted_labels, targets)
 57 | 
 58 |             print(f"Epoch: {epoch+1:04d}/{num_epochs:04d} | Train acc.: {train_acc.compute()*100:.2f}% | Val acc.: {val_acc.compute()*100:.2f}%")
 59 |             train_acc.reset(), val_acc.reset()
 60 | 
 61 | 
 62 | if __name__ == "__main__":
 63 | 
 64 |     print(watermark(packages="torch,lightning", python=True))
 65 |     print("Torch CUDA available?", torch.cuda.is_available())
 66 |     device = "cuda" if torch.cuda.is_available() else "cpu"
 67 | 
 68 |     L.seed_everything(123)
 69 | 
 70 |     ##########################
 71 |     ### 1 Loading the Dataset
 72 |     ##########################
 73 |     train_transforms = transforms.Compose([transforms.Resize((224, 224)),
 74 |                                            #transforms.RandomCrop((224, 224)),
 75 |                                            transforms.ToTensor()])
 76 |     
 77 |     test_transforms = transforms.Compose([transforms.Resize((224, 224)),
 78 |                                           #transforms.CenterCrop((224, 224)),
 79 |                                           transforms.ToTensor()])
 80 |     
 81 |     train_loader, val_loader, test_loader = get_dataloaders_cifar10(
 82 |         batch_size=16, 
 83 |         num_workers=4, 
 84 |         train_transforms=train_transforms,
 85 |         test_transforms=test_transforms,
 86 |         validation_fraction=0.1)
 87 | 
 88 | 
 89 |     #########################################
 90 |     ### 2 Initializing the Model
 91 |     #########################################
 92 | 
 93 |     model = vit_b_16(weights=None)
 94 | 
 95 |     # replace output layer
 96 |     model.heads.head = torch.nn.Linear(in_features=768, out_features=10)
 97 | 
 98 |     model.to(device)
 99 |     optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)
100 | 
101 |     #########################################
102 |     ### 3 Finetuning
103 |     #########################################
104 | 
105 |     start = time.time()
106 |     train(
107 |         num_epochs=10,
108 |         model=model,
109 |         optimizer=optimizer,
110 |         train_loader=train_loader,
111 |         val_loader=val_loader,
112 |         device=device
113 |     )
114 | 
115 |     end = time.time()
116 |     elapsed = end-start
117 |     print(f"Time elapsed {elapsed/60:.2f} min")
118 |     print(f"Memory used: {torch.cuda.max_memory_reserved() / 1e9:.02f} GB")
119 | 
120 |     #########################################
121 |     ### 4 Evaluation
122 |     #########################################
123 |     
124 |     with torch.no_grad():
125 |         model.eval()
126 |         test_acc = torchmetrics.Accuracy(task="multiclass", num_classes=10).to(device)
127 | 
128 |         for (features, targets) in test_loader:
129 |             features = features.to(device)
130 |             targets = targets.to(device)
131 |             outputs = model(features)
132 |             predicted_labels = torch.argmax(outputs, 1)
133 |             test_acc.update(predicted_labels, targets)
134 | 
135 |     print(f"Test accuracy {test_acc.compute()*100:.2f}%")


--------------------------------------------------------------------------------
/01_pytorch-vit.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | 
  3 | import lightning as L
  4 | import torch
  5 | import torch.nn.functional as F
  6 | import torchmetrics
  7 | from torchvision import transforms
  8 | from torchvision.models import vit_b_16
  9 | from torchvision.models import ViT_B_16_Weights
 10 | from watermark import watermark
 11 | 
 12 | from local_utilities import get_dataloaders_cifar10
 13 | 
 14 | 
 15 | def train(num_epochs, model, optimizer, train_loader, val_loader, device):
 16 | 
 17 |     for epoch in range(num_epochs):
 18 |         train_acc = torchmetrics.Accuracy(task="multiclass", num_classes=10).to(device)
 19 | 
 20 |         model.train()
 21 |         for batch_idx, (features, targets) in enumerate(train_loader):
 22 |             model.train()
 23 | 
 24 |             features = features.to(device)
 25 |             targets = targets.to(device)
 26 |             
 27 |             ### FORWARD AND BACK PROP   
 28 |             logits = model(features)
 29 |             loss = F.cross_entropy(logits, targets)
 30 |             
 31 |             optimizer.zero_grad()
 32 |             loss.backward()
 33 | 
 34 |             ### UPDATE MODEL PARAMETERS
 35 |             optimizer.step()
 36 | 
 37 |             ### LOGGING
 38 |             if not batch_idx % 300:
 39 |                 print(f"Epoch: {epoch+1:04d}/{num_epochs:04d} | Batch {batch_idx:04d}/{len(train_loader):04d} | Loss: {loss:.4f}")
 40 | 
 41 |             model.eval()
 42 |             with torch.no_grad():
 43 |                 predicted_labels = torch.argmax(logits, 1)
 44 |                 train_acc.update(predicted_labels, targets)
 45 | 
 46 |         ### MORE LOGGING
 47 |         model.eval()
 48 |         with torch.no_grad():
 49 |             val_acc = torchmetrics.Accuracy(task="multiclass", num_classes=10).to(device)
 50 | 
 51 |             for (features, targets) in val_loader:
 52 |                 features = features.to(device)
 53 |                 targets = targets.to(device)
 54 |                 outputs = model(features)
 55 |                 predicted_labels = torch.argmax(outputs, 1)
 56 |                 val_acc.update(predicted_labels, targets)
 57 | 
 58 |             print(f"Epoch: {epoch+1:04d}/{num_epochs:04d} | Train acc.: {train_acc.compute()*100:.2f}% | Val acc.: {val_acc.compute()*100:.2f}%")
 59 |             train_acc.reset(), val_acc.reset()
 60 | 
 61 | 
 62 | if __name__ == "__main__":
 63 | 
 64 |     print(watermark(packages="torch,lightning", python=True))
 65 |     print("Torch CUDA available?", torch.cuda.is_available())
 66 |     device = "cuda" if torch.cuda.is_available() else "cpu"
 67 | 
 68 |     L.seed_everything(123)
 69 | 
 70 |     ##########################
 71 |     ### 1 Loading the Dataset
 72 |     ##########################
 73 |     train_transforms = transforms.Compose([transforms.Resize((224, 224)),
 74 |                                            #transforms.RandomCrop((224, 224)),
 75 |                                            transforms.ToTensor()])
 76 |     
 77 |     test_transforms = transforms.Compose([transforms.Resize((224, 224)),
 78 |                                           #transforms.CenterCrop((224, 224)),
 79 |                                           transforms.ToTensor()])
 80 |     
 81 |     train_loader, val_loader, test_loader = get_dataloaders_cifar10(
 82 |         batch_size=16, 
 83 |         num_workers=4, 
 84 |         train_transforms=train_transforms,
 85 |         test_transforms=test_transforms,
 86 |         validation_fraction=0.1)
 87 | 
 88 | 
 89 |     #########################################
 90 |     ### 2 Initializing the Model
 91 |     #########################################
 92 | 
 93 |     model = vit_b_16(weights=ViT_B_16_Weights.IMAGENET1K_V1)
 94 | 
 95 |     # replace output layer
 96 |     model.heads.head = torch.nn.Linear(in_features=768, out_features=10)
 97 | 
 98 |     model.to(device)
 99 |     optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)
100 | 
101 |     #########################################
102 |     ### 3 Finetuning
103 |     #########################################
104 | 
105 |     start = time.time()
106 |     train(
107 |         num_epochs=3,
108 |         model=model,
109 |         optimizer=optimizer,
110 |         train_loader=train_loader,
111 |         val_loader=val_loader,
112 |         device=device
113 |     )
114 | 
115 |     end = time.time()
116 |     elapsed = end-start
117 |     print(f"Time elapsed {elapsed/60:.2f} min")
118 |     print(f"Memory used: {torch.cuda.max_memory_reserved() / 1e9:.02f} GB")
119 | 
120 |     #########################################
121 |     ### 4 Evaluation
122 |     #########################################
123 |     
124 |     with torch.no_grad():
125 |         model.eval()
126 |         test_acc = torchmetrics.Accuracy(task="multiclass", num_classes=10).to(device)
127 | 
128 |         for (features, targets) in test_loader:
129 |             features = features.to(device)
130 |             targets = targets.to(device)
131 |             outputs = model(features)
132 |             predicted_labels = torch.argmax(outputs, 1)
133 |             test_acc.update(predicted_labels, targets)
134 | 
135 |     print(f"Test accuracy {test_acc.compute()*100:.2f}%")


--------------------------------------------------------------------------------
/02_pytorch-vit-compile.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | 
  3 | import lightning as L
  4 | import torch
  5 | import torch.nn.functional as F
  6 | import torchmetrics
  7 | from torchvision import transforms
  8 | from torchvision.models import vit_b_16
  9 | from torchvision.models import ViT_B_16_Weights
 10 | from watermark import watermark
 11 | 
 12 | from local_utilities import get_dataloaders_cifar10
 13 | 
 14 | 
 15 | def train(num_epochs, model, optimizer, train_loader, val_loader, device):
 16 | 
 17 |     for epoch in range(num_epochs):
 18 |         train_acc = torchmetrics.Accuracy(task="multiclass", num_classes=10).to(device)
 19 | 
 20 |         model.train()
 21 |         for batch_idx, (features, targets) in enumerate(train_loader):
 22 |             model.train()
 23 | 
 24 |             features = features.to(device)
 25 |             targets = targets.to(device)
 26 |             
 27 |             ### FORWARD AND BACK PROP   
 28 |             logits = model(features)
 29 |             loss = F.cross_entropy(logits, targets)
 30 |             
 31 |             optimizer.zero_grad()
 32 |             loss.backward()
 33 | 
 34 |             ### UPDATE MODEL PARAMETERS
 35 |             optimizer.step()
 36 | 
 37 |             ### LOGGING
 38 |             if not batch_idx % 300:
 39 |                 print(f"Epoch: {epoch+1:04d}/{num_epochs:04d} | Batch {batch_idx:04d}/{len(train_loader):04d} | Loss: {loss:.4f}")
 40 | 
 41 |             model.eval()
 42 |             with torch.no_grad():
 43 |                 predicted_labels = torch.argmax(logits, 1)
 44 |                 train_acc.update(predicted_labels, targets)
 45 | 
 46 |         ### MORE LOGGING
 47 |         model.eval()
 48 |         with torch.no_grad():
 49 |             val_acc = torchmetrics.Accuracy(task="multiclass", num_classes=10).to(device)
 50 | 
 51 |             for (features, targets) in val_loader:
 52 |                 features = features.to(device)
 53 |                 targets = targets.to(device)
 54 |                 outputs = model(features)
 55 |                 predicted_labels = torch.argmax(outputs, 1)
 56 |                 val_acc.update(predicted_labels, targets)
 57 | 
 58 |             print(f"Epoch: {epoch+1:04d}/{num_epochs:04d} | Train acc.: {train_acc.compute()*100:.2f}% | Val acc.: {val_acc.compute()*100:.2f}%")
 59 |             train_acc.reset(), val_acc.reset()
 60 | 
 61 | 
 62 | if __name__ == "__main__":
 63 | 
 64 |     print(watermark(packages="torch,lightning", python=True))
 65 |     print("Torch CUDA available?", torch.cuda.is_available())
 66 |     device = "cuda" if torch.cuda.is_available() else "cpu"
 67 | 
 68 |     L.seed_everything(123)
 69 | 
 70 |     ##########################
 71 |     ### 1 Loading the Dataset
 72 |     ##########################
 73 |     train_transforms = transforms.Compose([transforms.Resize((224, 224)),
 74 |                                            #transforms.RandomCrop((224, 224)),
 75 |                                            transforms.ToTensor()])
 76 |     
 77 |     test_transforms = transforms.Compose([transforms.Resize((224, 224)),
 78 |                                           #transforms.CenterCrop((224, 224)),
 79 |                                           transforms.ToTensor()])
 80 |     
 81 |     train_loader, val_loader, test_loader = get_dataloaders_cifar10(
 82 |         batch_size=16, 
 83 |         num_workers=4, 
 84 |         train_transforms=train_transforms,
 85 |         test_transforms=test_transforms,
 86 |         validation_fraction=0.1)
 87 | 
 88 | 
 89 |     #########################################
 90 |     ### 2 Initializing the Model
 91 |     #########################################
 92 | 
 93 |     model = vit_b_16(weights=ViT_B_16_Weights.IMAGENET1K_V1)
 94 |     model = torch.compile(model)
 95 | 
 96 |     # replace output layer
 97 |     model.heads.head = torch.nn.Linear(in_features=768, out_features=10)
 98 | 
 99 |     model.to(device)
100 |     optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)
101 | 
102 |     #########################################
103 |     ### 3 Finetuning
104 |     #########################################
105 | 
106 |     start = time.time()
107 |     train(
108 |         num_epochs=3,
109 |         model=model,
110 |         optimizer=optimizer,
111 |         train_loader=train_loader,
112 |         val_loader=val_loader,
113 |         device=device
114 |     )
115 | 
116 |     end = time.time()
117 |     elapsed = end-start
118 |     print(f"Time elapsed {elapsed/60:.2f} min")
119 |     print(f"Memory used: {torch.cuda.max_memory_reserved() / 1e9:.02f} GB")
120 | 
121 |     #########################################
122 |     ### 4 Evaluation
123 |     #########################################
124 |     
125 |     with torch.no_grad():
126 |         model.eval()
127 |         test_acc = torchmetrics.Accuracy(task="multiclass", num_classes=10).to(device)
128 | 
129 |         for (features, targets) in test_loader:
130 |             features = features.to(device)
131 |             targets = targets.to(device)
132 |             outputs = model(features)
133 |             predicted_labels = torch.argmax(outputs, 1)
134 |             test_acc.update(predicted_labels, targets)
135 | 
136 |     print(f"Test accuracy {test_acc.compute()*100:.2f}%")


--------------------------------------------------------------------------------
/03_fabric-vit.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | 
  3 | import lightning as L
  4 | from lightning import Fabric
  5 | import torch
  6 | import torch.nn.functional as F
  7 | import torchmetrics
  8 | from torchvision import transforms
  9 | from torchvision.models import vit_b_16
 10 | from torchvision.models import ViT_B_16_Weights
 11 | from watermark import watermark
 12 | 
 13 | from local_utilities import get_dataloaders_cifar10
 14 | 
 15 | 
 16 | def train(num_epochs, model, optimizer, train_loader, val_loader, fabric):
 17 | 
 18 |     for epoch in range(num_epochs):
 19 |         train_acc = torchmetrics.Accuracy(task="multiclass", num_classes=10).to(fabric.device)
 20 | 
 21 |         model.train()
 22 |         for batch_idx, (features, targets) in enumerate(train_loader):
 23 |             model.train()
 24 | 
 25 |             ### FORWARD AND BACK PROP
 26 |             logits = model(features)
 27 |             loss = F.cross_entropy(logits, targets)
 28 | 
 29 |             optimizer.zero_grad()
 30 |             fabric.backward(loss)
 31 | 
 32 |             ### UPDATE MODEL PARAMETERS
 33 |             optimizer.step()
 34 | 
 35 |             ### LOGGING
 36 |             if not batch_idx % 300:
 37 |                 print(f"Epoch: {epoch+1:04d}/{num_epochs:04d} | Batch {batch_idx:04d}/{len(train_loader):04d} | Loss: {loss:.4f}")
 38 | 
 39 |             model.eval()
 40 |             with torch.no_grad():
 41 |                 predicted_labels = torch.argmax(logits, 1)
 42 |                 train_acc.update(predicted_labels, targets)
 43 | 
 44 |         ### MORE LOGGING
 45 |         model.eval()
 46 |         with torch.no_grad():
 47 |             val_acc = torchmetrics.Accuracy(task="multiclass", num_classes=10).to(fabric.device)
 48 | 
 49 |             for (features, targets) in val_loader:
 50 |                 outputs = model(features)
 51 |                 predicted_labels = torch.argmax(outputs, 1)
 52 |                 val_acc.update(predicted_labels, targets)
 53 | 
 54 |             print(f"Epoch: {epoch+1:04d}/{num_epochs:04d} | Train acc.: {train_acc.compute()*100:.2f}% | Val acc.: {val_acc.compute()*100:.2f}%")
 55 |             train_acc.reset(), val_acc.reset()
 56 | 
 57 | 
 58 | if __name__ == "__main__":
 59 | 
 60 |     print(watermark(packages="torch,lightning", python=True))
 61 |     print("Torch CUDA available?", torch.cuda.is_available())
 62 |     torch.set_float32_matmul_precision("medium")
 63 | 
 64 |     L.seed_everything(123)
 65 | 
 66 |     ##########################
 67 |     ### 1 Loading the Dataset
 68 |     ##########################
 69 |     train_transforms = transforms.Compose([transforms.Resize((224, 224)),
 70 |                                            #transforms.RandomCrop((224, 224)),
 71 |                                            transforms.ToTensor()])
 72 | 
 73 |     test_transforms = transforms.Compose([transforms.Resize((224, 224)),
 74 |                                           #transforms.CenterCrop((224, 224)),
 75 |                                           transforms.ToTensor()])
 76 | 
 77 |     train_loader, val_loader, test_loader = get_dataloaders_cifar10(
 78 |         batch_size=16,
 79 |         num_workers=4,
 80 |         train_transforms=train_transforms,
 81 |         test_transforms=test_transforms,
 82 |         validation_fraction=0.1,
 83 |         download=True
 84 |     )
 85 | 
 86 |     #########################################
 87 |     ### 2 Initializing the Model
 88 |     #########################################
 89 | 
 90 |     model = vit_b_16(weights=ViT_B_16_Weights.IMAGENET1K_V1)
 91 | 
 92 |     # replace output layer
 93 |     model.heads.head = torch.nn.Linear(in_features=768, out_features=10)
 94 | 
 95 |     optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)
 96 | 
 97 | 
 98 |     #########################################
 99 |     ### 3 Launch Fabric
100 |     #########################################
101 | 
102 |     fabric = Fabric(accelerator="cuda", devices=1)
103 |     fabric.launch()
104 | 
105 |     train_loader, val_loader, test_loader = fabric.setup_dataloaders(
106 |         train_loader, val_loader, test_loader)
107 | 
108 |     model, optimizer = fabric.setup(model, optimizer)
109 | 
110 |     #########################################
111 |     ### 4 Finetuning
112 |     #########################################
113 | 
114 |     start = time.time()
115 |     train(
116 |         num_epochs=3,
117 |         model=model,
118 |         optimizer=optimizer,
119 |         train_loader=train_loader,
120 |         val_loader=val_loader,
121 |         fabric=fabric,
122 |     )
123 | 
124 |     end = time.time()
125 |     elapsed = end-start
126 |     fabric.print(f"Time elapsed {elapsed/60:.2f} min")
127 |     fabric.print(f"Memory used: {torch.cuda.max_memory_reserved() / 1e9:.02f} GB")
128 | 
129 |     #########################################
130 |     ### 5 Evaluation
131 |     #########################################
132 | 
133 |     with torch.no_grad():
134 |         model.eval()
135 |         test_acc = torchmetrics.Accuracy(task="multiclass", num_classes=10).to(fabric.device)
136 | 
137 |         for (features, targets) in test_loader:
138 |             outputs = model(features)
139 |             predicted_labels = torch.argmax(outputs, 1)
140 |             test_acc.update(predicted_labels, targets)
141 | 
142 |     fabric.print(f"Test accuracy {test_acc.compute()*100:.2f}%")
143 | 


--------------------------------------------------------------------------------
/04_fabric-vit-mixed-precision.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | 
  3 | import lightning as L
  4 | from lightning import Fabric
  5 | import torch
  6 | import torch.nn.functional as F
  7 | import torchmetrics
  8 | from torchvision import transforms
  9 | from torchvision.models import vit_b_16
 10 | from torchvision.models import ViT_B_16_Weights
 11 | from watermark import watermark
 12 | 
 13 | from local_utilities import get_dataloaders_cifar10
 14 | 
 15 | 
 16 | def train(num_epochs, model, optimizer, train_loader, val_loader, fabric):
 17 | 
 18 |     for epoch in range(num_epochs):
 19 |         train_acc = torchmetrics.Accuracy(task="multiclass", num_classes=10).to(fabric.device)
 20 | 
 21 |         model.train()
 22 |         for batch_idx, (features, targets) in enumerate(train_loader):
 23 |             model.train()
 24 | 
 25 |             ### FORWARD AND BACK PROP
 26 |             logits = model(features)
 27 |             loss = F.cross_entropy(logits, targets)
 28 | 
 29 |             optimizer.zero_grad()
 30 |             fabric.backward(loss)
 31 | 
 32 |             ### UPDATE MODEL PARAMETERS
 33 |             optimizer.step()
 34 | 
 35 |             ### LOGGING
 36 |             if not batch_idx % 300:
 37 |                 print(f"Epoch: {epoch+1:04d}/{num_epochs:04d} | Batch {batch_idx:04d}/{len(train_loader):04d} | Loss: {loss:.4f}")
 38 | 
 39 |             model.eval()
 40 |             with torch.no_grad():
 41 |                 predicted_labels = torch.argmax(logits, 1)
 42 |                 train_acc.update(predicted_labels, targets)
 43 | 
 44 |         ### MORE LOGGING
 45 |         model.eval()
 46 |         with torch.no_grad():
 47 |             val_acc = torchmetrics.Accuracy(task="multiclass", num_classes=10).to(fabric.device)
 48 | 
 49 |             for (features, targets) in val_loader:
 50 |                 outputs = model(features)
 51 |                 predicted_labels = torch.argmax(outputs, 1)
 52 |                 val_acc.update(predicted_labels, targets)
 53 | 
 54 |             print(f"Epoch: {epoch+1:04d}/{num_epochs:04d} | Train acc.: {train_acc.compute()*100:.2f}% | Val acc.: {val_acc.compute()*100:.2f}%")
 55 |             train_acc.reset(), val_acc.reset()
 56 | 
 57 | 
 58 | if __name__ == "__main__":
 59 | 
 60 |     print(watermark(packages="torch,lightning", python=True))
 61 |     print("Torch CUDA available?", torch.cuda.is_available())
 62 |     torch.set_float32_matmul_precision("medium")
 63 | 
 64 |     L.seed_everything(123)
 65 | 
 66 |     ##########################
 67 |     ### 1 Loading the Dataset
 68 |     ##########################
 69 |     train_transforms = transforms.Compose([transforms.Resize((224, 224)),
 70 |                                            #transforms.RandomCrop((224, 224)),
 71 |                                            transforms.ToTensor()])
 72 | 
 73 |     test_transforms = transforms.Compose([transforms.Resize((224, 224)),
 74 |                                           #transforms.CenterCrop((224, 224)),
 75 |                                           transforms.ToTensor()])
 76 | 
 77 |     train_loader, val_loader, test_loader = get_dataloaders_cifar10(
 78 |         batch_size=16,
 79 |         num_workers=4,
 80 |         train_transforms=train_transforms,
 81 |         test_transforms=test_transforms,
 82 |         validation_fraction=0.1,
 83 |         download=True
 84 |     )
 85 | 
 86 |     #########################################
 87 |     ### 2 Initializing the Model
 88 |     #########################################
 89 | 
 90 |     model = vit_b_16(weights=ViT_B_16_Weights.IMAGENET1K_V1)
 91 | 
 92 |     # replace output layer
 93 |     model.heads.head = torch.nn.Linear(in_features=768, out_features=10)
 94 | 
 95 |     optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)
 96 | 
 97 | 
 98 |     #########################################
 99 |     ### 3 Launch Fabric
100 |     #########################################
101 | 
102 |     fabric = Fabric(accelerator="cuda", devices=1, precision="bf16-mixed")
103 |     fabric.launch()
104 | 
105 |     train_loader, val_loader, test_loader = fabric.setup_dataloaders(
106 |         train_loader, val_loader, test_loader)
107 | 
108 |     model, optimizer = fabric.setup(model, optimizer)
109 | 
110 |     #########################################
111 |     ### 4 Finetuning
112 |     #########################################
113 | 
114 |     start = time.time()
115 |     train(
116 |         num_epochs=3,
117 |         model=model,
118 |         optimizer=optimizer,
119 |         train_loader=train_loader,
120 |         val_loader=val_loader,
121 |         fabric=fabric,
122 |     )
123 | 
124 |     end = time.time()
125 |     elapsed = end-start
126 |     fabric.print(f"Time elapsed {elapsed/60:.2f} min")
127 |     fabric.print(f"Memory used: {torch.cuda.max_memory_reserved() / 1e9:.02f} GB")
128 | 
129 |     #########################################
130 |     ### 5 Evaluation
131 |     #########################################
132 | 
133 |     with torch.no_grad():
134 |         model.eval()
135 |         test_acc = torchmetrics.Accuracy(task="multiclass", num_classes=10).to(fabric.device)
136 | 
137 |         for (features, targets) in test_loader:
138 |             outputs = model(features)
139 |             predicted_labels = torch.argmax(outputs, 1)
140 |             test_acc.update(predicted_labels, targets)
141 | 
142 |     fabric.print(f"Test accuracy {test_acc.compute()*100:.2f}%")


--------------------------------------------------------------------------------
/05_fabric-vit-mixed-ddp.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | 
  3 | import lightning as L
  4 | from lightning import Fabric
  5 | import torch
  6 | import torch.nn.functional as F
  7 | import torchmetrics
  8 | from torchvision import transforms
  9 | from torchvision.models import vit_b_16
 10 | from torchvision.models import ViT_B_16_Weights
 11 | from watermark import watermark
 12 | 
 13 | from local_utilities import get_dataloaders_cifar10
 14 | 
 15 | 
 16 | def train(num_epochs, model, optimizer, train_loader, val_loader, fabric):
 17 | 
 18 |     for epoch in range(num_epochs):
 19 |         train_acc = torchmetrics.Accuracy(task="multiclass", num_classes=10).to(fabric.device)
 20 | 
 21 |         model.train()
 22 |         for batch_idx, (features, targets) in enumerate(train_loader):
 23 |             model.train()
 24 | 
 25 |             ### FORWARD AND BACK PROP
 26 |             logits = model(features)
 27 |             loss = F.cross_entropy(logits, targets)
 28 | 
 29 |             optimizer.zero_grad()
 30 |             fabric.backward(loss)
 31 | 
 32 |             ### UPDATE MODEL PARAMETERS
 33 |             optimizer.step()
 34 | 
 35 |             ### LOGGING
 36 |             if not batch_idx % 300:
 37 |                 print(f"Epoch: {epoch+1:04d}/{num_epochs:04d} | Batch {batch_idx:04d}/{len(train_loader):04d} | Loss: {loss:.4f}")
 38 | 
 39 |             model.eval()
 40 |             with torch.no_grad():
 41 |                 predicted_labels = torch.argmax(logits, 1)
 42 |                 train_acc.update(predicted_labels, targets)
 43 | 
 44 |         ### MORE LOGGING
 45 |         model.eval()
 46 |         with torch.no_grad():
 47 |             val_acc = torchmetrics.Accuracy(task="multiclass", num_classes=10).to(fabric.device)
 48 | 
 49 |             for (features, targets) in val_loader:
 50 |                 outputs = model(features)
 51 |                 predicted_labels = torch.argmax(outputs, 1)
 52 |                 val_acc.update(predicted_labels, targets)
 53 | 
 54 |             fabric.print(f"Epoch: {epoch+1:04d}/{num_epochs:04d} | Train acc.: {train_acc.compute()*100:.2f}% | Val acc.: {val_acc.compute()*100:.2f}%")
 55 |             train_acc.reset(), val_acc.reset()
 56 | 
 57 | 
 58 | if __name__ == "__main__":
 59 | 
 60 |     print(watermark(packages="torch,lightning", python=True))
 61 |     print("Torch CUDA available?", torch.cuda.is_available())
 62 |     torch.set_float32_matmul_precision("medium")
 63 | 
 64 |     L.seed_everything(123)
 65 | 
 66 |     ##########################
 67 |     ### 1 Loading the Dataset
 68 |     ##########################
 69 |     train_transforms = transforms.Compose([transforms.Resize((224, 224)),
 70 |                                            #transforms.RandomCrop((224, 224)),
 71 |                                            transforms.ToTensor()])
 72 | 
 73 |     test_transforms = transforms.Compose([transforms.Resize((224, 224)),
 74 |                                           #transforms.CenterCrop((224, 224)),
 75 |                                           transforms.ToTensor()])
 76 | 
 77 |     train_loader, val_loader, test_loader = get_dataloaders_cifar10(
 78 |         batch_size=16,
 79 |         num_workers=4,
 80 |         train_transforms=train_transforms,
 81 |         test_transforms=test_transforms,
 82 |         validation_fraction=0.1,
 83 |         download=True
 84 |     )
 85 | 
 86 |     #########################################
 87 |     ### 2 Initializing the Model
 88 |     #########################################
 89 | 
 90 |     model = vit_b_16(weights=ViT_B_16_Weights.IMAGENET1K_V1)
 91 | 
 92 |     # replace output layer
 93 |     model.heads.head = torch.nn.Linear(in_features=768, out_features=10)
 94 | 
 95 |     optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)
 96 | 
 97 | 
 98 |     #########################################
 99 |     ### 3 Launch Fabric
100 |     #########################################
101 | 
102 |     fabric = Fabric(accelerator="cuda", precision="bf16-mixed", devices=4, strategy="ddp")
103 |     fabric.launch()
104 | 
105 |     train_loader, val_loader, test_loader = fabric.setup_dataloaders(
106 |         train_loader, val_loader, test_loader)
107 | 
108 |     model, optimizer = fabric.setup(model, optimizer)
109 | 
110 |     #########################################
111 |     ### 4 Finetuning
112 |     #########################################
113 | 
114 |     start = time.time()
115 |     train(
116 |         num_epochs=3,
117 |         model=model,
118 |         optimizer=optimizer,
119 |         train_loader=train_loader,
120 |         val_loader=val_loader,
121 |         fabric=fabric,
122 |     )
123 | 
124 |     end = time.time()
125 |     elapsed = end-start
126 |     fabric.print(f"Time elapsed {elapsed/60:.2f} min")
127 |     fabric.print(f"Memory used: {torch.cuda.max_memory_reserved() / 1e9:.02f} GB")
128 | 
129 |     #########################################
130 |     ### 5 Evaluation
131 |     #########################################
132 | 
133 |     with torch.no_grad():
134 |         model.eval()
135 |         test_acc = torchmetrics.Accuracy(task="multiclass", num_classes=10).to(fabric.device)
136 | 
137 |         for (features, targets) in test_loader:
138 |             outputs = model(features)
139 |             predicted_labels = torch.argmax(outputs, 1)
140 |             test_acc.update(predicted_labels, targets)
141 | 
142 |     fabric.print(f"Test accuracy {test_acc.compute()*100:.2f}%")


--------------------------------------------------------------------------------
/06_fabric-vit-mixed-fsdp.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | 
  3 | import lightning as L
  4 | from lightning import Fabric
  5 | import torch
  6 | import torch.nn.functional as F
  7 | import torchmetrics
  8 | from torchvision import transforms
  9 | from torchvision.models import vit_b_16
 10 | from torchvision.models import ViT_B_16_Weights
 11 | from watermark import watermark
 12 | 
 13 | from local_utilities import get_dataloaders_cifar10
 14 | 
 15 | 
 16 | def train(num_epochs, model, optimizer, train_loader, val_loader, fabric):
 17 | 
 18 |     for epoch in range(num_epochs):
 19 |         train_acc = torchmetrics.Accuracy(task="multiclass", num_classes=10).to(fabric.device)
 20 | 
 21 |         model.train()
 22 |         for batch_idx, (features, targets) in enumerate(train_loader):
 23 |             model.train()
 24 | 
 25 |             ### FORWARD AND BACK PROP
 26 |             logits = model(features)
 27 |             loss = F.cross_entropy(logits, targets)
 28 | 
 29 |             optimizer.zero_grad()
 30 |             fabric.backward(loss)
 31 | 
 32 |             ### UPDATE MODEL PARAMETERS
 33 |             optimizer.step()
 34 | 
 35 |             ### LOGGING
 36 |             if not batch_idx % 300:
 37 |                 print(f"Epoch: {epoch+1:04d}/{num_epochs:04d} | Batch {batch_idx:04d}/{len(train_loader):04d} | Loss: {loss:.4f}")
 38 | 
 39 |             model.eval()
 40 |             with torch.no_grad():
 41 |                 predicted_labels = torch.argmax(logits, 1)
 42 |                 train_acc.update(predicted_labels, targets)
 43 | 
 44 |         ### MORE LOGGING
 45 |         model.eval()
 46 |         with torch.no_grad():
 47 |             val_acc = torchmetrics.Accuracy(task="multiclass", num_classes=10).to(fabric.device)
 48 | 
 49 |             for (features, targets) in val_loader:
 50 |                 outputs = model(features)
 51 |                 predicted_labels = torch.argmax(outputs, 1)
 52 |                 val_acc.update(predicted_labels, targets)
 53 | 
 54 |             fabric.print(f"Epoch: {epoch+1:04d}/{num_epochs:04d} | Train acc.: {train_acc.compute()*100:.2f}% | Val acc.: {val_acc.compute()*100:.2f}%")
 55 |             train_acc.reset(), val_acc.reset()
 56 | 
 57 | 
 58 | if __name__ == "__main__":
 59 | 
 60 |     print(watermark(packages="torch,lightning", python=True))
 61 |     print("Torch CUDA available?", torch.cuda.is_available())
 62 |     torch.set_float32_matmul_precision("medium")
 63 | 
 64 |     L.seed_everything(123)
 65 | 
 66 |     ##########################
 67 |     ### 1 Loading the Dataset
 68 |     ##########################
 69 |     train_transforms = transforms.Compose([transforms.Resize((224, 224)),
 70 |                                            #transforms.RandomCrop((224, 224)),
 71 |                                            transforms.ToTensor()])
 72 | 
 73 |     test_transforms = transforms.Compose([transforms.Resize((224, 224)),
 74 |                                           #transforms.CenterCrop((224, 224)),
 75 |                                           transforms.ToTensor()])
 76 | 
 77 |     train_loader, val_loader, test_loader = get_dataloaders_cifar10(
 78 |         batch_size=16,
 79 |         num_workers=4,
 80 |         train_transforms=train_transforms,
 81 |         test_transforms=test_transforms,
 82 |         validation_fraction=0.1,
 83 |         download=True
 84 |     )
 85 | 
 86 |     #########################################
 87 |     ### 2 Initializing the Model
 88 |     #########################################
 89 | 
 90 |     model = vit_b_16(weights=ViT_B_16_Weights.IMAGENET1K_V1)
 91 | 
 92 |     # replace output layer
 93 |     model.heads.head = torch.nn.Linear(in_features=768, out_features=10)
 94 | 
 95 |     optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)
 96 | 
 97 | 
 98 |     #########################################
 99 |     ### 3 Launch Fabric
100 |     #########################################
101 | 
102 |     fabric = Fabric(accelerator="cuda", precision="bf16-mixed", devices=4, strategy="fsdp")
103 |     fabric.launch()
104 | 
105 |     train_loader, val_loader, test_loader = fabric.setup_dataloaders(
106 |         train_loader, val_loader, test_loader)
107 | 
108 |     model, optimizer = fabric.setup(model, optimizer)
109 | 
110 |     #########################################
111 |     ### 4 Finetuning
112 |     #########################################
113 | 
114 |     start = time.time()
115 |     train(
116 |         num_epochs=3,
117 |         model=model,
118 |         optimizer=optimizer,
119 |         train_loader=train_loader,
120 |         val_loader=val_loader,
121 |         fabric=fabric,
122 |     )
123 | 
124 |     end = time.time()
125 |     elapsed = end-start
126 |     fabric.print(f"Time elapsed {elapsed/60:.2f} min")
127 |     fabric.print(f"Memory used: {torch.cuda.max_memory_reserved() / 1e9:.02f} GB")
128 | 
129 |     #########################################
130 |     ### 5 Evaluation
131 |     #########################################
132 | 
133 |     with torch.no_grad():
134 |         model.eval()
135 |         test_acc = torchmetrics.Accuracy(task="multiclass", num_classes=10).to(fabric.device)
136 | 
137 |         for (features, targets) in test_loader:
138 |             outputs = model(features)
139 |             predicted_labels = torch.argmax(outputs, 1)
140 |             test_acc.update(predicted_labels, targets)
141 | 
142 |     fabric.print(f"Test accuracy {test_acc.compute()*100:.2f}%")


--------------------------------------------------------------------------------
/07_fabric-vit-mixed-fsdp-with-scheduler.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | 
  3 | import lightning as L
  4 | from lightning import Fabric
  5 | import torch
  6 | import torch.nn.functional as F
  7 | from torch.optim.lr_scheduler import ExponentialLR
  8 | import torchmetrics
  9 | from torchvision import transforms
 10 | from torchvision.models import vit_b_16
 11 | from torchvision.models import ViT_B_16_Weights
 12 | from watermark import watermark
 13 | 
 14 | from local_utilities import get_dataloaders_cifar10
 15 | 
 16 | 
 17 | def train(num_epochs, model, optimizer, train_loader, val_loader, fabric, scheduler):
 18 | 
 19 |     for epoch in range(num_epochs):
 20 |         train_acc = torchmetrics.Accuracy(task="multiclass", num_classes=10).to(fabric.device)
 21 | 
 22 |         model.train()
 23 |         for batch_idx, (features, targets) in enumerate(train_loader):
 24 |             model.train()
 25 | 
 26 |             ### FORWARD AND BACK PROP
 27 |             logits = model(features)
 28 |             loss = F.cross_entropy(logits, targets)
 29 | 
 30 |             optimizer.zero_grad()
 31 |             fabric.backward(loss)
 32 | 
 33 |             ### UPDATE MODEL PARAMETERS
 34 |             optimizer.step()
 35 | 
 36 |             ### LOGGING
 37 |             if not batch_idx % 300:
 38 |                 fabric.print(f"Epoch: {epoch+1:04d}/{num_epochs:04d} | Batch {batch_idx:04d}/{len(train_loader):04d} | Loss: {loss:.4f}")
 39 | 
 40 |             model.eval()
 41 |             with torch.no_grad():
 42 |                 predicted_labels = torch.argmax(logits, 1)
 43 |                 train_acc.update(predicted_labels, targets)
 44 |         scheduler.step()
 45 | 
 46 |         ### MORE LOGGING
 47 |         model.eval()
 48 |         with torch.no_grad():
 49 |             val_acc = torchmetrics.Accuracy(task="multiclass", num_classes=10).to(fabric.device)
 50 | 
 51 |             for (features, targets) in val_loader:
 52 |                 outputs = model(features)
 53 |                 predicted_labels = torch.argmax(outputs, 1)
 54 |                 val_acc.update(predicted_labels, targets)
 55 | 
 56 |             fabric.print(f"Epoch: {epoch+1:04d}/{num_epochs:04d} | Train acc.: {train_acc.compute()*100:.2f}% | Val acc.: {val_acc.compute()*100:.2f}%")
 57 |             train_acc.reset(), val_acc.reset()
 58 | 
 59 | 
 60 | if __name__ == "__main__":
 61 | 
 62 |     torch.set_float32_matmul_precision("medium")
 63 | 
 64 |     L.seed_everything(123)
 65 | 
 66 |     ##########################
 67 |     ### 1 Loading the Dataset
 68 |     ##########################
 69 |     train_transforms = transforms.Compose([transforms.Resize((224, 224)),
 70 |                                            #transforms.RandomCrop((224, 224)),
 71 |                                            transforms.ToTensor()])
 72 | 
 73 |     test_transforms = transforms.Compose([transforms.Resize((224, 224)),
 74 |                                           #transforms.CenterCrop((224, 224)),
 75 |                                           transforms.ToTensor()])
 76 | 
 77 |     train_loader, val_loader, test_loader = get_dataloaders_cifar10(
 78 |         batch_size=16,
 79 |         num_workers=4,
 80 |         train_transforms=train_transforms,
 81 |         test_transforms=test_transforms,
 82 |         validation_fraction=0.1,
 83 |         download=True
 84 |     )
 85 | 
 86 |     #########################################
 87 |     ### 2 Initializing the Model
 88 |     #########################################
 89 | 
 90 |     model = vit_b_16(weights=ViT_B_16_Weights.IMAGENET1K_V1)
 91 | 
 92 |     # replace output layer
 93 |     model.heads.head = torch.nn.Linear(in_features=768, out_features=10)
 94 | 
 95 |     optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)
 96 | 
 97 | 
 98 |     #########################################
 99 |     ### 3 Launch Fabric
100 |     #########################################
101 | 
102 |     fabric = Fabric(accelerator="cuda", precision="bf16-mixed", devices=4, strategy="fsdp")
103 |     fabric.launch()
104 | 
105 |     train_loader, val_loader, test_loader = fabric.setup_dataloaders(
106 |         train_loader, val_loader, test_loader)
107 |     scheduler = ExponentialLR(optimizer, gamma=0.9)
108 |     model, optimizer = fabric.setup(model, optimizer)
109 | 
110 |     #########################################
111 |     ### 4 Finetuning
112 |     #########################################
113 | 
114 |     start = time.time()
115 |     train(
116 |         num_epochs=3,
117 |         model=model,
118 |         optimizer=optimizer,
119 |         train_loader=train_loader,
120 |         val_loader=val_loader,
121 |         fabric=fabric,
122 |         scheduler=scheduler
123 |     )
124 | 
125 |     end = time.time()
126 |     elapsed = end-start
127 |     fabric.print(f"Time elapsed {elapsed/60:.2f} min")
128 |     fabric.print(f"Memory used: {torch.cuda.max_memory_reserved() / 1e9:.02f} GB")
129 | 
130 |     #########################################
131 |     ### 5 Evaluation
132 |     #########################################
133 | 
134 |     with torch.no_grad():
135 |         model.eval()
136 |         test_acc = torchmetrics.Accuracy(task="multiclass", num_classes=10).to(fabric.device)
137 | 
138 |         for (features, targets) in test_loader:
139 |             outputs = model(features)
140 |             predicted_labels = torch.argmax(outputs, 1)
141 |             test_acc.update(predicted_labels, targets)
142 | 
143 |     fabric.print(f"Test accuracy {test_acc.compute()*100:.2f}%")


--------------------------------------------------------------------------------
/08_saving-and-loading/08-1-train.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import os
  3 | import time
  4 | 
  5 | import lightning as L
  6 | from lightning import Fabric
  7 | from lightning.fabric.strategies import FSDPStrategy
  8 | import torch
  9 | import torch.nn.functional as F
 10 | from torch.optim.lr_scheduler import ExponentialLR
 11 | import torchmetrics
 12 | from torchvision import transforms
 13 | from torchvision.models import vit_b_16
 14 | from torchvision.models import ViT_B_16_Weights
 15 | from watermark import watermark
 16 | 
 17 | # Get the parent directory of the current script
 18 | parent_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
 19 | # Add the parent directory to sys.path
 20 | sys.path.append(parent_dir)
 21 | from local_utilities import get_dataloaders_cifar10
 22 | 
 23 | 
 24 | def train(num_epochs, model, optimizer, train_loader, val_loader, fabric, scheduler):
 25 | 
 26 |     for epoch in range(num_epochs):
 27 |         train_acc = torchmetrics.Accuracy(task="multiclass", num_classes=10).to(fabric.device)
 28 | 
 29 |         model.train()
 30 |         for batch_idx, (features, targets) in enumerate(train_loader):
 31 |             model.train()
 32 | 
 33 |             ### FORWARD AND BACK PROP
 34 |             logits = model(features)
 35 |             loss = F.cross_entropy(logits, targets)
 36 | 
 37 |             optimizer.zero_grad()
 38 |             fabric.backward(loss)
 39 | 
 40 |             ### UPDATE MODEL PARAMETERS
 41 |             optimizer.step()
 42 | 
 43 |             ### LOGGING
 44 |             if not batch_idx % 300:
 45 |                 print(f"Epoch: {epoch+1:04d}/{num_epochs:04d} | Batch {batch_idx:04d}/{len(train_loader):04d} | Loss: {loss:.4f}")
 46 | 
 47 |             model.eval()
 48 |             with torch.no_grad():
 49 |                 predicted_labels = torch.argmax(logits, 1)
 50 |                 train_acc.update(predicted_labels, targets)
 51 |             break ## TODO: REMOVE
 52 |         scheduler.step()
 53 | 
 54 |         ### MORE LOGGING
 55 |         model.eval()
 56 |         with torch.no_grad():
 57 |             val_acc = torchmetrics.Accuracy(task="multiclass", num_classes=10).to(fabric.device)
 58 | 
 59 |             for (features, targets) in val_loader:
 60 |                 outputs = model(features)
 61 |                 predicted_labels = torch.argmax(outputs, 1)
 62 |                 val_acc.update(predicted_labels, targets)
 63 | 
 64 |             fabric.print(f"Epoch: {epoch+1:04d}/{num_epochs:04d} | Train acc.: {train_acc.compute()*100:.2f}% | Val acc.: {val_acc.compute()*100:.2f}%")
 65 |             train_acc.reset(), val_acc.reset()
 66 | 
 67 | 
 68 | if __name__ == "__main__":
 69 | 
 70 |     print(watermark(packages="torch,lightning", python=True))
 71 |     print("Torch CUDA available?", torch.cuda.is_available())
 72 |     torch.set_float32_matmul_precision("medium")
 73 | 
 74 |     L.seed_everything(123)
 75 | 
 76 |     ##########################
 77 |     ### 1 Loading the Dataset
 78 |     ##########################
 79 |     train_transforms = transforms.Compose([transforms.Resize((224, 224)),
 80 |                                            #transforms.RandomCrop((224, 224)),
 81 |                                            transforms.ToTensor()])
 82 | 
 83 |     test_transforms = transforms.Compose([transforms.Resize((224, 224)),
 84 |                                           #transforms.CenterCrop((224, 224)),
 85 |                                           transforms.ToTensor()])
 86 | 
 87 |     train_loader, val_loader, test_loader = get_dataloaders_cifar10(
 88 |         batch_size=16,
 89 |         num_workers=4,
 90 |         train_transforms=train_transforms,
 91 |         test_transforms=test_transforms,
 92 |         validation_fraction=0.1,
 93 |         download=True
 94 |     )
 95 | 
 96 |     #########################################
 97 |     ### 2 Initializing the Model
 98 |     #########################################
 99 | 
100 |     model = vit_b_16(weights=ViT_B_16_Weights.IMAGENET1K_V1)
101 | 
102 |     # replace output layer
103 |     model.heads.head = torch.nn.Linear(in_features=768, out_features=10)
104 | 
105 |     optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)
106 |     scheduler = ExponentialLR(optimizer, gamma=0.9)
107 | 
108 | 
109 |     #########################################
110 |     ### 3 Launch Fabric
111 |     #########################################
112 | 
113 |     strategy = FSDPStrategy(state_dict_type="full")
114 |     fabric = Fabric(accelerator="cuda", precision="bf16-mixed", devices=4, strategy=strategy)
115 |     fabric.launch()
116 | 
117 |     train_loader, val_loader, test_loader = fabric.setup_dataloaders(
118 |         train_loader, val_loader, test_loader)
119 | 
120 |     model, optimizer = fabric.setup(model, optimizer)
121 | 
122 |     #########################################
123 |     ### 4 Finetuning
124 |     #########################################
125 | 
126 |     start = time.time()
127 |     train(
128 |         num_epochs=3,
129 |         model=model,
130 |         optimizer=optimizer,
131 |         train_loader=train_loader,
132 |         val_loader=val_loader,
133 |         fabric=fabric,
134 |         scheduler=scheduler
135 |     )
136 | 
137 |     end = time.time()
138 |     elapsed = end-start
139 |     fabric.print(f"Time elapsed {elapsed/60:.2f} min")
140 |     fabric.print(f"Memory used: {torch.cuda.max_memory_reserved() / 1e9:.02f} GB")
141 | 
142 |     #########################################
143 |     ### 5 Evaluation
144 |     #########################################
145 | 
146 |     with torch.no_grad():
147 |         model.eval()
148 |         test_acc = torchmetrics.Accuracy(task="multiclass", num_classes=10).to(fabric.device)
149 | 
150 |         for (features, targets) in test_loader:
151 |             outputs = model(features)
152 |             predicted_labels = torch.argmax(outputs, 1)
153 |             test_acc.update(predicted_labels, targets)
154 | 
155 |     fabric.print(f"Test accuracy {test_acc.compute()*100:.2f}%")
156 | 
157 | 
158 |     #########################################
159 |     ### 6 Save the model
160 |     #########################################
161 | 
162 | 
163 |     state = {
164 |         "model": model,
165 |         "optimizer": optimizer,
166 |         "anything-else-you-want-to-save": 123
167 |     }
168 | 
169 |     fabric.save("checkpoint.ckpt", state)


--------------------------------------------------------------------------------
/08_saving-and-loading/08-2-load.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | 
 4 | from lightning import Fabric
 5 | import torch
 6 | import torchmetrics
 7 | from torchvision import transforms
 8 | from torchvision.models import vit_b_16
 9 | 
10 | # Get the parent directory of the current script
11 | parent_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
12 | # Add the parent directory to sys.path
13 | sys.path.append(parent_dir)
14 | from local_utilities import get_dataloaders_cifar10
15 | 
16 | 
17 | test_transforms = transforms.Compose(
18 |     [transforms.Resize((224, 224)), transforms.ToTensor()])
19 | 
20 | _, _, test_loader = get_dataloaders_cifar10(
21 |     batch_size=16,
22 |     num_workers=4,
23 |     train_transforms=None,
24 |     test_transforms=test_transforms,
25 |     validation_fraction=0.1,
26 |     download=True
27 | )
28 | 
29 | model = vit_b_16(weights=None)
30 | model.heads.head = torch.nn.Linear(in_features=768, out_features=10)
31 | optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)
32 | 
33 | fabric = Fabric(accelerator="cuda", precision="bf16-mixed", devices=1)
34 | fabric.launch()
35 | test_loader = fabric.setup_dataloaders(test_loader)
36 | model, optimizer = fabric.setup(model, optimizer)
37 | 
38 | state = {
39 |     "model": model,
40 |     "optimizer": optimizer,
41 |     "anything-else-you-want-to-save": None
42 | }
43 | 
44 | fabric.load("checkpoint.ckpt", state)
45 | 
46 | additional_info = state["anything-else-you-want-to-save"]
47 | print("anything-else-you-want-to-save:", additional_info)
48 | 
49 | with torch.no_grad():
50 |     model.eval()
51 |     test_acc = torchmetrics.Accuracy(task="multiclass", num_classes=10).to(fabric.device)
52 | 
53 |     for (features, targets) in test_loader:
54 |         outputs = model(features)
55 |         predicted_labels = torch.argmax(outputs, 1)
56 |         test_acc.update(predicted_labels, targets)
57 | 
58 | fabric.print(f"Test accuracy {test_acc.compute()*100:.2f}%")
59 | 


--------------------------------------------------------------------------------
/08_saving-and-loading/README.md:
--------------------------------------------------------------------------------
  1 | # Saving and Loading Models in Fabric
  2 | 
  3 | 
  4 | 
  5 | There are several ways to save and load modules in Fabric. 
  6 | 
  7 | 
  8 | 
  9 | The simplest way to load an existing, saved PyTorch model is to use the `load_raw`:
 10 | 
 11 | ```python
 12 | fabric = Fabric()
 13 | model = MyModel()
 14 | 
 15 | # model.pt is a regular PyTorch state_dict via torch.save
 16 | fabric.load_raw("path/to/model.pt", model)
 17 | ```
 18 | 
 19 | However, Fabric has a `load` function that can save additional information to a checkpoint file, for example, the optimizer state, which makes it incredibly useful.
 20 | 
 21 | Below, I will illustrate 3 common scenarios for saving and loading models with Fabric:
 22 | 
 23 | 
 24 | &nbsp;
 25 | ## 1) Train on 1 GPU, load on 1 GPU
 26 | 
 27 | Suppose you saved a model after training as follows:
 28 | 
 29 | ```python
 30 | # training.py file
 31 | 
 32 | fabric = Fabric(accelerator="cuda", precision="bf16-mixed", devices=1)
 33 | fabric.launch()
 34 | # ...
 35 | state = {
 36 |     "model": model,
 37 |     "optimizer": optimizer,
 38 |     "anything-else-you-want-to-save": 123
 39 | }
 40 | fabric.save("checkpoint.ckpt", state)
 41 | ```
 42 | 
 43 | Then you can load it in a separate file as follows:
 44 | 
 45 | ```python
 46 | # loading.py file
 47 | fabric = Fabric(accelerator="cuda", precision="bf16-mixed", devices=1)
 48 | fabric.launch()
 49 | model, optimizer = fabric.setup(model, optimizer)
 50 | 
 51 | state = {
 52 |     "model": model,
 53 |     "optimizer": optimizer,
 54 |     "anything-else-you-want-to-save": None
 55 | }
 56 | 
 57 | fabric.load("checkpoint.ckpt", state)
 58 | ```
 59 | 
 60 | 
 61 | &nbsp;
 62 | ## 2) Train on 4 GPUs, load on 4 GPUs
 63 | 
 64 | Suppose you trained a model in a distributed fashion and saved it as follows:
 65 | 
 66 | ```python
 67 | # training.py file
 68 | 
 69 | fabric = Fabric(
 70 |     accelerator="cuda", precision="bf16-mixed",
 71 |     devices=4, strategy="fsdp"
 72 | )
 73 | fabric.launch()
 74 | # ...
 75 | state = {
 76 |     "model": model,
 77 |     "optimizer": optimizer,
 78 |     "anything-else-you-want-to-save": 123
 79 | }
 80 | fabric.save("checkpoint.ckpt", state)
 81 | ```
 82 | 
 83 | This will automatically shard the checkpoint file as well; here it will create a checkpoint folder with 4 smaller checkpoint chunks since we used 4 devices.
 84 | 
 85 | Then you can load it it as follows:
 86 | 
 87 | ```python
 88 | # loading.py file
 89 | fabric = Fabric(
 90 |     accelerator="cuda", precision="bf16-mixed",
 91 |     devices=4, strategy="fsdp"
 92 | )
 93 | fabric.launch()
 94 | model, optimizer = fabric.setup(model, optimizer)
 95 | 
 96 | state = {
 97 |     "model": model,
 98 |     "optimizer": optimizer,
 99 |     "anything-else-you-want-to-save": None
100 | }
101 | 
102 | fabric.load("checkpoint.ckpt", state)
103 | ```
104 | 
105 | &nbsp;
106 | ## 3) Train on 4 GPUs, load on 1 GPUs
107 | 
108 | The maybe most common scenario is to train a model on multiple GPUs, and then use the model on a single GPU for inference later. As mentioned above, Fabric saves distributed checkpoint chunks by default. You can change this behavior and create a single checkpoint file via the `state_dict_type="full"` shown below:
109 | 
110 | 
111 | 
112 | ```python
113 | # training.py file
114 | strategy = FSDPStrategy(state_dict_type="full")
115 | fabric = Fabric(
116 |     accelerator="cuda", precision="bf16-mixed",
117 |     devices=4, strategy=strategy
118 | )
119 | fabric.launch()
120 | # ...
121 | state = {
122 |     "model": model,
123 |     "optimizer": optimizer,
124 |     "anything-else-you-want-to-save": 123
125 | }
126 | fabric.save("checkpoint.ckpt", state)
127 | ```
128 | 
129 | Then you can load the checkpoint as follows:
130 | 
131 | ```python
132 | # loading.py file
133 | fabric = Fabric(
134 |     accelerator="cuda", precision="bf16-mixed", devices=1)
135 | fabric.launch()
136 | model, optimizer = fabric.setup(model, optimizer)
137 | 
138 | state = {
139 |     "model": model,
140 |     "optimizer": optimizer,
141 |     "anything-else-you-want-to-save": None
142 | }
143 | 
144 | fabric.load("checkpoint.ckpt", state)
145 | ```
146 | 
147 | If you want to give this a try, this third way is implemented in the [08-1-train.py](08-1-train.py) and [08-2-load.py](08-2-load.py) scripts.


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright 2023 Sebastian Raschka
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Experiments for CVPR 2023 Talk
2 | 
3 | ## Scaling PyTorch Model Training With Minimal Code Changes
4 | 
5 | In this short tutorial, Sebastian I'll show you how to accelerate the training of LLMs and Vision Transformers with minimal code changes using open-source libraries. 


--------------------------------------------------------------------------------
/cvpr-talk__slides.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rasbt/cvpr2023/6d0753f02f0f471c2e59e77ae8ddfb9f17d0a2cf/cvpr-talk__slides.pdf


--------------------------------------------------------------------------------
/local_utilities.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.utils.data import sampler
 3 | from torchvision import datasets
 4 | from torch.utils.data import DataLoader
 5 | from torch.utils.data import SubsetRandomSampler
 6 | from torchvision import transforms
 7 | 
 8 | 
 9 | def get_dataloaders_cifar10(batch_size, num_workers=0,
10 |                             validation_fraction=None,
11 |                             train_transforms=None,
12 |                             test_transforms=None,
13 |                             download=True):
14 | 
15 |     if train_transforms is None:
16 |         train_transforms = transforms.ToTensor()
17 | 
18 |     if test_transforms is None:
19 |         test_transforms = transforms.ToTensor()
20 | 
21 |     train_dataset = datasets.CIFAR10(root='data',
22 |                                      train=True,
23 |                                      transform=train_transforms,
24 |                                      download=download)
25 | 
26 |     valid_dataset = datasets.CIFAR10(root='data',
27 |                                      train=True,
28 |                                      transform=test_transforms)
29 | 
30 |     test_dataset = datasets.CIFAR10(root='data',
31 |                                     train=False,
32 |                                     transform=test_transforms)
33 | 
34 |     if validation_fraction is not None:
35 |         num = int(validation_fraction * 50000)
36 |         train_indices = range(0, 50000 - num)
37 |         valid_indices = range(50000 - num, 50000)
38 | 
39 |         train_sampler = SubsetRandomSampler(train_indices)
40 |         valid_sampler = SubsetRandomSampler(valid_indices)
41 | 
42 |         valid_loader = DataLoader(dataset=valid_dataset,
43 |                                   batch_size=batch_size,
44 |                                   num_workers=num_workers,
45 |                                   sampler=valid_sampler)
46 | 
47 |         train_loader = DataLoader(dataset=train_dataset,
48 |                                   batch_size=batch_size,
49 |                                   num_workers=num_workers,
50 |                                   drop_last=True,
51 |                                   sampler=train_sampler)
52 | 
53 |     else:
54 |         train_loader = DataLoader(dataset=train_dataset,
55 |                                   batch_size=batch_size,
56 |                                   num_workers=num_workers,
57 |                                   drop_last=True,
58 |                                   shuffle=True)
59 | 
60 |     test_loader = DataLoader(dataset=test_dataset,
61 |                              batch_size=batch_size,
62 |                              num_workers=num_workers,
63 |                              shuffle=False)
64 | 
65 |     if validation_fraction is None:
66 |         return train_loader, test_loader
67 |     else:
68 |         return train_loader, valid_loader, test_loader
69 | 


--------------------------------------------------------------------------------
/logs.md:
--------------------------------------------------------------------------------
  1 | # 00_pytorch-vit-random-init.py (finetuning not pretrained)
  2 | 
  3 | Train a plain Vision transformer from scratch.
  4 | 
  5 | 
  6 | ```
  7 | Python version       : 3.9.16
  8 | IPython version      : 8.12.0
  9 | 
 10 | torch    : 2.1.0.dev20230612+cu118
 11 | lightning: 2.1.0.dev0
 12 | 
 13 | Epoch: 0001/0010 | Batch 0000/2812 | Loss: 2.5971
 14 | Epoch: 0001/0010 | Batch 0300/2812 | Loss: 1.9497
 15 | Epoch: 0001/0010 | Batch 0600/2812 | Loss: 1.6241
 16 | Epoch: 0001/0010 | Batch 0900/2812 | Loss: 2.0443
 17 | Epoch: 0001/0010 | Batch 1200/2812 | Loss: 1.9395
 18 | Epoch: 0001/0010 | Batch 1500/2812 | Loss: 1.6568
 19 | Epoch: 0001/0010 | Batch 1800/2812 | Loss: 1.7207
 20 | Epoch: 0001/0010 | Batch 2100/2812 | Loss: 1.6199
 21 | Epoch: 0001/0010 | Batch 2400/2812 | Loss: 1.1766
 22 | Epoch: 0001/0010 | Batch 2700/2812 | Loss: 1.5693
 23 | Epoch: 0001/0010 | Train acc.: 34.81% | Val acc.: 47.22%
 24 | Epoch: 0002/0010 | Batch 0000/2812 | Loss: 1.0045
 25 | Epoch: 0002/0010 | Batch 0300/2812 | Loss: 1.5517
 26 | Epoch: 0002/0010 | Batch 0600/2812 | Loss: 1.5892
 27 | Epoch: 0002/0010 | Batch 0900/2812 | Loss: 1.3893
 28 | Epoch: 0002/0010 | Batch 1200/2812 | Loss: 1.2300
 29 | Epoch: 0002/0010 | Batch 1500/2812 | Loss: 0.9359
 30 | Epoch: 0002/0010 | Batch 1800/2812 | Loss: 1.4835
 31 | Epoch: 0002/0010 | Batch 2100/2812 | Loss: 1.7854
 32 | Epoch: 0002/0010 | Batch 2400/2812 | Loss: 0.9467
 33 | Epoch: 0002/0010 | Batch 2700/2812 | Loss: 1.7041
 34 | Epoch: 0002/0010 | Train acc.: 50.24% | Val acc.: 52.48%
 35 | Epoch: 0003/0010 | Batch 0000/2812 | Loss: 1.3514
 36 | Epoch: 0003/0010 | Batch 0300/2812 | Loss: 1.2390
 37 | Epoch: 0003/0010 | Batch 0600/2812 | Loss: 1.0620
 38 | Epoch: 0003/0010 | Batch 0900/2812 | Loss: 1.3221
 39 | Epoch: 0003/0010 | Batch 1200/2812 | Loss: 0.8957
 40 | Epoch: 0003/0010 | Batch 1500/2812 | Loss: 0.7106
 41 | Epoch: 0003/0010 | Batch 1800/2812 | Loss: 1.0991
 42 | Epoch: 0003/0010 | Batch 2100/2812 | Loss: 1.1103
 43 | Epoch: 0003/0010 | Batch 2400/2812 | Loss: 1.1114
 44 | Epoch: 0003/0010 | Batch 2700/2812 | Loss: 1.2009
 45 | Epoch: 0003/0010 | Train acc.: 55.98% | Val acc.: 55.98%
 46 | Epoch: 0004/0010 | Batch 0000/2812 | Loss: 1.3007
 47 | Epoch: 0004/0010 | Batch 0300/2812 | Loss: 1.0196
 48 | Epoch: 0004/0010 | Batch 0600/2812 | Loss: 0.9170
 49 | Epoch: 0004/0010 | Batch 0900/2812 | Loss: 1.1119
 50 | Epoch: 0004/0010 | Batch 1200/2812 | Loss: 1.4091
 51 | Epoch: 0004/0010 | Batch 1500/2812 | Loss: 0.9497
 52 | Epoch: 0004/0010 | Batch 1800/2812 | Loss: 1.1372
 53 | Epoch: 0004/0010 | Batch 2100/2812 | Loss: 1.5214
 54 | Epoch: 0004/0010 | Batch 2400/2812 | Loss: 1.1381
 55 | Epoch: 0004/0010 | Batch 2700/2812 | Loss: 1.1540
 56 | Epoch: 0004/0010 | Train acc.: 59.33% | Val acc.: 59.14%
 57 | Epoch: 0005/0010 | Batch 0000/2812 | Loss: 1.3431
 58 | Epoch: 0005/0010 | Batch 0300/2812 | Loss: 1.5235
 59 | Epoch: 0005/0010 | Batch 0600/2812 | Loss: 0.6558
 60 | Epoch: 0005/0010 | Batch 0900/2812 | Loss: 1.3207
 61 | Epoch: 0005/0010 | Batch 1200/2812 | Loss: 1.1548
 62 | Epoch: 0005/0010 | Batch 1500/2812 | Loss: 0.7654
 63 | Epoch: 0005/0010 | Batch 1800/2812 | Loss: 0.7152
 64 | Epoch: 0005/0010 | Batch 2100/2812 | Loss: 1.2607
 65 | Epoch: 0005/0010 | Batch 2400/2812 | Loss: 0.9752
 66 | Epoch: 0005/0010 | Batch 2700/2812 | Loss: 1.2946
 67 | Epoch: 0005/0010 | Train acc.: 62.04% | Val acc.: 60.06%
 68 | Epoch: 0006/0010 | Batch 0000/2812 | Loss: 0.9951
 69 | Epoch: 0006/0010 | Batch 0300/2812 | Loss: 1.1221
 70 | Epoch: 0006/0010 | Batch 0600/2812 | Loss: 0.4458
 71 | Epoch: 0006/0010 | Batch 0900/2812 | Loss: 1.2516
 72 | Epoch: 0006/0010 | Batch 1200/2812 | Loss: 0.6722
 73 | Epoch: 0006/0010 | Batch 1500/2812 | Loss: 1.0663
 74 | Epoch: 0006/0010 | Batch 1800/2812 | Loss: 0.9296
 75 | Epoch: 0006/0010 | Batch 2100/2812 | Loss: 0.7549
 76 | Epoch: 0006/0010 | Batch 2400/2812 | Loss: 0.6679
 77 | Epoch: 0006/0010 | Batch 2700/2812 | Loss: 1.2472
 78 | Epoch: 0006/0010 | Train acc.: 64.22% | Val acc.: 61.80%
 79 | Epoch: 0007/0010 | Batch 0000/2812 | Loss: 1.2649
 80 | Epoch: 0007/0010 | Batch 0300/2812 | Loss: 0.5699
 81 | Epoch: 0007/0010 | Batch 0600/2812 | Loss: 1.1111
 82 | Epoch: 0007/0010 | Batch 0900/2812 | Loss: 1.4958
 83 | Epoch: 0007/0010 | Batch 1200/2812 | Loss: 0.9095
 84 | Epoch: 0007/0010 | Batch 1500/2812 | Loss: 0.6518
 85 | Epoch: 0007/0010 | Batch 1800/2812 | Loss: 1.2235
 86 | Epoch: 0007/0010 | Batch 2100/2812 | Loss: 0.9390
 87 | Epoch: 0007/0010 | Batch 2400/2812 | Loss: 1.3117
 88 | Epoch: 0007/0010 | Batch 2700/2812 | Loss: 0.7095
 89 | Epoch: 0007/0010 | Train acc.: 66.12% | Val acc.: 61.14%
 90 | Epoch: 0008/0010 | Batch 0000/2812 | Loss: 0.5717
 91 | Epoch: 0008/0010 | Batch 0300/2812 | Loss: 0.9590
 92 | Epoch: 0008/0010 | Batch 0600/2812 | Loss: 1.2407
 93 | Epoch: 0008/0010 | Batch 0900/2812 | Loss: 0.6916
 94 | Epoch: 0008/0010 | Batch 1200/2812 | Loss: 0.6023
 95 | Epoch: 0008/0010 | Batch 1500/2812 | Loss: 0.6515
 96 | Epoch: 0008/0010 | Batch 1800/2812 | Loss: 0.6961
 97 | Epoch: 0008/0010 | Batch 2100/2812 | Loss: 0.5924
 98 | Epoch: 0008/0010 | Batch 2400/2812 | Loss: 0.7415
 99 | Epoch: 0008/0010 | Batch 2700/2812 | Loss: 0.9263
100 | Epoch: 0008/0010 | Train acc.: 68.04% | Val acc.: 63.60%
101 | Epoch: 0009/0010 | Batch 0000/2812 | Loss: 0.9797
102 | Epoch: 0009/0010 | Batch 0300/2812 | Loss: 0.5473
103 | Epoch: 0009/0010 | Batch 0600/2812 | Loss: 0.7215
104 | Epoch: 0009/0010 | Batch 0900/2812 | Loss: 1.2585
105 | Epoch: 0009/0010 | Batch 1200/2812 | Loss: 0.6315
106 | Epoch: 0009/0010 | Batch 1500/2812 | Loss: 0.5185
107 | Epoch: 0009/0010 | Batch 1800/2812 | Loss: 0.7371
108 | Epoch: 0009/0010 | Batch 2100/2812 | Loss: 1.3996
109 | Epoch: 0009/0010 | Batch 2400/2812 | Loss: 0.5328
110 | Epoch: 0009/0010 | Batch 2700/2812 | Loss: 1.0833
111 | Epoch: 0009/0010 | Train acc.: 69.50% | Val acc.: 65.32%
112 | Epoch: 0010/0010 | Batch 0000/2812 | Loss: 0.6748
113 | Epoch: 0010/0010 | Batch 0300/2812 | Loss: 0.7661
114 | Epoch: 0010/0010 | Batch 0600/2812 | Loss: 0.8363
115 | Epoch: 0010/0010 | Batch 0900/2812 | Loss: 0.4878
116 | Epoch: 0010/0010 | Batch 1200/2812 | Loss: 1.4964
117 | Epoch: 0010/0010 | Batch 1500/2812 | Loss: 1.4705
118 | Epoch: 0010/0010 | Batch 1800/2812 | Loss: 1.1050
119 | Epoch: 0010/0010 | Batch 2100/2812 | Loss: 0.4973
120 | Epoch: 0010/0010 | Batch 2400/2812 | Loss: 0.5641
121 | Epoch: 0010/0010 | Batch 2700/2812 | Loss: 0.6933
122 | Epoch: 0010/0010 | Train acc.: 71.16% | Val acc.: 62.80%
123 | Time elapsed 61.48 min
124 | Memory used: 3.71 GB
125 | Test accuracy 62.85%
126 | ```
127 | 
128 | # 01_pytorch-vit.py (finetuning pretrained)
129 | 
130 | Like above but using a pretrained vision transformer.
131 | 
132 | ```
133 | Epoch: 0001/0003 | Batch 0000/2812 | Loss: 2.4934
134 | Epoch: 0001/0003 | Batch 0300/2812 | Loss: 0.0954
135 | Epoch: 0001/0003 | Batch 0600/2812 | Loss: 0.0981
136 | Epoch: 0001/0003 | Batch 0900/2812 | Loss: 0.2078
137 | Epoch: 0001/0003 | Batch 1200/2812 | Loss: 0.3588
138 | Epoch: 0001/0003 | Batch 1500/2812 | Loss: 0.0104
139 | Epoch: 0001/0003 | Batch 1800/2812 | Loss: 0.1560
140 | Epoch: 0001/0003 | Batch 2100/2812 | Loss: 0.0474
141 | Epoch: 0001/0003 | Batch 2400/2812 | Loss: 0.4250
142 | Epoch: 0001/0003 | Batch 2700/2812 | Loss: 0.4414
143 | Epoch: 0001/0003 | Train acc.: 92.40% | Val acc.: 94.12%
144 | Epoch: 0002/0003 | Batch 0000/2812 | Loss: 0.0912
145 | Epoch: 0002/0003 | Batch 0300/2812 | Loss: 0.0337
146 | Epoch: 0002/0003 | Batch 0600/2812 | Loss: 0.1545
147 | Epoch: 0002/0003 | Batch 0900/2812 | Loss: 0.0478
148 | Epoch: 0002/0003 | Batch 1200/2812 | Loss: 0.0697
149 | Epoch: 0002/0003 | Batch 1500/2812 | Loss: 0.1314
150 | Epoch: 0002/0003 | Batch 1800/2812 | Loss: 0.2215
151 | Epoch: 0002/0003 | Batch 2100/2812 | Loss: 0.4472
152 | Epoch: 0002/0003 | Batch 2400/2812 | Loss: 0.0322
153 | Epoch: 0002/0003 | Batch 2700/2812 | Loss: 0.1310
154 | Epoch: 0002/0003 | Train acc.: 96.28% | Val acc.: 94.50%
155 | Epoch: 0003/0003 | Batch 0000/2812 | Loss: 0.0902
156 | Epoch: 0003/0003 | Batch 0300/2812 | Loss: 0.1597
157 | Epoch: 0003/0003 | Batch 0600/2812 | Loss: 0.0106
158 | Epoch: 0003/0003 | Batch 0900/2812 | Loss: 0.0032
159 | Epoch: 0003/0003 | Batch 1200/2812 | Loss: 0.0147
160 | Epoch: 0003/0003 | Batch 1500/2812 | Loss: 0.0082
161 | Epoch: 0003/0003 | Batch 1800/2812 | Loss: 0.0078
162 | Epoch: 0003/0003 | Batch 2100/2812 | Loss: 0.0060
163 | Epoch: 0003/0003 | Batch 2400/2812 | Loss: 0.1395
164 | Epoch: 0003/0003 | Batch 2700/2812 | Loss: 0.1128
165 | Epoch: 0003/0003 | Train acc.: 97.24% | Val acc.: 95.74%
166 | Time elapsed 18.70 min
167 | Memory used: 3.71 GB
168 | Test accuracy 95.37%
169 | ```
170 | 
171 | # 02_pytorch-vit-compile.py
172 | 
173 | Like above but with `torch.compile`.
174 | 
175 | ```
176 | Epoch: 0001/0003 | Batch 0000/2812 | Loss: 2.4934
177 | Epoch: 0001/0003 | Batch 0300/2812 | Loss: 0.4464
178 | Epoch: 0001/0003 | Batch 0600/2812 | Loss: 0.1263
179 | Epoch: 0001/0003 | Batch 0900/2812 | Loss: 0.1233
180 | Epoch: 0001/0003 | Batch 1200/2812 | Loss: 0.4541
181 | Epoch: 0001/0003 | Batch 1500/2812 | Loss: 0.0186
182 | Epoch: 0001/0003 | Batch 1800/2812 | Loss: 0.0930
183 | Epoch: 0001/0003 | Batch 2100/2812 | Loss: 0.0396
184 | Epoch: 0001/0003 | Batch 2400/2812 | Loss: 0.2211
185 | Epoch: 0001/0003 | Batch 2700/2812 | Loss: 0.1570
186 | Epoch: 0002/0003 | Batch 0000/2812 | Loss: 0.0186
187 | Epoch: 0002/0003 | Batch 0300/2812 | Loss: 0.0337
188 | Epoch: 0002/0003 | Batch 0600/2812 | Loss: 0.1992
189 | Epoch: 0002/0003 | Batch 0900/2812 | Loss: 0.0275
190 | Epoch: 0002/0003 | Batch 1200/2812 | Loss: 0.0874
191 | Epoch: 0002/0003 | Batch 1500/2812 | Loss: 0.0739
192 | Epoch: 0002/0003 | Batch 1800/2812 | Loss: 0.0432
193 | Epoch: 0002/0003 | Batch 2100/2812 | Loss: 0.0564
194 | Epoch: 0002/0003 | Batch 2400/2812 | Loss: 0.0110
195 | Epoch: 0002/0003 | Batch 2700/2812 | Loss: 0.0948
196 | Epoch: 0002/0003 | Train acc.: 96.15% | Val acc.: 94.72%
197 | Epoch: 0003/0003 | Batch 0000/2812 | Loss: 0.0462
198 | Epoch: 0003/0003 | Batch 0300/2812 | Loss: 0.1742
199 | Epoch: 0003/0003 | Batch 0600/2812 | Loss: 0.0039
200 | Epoch: 0003/0003 | Batch 0900/2812 | Loss: 0.0113
201 | Epoch: 0003/0003 | Batch 1200/2812 | Loss: 0.0022
202 | Epoch: 0003/0003 | Batch 1500/2812 | Loss: 0.0047
203 | Epoch: 0003/0003 | Batch 1800/2812 | Loss: 0.0667
204 | Epoch: 0003/0003 | Batch 2100/2812 | Loss: 0.0145
205 | Epoch: 0003/0003 | Batch 2400/2812 | Loss: 0.0071
206 | Epoch: 0003/0003 | Batch 2700/2812 | Loss: 0.0085
207 | Epoch: 0003/0003 | Train acc.: 97.29% | Val acc.: 93.16%
208 | Time elapsed 18.01 min
209 | Memory used: 3.73 GB
210 | Test accuracy 93.02%
211 | ```
212 | 
213 | # 03_fabric-vit.py
214 | 
215 | Like `01_pytorch-vit.py` but using Fabric.
216 | 
217 | ```
218 | Epoch: 0001/0003 | Batch 0000/2812 | Loss: 2.4934
219 | Epoch: 0001/0003 | Batch 0300/2812 | Loss: 0.4444
220 | Epoch: 0001/0003 | Batch 0600/2812 | Loss: 0.2839
221 | Epoch: 0001/0003 | Batch 0900/2812 | Loss: 0.0866
222 | Epoch: 0001/0003 | Batch 1200/2812 | Loss: 0.5020
223 | Epoch: 0001/0003 | Batch 1500/2812 | Loss: 0.0164
224 | Epoch: 0001/0003 | Batch 1800/2812 | Loss: 0.0770
225 | Epoch: 0001/0003 | Batch 2100/2812 | Loss: 0.0157
226 | Epoch: 0001/0003 | Batch 2400/2812 | Loss: 0.3030
227 | Epoch: 0001/0003 | Batch 2700/2812 | Loss: 0.0797
228 | Epoch: 0001/0003 | Train acc.: 92.25% | Val acc.: 93.22%
229 | Epoch: 0002/0003 | Batch 0000/2812 | Loss: 0.0194
230 | Epoch: 0002/0003 | Batch 0300/2812 | Loss: 0.0443
231 | Epoch: 0002/0003 | Batch 0600/2812 | Loss: 0.1183
232 | Epoch: 0002/0003 | Batch 0900/2812 | Loss: 0.0122
233 | Epoch: 0002/0003 | Batch 1200/2812 | Loss: 0.0111
234 | Epoch: 0002/0003 | Batch 1500/2812 | Loss: 0.0069
235 | Epoch: 0002/0003 | Batch 1800/2812 | Loss: 0.0209
236 | Epoch: 0002/0003 | Batch 2100/2812 | Loss: 0.5524
237 | Epoch: 0002/0003 | Batch 2400/2812 | Loss: 0.2935
238 | Epoch: 0002/0003 | Batch 2700/2812 | Loss: 0.1671
239 | Epoch: 0002/0003 | Train acc.: 96.12% | Val acc.: 94.72%
240 | Epoch: 0003/0003 | Batch 0000/2812 | Loss: 0.0296
241 | Epoch: 0003/0003 | Batch 0300/2812 | Loss: 0.0707
242 | Epoch: 0003/0003 | Batch 0600/2812 | Loss: 0.0027
243 | Epoch: 0003/0003 | Batch 0900/2812 | Loss: 0.0081
244 | Epoch: 0003/0003 | Batch 1200/2812 | Loss: 0.0087
245 | Epoch: 0003/0003 | Batch 1500/2812 | Loss: 0.0014
246 | Epoch: 0003/0003 | Batch 1800/2812 | Loss: 0.2059
247 | Epoch: 0003/0003 | Batch 2100/2812 | Loss: 0.0106
248 | Epoch: 0003/0003 | Batch 2400/2812 | Loss: 0.0675
249 | ^[[BEpoch: 0003/0003 | Batch 2700/2812 | Loss: 0.0527
250 | Epoch: 0003/0003 | Train acc.: 97.35% | Val acc.: 95.48%
251 | Time elapsed 18.49 min
252 | Memory used: 3.71 GB
253 | Test accuracy 95.78%
254 | ```
255 | 
256 | # 04_fabric-vit-mixed-precision.py
257 | 
258 | Like `03_fabric-vit.py` but with bfloat16 mixed precision training (`bf16-mixed`) -- if your GPU doesn't suppoirt it, you can change it to float16 mixed precision (`16-mixed`).
259 | 
260 | ```
261 | Epoch: 0001/0003 | Batch 0000/2812 | Loss: 2.4933
262 | Epoch: 0001/0003 | Batch 0300/2812 | Loss: 0.1566
263 | Epoch: 0001/0003 | Batch 0600/2812 | Loss: 0.1762
264 | Epoch: 0001/0003 | Batch 0900/2812 | Loss: 0.1912
265 | Epoch: 0001/0003 | Batch 1200/2812 | Loss: 0.2519
266 | Epoch: 0001/0003 | Batch 1500/2812 | Loss: 0.0415
267 | Epoch: 0001/0003 | Batch 1800/2812 | Loss: 0.0783
268 | Epoch: 0001/0003 | Batch 2100/2812 | Loss: 0.0447
269 | Epoch: 0001/0003 | Batch 2400/2812 | Loss: 0.5027
270 | Epoch: 0001/0003 | Batch 2700/2812 | Loss: 0.3819
271 | Epoch: 0001/0003 | Train acc.: 92.36% | Val acc.: 93.00%
272 | Epoch: 0002/0003 | Batch 0000/2812 | Loss: 0.1281
273 | Epoch: 0002/0003 | Batch 0300/2812 | Loss: 0.2139
274 | Epoch: 0002/0003 | Batch 0600/2812 | Loss: 0.1355
275 | Epoch: 0002/0003 | Batch 0900/2812 | Loss: 0.1130
276 | Epoch: 0002/0003 | Batch 1200/2812 | Loss: 0.1395
277 | Epoch: 0002/0003 | Batch 1500/2812 | Loss: 0.0121
278 | Epoch: 0002/0003 | Batch 1800/2812 | Loss: 0.0389
279 | Epoch: 0002/0003 | Batch 2100/2812 | Loss: 0.2634
280 | Epoch: 0002/0003 | Batch 2400/2812 | Loss: 0.0625
281 | Epoch: 0002/0003 | Batch 2700/2812 | Loss: 0.1037
282 | Epoch: 0002/0003 | Train acc.: 96.20% | Val acc.: 95.26%
283 | Epoch: 0003/0003 | Batch 0000/2812 | Loss: 0.0712
284 | Epoch: 0003/0003 | Batch 0300/2812 | Loss: 0.1453
285 | Epoch: 0003/0003 | Batch 0600/2812 | Loss: 0.0075
286 | Epoch: 0003/0003 | Batch 0900/2812 | Loss: 0.0663
287 | Epoch: 0003/0003 | Batch 1200/2812 | Loss: 0.0016
288 | Epoch: 0003/0003 | Batch 1500/2812 | Loss: 0.0029
289 | Epoch: 0003/0003 | Batch 1800/2812 | Loss: 0.0449
290 | Epoch: 0003/0003 | Batch 2100/2812 | Loss: 0.0020
291 | Epoch: 0003/0003 | Batch 2400/2812 | Loss: 0.2057
292 | Epoch: 0003/0003 | Batch 2700/2812 | Loss: 0.0410
293 | Epoch: 0003/0003 | Train acc.: 97.20% | Val acc.: 95.56%
294 | Time elapsed 6.36 min
295 | Memory used: 3.04 GB
296 | Test accuracy 95.24%
297 | ```
298 | 
299 | # 05_fabric-vit-mixed-ddp.py
300 | 
301 | Like `04_fabric-vit-mixed-precision.py` but using Distributed Data Parallelism (DDP).
302 | 
303 | ```
304 | Epoch: 0001/0003 | Batch 0000/0703 | Loss: 2.4056
305 | Epoch: 0001/0003 | Batch 0000/0703 | Loss: 2.2974
306 | Epoch: 0001/0003 | Batch 0000/0703 | Loss: 2.3568
307 | Epoch: 0001/0003 | Batch 0000/0703 | Loss: 2.4210
308 | Epoch: 0001/0003 | Batch 0300/0703 | Loss: 0.2212
309 | Epoch: 0001/0003 | Batch 0300/0703 | Loss: 0.1103
310 | Epoch: 0001/0003 | Batch 0300/0703 | Loss: 0.0332
311 | Epoch: 0001/0003 | Batch 0300/0703 | Loss: 0.1673
312 | Epoch: 0001/0003 | Batch 0600/0703 | Loss: 0.1595
313 | Epoch: 0001/0003 | Batch 0600/0703 | Loss: 0.2520
314 | Epoch: 0001/0003 | Batch 0600/0703 | Loss: 0.1233
315 | Epoch: 0001/0003 | Batch 0600/0703 | Loss: 0.0192
316 | Epoch: 0001/0003 | Train acc.: 93.58% | Val acc.: 95.96%
317 | Epoch: 0002/0003 | Batch 0000/0703 | Loss: 0.0078
318 | Epoch: 0002/0003 | Batch 0000/0703 | Loss: 0.0372
319 | Epoch: 0002/0003 | Batch 0000/0703 | Loss: 0.0221
320 | Epoch: 0002/0003 | Batch 0000/0703 | Loss: 0.0254
321 | Epoch: 0002/0003 | Batch 0300/0703 | Loss: 0.1037
322 | Epoch: 0002/0003 | Batch 0300/0703 | Loss: 0.0042
323 | Epoch: 0002/0003 | Batch 0300/0703 | Loss: 0.0218
324 | Epoch: 0002/0003 | Batch 0300/0703 | Loss: 0.0519
325 | Epoch: 0002/0003 | Batch 0600/0703 | Loss: 0.0038
326 | Epoch: 0002/0003 | Batch 0600/0703 | Loss: 0.0028
327 | Epoch: 0002/0003 | Batch 0600/0703 | Loss: 0.0067
328 | Epoch: 0002/0003 | Batch 0600/0703 | Loss: 0.0076
329 | Epoch: 0002/0003 | Train acc.: 97.86% | Val acc.: 95.76%
330 | Epoch: 0003/0003 | Batch 0000/0703 | Loss: 0.0073
331 | Epoch: 0003/0003 | Batch 0000/0703 | Loss: 0.0024
332 | Epoch: 0003/0003 | Batch 0000/0703 | Loss: 0.0173
333 | Epoch: 0003/0003 | Batch 0000/0703 | Loss: 0.0155
334 | Epoch: 0003/0003 | Batch 0300/0703 | Loss: 0.0133
335 | Epoch: 0003/0003 | Batch 0300/0703 | Loss: 0.0012
336 | Epoch: 0003/0003 | Batch 0300/0703 | Loss: 0.0024
337 | Epoch: 0003/0003 | Batch 0300/0703 | Loss: 0.0185
338 | Epoch: 0003/0003 | Batch 0600/0703 | Loss: 0.1074
339 | Epoch: 0003/0003 | Batch 0600/0703 | Loss: 0.0068
340 | Epoch: 0003/0003 | Batch 0600/0703 | Loss: 0.0034
341 | Epoch: 0003/0003 | Batch 0600/0703 | Loss: 0.2559
342 | Epoch: 0003/0003 | Train acc.: 98.58% | Val acc.: 95.82%
343 | Time elapsed 2.19 min
344 | Memory used: 4.09 GB
345 | Test accuracy 95.73%
346 | ```
347 | 
348 | # 06_fabric-vit-mixed-fsdp.py
349 | 
350 | Like `05_fabric-vit-mixed-ddp.py` but using Fully Sharded Data Parallelism (FSDP) instead of DDP.
351 | 
352 | ```
353 | Epoch: 0001/0003 | Batch 0000/0703 | Loss: 2.4210
354 | Epoch: 0001/0003 | Batch 0000/0703 | Loss: 2.3568
355 | Epoch: 0001/0003 | Batch 0000/0703 | Loss: 2.4056
356 | Epoch: 0001/0003 | Batch 0000/0703 | Loss: 2.2974
357 | Epoch: 0001/0003 | Batch 0300/0703 | Loss: 0.0243
358 | Epoch: 0001/0003 | Batch 0300/0703 | Loss: 0.0704
359 | Epoch: 0001/0003 | Batch 0300/0703 | Loss: 0.0503
360 | Epoch: 0001/0003 | Batch 0300/0703 | Loss: 0.0400
361 | Epoch: 0001/0003 | Batch 0600/0703 | Loss: 0.2057
362 | Epoch: 0001/0003 | Batch 0600/0703 | Loss: 0.0352
363 | Epoch: 0001/0003 | Batch 0600/0703 | Loss: 0.0217
364 | Epoch: 0001/0003 | Batch 0600/0703 | Loss: 0.0306
365 | Epoch: 0001/0003 | Train acc.: 93.65% | Val acc.: 95.74%
366 | Epoch: 0002/0003 | Batch 0000/0703 | Loss: 0.0102
367 | Epoch: 0002/0003 | Batch 0000/0703 | Loss: 0.0124
368 | Epoch: 0002/0003 | Batch 0000/0703 | Loss: 0.0907
369 | Epoch: 0002/0003 | Batch 0000/0703 | Loss: 0.0597
370 | Epoch: 0002/0003 | Batch 0300/0703 | Loss: 0.0201
371 | Epoch: 0002/0003 | Batch 0300/0703 | Loss: 0.0815
372 | Epoch: 0002/0003 | Batch 0300/0703 | Loss: 0.0039
373 | Epoch: 0002/0003 | Batch 0300/0703 | Loss: 0.0473
374 | Epoch: 0002/0003 | Batch 0600/0703 | Loss: 0.1221
375 | Epoch: 0002/0003 | Batch 0600/0703 | Loss: 0.0371
376 | Epoch: 0002/0003 | Batch 0600/0703 | Loss: 0.1342
377 | Epoch: 0002/0003 | Batch 0600/0703 | Loss: 0.0045
378 | Epoch: 0002/0003 | Train acc.: 97.72% | Val acc.: 96.36%
379 | Epoch: 0003/0003 | Batch 0000/0703 | Loss: 0.0131
380 | Epoch: 0003/0003 | Batch 0000/0703 | Loss: 0.0122
381 | Epoch: 0003/0003 | Batch 0000/0703 | Loss: 0.0182
382 | Epoch: 0003/0003 | Batch 0000/0703 | Loss: 0.1417
383 | Epoch: 0003/0003 | Batch 0300/0703 | Loss: 0.0007
384 | Epoch: 0003/0003 | Batch 0300/0703 | Loss: 0.0154
385 | Epoch: 0003/0003 | Batch 0300/0703 | Loss: 0.0010
386 | Epoch: 0003/0003 | Batch 0300/0703 | Loss: 0.0020
387 | Epoch: 0003/0003 | Batch 0600/0703 | Loss: 0.0134
388 | Epoch: 0003/0003 | Batch 0600/0703 | Loss: 0.4331
389 | Epoch: 0003/0003 | Batch 0600/0703 | Loss: 0.0044
390 | Epoch: 0003/0003 | Batch 0600/0703 | Loss: 0.2052
391 | Epoch: 0003/0003 | Train acc.: 98.64% | Val acc.: 96.28%
392 | Time elapsed 2.23 min
393 | Memory used: 2.83 GB
394 | Test accuracy 96.22%
395 | ```
396 | 
397 | 
398 | 
399 | ## 07_fabric-vit-mixed-fsdp-with-scheduler.py
400 | 
401 | Like `06_fabric-vit-mixed-fsdp.py` above but using a learning rate scheduler.
402 | 
403 | 
404 | 
405 | ```
406 | Epoch: 0001/0003 | Batch 0000/0703 | Loss: 2.2974
407 | Epoch: 0001/0003 | Batch 0000/0703 | Loss: 2.3568
408 | Epoch: 0001/0003 | Batch 0000/0703 | Loss: 2.4056
409 | Epoch: 0001/0003 | Batch 0000/0703 | Loss: 2.4210
410 | Epoch: 0001/0003 | Batch 0300/0703 | Loss: 0.0243
411 | Epoch: 0001/0003 | Batch 0300/0703 | Loss: 0.0503
412 | Epoch: 0001/0003 | Batch 0300/0703 | Loss: 0.0400
413 | Epoch: 0001/0003 | Batch 0300/0703 | Loss: 0.0704
414 | Epoch: 0001/0003 | Batch 0600/0703 | Loss: 0.2057
415 | Epoch: 0001/0003 | Batch 0600/0703 | Loss: 0.0352
416 | Epoch: 0001/0003 | Batch 0600/0703 | Loss: 0.0217
417 | Epoch: 0001/0003 | Batch 0600/0703 | Loss: 0.0306
418 | Epoch: 0001/0003 | Train acc.: 93.65% | Val acc.: 95.74%
419 | Epoch: 0002/0003 | Batch 0000/0703 | Loss: 0.0124
420 | Epoch: 0002/0003 | Batch 0000/0703 | Loss: 0.0102
421 | Epoch: 0002/0003 | Batch 0000/0703 | Loss: 0.0907
422 | Epoch: 0002/0003 | Batch 0000/0703 | Loss: 0.0597
423 | Epoch: 0002/0003 | Batch 0300/0703 | Loss: 0.0815
424 | Epoch: 0002/0003 | Batch 0300/0703 | Loss: 0.0201
425 | Epoch: 0002/0003 | Batch 0300/0703 | Loss: 0.0039
426 | Epoch: 0002/0003 | Batch 0300/0703 | Loss: 0.0473
427 | Epoch: 0002/0003 | Batch 0600/0703 | Loss: 0.0371
428 | Epoch: 0002/0003 | Batch 0600/0703 | Loss: 0.1221
429 | Epoch: 0002/0003 | Batch 0600/0703 | Loss: 0.1342
430 | Epoch: 0002/0003 | Batch 0600/0703 | Loss: 0.0045
431 | Epoch: 0002/0003 | Train acc.: 97.72% | Val acc.: 96.36%
432 | Epoch: 0003/0003 | Batch 0000/0703 | Loss: 0.0131
433 | Epoch: 0003/0003 | Batch 0000/0703 | Loss: 0.0122
434 | Epoch: 0003/0003 | Batch 0000/0703 | Loss: 0.0182
435 | Epoch: 0003/0003 | Batch 0000/0703 | Loss: 0.1417
436 | Epoch: 0003/0003 | Batch 0300/0703 | Loss: 0.0021
437 | Epoch: 0003/0003 | Batch 0300/0703 | Loss: 0.0317
438 | Epoch: 0003/0003 | Batch 0300/0703 | Loss: 0.0491
439 | Epoch: 0003/0003 | Batch 0300/0703 | Loss: 0.0174
440 | Epoch: 0003/0003 | Batch 0600/0703 | Loss: 0.0013
441 | Epoch: 0003/0003 | Batch 0600/0703 | Loss: 0.0062
442 | Epoch: 0003/0003 | Batch 0600/0703 | Loss: 0.0013
443 | Epoch: 0003/0003 | Batch 0600/0703 | Loss: 0.0303
444 | Epoch: 0003/0003 | Train acc.: 98.66% | Val acc.: 95.60%
445 | Time elapsed 2.26 min
446 | Memory used: 2.83 GB
447 | Test accuracy 96.43%
448 | ```
449 | 
450 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | numpy >= 1.24.3
 2 | scipy >= 1.10.1
 3 | pandas >= 2.0.2
 4 | watermark >= 2.4.2
 5 | torch >= 2.1.0
 6 | torchvision >= 0.15.2
 7 | torchmetrics >= 0.11.4
 8 | transformers >= 4.30.2
 9 | lightning >= 2.1.0
10 | deepspeed >= 0.9.4


--------------------------------------------------------------------------------