├── .gitignore
├── LICENSE
├── README.md
├── dataset
    └── README.md
├── imgs
    ├── dataset-overview.jpg
    ├── wandb-api-key.png
    ├── wandb-checkpoint.png
    ├── wandb-lightning.png
    ├── wandb-login.png
    ├── wandb-new-project.png
    ├── wandb-report.png
    ├── wandb-run-config.png
    └── wandb-run-files.png
├── notebooks
    ├── data-module.ipynb
    ├── plain-pytorch-model.ipynb
    ├── pytorch-lightning-model.ipynb
    └── pytorch-lightning-training.ipynb
├── requirements.txt
└── src
    ├── README.md
    ├── inference.py
    ├── mnist.py
    ├── model.py
    └── train.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 
131 | # VSCode Configuration Files
132 | .vscode/
133 | 
134 | # Dataset
135 | dataset/train/
136 | dataset/val/
137 | dataset/test/
138 | 
139 | # Logs
140 | wandb/
141 | lightning_logs/


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 Álvaro Bartolomé
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # :detective::robot: Monitoring a PyTorch Lightning model with Weights & Biases
  2 | 
  3 | <p align="center">
  4 |   <img src="imgs/wandb-lightning.png">
  5 | </p>
  6 | 
  7 | __Weights & Biases__ (wandb) is the future of machine learning; tools for experiment 
  8 | tracking, improved model performance, and results collaboration. Weights & Biases helps 
  9 | you keep track of your machine learning projects, and it is framework agnostic, and environment
 10 | agnositc. Weights & Biases has a flexible integration for any Python script by simply
 11 | using the `wandb` Python library, and a few lines of code. __PyTorch Lightning__ is the lightweight 
 12 | PyTorch wrapper for high-performance AI research. Along this project we will see how to define
 13 | a PyTorch model, wrap it with PyTorch Lightning, and monitor its training with Weights & Biases.
 14 | 
 15 | :warning: __Disclaimer__. This project is result of some experiments I ran using both PyTorch Lightning
 16 | and `wandb`, but these projects contain more functionalities than the ones presented along this project.
 17 | So checking their websites for more information is highly recommended, even though you can still use this
 18 | repository as an initial minimal example.
 19 | 
 20 | ---
 21 | 
 22 | ## :hammer_and_wrench: Requirements
 23 | 
 24 | First of all you will need to install the requirements as it follows, in order to reproduce
 25 | all the content described along this project:
 26 | 
 27 | ```
 28 | pip install -r requirements.txt
 29 | ```
 30 | 
 31 | ---
 32 | 
 33 | :pushpin: __Note__. If you are using Jupyter Lab or Jupyter Notebooks, either on a local environment or hosted on AWS, Azure or GCP, you will 
 34 | need to install the following Jupyter Lab extensions so as to see the training progress bar in your Notebook, otherwise
 35 | you will just see a text similar to: `HBox(children=(FloatProgress(value=0.0, ...)`.
 36 | 
 37 | If you are using conda you will need to install nodejs first, and the proceed with the next steps. If you are not
 38 | using conda just skip this step.
 39 | 
 40 | ```
 41 | conda install nodejs
 42 | ```
 43 | 
 44 | And then install and activate the following Jupyter Lab widget so that you can see the tqdm progress bar properly
 45 | in your Notebook, while the PyTorch Lightning model is being trained.
 46 | 
 47 | ```
 48 | jupyter labextension install @jupyter-widgets/jupyterlab-manager
 49 | jupyter nbextension enable --py widgetsnbextension
 50 | ```
 51 | 
 52 | ---
 53 | 
 54 | ## :open_file_folder: Dataset
 55 | 
 56 | The dataset that is going to be used to train the image classification model is 
 57 | "[The Simpsons Characters Data](https://www.kaggle.com/alexattia/the-simpsons-characters-dataset)", 
 58 | which is Kaggle dataset that contains images of some of the main The Simpsons characters.
 59 | 
 60 | The original dataset contains 42 classes of The Simpsons characters, with an unbalanced number of samples per 
 61 | class, and a total of 20,935 training images and 990 test images in JPG format.
 62 | 
 63 | The modified version of the dataset, which is the one that has been used along this project, contains just the top-10
 64 | classes, and the data is balanced to have 1,000 samples per class. This results in a total of 10,000 RGB images in JPG format, 
 65 | that will be split in train/validation/test as 65/15/20. The images are then rescaled to 32x32 pixels, some of them
 66 | rotated horizontally randomlly, and then normalized.
 67 | 
 68 | ![dataset-overview](imgs/dataset-overview.jpg)
 69 | 
 70 | Find all the information about the dataset in [dataset/README.md](https://github.com/alvarobartt/ml-monitoring-with-wandb/tree/master/dataset).
 71 | 
 72 | :pushpin: __Note__. If you want to use a similar dataset, I put together a MNIST-like The Simpsons dataset based on this
 73 | one, that you can find at [alvarobartt/simpsons-mnist](https://github.com/alvarobartt/simpsons-mnist).
 74 | 
 75 | ---
 76 | 
 77 | ## :robot: Modelling
 78 | 
 79 | Along this tutorial we will be using PyTorch Lightning as the training interface for out PyTorch model. This means
 80 | that we will first create the plain PyTorch model and its training loop, that will be later translated to PyTorch 
 81 | Ligthning.
 82 | 
 83 | So on, we will define a pretty simple CNN architecture for "The Simpsons Characters" dataset that we are using (the 
 84 | modified version), that consists on 32x32px RGB images, tensors of shape `(32, 32, 3)` in (H, W, C) format.
 85 | 
 86 | ```python
 87 | import torch
 88 | import torch.nn as nn
 89 | 
 90 | class SimpsonsNet(nn.Module):
 91 |     def __init__(self):
 92 |         super(SimpsonsNet, self).__init__()
 93 |         self.conv1 = nn.Conv2d(3, 16, kernel_size=3, padding=1)
 94 |         self.conv2 = nn.Conv2d(16, 32, kernel_size=3, padding=1)
 95 |         self.conv3 = nn.Conv2d(32, 32, kernel_size=3, padding=1)
 96 |         self.dropout = nn.Dropout(.2)
 97 |         self.fc1 = nn.Linear(16*16*32, 128)
 98 |         self.fc2 = nn.Linear(128, 10)
 99 |         
100 |     def forward(self, x):
101 |         x = F.relu(self.conv1(x))
102 |         x = F.relu(self.conv2(x))
103 |         x = F.relu(self.conv3(x))
104 |         x = F.max_pool2d(x, 2)
105 |         x = self.dropout(x)
106 |         x = nn.Flatten()(x)
107 |         x = F.relu(self.fc1(x))
108 |         x = self.dropout(x)
109 |         x = self.fc2(x)
110 |         return x
111 | ```
112 | 
113 | Once we create the plain PyTorch model we need to intantiate the class and initialize/define both the loss function
114 | and the optimizer that we will be using to train the net.
115 | 
116 | ```python
117 | import torch.nn as nn
118 | import torch.optim as optim
119 | 
120 | criterion = nn.CrossEntropyLoss()
121 | optimizer = optim.Adam(model.parameters())
122 | ```
123 | 
124 | And then we need to create the training loop, which looks like:
125 | 
126 | ```python
127 | num_epochs = 10
128 | 
129 | for epoch in range(num_epochs):
130 |     print(f"\nEpoch {epoch}")
131 |     running_loss = .0
132 |     running_corrects = .0
133 |     model.train()
134 |     for inputs, labels in train_loader:
135 |         inputs, labels = inputs.to(device), labels.to(device)
136 | 
137 |         optimizer.zero_grad()
138 |         
139 |         outputs = model(inputs)
140 |         _, preds = torch.max(outputs, 1)
141 |         loss = criterion(outputs, labels)
142 |         loss.backward()
143 |         optimizer.step()
144 | 
145 |         running_loss += loss.item() * inputs.size(0)
146 |         res = torch.sum(preds == labels)
147 |         running_corrects += res
148 |     epoch_loss = running_loss / len(train_dataset)
149 |     epoch_acc = running_corrects.double() / len(train_dataset)
150 |     print(f"loss: {epoch_loss}, acc: {epoch_acc}")
151 | ```
152 | 
153 | That just for the training, even though we could include also the validation and the test; but that will just add
154 | more complexity to the loop. This being said, we will proceed with the translation from plain PyTorch to PyTorch Lightning,
155 | so that we will see how easy and structured it is to create a training interface for any model.
156 | 
157 | ---
158 | 
159 | All the code above (both the net definition and the training loop) translated to PyTorch Lightning it ends up being as easy as:
160 | 
161 | ```python
162 | import torch
163 | import torch.nn as nn
164 | from torch.nn import functional as F
165 | 
166 | from pytorch_lightning import LightningModule
167 | from pytorch_lightning.metrics.functional import accuracy
168 | 
169 | 
170 | class SimpsonsNet(LightningModule):
171 |     def __init__(self):
172 |         super(SimpsonsNet, self).__init__()
173 |         
174 |         self.conv1 = nn.Conv2d(3, 16, kernel_size=3, padding=1)
175 |         self.conv2 = nn.Conv2d(16, 32, kernel_size=3, padding=1)
176 |         self.conv3 = nn.Conv2d(32, 32, kernel_size=3, padding=1)
177 |         self.dropout = nn.Dropout(.5)
178 |         self.flatten = nn.Flatten()
179 |         self.fc1 = nn.Linear(16*16*32, 64)
180 |         self.fc2 = nn.Linear(64, 10)
181 |         
182 |     def forward(self, x):
183 |         x = F.relu(self.conv1(x))
184 |         x = F.relu(self.conv2(x))
185 |         x = F.relu(self.conv3(x))
186 |         x = F.max_pool2d(x, 2)
187 |         x = self.dropout(x)
188 |         x = self.flatten(x)
189 |         x = F.relu(self.fc1(x))
190 |         x = self.dropout(x)
191 |         x = F.log_softmax(self.fc2(x), dim=1)
192 |         return x
193 | 
194 |     def training_step(self, batch, batch_idx):
195 |         x, y = batch
196 |         logits = self(x)
197 |         loss = F.nll_loss(logits, y)
198 |         
199 |         preds = torch.argmax(logits, dim=1)
200 |         acc = accuracy(preds, y)
201 | 
202 |         self.log('train_loss', loss, on_step=True, on_epoch=True, logger=True)
203 |         self.log('train_acc', acc, on_step=True, on_epoch=True, logger=True)
204 | 
205 |         return loss
206 | 
207 |     def validation_step(self, batch, batch_idx):
208 |         x, y = batch
209 |         logits = self(x)
210 |         loss = F.nll_loss(logits, y)
211 | 
212 |         preds = torch.argmax(logits, dim=1)
213 |         acc = accuracy(preds, y)
214 | 
215 |         self.log('val_loss', loss, prog_bar=True)
216 |         self.log('val_acc', acc, prog_bar=True)
217 | 
218 |         return loss
219 | 
220 |     def test_step(self, batch, batch_idx):
221 |         x, y = batch
222 |         logits = self(x)
223 |         loss = F.nll_loss(logits, y)
224 |         
225 |         preds = torch.argmax(logits, dim=1)
226 |         acc = accuracy(preds, y)
227 | 
228 |         self.log('test_loss', loss, prog_bar=True)
229 |         self.log('test_acc', acc, prog_bar=True)
230 | 
231 |         return loss
232 | 
233 |     def configure_optimizers(self):
234 |         return torch.optim.Adam(self.parameters())
235 | ```
236 | 
237 | So that then, in order to train the net after defining all the steps, is just like:
238 | 
239 | ```python
240 | import pytorch_lightning as pl
241 | 
242 | trainer = pl.Trainer(gpus=1, progress_bar_refresh_rate=10, max_epochs=10)
243 | trainer.fit(model, train_loader, val_loader)
244 | ```
245 | 
246 | So this is the basic translation from plain PyTorch to PyTorch Lightning, in the following section we will
247 | see how easy is to integrate any logging interface to the PyTorch Lightning Trainer.
248 | 
249 | ---
250 | 
251 | ## :detective: Monitoring
252 | 
253 | Before starting with the ML monitoring, you will need to setup your Weights & Biases account and install 
254 | the required Python packages so that you can dump the logs in your Weights & Biases project's page.
255 | 
256 | First you need to login to Weights & Biases at https://wandb.ai/login, where the preferred option is to 
257 | log in using your GitHub account, so that you can synchronize both GitHub and wandb.
258 | 
259 | ![wandb-login](imgs/wandb-login.png)
260 | 
261 | Once registered, you will see your main wandb page, where all your projects will be listed. If this is your first
262 | login, you won't have any. So you should create one in order to have a proper project where you dump the logs of 
263 | your ML model training.
264 | 
265 | ![wanbd-new-project](imgs/wandb-new-project.png)
266 | 
267 | So on, at the top of your Python file (.py, .ipynb) you will need to initialize wandb, so that you can properly link
268 | your Python file with your wandb account and project. In order to do so, you just need to reproduce the steps that 
269 | wandb showed you whenever you created the project, which in this case it should look like:
270 | 
271 | ```python
272 | import wandb
273 | 
274 | wandb.init(project='ml-monitoring-with-wandb', entity='alvarobartt')
275 | ```
276 | 
277 | The first time it may ask you for an API Key, that you can find in your profile settings on the Weights & Biases site.
278 | The API Key section looks like the following (if you don't have any API Keys, create a new one):
279 | 
280 | ![wandb-api-key](imgs/wandb-api-key.png)
281 | 
282 | Then everything will be properly set up.
283 | 
284 | ---
285 | 
286 | As mentioned above, the PyTorch Lightning Trainer did not contain any logging interface defined, so that the logs
287 | in the PyTorch Lightining module `self.evaluate()` function were just being printed locally. But if we include a custom
288 | logger, those logging messages will be redirected.
289 | 
290 | So to update the previous training code to include Weights & Biases (`wandb`) as the custom logging interface, we just 
291 | need to replace the Trainer code block with:
292 | 
293 | ```python
294 | import pytorch_lightning as pl
295 | from pytorch_lightning.loggers import WandbLogger
296 | 
297 | trainer = pl.Trainer(gpus=1, progress_bar_refresh_rate=10, max_epochs=10, logger=WandbLogger())
298 | trainer.fit(model, train_loader, val_loader);
299 | ```
300 | 
301 | Which will dump the logs to Weights & Biases, you will see both the "Project Page" and the "Run Page" when fitting the model
302 | so that you just need to click there in order to go to https://wandb.ai/site to track your models.
303 | 
304 | After some training loops of the same model with different configurations, the Weights & Biases project page looks like:
305 | 
306 | ![wandb-report](imgs/wandb-report.png)
307 | 
308 | But so as to see the configuration of each one of those runs we just need to select the run that we want to check and 
309 | all the configuration previously set in the code, will be available in the Overview tab of each run.
310 | 
311 | ![wandb-run-config](imgs/wandb-run-config.png)
312 | 
313 | All this information is really useful as we can clearly keep track of all the experiments we run so as to keep the best model, 
314 | depending on the feature we want to focus in.
315 | 
316 | Anyway, in the training code we are already managing which model will be stored in `wandb`, assuming that the best model is the
317 | one with the highest accuracy, once the run is finished, all those files (in this case the model's `state_dict` as .pth) will 
318 | be uploaded to the Files tab.
319 | 
320 | ![wandb-run-files](imgs/wandb-run-files.png)
321 | 
322 | :pushpin: __Note__. The last checkpoint based on the highest validation accuracy will also be updated automatically, as 
323 | defined in the default behavior of the PyTorch Lightning Trainer; specifying both the epoch and the step when it was saved.
324 | 
325 | ![wandb-checkpoint](imgs/wandb-checkpoint.png)
326 | 
327 | :pushpin: __Note__. Both PyTorch Lightning and Weights & Biases log directories are included in the `.gitignore` file, which means
328 | that the logs will not be updated to GitHub, feel free to remove those lines so that GIT does not ignore these directories.
329 | Anyway as you are using Weights & Biases, the logs will be stored there, so there's no need to store them locally.
330 | 
331 | ---
332 | 
333 | ## :computer: Credits
334 | 
335 | Credits to [Alexandre Attia](https://github.com/alexattia) creating [The Simpsons Characters Dataset](https://www.kaggle.com/alexattia/the-simpsons-characters-dataset), 
336 | as well as to the Kaggle community that made it possible, as they included a lot of manually curated images to the 
337 | original dataset that scaled from 20 characters originally to 42.
338 | 
339 | Credits to [Lezwon Castelino](https://github.com/lezwon) for solving the PyTorch Lightning progress bar issue as he 
340 | nicely provided a solution to the issue in [this PyTorch Lightning issue](https://github.com/PyTorchLightning/pytorch-lightning/issues/1112)
341 | sharing the following [StackOverflow post](https://stackoverflow.com/questions/60656978/display-tqdm-in-aws-sagemakers-jupyterlab).
342 | 
343 | Last but not least, credits to [Charles Frye](https://github.com/charlesfrye) for creating and explaining in detail the integration
344 | of Weights & Biases with the PyTorch Lightning training in the [PyTorch Lightning + W&B example](https://github.com/wandb/examples/blob/master/colabs/pytorch-lightning/Supercharge_your_Training_with_Pytorch_Lightning_%2B_Weights_%26_Biases.ipynb).
345 | 
346 | ---
347 | 
348 | ## :crystal_ball: Future Tasks
349 | 
350 | - [ ] Use feature to show the predicted labels per image (https://twitter.com/weights_biases/status/1364342536836288515)
351 | - [ ] Train model with bigger images, since model's inference results are not that good with images from the Internet
352 | - [ ] If possible, create a simple Streamlit application on Huggingface Spaces
353 | 


--------------------------------------------------------------------------------
/dataset/README.md:
--------------------------------------------------------------------------------
 1 | # :open_file_folder: The Simpsons Characters Dataset
 2 | 
 3 | The original dataset can be found at [Kaggle - The Simpsons Characters Data](https://www.kaggle.com/alexattia/the-simpsons-characters-dataset). Anyway, in this project a prepared version of the dataset has 
 4 | been used, and can be found at [alvarobartt/simpsons-mnist](https://github.com/alvarobartt/simpsons-mnist).
 5 | 
 6 | The prepared dataset contains 650 images to train, 150 images to validate and 200 images to test.
 7 | All the images are RGB images in JPG and JPEG format, with different sizes, as no transformation
 8 | has been applied to the raw data, as the images are transformed while loading the dataset to train
 9 | the CNN model. So that the used images correspond to the 10 most populated classes of the original
10 | dataset, which means that just the classes with up to 1000 images have been used, the rest of the
11 | classes have been discarded.
12 | 
13 | ## :mechanical_arm: Train Dataset
14 | 
15 | The train dataset can be downloaded from the following Dropbox URL: https://www.dropbox.com/s/p4afak2ccgbsup3/train.zip?dl=0
16 | 
17 | ## :eyeglasses: Validation Dataset
18 | 
19 | The validation dataset can be downloaded from the following Dropbox URL: https://www.dropbox.com/s/q633blvy837q082/val.zip?dl=0
20 | 
21 | ## :test_tube: Test Dataset
22 | 
23 | The test dataset can be downloaded from the following Dropbox URL: https://www.dropbox.com/s/km80dr5hfziyf3k/test.zip?dl=0
24 | 
25 | ---
26 | 
27 | ## :blue_book: Information
28 | 
29 | To download and extract the train, val and test datasets from the terminal just use the following commands:
30 | 
31 | ```
32 | mkdir dataset/
33 | cd dataset/
34 | wget --no-check-certificate https://www.dropbox.com/s/p4afak2ccgbsup3/train.zip -O train.zip
35 | unzip -q train.zip
36 | rm train.zip
37 | wget --no-check-certificate https://www.dropbox.com/s/q633blvy837q082/val.zip -O val.zip
38 | unzip -q val.zip
39 | rm val.zip
40 | wget --no-check-certificate https://www.dropbox.com/s/km80dr5hfziyf3k/test.zip -O test.zip
41 | unzip -q test.zip
42 | rm test.zip
43 | ```
44 | 
45 | ---
46 | 
47 | Additionally, if you are using Google Colab, just use the following steps to include as a code cell in a
48 | Notebook, so as to download and extract the dataset under the `/content/` directory (by default):
49 | 
50 | 1. Make sure that there are no directories with the same name under the same directory (you can 
51 | modify the names if it exists) and if so, just remove them:
52 | 
53 | ```
54 | !rm -r /content/train
55 | !rm -r /content/val
56 | !rm -r /content/test
57 | ```
58 | 
59 | 2. Then you need to download both the train and test ZIP files using WGET as it follows:
60 | 
61 | ```
62 | !wget --no-check-certificate \
63 |         https://www.dropbox.com/s/8u2k79tuqmwrwi8/train.zip \
64 |        -O /tmp/train.zip
65 | 
66 | !wget --no-check-certificate \
67 |         https://www.dropbox.com/s/q633blvy837q082/val.zip \
68 |        -O /tmp/val.zip
69 | 
70 | !wget --no-check-certificate \
71 |         https://www.dropbox.com/s/pnipjr7brjz1pm5/test.zip \
72 |        -O /tmp/test.zip
73 | ```
74 | 
75 | 3. Finally, you just need to use `ZipFile` to extract the ZIP files into the `/content/` directory for both train and test sets:
76 | 
77 | ```python
78 | import zipfile
79 | 
80 | with zipfile.ZipFile("//tmp/train.zip", "r") as zip_ref:
81 |     zip_ref.extractall("/content/train")
82 | zip_ref.close()
83 | 
84 | with zipfile.ZipFile("//tmp/val.zip", "r") as zip_ref:
85 |     zip_ref.extractall("/content/val")
86 | zip_ref.close()
87 | 
88 | with zipfile.ZipFile("//tmp/test.zip", "r") as zip_ref:
89 |     zip_ref.extractall("/content/test")
90 | zip_ref.close()
91 | ```
92 | 
93 | __Note__: you can use the `/tmp/` as it is the recommendation, but if you plan to use this dataset 
94 | frequently it's just better to store it under the `/content/` directory so as to keep them, as `/tmp` is temporary.
95 | 


--------------------------------------------------------------------------------
/imgs/dataset-overview.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alvarobartt/ml-monitoring-with-wandb/1f9ae9951903baf402f8f8245167e55f215f7571/imgs/dataset-overview.jpg


--------------------------------------------------------------------------------
/imgs/wandb-api-key.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alvarobartt/ml-monitoring-with-wandb/1f9ae9951903baf402f8f8245167e55f215f7571/imgs/wandb-api-key.png


--------------------------------------------------------------------------------
/imgs/wandb-checkpoint.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alvarobartt/ml-monitoring-with-wandb/1f9ae9951903baf402f8f8245167e55f215f7571/imgs/wandb-checkpoint.png


--------------------------------------------------------------------------------
/imgs/wandb-lightning.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alvarobartt/ml-monitoring-with-wandb/1f9ae9951903baf402f8f8245167e55f215f7571/imgs/wandb-lightning.png


--------------------------------------------------------------------------------
/imgs/wandb-login.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alvarobartt/ml-monitoring-with-wandb/1f9ae9951903baf402f8f8245167e55f215f7571/imgs/wandb-login.png


--------------------------------------------------------------------------------
/imgs/wandb-new-project.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alvarobartt/ml-monitoring-with-wandb/1f9ae9951903baf402f8f8245167e55f215f7571/imgs/wandb-new-project.png


--------------------------------------------------------------------------------
/imgs/wandb-report.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alvarobartt/ml-monitoring-with-wandb/1f9ae9951903baf402f8f8245167e55f215f7571/imgs/wandb-report.png


--------------------------------------------------------------------------------
/imgs/wandb-run-config.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alvarobartt/ml-monitoring-with-wandb/1f9ae9951903baf402f8f8245167e55f215f7571/imgs/wandb-run-config.png


--------------------------------------------------------------------------------
/imgs/wandb-run-files.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alvarobartt/ml-monitoring-with-wandb/1f9ae9951903baf402f8f8245167e55f215f7571/imgs/wandb-run-files.png


--------------------------------------------------------------------------------
/notebooks/data-module.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "from torchvision import transforms as T\n",
 10 |     "\n",
 11 |     "class SimpsonsTransforms(T.Compose):\n",
 12 |     "    def __init__(self, phase):\n",
 13 |     "        self.phase = phase\n",
 14 |     "        self.transforms = {\n",
 15 |     "            'train': [\n",
 16 |     "                T.Resize((32, 32)),\n",
 17 |     "                T.RandomHorizontalFlip(),\n",
 18 |     "                T.ToTensor(),\n",
 19 |     "                T.Normalize(\n",
 20 |     "                    mean=[0.485, 0.456, 0.406],\n",
 21 |     "                    std=[0.229, 0.224, 0.225]\n",
 22 |     "                )\n",
 23 |     "            ],\n",
 24 |     "            'val': [\n",
 25 |     "                T.Resize((32, 32)),\n",
 26 |     "                T.ToTensor(),\n",
 27 |     "                T.Normalize(\n",
 28 |     "                    mean=[0.485, 0.456, 0.406],\n",
 29 |     "                    std=[0.229, 0.224, 0.225]\n",
 30 |     "                )\n",
 31 |     "            ],\n",
 32 |     "            'test': [\n",
 33 |     "                T.Resize((32, 32)),\n",
 34 |     "                T.ToTensor(),\n",
 35 |     "                T.Normalize(\n",
 36 |     "                    mean=[0.485, 0.456, 0.406],\n",
 37 |     "                    std=[0.229, 0.224, 0.225]\n",
 38 |     "                )\n",
 39 |     "            ]\n",
 40 |     "        }\n",
 41 |     "        \n",
 42 |     "        super().__init__(self.transforms[self.phase])"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": 2,
 48 |    "metadata": {},
 49 |    "outputs": [],
 50 |    "source": [
 51 |     "from torchvision.datasets import ImageFolder\n",
 52 |     "\n",
 53 |     "class SimpsonsImageFolder(ImageFolder):\n",
 54 |     "    def __init__(self, phase):\n",
 55 |     "        super().__init__()\n",
 56 |     "        self.root = f\"{root}/{phase}\"\n",
 57 |     "        self.phase = phase\n",
 58 |     "        self.transform = SimpsonsTransforms(phase=phase)"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "code",
 63 |    "execution_count": 3,
 64 |    "metadata": {},
 65 |    "outputs": [],
 66 |    "source": [
 67 |     "from torch.utils.data import DataLoader\n",
 68 |     "from pytorch_lightning import LightningDataModule\n",
 69 |     "\n",
 70 |     "class SimpsonsDataModule(LightningDataModule):\n",
 71 |     "    def __init__(self, dataset_path, batch_size):\n",
 72 |     "        super().__init__()\n",
 73 |     "        self.dataset_path = dataset_path\n",
 74 |     "        self.batch_size = batch_size\n",
 75 |     "    \n",
 76 |     "    def train_dataloader(self):\n",
 77 |     "        self.train_imagefolder = SimpsonsImageFolder(root=self.dataset_path, \n",
 78 |     "                                                     phase='train')\n",
 79 |     "        return DataLoader(dataset=self.train_imagefolder,\n",
 80 |     "                          batch_size=self.batch_size, \n",
 81 |     "                          num_workers=12, shuffle=True)\n",
 82 |     "    \n",
 83 |     "    def val_dataloader(self):\n",
 84 |     "        self.val_imagefolder = SimpsonsImageFolder(root=self.dataset_path,\n",
 85 |     "                                                   phase='val')\n",
 86 |     "        return DataLoader(dataset=self.val_imagefolder,\n",
 87 |     "                          batch_size=self.batch_size, \n",
 88 |     "                          num_workers=12)\n",
 89 |     "    \n",
 90 |     "    def test_dataloader(self):\n",
 91 |     "        self.test_imagefolder = SimpsonsImageFolder(root=self.dataset_path,\n",
 92 |     "                                                    phase='test')\n",
 93 |     "        return DataLoader(dataset=self.test_imagefolder,\n",
 94 |     "                          batch_size=self.batch_size,\n",
 95 |     "                          num_workers=12)"
 96 |    ]
 97 |   }
 98 |  ],
 99 |  "metadata": {
100 |   "kernelspec": {
101 |    "display_name": "Python 3",
102 |    "language": "python",
103 |    "name": "python3"
104 |   },
105 |   "language_info": {
106 |    "codemirror_mode": {
107 |     "name": "ipython",
108 |     "version": 3
109 |    },
110 |    "file_extension": ".py",
111 |    "mimetype": "text/x-python",
112 |    "name": "python",
113 |    "nbconvert_exporter": "python",
114 |    "pygments_lexer": "ipython3",
115 |    "version": "3.8.5"
116 |   }
117 |  },
118 |  "nbformat": 4,
119 |  "nbformat_minor": 4
120 | }
121 | 


--------------------------------------------------------------------------------
/notebooks/plain-pytorch-model.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "data_path = '../dataset'"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": 2,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "import os"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": 3,
 24 |    "metadata": {},
 25 |    "outputs": [
 26 |     {
 27 |      "name": "stdout",
 28 |      "output_type": "stream",
 29 |      "text": [
 30 |       "../dataset/test\n",
 31 |       "../dataset/train\n",
 32 |       "../dataset/val\n"
 33 |      ]
 34 |     }
 35 |    ],
 36 |    "source": [
 37 |     "for path in os.listdir(data_path):\n",
 38 |     "    if os.path.isdir(os.path.join(data_path, path)):\n",
 39 |     "        print(os.path.join(data_path, path))"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "code",
 44 |    "execution_count": 4,
 45 |    "metadata": {},
 46 |    "outputs": [],
 47 |    "source": [
 48 |     "train_data_path = os.path.join(data_path, 'train')\n",
 49 |     "val_data_path = os.path.join(data_path, 'val')\n",
 50 |     "test_data_path = os.path.join(data_path, 'test')"
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "code",
 55 |    "execution_count": 5,
 56 |    "metadata": {},
 57 |    "outputs": [
 58 |     {
 59 |      "data": {
 60 |       "text/plain": [
 61 |        "{0: 'bart_simpson',\n",
 62 |        " 1: 'charles_montgomery_burns',\n",
 63 |        " 2: 'homer_simpson',\n",
 64 |        " 3: 'krusty_the_clown',\n",
 65 |        " 4: 'lisa_simpson',\n",
 66 |        " 5: 'marge_simpson',\n",
 67 |        " 6: 'milhouse_van_houten',\n",
 68 |        " 7: 'moe_szyslak',\n",
 69 |        " 8: 'ned_flanders',\n",
 70 |        " 9: 'principal_skinner'}"
 71 |       ]
 72 |      },
 73 |      "execution_count": 5,
 74 |      "metadata": {},
 75 |      "output_type": "execute_result"
 76 |     }
 77 |    ],
 78 |    "source": [
 79 |     "train_classes = dict()\n",
 80 |     "\n",
 81 |     "for path in sorted(os.listdir(train_data_path)):\n",
 82 |     "    if os.path.isdir(os.path.join(train_data_path, path)):\n",
 83 |     "        train_classes.setdefault(len(train_classes), path)\n",
 84 |     "        \n",
 85 |     "train_classes"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "code",
 90 |    "execution_count": 6,
 91 |    "metadata": {},
 92 |    "outputs": [
 93 |     {
 94 |      "data": {
 95 |       "text/plain": [
 96 |        "{0: 'bart_simpson',\n",
 97 |        " 1: 'charles_montgomery_burns',\n",
 98 |        " 2: 'homer_simpson',\n",
 99 |        " 3: 'krusty_the_clown',\n",
100 |        " 4: 'lisa_simpson',\n",
101 |        " 5: 'marge_simpson',\n",
102 |        " 6: 'milhouse_van_houten',\n",
103 |        " 7: 'moe_szyslak',\n",
104 |        " 8: 'ned_flanders',\n",
105 |        " 9: 'principal_skinner'}"
106 |       ]
107 |      },
108 |      "execution_count": 6,
109 |      "metadata": {},
110 |      "output_type": "execute_result"
111 |     }
112 |    ],
113 |    "source": [
114 |     "val_classes = dict()\n",
115 |     "\n",
116 |     "for path in sorted(os.listdir(val_data_path)):\n",
117 |     "    if os.path.isdir(os.path.join(val_data_path, path)):\n",
118 |     "        val_classes.setdefault(len(val_classes), path)\n",
119 |     "        \n",
120 |     "val_classes"
121 |    ]
122 |   },
123 |   {
124 |    "cell_type": "code",
125 |    "execution_count": 7,
126 |    "metadata": {},
127 |    "outputs": [
128 |     {
129 |      "data": {
130 |       "text/plain": [
131 |        "{0: 'bart_simpson',\n",
132 |        " 1: 'charles_montgomery_burns',\n",
133 |        " 2: 'homer_simpson',\n",
134 |        " 3: 'krusty_the_clown',\n",
135 |        " 4: 'lisa_simpson',\n",
136 |        " 5: 'marge_simpson',\n",
137 |        " 6: 'milhouse_van_houten',\n",
138 |        " 7: 'moe_szyslak',\n",
139 |        " 8: 'ned_flanders',\n",
140 |        " 9: 'principal_skinner'}"
141 |       ]
142 |      },
143 |      "execution_count": 7,
144 |      "metadata": {},
145 |      "output_type": "execute_result"
146 |     }
147 |    ],
148 |    "source": [
149 |     "test_classes = dict()\n",
150 |     "\n",
151 |     "for path in sorted(os.listdir(test_data_path)):\n",
152 |     "    if os.path.isdir(os.path.join(test_data_path, path)):\n",
153 |     "        test_classes.setdefault(len(test_classes), path)\n",
154 |     "        \n",
155 |     "test_classes"
156 |    ]
157 |   },
158 |   {
159 |    "cell_type": "code",
160 |    "execution_count": 8,
161 |    "metadata": {},
162 |    "outputs": [
163 |     {
164 |      "data": {
165 |       "text/plain": [
166 |        "device(type='cuda')"
167 |       ]
168 |      },
169 |      "execution_count": 8,
170 |      "metadata": {},
171 |      "output_type": "execute_result"
172 |     }
173 |    ],
174 |    "source": [
175 |     "import torch\n",
176 |     "\n",
177 |     "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
178 |     "device"
179 |    ]
180 |   },
181 |   {
182 |    "cell_type": "code",
183 |    "execution_count": 9,
184 |    "metadata": {},
185 |    "outputs": [
186 |     {
187 |      "data": {
188 |       "text/plain": [
189 |        "'GeForce GTX 1070'"
190 |       ]
191 |      },
192 |      "execution_count": 9,
193 |      "metadata": {},
194 |      "output_type": "execute_result"
195 |     }
196 |    ],
197 |    "source": [
198 |     "torch.cuda.get_device_name(0)"
199 |    ]
200 |   },
201 |   {
202 |    "cell_type": "code",
203 |    "execution_count": 10,
204 |    "metadata": {},
205 |    "outputs": [],
206 |    "source": [
207 |     "torch.cuda.empty_cache()"
208 |    ]
209 |   },
210 |   {
211 |    "cell_type": "code",
212 |    "execution_count": 11,
213 |    "metadata": {},
214 |    "outputs": [],
215 |    "source": [
216 |     "from torchvision import transforms as T"
217 |    ]
218 |   },
219 |   {
220 |    "cell_type": "code",
221 |    "execution_count": 12,
222 |    "metadata": {},
223 |    "outputs": [],
224 |    "source": [
225 |     "train_transform = T.Compose([\n",
226 |     "    T.Resize((32,32)),\n",
227 |     "    T.ToTensor(),\n",
228 |     "    T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])\n",
229 |     "])"
230 |    ]
231 |   },
232 |   {
233 |    "cell_type": "code",
234 |    "execution_count": 13,
235 |    "metadata": {},
236 |    "outputs": [],
237 |    "source": [
238 |     "val_transform = T.Compose([\n",
239 |     "    T.Resize((32,32)),\n",
240 |     "    T.ToTensor(),\n",
241 |     "    T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])\n",
242 |     "])"
243 |    ]
244 |   },
245 |   {
246 |    "cell_type": "code",
247 |    "execution_count": 14,
248 |    "metadata": {},
249 |    "outputs": [],
250 |    "source": [
251 |     "test_transform = T.Compose([\n",
252 |     "    T.Resize((32,32)),\n",
253 |     "    T.ToTensor(),\n",
254 |     "    T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])\n",
255 |     "])"
256 |    ]
257 |   },
258 |   {
259 |    "cell_type": "code",
260 |    "execution_count": 15,
261 |    "metadata": {},
262 |    "outputs": [],
263 |    "source": [
264 |     "from torchvision.datasets import ImageFolder"
265 |    ]
266 |   },
267 |   {
268 |    "cell_type": "code",
269 |    "execution_count": 16,
270 |    "metadata": {},
271 |    "outputs": [],
272 |    "source": [
273 |     "train_dataset = ImageFolder(\n",
274 |     "    root=train_data_path,\n",
275 |     "    transform=train_transform\n",
276 |     ")"
277 |    ]
278 |   },
279 |   {
280 |    "cell_type": "code",
281 |    "execution_count": 17,
282 |    "metadata": {},
283 |    "outputs": [
284 |     {
285 |      "data": {
286 |       "text/plain": [
287 |        "Dataset ImageFolder\n",
288 |        "    Number of datapoints: 6500\n",
289 |        "    Root location: ../dataset/train\n",
290 |        "    StandardTransform\n",
291 |        "Transform: Compose(\n",
292 |        "               Resize(size=(32, 32), interpolation=bilinear, max_size=None, antialias=None)\n",
293 |        "               ToTensor()\n",
294 |        "               Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])\n",
295 |        "           )"
296 |       ]
297 |      },
298 |      "execution_count": 17,
299 |      "metadata": {},
300 |      "output_type": "execute_result"
301 |     }
302 |    ],
303 |    "source": [
304 |     "train_dataset"
305 |    ]
306 |   },
307 |   {
308 |    "cell_type": "code",
309 |    "execution_count": 18,
310 |    "metadata": {},
311 |    "outputs": [],
312 |    "source": [
313 |     "val_dataset = ImageFolder(\n",
314 |     "    root=val_data_path,\n",
315 |     "    transform=val_transform\n",
316 |     ")"
317 |    ]
318 |   },
319 |   {
320 |    "cell_type": "code",
321 |    "execution_count": 19,
322 |    "metadata": {},
323 |    "outputs": [
324 |     {
325 |      "data": {
326 |       "text/plain": [
327 |        "Dataset ImageFolder\n",
328 |        "    Number of datapoints: 1500\n",
329 |        "    Root location: ../dataset/val\n",
330 |        "    StandardTransform\n",
331 |        "Transform: Compose(\n",
332 |        "               Resize(size=(32, 32), interpolation=bilinear, max_size=None, antialias=None)\n",
333 |        "               ToTensor()\n",
334 |        "               Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])\n",
335 |        "           )"
336 |       ]
337 |      },
338 |      "execution_count": 19,
339 |      "metadata": {},
340 |      "output_type": "execute_result"
341 |     }
342 |    ],
343 |    "source": [
344 |     "val_dataset"
345 |    ]
346 |   },
347 |   {
348 |    "cell_type": "code",
349 |    "execution_count": 20,
350 |    "metadata": {},
351 |    "outputs": [],
352 |    "source": [
353 |     "test_dataset = ImageFolder(\n",
354 |     "    root=test_data_path,\n",
355 |     "    transform=test_transform\n",
356 |     ")"
357 |    ]
358 |   },
359 |   {
360 |    "cell_type": "code",
361 |    "execution_count": 21,
362 |    "metadata": {},
363 |    "outputs": [
364 |     {
365 |      "data": {
366 |       "text/plain": [
367 |        "Dataset ImageFolder\n",
368 |        "    Number of datapoints: 2000\n",
369 |        "    Root location: ../dataset/test\n",
370 |        "    StandardTransform\n",
371 |        "Transform: Compose(\n",
372 |        "               Resize(size=(32, 32), interpolation=bilinear, max_size=None, antialias=None)\n",
373 |        "               ToTensor()\n",
374 |        "               Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])\n",
375 |        "           )"
376 |       ]
377 |      },
378 |      "execution_count": 21,
379 |      "metadata": {},
380 |      "output_type": "execute_result"
381 |     }
382 |    ],
383 |    "source": [
384 |     "test_dataset"
385 |    ]
386 |   },
387 |   {
388 |    "cell_type": "code",
389 |    "execution_count": 22,
390 |    "metadata": {},
391 |    "outputs": [],
392 |    "source": [
393 |     "from torch.utils.data import DataLoader"
394 |    ]
395 |   },
396 |   {
397 |    "cell_type": "code",
398 |    "execution_count": 23,
399 |    "metadata": {},
400 |    "outputs": [],
401 |    "source": [
402 |     "BATCH_SIZE = 32"
403 |    ]
404 |   },
405 |   {
406 |    "cell_type": "code",
407 |    "execution_count": 24,
408 |    "metadata": {},
409 |    "outputs": [],
410 |    "source": [
411 |     "train_loader = DataLoader(\n",
412 |     "    train_dataset,\n",
413 |     "    batch_size=BATCH_SIZE,\n",
414 |     "    num_workers=0,\n",
415 |     "    shuffle=True\n",
416 |     ")"
417 |    ]
418 |   },
419 |   {
420 |    "cell_type": "code",
421 |    "execution_count": 25,
422 |    "metadata": {},
423 |    "outputs": [],
424 |    "source": [
425 |     "val_loader = DataLoader(\n",
426 |     "    val_dataset,\n",
427 |     "    batch_size=BATCH_SIZE,\n",
428 |     "    num_workers=0,\n",
429 |     "    shuffle=True\n",
430 |     ")"
431 |    ]
432 |   },
433 |   {
434 |    "cell_type": "code",
435 |    "execution_count": 26,
436 |    "metadata": {},
437 |    "outputs": [],
438 |    "source": [
439 |     "test_loader = DataLoader(\n",
440 |     "    test_dataset,\n",
441 |     "    batch_size=BATCH_SIZE,\n",
442 |     "    num_workers=0,\n",
443 |     "    shuffle=True\n",
444 |     ")"
445 |    ]
446 |   },
447 |   {
448 |    "cell_type": "code",
449 |    "execution_count": 27,
450 |    "metadata": {},
451 |    "outputs": [],
452 |    "source": [
453 |     "import torch\n",
454 |     "import torch.nn as nn\n",
455 |     "import torch.nn.functional as F"
456 |    ]
457 |   },
458 |   {
459 |    "cell_type": "code",
460 |    "execution_count": 28,
461 |    "metadata": {},
462 |    "outputs": [],
463 |    "source": [
464 |     "class SimpleNet(nn.Module):\n",
465 |     "    def __init__(self):\n",
466 |     "        super(SimpleNet, self).__init__()\n",
467 |     "        self.conv1 = nn.Conv2d(3, 16, kernel_size=3, padding=1) # 32 x 32\n",
468 |     "        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, padding=1) # 32 x 32\n",
469 |     "        self.conv3 = nn.Conv2d(32, 32, kernel_size=3, padding=1) # 16 x 16\n",
470 |     "        self.dropout = nn.Dropout(.2)\n",
471 |     "        self.fc1 = nn.Linear(16*16*32, 128)\n",
472 |     "        self.fc2 = nn.Linear(128, 10)\n",
473 |     "        \n",
474 |     "    def forward(self, x):\n",
475 |     "        x = F.relu(self.conv1(x))\n",
476 |     "        x = F.relu(self.conv2(x))\n",
477 |     "        x = F.relu(self.conv3(x))\n",
478 |     "        x = F.max_pool2d(x, 2)\n",
479 |     "        x = self.dropout(x)\n",
480 |     "        x = torch.flatten(x, 1)\n",
481 |     "        x = F.relu(self.fc1(x))\n",
482 |     "        x = self.dropout(x)\n",
483 |     "        x = self.fc2(x)\n",
484 |     "        return x"
485 |    ]
486 |   },
487 |   {
488 |    "cell_type": "code",
489 |    "execution_count": 29,
490 |    "metadata": {},
491 |    "outputs": [],
492 |    "source": [
493 |     "model = SimpleNet()"
494 |    ]
495 |   },
496 |   {
497 |    "cell_type": "code",
498 |    "execution_count": 30,
499 |    "metadata": {},
500 |    "outputs": [
501 |     {
502 |      "data": {
503 |       "text/plain": [
504 |        "SimpleNet(\n",
505 |        "  (conv1): Conv2d(3, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
506 |        "  (conv2): Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
507 |        "  (conv3): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
508 |        "  (dropout): Dropout(p=0.2, inplace=False)\n",
509 |        "  (fc1): Linear(in_features=8192, out_features=128, bias=True)\n",
510 |        "  (fc2): Linear(in_features=128, out_features=10, bias=True)\n",
511 |        ")"
512 |       ]
513 |      },
514 |      "execution_count": 30,
515 |      "metadata": {},
516 |      "output_type": "execute_result"
517 |     }
518 |    ],
519 |    "source": [
520 |     "model"
521 |    ]
522 |   },
523 |   {
524 |    "cell_type": "code",
525 |    "execution_count": 31,
526 |    "metadata": {},
527 |    "outputs": [
528 |     {
529 |      "data": {
530 |       "text/plain": [
531 |        "'cuda'"
532 |       ]
533 |      },
534 |      "execution_count": 31,
535 |      "metadata": {},
536 |      "output_type": "execute_result"
537 |     }
538 |    ],
539 |    "source": [
540 |     "device = 'cuda' if torch.cuda.is_available else 'cpu'\n",
541 |     "device"
542 |    ]
543 |   },
544 |   {
545 |    "cell_type": "code",
546 |    "execution_count": 32,
547 |    "metadata": {},
548 |    "outputs": [
549 |     {
550 |      "data": {
551 |       "text/plain": [
552 |        "SimpleNet(\n",
553 |        "  (conv1): Conv2d(3, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
554 |        "  (conv2): Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
555 |        "  (conv3): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
556 |        "  (dropout): Dropout(p=0.2, inplace=False)\n",
557 |        "  (fc1): Linear(in_features=8192, out_features=128, bias=True)\n",
558 |        "  (fc2): Linear(in_features=128, out_features=10, bias=True)\n",
559 |        ")"
560 |       ]
561 |      },
562 |      "execution_count": 32,
563 |      "metadata": {},
564 |      "output_type": "execute_result"
565 |     }
566 |    ],
567 |    "source": [
568 |     "model.to(device)"
569 |    ]
570 |   },
571 |   {
572 |    "cell_type": "code",
573 |    "execution_count": 33,
574 |    "metadata": {},
575 |    "outputs": [],
576 |    "source": [
577 |     "import torch.optim as optim"
578 |    ]
579 |   },
580 |   {
581 |    "cell_type": "code",
582 |    "execution_count": 34,
583 |    "metadata": {},
584 |    "outputs": [],
585 |    "source": [
586 |     "criterion = nn.CrossEntropyLoss()\n",
587 |     "optimizer = optim.Adam(model.parameters())"
588 |    ]
589 |   },
590 |   {
591 |    "cell_type": "code",
592 |    "execution_count": 35,
593 |    "metadata": {},
594 |    "outputs": [
595 |     {
596 |      "name": "stdout",
597 |      "output_type": "stream",
598 |      "text": [
599 |       "\n",
600 |       "Epoch 0\n"
601 |      ]
602 |     },
603 |     {
604 |      "name": "stderr",
605 |      "output_type": "stream",
606 |      "text": [
607 |       "/home/alvarobartt/miniconda3/envs/deeplearning/lib/python3.8/site-packages/torch/nn/functional.py:718: UserWarning: Named tensors and all their associated APIs are an experimental feature and subject to change. Please do not use them for anything important until they are released as stable. (Triggered internally at  /pytorch/c10/core/TensorImpl.h:1156.)\n",
608 |       "  return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)\n"
609 |      ]
610 |     },
611 |     {
612 |      "name": "stdout",
613 |      "output_type": "stream",
614 |      "text": [
615 |       "Train Loss: 1.676144320854774, Train Acc: 0.43153846153846154\n",
616 |       "\n",
617 |       "Epoch 1\n",
618 |       "Train Loss: 1.164538860467764, Train Acc: 0.6112307692307692\n",
619 |       "\n",
620 |       "Epoch 2\n",
621 |       "Train Loss: 0.8820871773912357, Train Acc: 0.7158461538461539\n",
622 |       "\n",
623 |       "Epoch 3\n",
624 |       "Train Loss: 0.630218602180481, Train Acc: 0.7883076923076924\n",
625 |       "\n",
626 |       "Epoch 4\n",
627 |       "Train Loss: 0.4446859418021945, Train Acc: 0.8521538461538462\n",
628 |       "\n",
629 |       "Epoch 5\n",
630 |       "Train Loss: 0.3111736298753665, Train Acc: 0.8949230769230769\n",
631 |       "\n",
632 |       "Epoch 6\n",
633 |       "Train Loss: 0.23085217702732636, Train Acc: 0.9246153846153846\n",
634 |       "\n",
635 |       "Epoch 7\n",
636 |       "Train Loss: 0.17592897058450258, Train Acc: 0.9404615384615385\n",
637 |       "\n",
638 |       "Epoch 8\n",
639 |       "Train Loss: 0.15028060822761977, Train Acc: 0.9515384615384616\n",
640 |       "\n",
641 |       "Epoch 9\n",
642 |       "Train Loss: 0.11530854218968978, Train Acc: 0.9603076923076923\n"
643 |      ]
644 |     }
645 |    ],
646 |    "source": [
647 |     "for epoch in range(10):\n",
648 |     "    print(f\"\\nEpoch {epoch}\")\n",
649 |     "    running_loss = .0\n",
650 |     "    running_corrects = .0\n",
651 |     "    model.train()\n",
652 |     "    for inputs, labels in train_loader:\n",
653 |     "        inputs, labels = inputs.to(device), labels.to(device)\n",
654 |     "\n",
655 |     "        optimizer.zero_grad()\n",
656 |     "        \n",
657 |     "        outputs = model(inputs)\n",
658 |     "        _, preds = torch.max(outputs, 1)\n",
659 |     "        loss = criterion(outputs, labels)\n",
660 |     "        loss.backward()\n",
661 |     "        optimizer.step()\n",
662 |     "\n",
663 |     "        running_loss += loss.item() * inputs.size(0)\n",
664 |     "        res = torch.sum(preds == labels)\n",
665 |     "        running_corrects += res\n",
666 |     "    epoch_loss = running_loss / len(train_dataset)\n",
667 |     "    epoch_acc = running_corrects.double() / len(train_dataset)\n",
668 |     "    print(f\"Train Loss: {epoch_loss}, Train Acc: {epoch_acc}\")"
669 |    ]
670 |   },
671 |   {
672 |    "cell_type": "code",
673 |    "execution_count": 36,
674 |    "metadata": {},
675 |    "outputs": [
676 |     {
677 |      "data": {
678 |       "text/plain": [
679 |        "SimpleNet(\n",
680 |        "  (conv1): Conv2d(3, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
681 |        "  (conv2): Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
682 |        "  (conv3): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
683 |        "  (dropout): Dropout(p=0.2, inplace=False)\n",
684 |        "  (fc1): Linear(in_features=8192, out_features=128, bias=True)\n",
685 |        "  (fc2): Linear(in_features=128, out_features=10, bias=True)\n",
686 |        ")"
687 |       ]
688 |      },
689 |      "execution_count": 36,
690 |      "metadata": {},
691 |      "output_type": "execute_result"
692 |     }
693 |    ],
694 |    "source": [
695 |     "model"
696 |    ]
697 |   },
698 |   {
699 |    "cell_type": "code",
700 |    "execution_count": 37,
701 |    "metadata": {},
702 |    "outputs": [
703 |     {
704 |      "name": "stdout",
705 |      "output_type": "stream",
706 |      "text": [
707 |       "CPU times: user 3.78 s, sys: 32 ms, total: 3.81 s\n",
708 |       "Wall time: 3.81 s\n"
709 |      ]
710 |     }
711 |    ],
712 |    "source": [
713 |     "%%time\n",
714 |     "\n",
715 |     "for inputs, labels in test_loader:\n",
716 |     "    inputs, labels = inputs.to(device), labels.to(device)\n",
717 |     "\n",
718 |     "    with torch.no_grad():\n",
719 |     "        outputs = model(inputs)\n",
720 |     "        _, preds = torch.max(outputs, 1)\n",
721 |     "        loss = criterion(outputs, labels)\n",
722 |     "\n",
723 |     "    running_loss += loss.item() * inputs.size(0)\n",
724 |     "    running_corrects += torch.sum(preds == labels)\n",
725 |     "    \n",
726 |     "loss = running_loss / len(test_dataset)\n",
727 |     "acc = running_corrects.double() / len(test_dataset)"
728 |    ]
729 |   },
730 |   {
731 |    "cell_type": "code",
732 |    "execution_count": 38,
733 |    "metadata": {},
734 |    "outputs": [
735 |     {
736 |      "name": "stdout",
737 |      "output_type": "stream",
738 |      "text": [
739 |       "Test Loss: 0.11530854218968978, Test Accuracy: 0.9603076923076923\n"
740 |      ]
741 |     }
742 |    ],
743 |    "source": [
744 |     "print(f\"Test Loss: {epoch_loss}, Test Accuracy: {epoch_acc}\")"
745 |    ]
746 |   }
747 |  ],
748 |  "metadata": {
749 |   "kernelspec": {
750 |    "display_name": "Python 3",
751 |    "language": "python",
752 |    "name": "python3"
753 |   },
754 |   "language_info": {
755 |    "codemirror_mode": {
756 |     "name": "ipython",
757 |     "version": 3
758 |    },
759 |    "file_extension": ".py",
760 |    "mimetype": "text/x-python",
761 |    "name": "python",
762 |    "nbconvert_exporter": "python",
763 |    "pygments_lexer": "ipython3",
764 |    "version": "3.8.5"
765 |   }
766 |  },
767 |  "nbformat": 4,
768 |  "nbformat_minor": 4
769 | }
770 | 


--------------------------------------------------------------------------------
/notebooks/pytorch-lightning-model.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import torch\n",
 10 |     "import torch.nn as nn\n",
 11 |     "from torch.nn import functional as F\n",
 12 |     "\n",
 13 |     "from pytorch_lightning import LightningModule\n",
 14 |     "from pytorch_lightning.metrics.functional import accuracy\n",
 15 |     "\n",
 16 |     "\n",
 17 |     "class SimpsonsNet(LightningModule):\n",
 18 |     "    def __init__(self):\n",
 19 |     "        super(SimpsonsNet, self).__init__()\n",
 20 |     "        \n",
 21 |     "        self.conv1 = nn.Conv2d(3, 16, kernel_size=3, padding=1)\n",
 22 |     "        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, padding=1)\n",
 23 |     "        self.conv3 = nn.Conv2d(32, 32, kernel_size=3, padding=1)\n",
 24 |     "        self.dropout = nn.Dropout(.2)\n",
 25 |     "        self.fc1 = nn.Linear(16*16*32, 128)\n",
 26 |     "        self.fc2 = nn.Linear(128, 10)\n",
 27 |     "        \n",
 28 |     "    def forward(self, x):\n",
 29 |     "        x = F.relu(self.conv1(x))\n",
 30 |     "        x = F.relu(self.conv2(x))\n",
 31 |     "        x = F.relu(self.conv3(x))\n",
 32 |     "        x = F.max_pool2d(x, 2)\n",
 33 |     "        x = self.dropout(x)\n",
 34 |     "        x = torch.flatten(x, 1)\n",
 35 |     "        x = F.relu(self.fc1(x))\n",
 36 |     "        x = self.dropout(x)\n",
 37 |     "        x = self.fc2(x)\n",
 38 |     "        return x\n",
 39 |     "\n",
 40 |     "    def _evaluate(self, batch, batch_idx, stage):\n",
 41 |     "        x, y = batch\n",
 42 |     "        out = self.forward(x)\n",
 43 |     "        # log_softmax + nll_loss = CrossEntropyLoss\n",
 44 |     "        # https://discuss.pytorch.org/t/difference-between-nn-linear-nn-crossentropyloss-and-nn-logsoftmax-nn-nllloss/21634\n",
 45 |     "        logits = F.log_softmax(out, dim=1)\n",
 46 |     "        loss = F.nll_loss(logits, y)\n",
 47 |     "        preds = torch.argmax(logits, dim=1)\n",
 48 |     "        acc = accuracy(preds, y)\n",
 49 |     "\n",
 50 |     "        self.log(f'{stage}_loss', loss, prog_bar=True)\n",
 51 |     "        self.log(f'{stage}_acc', acc, prog_bar=True)\n",
 52 |     "\n",
 53 |     "        return loss, acc\n",
 54 |     "    \n",
 55 |     "    def training_step(self, batch, batch_idx):\n",
 56 |     "        loss, acc = self._evaluate(batch, batch_idx, 'train')\n",
 57 |     "        return loss\n",
 58 |     "\n",
 59 |     "    def validation_step(self, batch, batch_idx):\n",
 60 |     "        self._evaluate(batch, batch_idx, 'val')\n",
 61 |     "\n",
 62 |     "    def test_step(self, batch, batch_idx):\n",
 63 |     "        self._evaluate(batch, batch_idx, 'test')\n",
 64 |     "\n",
 65 |     "    def configure_optimizers(self):\n",
 66 |     "        return torch.optim.Adam(self.parameters())"
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "code",
 71 |    "execution_count": 2,
 72 |    "metadata": {},
 73 |    "outputs": [],
 74 |    "source": [
 75 |     "model = SimpsonsNet()"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "code",
 80 |    "execution_count": 3,
 81 |    "metadata": {},
 82 |    "outputs": [
 83 |     {
 84 |      "data": {
 85 |       "text/plain": [
 86 |        "SimpsonsNet(\n",
 87 |        "  (conv1): Conv2d(3, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
 88 |        "  (conv2): Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
 89 |        "  (conv3): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
 90 |        "  (dropout): Dropout(p=0.2, inplace=False)\n",
 91 |        "  (fc1): Linear(in_features=8192, out_features=128, bias=True)\n",
 92 |        "  (fc2): Linear(in_features=128, out_features=10, bias=True)\n",
 93 |        ")"
 94 |       ]
 95 |      },
 96 |      "execution_count": 3,
 97 |      "metadata": {},
 98 |      "output_type": "execute_result"
 99 |     }
100 |    ],
101 |    "source": [
102 |     "model"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "code",
107 |    "execution_count": 4,
108 |    "metadata": {},
109 |    "outputs": [],
110 |    "source": [
111 |     "x = torch.randn(1, 3, 32, 32)\n",
112 |     "with torch.no_grad():\n",
113 |     "    y = model(x)"
114 |    ]
115 |   }
116 |  ],
117 |  "metadata": {
118 |   "kernelspec": {
119 |    "display_name": "Python 3",
120 |    "language": "python",
121 |    "name": "python3"
122 |   },
123 |   "language_info": {
124 |    "codemirror_mode": {
125 |     "name": "ipython",
126 |     "version": 3
127 |    },
128 |    "file_extension": ".py",
129 |    "mimetype": "text/x-python",
130 |    "name": "python",
131 |    "nbconvert_exporter": "python",
132 |    "pygments_lexer": "ipython3",
133 |    "version": "3.8.5"
134 |   }
135 |  },
136 |  "nbformat": 4,
137 |  "nbformat_minor": 4
138 | }
139 | 


--------------------------------------------------------------------------------
/notebooks/pytorch-lightning-training.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "from torchvision import transforms as T\n",
 10 |     "\n",
 11 |     "class SimpsonsTransforms(T.Compose):\n",
 12 |     "    def __init__(self, phase):\n",
 13 |     "        self.phase = phase\n",
 14 |     "        self.transforms = {\n",
 15 |     "            'train': [\n",
 16 |     "                T.Resize((32, 32)),\n",
 17 |     "                T.RandomHorizontalFlip(),\n",
 18 |     "                T.ToTensor(),\n",
 19 |     "                T.Normalize(\n",
 20 |     "                    mean=[0.485, 0.456, 0.406],\n",
 21 |     "                    std=[0.229, 0.224, 0.225]\n",
 22 |     "                )\n",
 23 |     "            ],\n",
 24 |     "            'val': [\n",
 25 |     "                T.Resize((32, 32)),\n",
 26 |     "                T.ToTensor(),\n",
 27 |     "                T.Normalize(\n",
 28 |     "                    mean=[0.485, 0.456, 0.406],\n",
 29 |     "                    std=[0.229, 0.224, 0.225]\n",
 30 |     "                )\n",
 31 |     "            ],\n",
 32 |     "            'test': [\n",
 33 |     "                T.Resize((32, 32)),\n",
 34 |     "                T.ToTensor(),\n",
 35 |     "                T.Normalize(\n",
 36 |     "                    mean=[0.485, 0.456, 0.406],\n",
 37 |     "                    std=[0.229, 0.224, 0.225]\n",
 38 |     "                )\n",
 39 |     "            ]\n",
 40 |     "        }\n",
 41 |     "        \n",
 42 |     "        super().__init__(self.transforms[self.phase])"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": 2,
 48 |    "metadata": {},
 49 |    "outputs": [],
 50 |    "source": [
 51 |     "from torchvision.datasets import ImageFolder\n",
 52 |     "\n",
 53 |     "class SimpsonsImageFolder(ImageFolder):\n",
 54 |     "    def __init__(self, root, phase):\n",
 55 |     "        self.root = f\"{root}/{phase}\"\n",
 56 |     "        self.phase = phase\n",
 57 |     "        self.transform = SimpsonsTransforms(phase=phase)\n",
 58 |     "        \n",
 59 |     "        super().__init__(self.root, self.transform)"
 60 |    ]
 61 |   },
 62 |   {
 63 |    "cell_type": "code",
 64 |    "execution_count": 3,
 65 |    "metadata": {},
 66 |    "outputs": [],
 67 |    "source": [
 68 |     "from pytorch_lightning import LightningDataModule\n",
 69 |     "\n",
 70 |     "from torch.utils.data import DataLoader\n",
 71 |     "\n",
 72 |     "class SimpsonsDataModule(LightningDataModule):\n",
 73 |     "    def __init__(self, dataset_path, batch_size):\n",
 74 |     "        super().__init__()\n",
 75 |     "        self.dataset_path = dataset_path\n",
 76 |     "        self.batch_size = batch_size\n",
 77 |     "    \n",
 78 |     "    def train_dataloader(self):\n",
 79 |     "        self.train_imagefolder = SimpsonsImageFolder(root=self.dataset_path, \n",
 80 |     "                                                     phase='train')\n",
 81 |     "        return DataLoader(dataset=self.train_imagefolder,\n",
 82 |     "                          batch_size=self.batch_size,\n",
 83 |     "                          num_workers=12, shuffle=True)\n",
 84 |     "    \n",
 85 |     "    def val_dataloader(self):\n",
 86 |     "        self.val_imagefolder = SimpsonsImageFolder(root=self.dataset_path,\n",
 87 |     "                                                   phase='val')\n",
 88 |     "        return DataLoader(dataset=self.val_imagefolder,\n",
 89 |     "                          batch_size=self.batch_size,\n",
 90 |     "                          num_workers=12)\n",
 91 |     "    \n",
 92 |     "    def test_dataloader(self):\n",
 93 |     "        self.test_imagefolder = SimpsonsImageFolder(root=self.dataset_path,\n",
 94 |     "                                                    phase='test')\n",
 95 |     "        return DataLoader(dataset=self.test_imagefolder,\n",
 96 |     "                          batch_size=self.batch_size,\n",
 97 |     "                          num_workers=12)"
 98 |    ]
 99 |   },
100 |   {
101 |    "cell_type": "code",
102 |    "execution_count": null,
103 |    "metadata": {},
104 |    "outputs": [],
105 |    "source": [
106 |     "import torch\n",
107 |     "import torch.nn as nn\n",
108 |     "from torch.nn import functional as F\n",
109 |     "\n",
110 |     "from pytorch_lightning import LightningModule\n",
111 |     "from pytorch_lightning.metrics.functional import accuracy\n",
112 |     "\n",
113 |     "\n",
114 |     "class SimpsonsNet(LightningModule):\n",
115 |     "    def __init__(self):\n",
116 |     "        super(SimpsonsNet, self).__init__()\n",
117 |     "        \n",
118 |     "        self.conv1 = nn.Conv2d(3, 16, kernel_size=3, padding=1)\n",
119 |     "        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, padding=1)\n",
120 |     "        self.conv3 = nn.Conv2d(32, 32, kernel_size=3, padding=1)\n",
121 |     "        self.dropout = nn.Dropout(.2)\n",
122 |     "        self.fc1 = nn.Linear(16*16*32, 128)\n",
123 |     "        self.fc2 = nn.Linear(128, 10)\n",
124 |     "        \n",
125 |     "    def forward(self, x):\n",
126 |     "        x = F.relu(self.conv1(x))\n",
127 |     "        x = F.relu(self.conv2(x))\n",
128 |     "        x = F.relu(self.conv3(x))\n",
129 |     "        x = F.max_pool2d(x, 2)\n",
130 |     "        x = self.dropout(x)\n",
131 |     "        x = torch.flatten(x, 1)\n",
132 |     "        x = F.relu(self.fc1(x))\n",
133 |     "        x = self.dropout(x)\n",
134 |     "        x = self.fc2(x)\n",
135 |     "        return x\n",
136 |     "\n",
137 |     "    def _evaluate(self, batch, batch_idx, stage):\n",
138 |     "        x, y = batch\n",
139 |     "        out = self.forward(x)\n",
140 |     "        logits = F.log_softmax(out, dim=1)\n",
141 |     "        loss = F.nll_loss(logits, y)\n",
142 |     "        preds = torch.argmax(logits, dim=1)\n",
143 |     "        acc = accuracy(preds, y)\n",
144 |     "\n",
145 |     "        self.log(f'{stage}_loss', loss, prog_bar=True)\n",
146 |     "        self.log(f'{stage}_acc', acc, prog_bar=True)\n",
147 |     "\n",
148 |     "        return loss, acc\n",
149 |     "    \n",
150 |     "    def training_step(self, batch, batch_idx):\n",
151 |     "        loss, acc = self._evaluate(batch, batch_idx, 'train')\n",
152 |     "        return loss\n",
153 |     "\n",
154 |     "    def validation_step(self, batch, batch_idx):\n",
155 |     "        self._evaluate(batch, batch_idx, 'val')\n",
156 |     "\n",
157 |     "    def test_step(self, batch, batch_idx):\n",
158 |     "        self._evaluate(batch, batch_idx, 'test')\n",
159 |     "\n",
160 |     "    def configure_optimizers(self):\n",
161 |     "        return torch.optim.Adam(self.parameters())"
162 |    ]
163 |   },
164 |   {
165 |    "cell_type": "code",
166 |    "execution_count": 5,
167 |    "metadata": {},
168 |    "outputs": [],
169 |    "source": [
170 |     "data_module = SimpsonsDataModule(dataset_path=\"../dataset\", batch_size=32)"
171 |    ]
172 |   },
173 |   {
174 |    "cell_type": "code",
175 |    "execution_count": 6,
176 |    "metadata": {},
177 |    "outputs": [],
178 |    "source": [
179 |     "train_loader = data_module.train_dataloader()\n",
180 |     "val_loader = data_module.val_dataloader()\n",
181 |     "test_loader = data_module.test_dataloader()"
182 |    ]
183 |   },
184 |   {
185 |    "cell_type": "code",
186 |    "execution_count": 7,
187 |    "metadata": {},
188 |    "outputs": [],
189 |    "source": [
190 |     "model = SimpsonsNet()"
191 |    ]
192 |   },
193 |   {
194 |    "cell_type": "code",
195 |    "execution_count": 8,
196 |    "metadata": {},
197 |    "outputs": [
198 |     {
199 |      "name": "stderr",
200 |      "output_type": "stream",
201 |      "text": [
202 |       "\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33malvarobartt\u001b[0m (use `wandb login --relogin` to force relogin)\n"
203 |      ]
204 |     },
205 |     {
206 |      "data": {
207 |       "text/plain": [
208 |        "True"
209 |       ]
210 |      },
211 |      "execution_count": 8,
212 |      "metadata": {},
213 |      "output_type": "execute_result"
214 |     }
215 |    ],
216 |    "source": [
217 |     "import wandb\n",
218 |     "wandb.login(project=\"ml-monitoring-with-wandb\", entity=\"alvarobartt\")"
219 |    ]
220 |   },
221 |   {
222 |    "cell_type": "code",
223 |    "execution_count": 9,
224 |    "metadata": {},
225 |    "outputs": [],
226 |    "source": [
227 |     "from pytorch_lightning.loggers import WandbLogger\n",
228 |     "\n",
229 |     "wandb_logger = WandbLogger()"
230 |    ]
231 |   },
232 |   {
233 |    "cell_type": "code",
234 |    "execution_count": 10,
235 |    "metadata": {},
236 |    "outputs": [
237 |     {
238 |      "name": "stderr",
239 |      "output_type": "stream",
240 |      "text": [
241 |       "GPU available: True, used: True\n",
242 |       "TPU available: None, using: 0 TPU cores\n"
243 |      ]
244 |     },
245 |     {
246 |      "data": {
247 |       "text/html": [
248 |        "\n",
249 |        "                Tracking run with wandb version 0.10.21<br/>\n",
250 |        "                Syncing run <strong style=\"color:#cdcd00\">fine-cloud-15</strong> to <a href=\"https://wandb.ai\" target=\"_blank\">Weights & Biases</a> <a href=\"https://docs.wandb.com/integrations/jupyter.html\" target=\"_blank\">(Documentation)</a>.<br/>\n",
251 |        "                Project page: <a href=\"https://wandb.ai/alvarobartt/ml-monitoring-with-wandb-notebooks\" target=\"_blank\">https://wandb.ai/alvarobartt/ml-monitoring-with-wandb-notebooks</a><br/>\n",
252 |        "                Run page: <a href=\"https://wandb.ai/alvarobartt/ml-monitoring-with-wandb-notebooks/runs/1wtc1d1b\" target=\"_blank\">https://wandb.ai/alvarobartt/ml-monitoring-with-wandb-notebooks/runs/1wtc1d1b</a><br/>\n",
253 |        "                Run data is saved locally in <code>/home/alvarobartt/Desktop/projects/ml-monitoring-with-wandb/notebooks/wandb/run-20210303_183341-1wtc1d1b</code><br/><br/>\n",
254 |        "            "
255 |       ],
256 |       "text/plain": [
257 |        "<IPython.core.display.HTML object>"
258 |       ]
259 |      },
260 |      "metadata": {},
261 |      "output_type": "display_data"
262 |     },
263 |     {
264 |      "name": "stderr",
265 |      "output_type": "stream",
266 |      "text": [
267 |       "\n",
268 |       "  | Name       | Type       | Params\n",
269 |       "------------------------------------------\n",
270 |       "0 | sequential | Sequential | 539 K \n",
271 |       "------------------------------------------\n",
272 |       "539 K     Trainable params\n",
273 |       "0         Non-trainable params\n",
274 |       "539 K     Total params\n",
275 |       "2.157     Total estimated model params size (MB)\n"
276 |      ]
277 |     },
278 |     {
279 |      "data": {
280 |       "application/vnd.jupyter.widget-view+json": {
281 |        "model_id": "7cf2435be0ef421597ea508c4b0d3ac2",
282 |        "version_major": 2,
283 |        "version_minor": 0
284 |       },
285 |       "text/plain": [
286 |        "HBox(children=(HTML(value='Validation sanity check'), FloatProgress(value=1.0, bar_style='info', layout=Layout…"
287 |       ]
288 |      },
289 |      "metadata": {},
290 |      "output_type": "display_data"
291 |     },
292 |     {
293 |      "data": {
294 |       "application/vnd.jupyter.widget-view+json": {
295 |        "model_id": "1263fa12dc2145998396156000225bba",
296 |        "version_major": 2,
297 |        "version_minor": 0
298 |       },
299 |       "text/plain": [
300 |        "HBox(children=(HTML(value='Training'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), max…"
301 |       ]
302 |      },
303 |      "metadata": {},
304 |      "output_type": "display_data"
305 |     },
306 |     {
307 |      "data": {
308 |       "application/vnd.jupyter.widget-view+json": {
309 |        "model_id": "",
310 |        "version_major": 2,
311 |        "version_minor": 0
312 |       },
313 |       "text/plain": [
314 |        "HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…"
315 |       ]
316 |      },
317 |      "metadata": {},
318 |      "output_type": "display_data"
319 |     },
320 |     {
321 |      "data": {
322 |       "application/vnd.jupyter.widget-view+json": {
323 |        "model_id": "",
324 |        "version_major": 2,
325 |        "version_minor": 0
326 |       },
327 |       "text/plain": [
328 |        "HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…"
329 |       ]
330 |      },
331 |      "metadata": {},
332 |      "output_type": "display_data"
333 |     },
334 |     {
335 |      "data": {
336 |       "application/vnd.jupyter.widget-view+json": {
337 |        "model_id": "",
338 |        "version_major": 2,
339 |        "version_minor": 0
340 |       },
341 |       "text/plain": [
342 |        "HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…"
343 |       ]
344 |      },
345 |      "metadata": {},
346 |      "output_type": "display_data"
347 |     },
348 |     {
349 |      "data": {
350 |       "application/vnd.jupyter.widget-view+json": {
351 |        "model_id": "",
352 |        "version_major": 2,
353 |        "version_minor": 0
354 |       },
355 |       "text/plain": [
356 |        "HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…"
357 |       ]
358 |      },
359 |      "metadata": {},
360 |      "output_type": "display_data"
361 |     },
362 |     {
363 |      "data": {
364 |       "application/vnd.jupyter.widget-view+json": {
365 |        "model_id": "",
366 |        "version_major": 2,
367 |        "version_minor": 0
368 |       },
369 |       "text/plain": [
370 |        "HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…"
371 |       ]
372 |      },
373 |      "metadata": {},
374 |      "output_type": "display_data"
375 |     },
376 |     {
377 |      "data": {
378 |       "application/vnd.jupyter.widget-view+json": {
379 |        "model_id": "",
380 |        "version_major": 2,
381 |        "version_minor": 0
382 |       },
383 |       "text/plain": [
384 |        "HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…"
385 |       ]
386 |      },
387 |      "metadata": {},
388 |      "output_type": "display_data"
389 |     },
390 |     {
391 |      "data": {
392 |       "application/vnd.jupyter.widget-view+json": {
393 |        "model_id": "",
394 |        "version_major": 2,
395 |        "version_minor": 0
396 |       },
397 |       "text/plain": [
398 |        "HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…"
399 |       ]
400 |      },
401 |      "metadata": {},
402 |      "output_type": "display_data"
403 |     },
404 |     {
405 |      "data": {
406 |       "application/vnd.jupyter.widget-view+json": {
407 |        "model_id": "",
408 |        "version_major": 2,
409 |        "version_minor": 0
410 |       },
411 |       "text/plain": [
412 |        "HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…"
413 |       ]
414 |      },
415 |      "metadata": {},
416 |      "output_type": "display_data"
417 |     },
418 |     {
419 |      "data": {
420 |       "application/vnd.jupyter.widget-view+json": {
421 |        "model_id": "",
422 |        "version_major": 2,
423 |        "version_minor": 0
424 |       },
425 |       "text/plain": [
426 |        "HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…"
427 |       ]
428 |      },
429 |      "metadata": {},
430 |      "output_type": "display_data"
431 |     },
432 |     {
433 |      "data": {
434 |       "application/vnd.jupyter.widget-view+json": {
435 |        "model_id": "",
436 |        "version_major": 2,
437 |        "version_minor": 0
438 |       },
439 |       "text/plain": [
440 |        "HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…"
441 |       ]
442 |      },
443 |      "metadata": {},
444 |      "output_type": "display_data"
445 |     },
446 |     {
447 |      "name": "stdout",
448 |      "output_type": "stream",
449 |      "text": [
450 |       "\n"
451 |      ]
452 |     }
453 |    ],
454 |    "source": [
455 |     "import pytorch_lightning as pl\n",
456 |     "\n",
457 |     "trainer = pl.Trainer(gpus=1, progress_bar_refresh_rate=10, max_epochs=10, logger=wandb_logger)\n",
458 |     "trainer.fit(model, train_loader, val_loader);"
459 |    ]
460 |   },
461 |   {
462 |    "cell_type": "code",
463 |    "execution_count": 11,
464 |    "metadata": {},
465 |    "outputs": [
466 |     {
467 |      "data": {
468 |       "application/vnd.jupyter.widget-view+json": {
469 |        "model_id": "9cf879b2e38b45069b0295d80dc28e20",
470 |        "version_major": 2,
471 |        "version_minor": 0
472 |       },
473 |       "text/plain": [
474 |        "HBox(children=(HTML(value='Testing'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), max=…"
475 |       ]
476 |      },
477 |      "metadata": {},
478 |      "output_type": "display_data"
479 |     },
480 |     {
481 |      "name": "stdout",
482 |      "output_type": "stream",
483 |      "text": [
484 |       "\n",
485 |       "--------------------------------------------------------------------------------\n",
486 |       "DATALOADER:0 TEST RESULTS\n",
487 |       "{'test_acc': 0.7515000104904175, 'test_loss': 0.9986332654953003}\n",
488 |       "--------------------------------------------------------------------------------\n"
489 |      ]
490 |     }
491 |    ],
492 |    "source": [
493 |     "trainer.test(model, test_loader);"
494 |    ]
495 |   }
496 |  ],
497 |  "metadata": {
498 |   "kernelspec": {
499 |    "display_name": "Python 3",
500 |    "language": "python",
501 |    "name": "python3"
502 |   },
503 |   "language_info": {
504 |    "codemirror_mode": {
505 |     "name": "ipython",
506 |     "version": 3
507 |    },
508 |    "file_extension": ".py",
509 |    "mimetype": "text/x-python",
510 |    "name": "python",
511 |    "nbconvert_exporter": "python",
512 |    "pygments_lexer": "ipython3",
513 |    "version": "3.8.5"
514 |   }
515 |  },
516 |  "nbformat": 4,
517 |  "nbformat_minor": 4
518 | }
519 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | torch==1.9.0
2 | torchvision==0.10.0
3 | pytorch_lightning==1.3.7.post0
4 | torchmetrics==0.3.2
5 | wandb==0.10.32


--------------------------------------------------------------------------------
/src/README.md:
--------------------------------------------------------------------------------
 1 | # :computer: Code
 2 | 
 3 | This directory contains all the source files required to train the model and reproduce
 4 | what was explained in the [README.md](https://github.com/alvarobartt/ml-monitoring-with-wandb/blob/master/README.md)
 5 | with just minor changes required.
 6 | 
 7 | All the code and files are pretty self explanatory, that's why we won't get into much detail
 8 | in this section unless requested.
 9 | 
10 | So on, to train the presented PyTorch Lightning model using its trainer and monitor it with Weights and Biases,
11 | you just need to use the following command:
12 | 
13 | ```bash
14 | python train.py --batch-size 32 --epochs 10
15 | ```
16 | 
17 | :pushpin: __Note__. You can freely tweak that parameters as well as include new ones, since the code
18 | is pretty simple and easy to modify to fit your own scenario, if applicable.
19 | 


--------------------------------------------------------------------------------
/src/inference.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Alvaro Bartolome, alvarobartt @ GitHub
 2 | # See LICENSE for details.
 3 | 
 4 | from __future__ import absolute_import
 5 | 
 6 | import torch
 7 | 
 8 | from model import SimpsonsNet
 9 | from mnist import SimpsonsMNISTDataModule, IDX2CLASS
10 | 
11 | 
12 | def test_model_inference():
13 |     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
14 | 
15 |     model = SimpsonsNet().load_from_checkpoint('wandb/latest-run/files/ml-monitoring-with-wandb/2fzailcj/checkpoints/epoch=18-step=3875.ckpt')
16 |     model = model.to(device)
17 |     model.eval();
18 | 
19 |     data = SimpsonsMNISTDataModule(dataset_path="../dataset", batch_size=5)
20 |     test_data = data.test_dataloader()
21 |     x, target = next(iter(test_data))
22 |     x = x.to(device)
23 |     
24 |     with torch.no_grad():
25 |         y = model(x)
26 | 
27 |     pred = torch.argmax(y, dim=1)
28 |     print([IDX2CLASS[idx] for idx in pred.cpu().numpy()])
29 |     print([IDX2CLASS[idx] for idx in target.numpy()])
30 | 
31 | 
32 | if __name__ == '__main__':
33 |     test_model_inference()
34 | 


--------------------------------------------------------------------------------
/src/mnist.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Alvaro Bartolome, alvarobartt @ GitHub
 2 | # See LICENSE for details.
 3 | 
 4 | from torch.nn.modules import transformer
 5 | from torchvision import transforms as T
 6 | from torchvision.datasets import ImageFolder
 7 | 
 8 | from torch.utils.data import DataLoader
 9 | 
10 | from pytorch_lightning import LightningDataModule
11 | 
12 | 
13 | class SimpsonsMNISTTransforms(T.Compose):
14 |     def __init__(self, phase):
15 |         self.phase = phase
16 |         self.transforms = {
17 |             'train': [
18 |                 T.Resize((32, 32)),
19 |                 T.RandomHorizontalFlip(),
20 |                 T.ToTensor(),
21 |                 T.Normalize(
22 |                     mean=[0.485, 0.456, 0.406],
23 |                     std=[0.229, 0.224, 0.225]
24 |                 )
25 |             ],
26 |             'val': [
27 |                 T.Resize((32, 32)),
28 |                 T.ToTensor(),
29 |                 T.Normalize(
30 |                     mean=[0.485, 0.456, 0.406],
31 |                     std=[0.229, 0.224, 0.225]
32 |                 )
33 |             ],
34 |             'test': [
35 |                 T.Resize((32, 32)),
36 |                 T.ToTensor(),
37 |                 T.Normalize(
38 |                     mean=[0.485, 0.456, 0.406],
39 |                     std=[0.229, 0.224, 0.225]
40 |                 )
41 |             ]
42 |         }
43 |         
44 |         super().__init__(self.transforms[self.phase])
45 | 
46 | 
47 | class SimpsonsMNISTImageFolder(ImageFolder):
48 |     def __init__(self, root, phase):
49 |         super().__init__(root=f"{root}/{phase}")
50 |         self.phase = phase
51 |         self.transform = SimpsonsMNISTTransforms(phase=phase)
52 | 
53 | 
54 | class SimpsonsMNISTDataModule(LightningDataModule):
55 |     def __init__(self, dataset_path, batch_size):
56 |         super().__init__()
57 |         self.dataset_path = dataset_path
58 |         self.batch_size = batch_size
59 |     
60 |     def train_dataloader(self):
61 |         self.train_image_folder = SimpsonsMNISTImageFolder(root=self.dataset_path, phase='train')
62 | 
63 |         return DataLoader(dataset=self.train_image_folder,
64 |                           batch_size=self.batch_size, 
65 |                           num_workers=12, shuffle=True)
66 |     
67 |     def val_dataloader(self):
68 |         self.val_image_folder = SimpsonsMNISTImageFolder(root=self.dataset_path, phase='val')
69 | 
70 |         return DataLoader(dataset=self.val_image_folder,
71 |                           batch_size=self.batch_size, 
72 |                           num_workers=12)
73 |     
74 |     def test_dataloader(self):
75 |         self.test_image_folder = SimpsonsMNISTImageFolder(root=self.dataset_path, phase='test')
76 |         
77 |         return DataLoader(dataset=self.test_image_folder,
78 |                           batch_size=self.batch_size,
79 |                           num_workers=12, shuffle=True)
80 | 
81 | 
82 | CLASS2IDX = {
83 |     'bart_simpson': 0,
84 |     'charles_montgomery_burns': 1,
85 |     'homer_simpson': 2,
86 |     'krusty_the_clown': 3,
87 |     'lisa_simpson': 4,
88 |     'marge_simpson': 5,
89 |     'milhouse_van_houten': 6,
90 |     'moe_szyslak': 7,
91 |     'ned_flanders': 8,
92 |     'principal_skinner': 9
93 | }
94 | 
95 | IDX2CLASS = {v: k for k, v in CLASS2IDX.items()}
96 | 


--------------------------------------------------------------------------------
/src/model.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Alvaro Bartolome, alvarobartt @ GitHub
 2 | # See LICENSE for details.
 3 | 
 4 | import torch
 5 | import torch.nn as nn
 6 | from torch.nn import functional as F
 7 | 
 8 | from pytorch_lightning import LightningModule
 9 | from torchmetrics.functional import accuracy # LightningDeprecationWarning: `pytorch_lightning.metrics.*` module has been renamed to `torchmetrics.*`
10 | 
11 | 
12 | class SimpsonsNet(LightningModule):
13 |     def __init__(self, num_classes: int = 10):
14 |         super(SimpsonsNet, self).__init__()
15 |         
16 |         self.conv1 = nn.Conv2d(3, 16, kernel_size=3, padding=1)
17 |         self.conv2 = nn.Conv2d(16, 32, kernel_size=3, padding=1)
18 |         self.conv3 = nn.Conv2d(32, 32, kernel_size=3, padding=1)
19 |         
20 |         self.maxpool = nn.MaxPool2d(2)
21 |         self.dropout = nn.Dropout(.5)
22 | 
23 |         self.fc1 = nn.Linear(16*16*32, 64)
24 |         self.fc2 = nn.Linear(64, num_classes)
25 |         
26 |     def forward(self, x):
27 |         x = F.relu(self.conv1(x))
28 |         x = F.relu(self.conv2(x))
29 |         x = F.relu(self.conv3(x))
30 |         x = self.maxpool(x)
31 |         x = x.view(x.size(0), -1)
32 |         x = F.relu(self.fc1(x))
33 |         x = self.dropout(x)
34 |         x = F.log_softmax(self.fc2(x), dim=1)
35 |         return x
36 |     
37 |     def training_step(self, batch, batch_idx):
38 |         x, y = batch
39 |         logits = self(x)
40 |         loss = F.nll_loss(logits, y)
41 |         
42 |         preds = torch.argmax(logits, dim=1)
43 |         acc = accuracy(preds, y)
44 | 
45 |         self.log('train_loss', loss, on_step=True, on_epoch=True, logger=True)
46 |         self.log('train_acc', acc, on_step=True, on_epoch=True, logger=True)
47 | 
48 |         return loss
49 | 
50 |     def validation_step(self, batch, batch_idx):
51 |         x, y = batch
52 |         logits = self(x)
53 |         loss = F.nll_loss(logits, y)
54 | 
55 |         preds = torch.argmax(logits, dim=1)
56 |         acc = accuracy(preds, y)
57 | 
58 |         self.log('val_loss', loss, prog_bar=True)
59 |         self.log('val_acc', acc, prog_bar=True)
60 | 
61 |         return loss
62 | 
63 |     def test_step(self, batch, batch_idx):
64 |         x, y = batch
65 |         logits = self(x)
66 |         loss = F.nll_loss(logits, y)
67 |         
68 |         preds = torch.argmax(logits, dim=1)
69 |         acc = accuracy(preds, y)
70 | 
71 |         self.log('test_loss', loss, prog_bar=True)
72 |         self.log('test_acc', acc, prog_bar=True)
73 | 
74 |         return loss
75 | 
76 |     def configure_optimizers(self):
77 |         return torch.optim.Adam(self.parameters())
78 | 


--------------------------------------------------------------------------------
/src/train.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Alvaro Bartolome, alvarobartt @ GitHub
 2 | # See LICENSE for details.
 3 | 
 4 | from __future__ import absolute_import
 5 | 
 6 | import click
 7 | 
 8 | import wandb
 9 | 
10 | import pytorch_lightning as pl
11 | from pytorch_lightning.loggers import WandbLogger
12 | 
13 | import torch
14 | 
15 | from mnist import SimpsonsMNISTDataModule
16 | from model import SimpsonsNet
17 | 
18 | 
19 | @click.command()
20 | @click.option('-b', '--batch-size', required=True, type=int)
21 | @click.option('-e', '--epochs', required=True, type=int)
22 | def train(batch_size: int, epochs: int):
23 |     """Trains a PyTorch Lightning model using Weights and Biases"""
24 | 
25 |     # Instantiate the model
26 |     model = SimpsonsNet()
27 | 
28 |     # Make sure that the Tensor shapes for the model's input/output work as expected
29 |     try:
30 |         x = torch.randn(1, 3, 32, 32)
31 |         with torch.no_grad():
32 |             y = model(x)
33 |         assert y.shape == torch.Size([1, 10])
34 |     except Exception as e:
35 |         raise e
36 | 
37 |     # Instantiate the LightningDataModule
38 |     data_module = SimpsonsMNISTDataModule(dataset_path="../dataset", batch_size=batch_size)
39 | 
40 |     # Load the DataLoaders for both the train and validation datasets
41 |     train_loader = data_module.train_dataloader()
42 |     val_loader = data_module.val_dataloader()
43 | 
44 |     # Create the configuration of the current run
45 |     wandb_config = {
46 |         'batch_size': batch_size,
47 |         'epochs': epochs,
48 |         'layers': len(list(filter(lambda param: param.requires_grad and len(param.data.size()) > 1, model.parameters()))),
49 |         'parameters': sum(param.numel() for param in model.parameters() if param.requires_grad),
50 |         'train_batches': len(train_loader),
51 |         'val_batches': len(val_loader),
52 |         'dataset': 'Simpsons-MNIST',
53 |         'dataset_train_size': len(data_module.train_image_folder),
54 |         'dataset_val_size': len(data_module.val_image_folder),
55 |         'input_shape': '[3,32,32]',
56 |         'channels_last': False,
57 |         'criterion': 'CrossEntropyLoss',
58 |         'optimizer': 'Adam'
59 |     }
60 | 
61 |     # Init the PyTorch Lightning WandbLogger (you need to `wandb login` first!)
62 |     wandb_logger = WandbLogger(project='ml-monitoring-with-wandb', job_type='train', config=wandb_config)
63 | 
64 |     # Instantiate the PyTorch Lightning Trainer and fit the model
65 |     trainer = pl.Trainer(gpus=1, progress_bar_refresh_rate=10, max_epochs=epochs, logger=wandb_logger)
66 |     trainer.fit(model, train_loader, val_loader)
67 | 
68 |     # Close wandb run
69 |     wandb.finish()
70 | 
71 | 
72 | if __name__ == '__main__':
73 |     train()
74 | 


--------------------------------------------------------------------------------