├── .gitignore ├── .notebooks-setup └── get-started.bash ├── LICENSE ├── README.md ├── day1 ├── 01-pytorch-test-setup.ipynb ├── 02-pytorch-mnist-mlp.ipynb ├── 03-pytorch-mnist-cnn.ipynb ├── 04a-pytorch-imdb-rnn.ipynb ├── 04b-pytorch-imdb-huggingface.ipynb ├── README.md └── solutions │ ├── pytorch-imdb-rnn-example-answer.py │ ├── pytorch-mnist-cnn-example-answer.py │ └── pytorch-mnist-mlp-example-answer.py ├── day2 ├── Exercise_5.md ├── Exercise_6.md ├── Exercise_7.md ├── Exercise_8.md ├── README.md ├── imgs │ ├── avp.png │ ├── dvc.png │ ├── gtsrb-montage.png │ └── traffic-signs.png ├── logs │ └── .gitignore ├── pytorch_20ng_bert.py ├── pytorch_20ng_cnn.py ├── pytorch_20ng_rnn.py ├── pytorch_dvc_cnn_pretrained.py ├── pytorch_dvc_cnn_pretrained_multigpu.py ├── pytorch_dvc_cnn_simple.py ├── pytorch_dvc_vit.py ├── pytorch_generate_gpt.ipynb ├── pytorch_gtsrb_cnn_pretrained.py ├── pytorch_gtsrb_cnn_simple.py ├── pytorch_gtsrb_vit.py ├── pytorch_imdb_gpt.py ├── pytorch_imdb_gpt_multigpu.py ├── pytorch_test.py ├── run-2gpus-torchrun.sh ├── run-2gpus.sh ├── run-8gpus.sh └── run.sh └── requirements.txt /.gitignore: -------------------------------------------------------------------------------- 1 | day2/*.png 2 | day2/*.out 3 | day2/*.h5 4 | day2/*.pt 5 | .ipynb_checkpoints 6 | *~ 7 | data/ 8 | __pycache__/ 9 | day1/**/model.png 10 | day1/optional/pml_utils.py 11 | day2/mlruns/ 12 | -------------------------------------------------------------------------------- /.notebooks-setup/get-started.bash: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | ## Script that downloads the code for doing the Deep Learning course exercises 3 | cd /home/jovyan 4 | 5 | # git reflog requires a name and email if user is not in passwd 6 | # even if you're only cloning 7 | export GIT_COMMITTER_NAME=anonymous 8 | export GIT_COMMITTER_EMAIL=anon@localhost 9 | 10 | #git clone -b vuokatti2021 --single-branch https://github.com/csc-training/intro-to-dl.git 11 | git clone https://github.com/csc-training/intro-to-dl 12 | 13 | rmdir work 14 | rm get-started.bash 15 | #pip install imageio h5py tqdm 16 | #pip install scikit-learn 17 | #pip install torchtext 18 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019-2024 CSC - IT Center for Science Ltd. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Introduction to deep learning 2 | 3 | This repository contains the exercise materials for the [CSC](https://www.csc.fi/) course [Practical deep learning](https://csc.fi/koulutuskalenteri/practical-deep-learning-5/). 4 | 5 | - [Exercises for day 1](day1/README.md) 6 | - [Exercises for day 2](day2/README.md) 7 | -------------------------------------------------------------------------------- /day1/01-pytorch-test-setup.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "deletable": true, 7 | "editable": true 8 | }, 9 | "source": [ 10 | "# Notebook for testing the PyTorch setup\n", 11 | "\n", 12 | "This notebook is for testing the [PyTorch](http://pytorch.org/) setup. Below is a set of required imports. \n", 13 | "\n", 14 | "Run the cell, and no error messages should appear." 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": { 21 | "collapsed": false, 22 | "deletable": true, 23 | "editable": true, 24 | "jupyter": { 25 | "outputs_hidden": false 26 | } 27 | }, 28 | "outputs": [], 29 | "source": [ 30 | "%matplotlib inline\n", 31 | "\n", 32 | "import torch\n", 33 | "import torch.nn as nn\n", 34 | "import torchvision\n", 35 | "from torch.utils.data import DataLoader\n", 36 | "from torchvision import datasets\n", 37 | "import torchvision.transforms as transforms\n", 38 | "\n", 39 | "from datasets import load_dataset\n", 40 | "from tokenizers import Tokenizer\n", 41 | "from tokenizers import models, trainers, pre_tokenizers, normalizers, processors\n", 42 | "\n", 43 | "from packaging.version import Version as LV\n", 44 | "from tqdm import tqdm\n", 45 | "\n", 46 | "import numpy as np\n", 47 | "import matplotlib.pyplot as plt\n", 48 | "import seaborn as sns\n", 49 | "sns.set()\n", 50 | "\n", 51 | "print('Using PyTorch version:', torch.__version__)\n", 52 | "assert(LV(torch.__version__) >= LV(\"2.0\"))" 53 | ] 54 | }, 55 | { 56 | "cell_type": "markdown", 57 | "metadata": {}, 58 | "source": [ 59 | "Let's check if we have GPU available." 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": null, 65 | "metadata": { 66 | "deletable": true, 67 | "editable": true 68 | }, 69 | "outputs": [], 70 | "source": [ 71 | "if torch.cuda.is_available():\n", 72 | " print('Using GPU, device name:', torch.cuda.get_device_name(0))\n", 73 | " device = torch.device('cuda')\n", 74 | "else:\n", 75 | " print('No GPU found, using CPU instead.') \n", 76 | " device = torch.device('cpu')" 77 | ] 78 | }, 79 | { 80 | "cell_type": "markdown", 81 | "metadata": {}, 82 | "source": [ 83 | "## Tensors in PyTorch\n", 84 | "\n", 85 | "Tensors are data structures that contain vectors, matrices or higher-dimensional arrays. They are similar to NumPy's ndarrays, except that PyTorch tensors can also run on GPUs and other hardware accelerators. Also check the [PyTorch Tensors tutorial](https://pytorch.org/tutorials/beginner/basics/tensorqs_tutorial.html).\n", 86 | "\n", 87 | "Let's create some tensors and investigate their shapes and data types." 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": null, 93 | "metadata": {}, 94 | "outputs": [], 95 | "source": [ 96 | "x = torch.ones(3, 4)\n", 97 | "print(type(x))" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": null, 103 | "metadata": {}, 104 | "outputs": [], 105 | "source": [ 106 | "print(\"x.shape =\",x.shape)\n", 107 | "print(\"x.dtype =\", x.dtype)\n", 108 | "print(\"x =\", x)" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": null, 114 | "metadata": {}, 115 | "outputs": [], 116 | "source": [ 117 | "data = [[1, 2, 3],[4, 5, 6]]\n", 118 | "y = torch.tensor(data, dtype=torch.float)\n", 119 | "\n", 120 | "print(\"y.shape =\", y.shape)\n", 121 | "print(\"y.dtype =\", y.dtype)\n", 122 | "print(\"y =\", y)" 123 | ] 124 | }, 125 | { 126 | "cell_type": "markdown", 127 | "metadata": {}, 128 | "source": [ 129 | "### Operations on tensors\n", 130 | "\n", 131 | "There are a lot of built-in [operations that can be run on tensors](https://pytorch.org/docs/stable/torch.html). Let's try matrix multiplication:" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": null, 137 | "metadata": {}, 138 | "outputs": [], 139 | "source": [ 140 | "# This computes the matrix product y x\n", 141 | "z = y.matmul(x)\n", 142 | "\n", 143 | "print(\"z.shape =\", z.shape)\n", 144 | "print(\"z.dtype =\", z.dtype)\n", 145 | "print(\"z =\", z)" 146 | ] 147 | }, 148 | { 149 | "cell_type": "markdown", 150 | "metadata": {}, 151 | "source": [ 152 | "### Devices\n", 153 | "\n", 154 | "We mentioned that PyTorch tensors can also be used on GPUs. We can check what device our tensors is on with `x.device`, we can move it to another device with `x.to(device)` where `device` can be defined dynamically based on if we have GPU available or not. We already did this above with code similar to this:\n", 155 | "\n", 156 | "```python\n", 157 | "if torch.cuda.is_available():\n", 158 | " device = torch.device('cuda')\n", 159 | "else:\n", 160 | " device = torch.device('cpu')\n", 161 | "```\n", 162 | "\n", 163 | "If we don't have a GPU the tensor will just stay on the CPU." 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": null, 169 | "metadata": {}, 170 | "outputs": [], 171 | "source": [ 172 | "print(\"(before) x.device =\", x.device)\n", 173 | "x = x.to(device)\n", 174 | "print(\"(after) x.device =\", x.device)" 175 | ] 176 | }, 177 | { 178 | "cell_type": "markdown", 179 | "metadata": {}, 180 | "source": [ 181 | "If our tensors are now on the GPU, the matrix multiplication will also take place on the GPU and be much faster (of course not something we would notice in this trivial example)." 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": null, 187 | "metadata": {}, 188 | "outputs": [], 189 | "source": [ 190 | "y = y.to(device)\n", 191 | "z = y.matmul(x)" 192 | ] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "execution_count": null, 197 | "metadata": {}, 198 | "outputs": [], 199 | "source": [ 200 | "print(\"z.device =\", z.device)\n", 201 | "print(\"z =\", z)" 202 | ] 203 | }, 204 | { 205 | "cell_type": "code", 206 | "execution_count": null, 207 | "metadata": {}, 208 | "outputs": [], 209 | "source": [] 210 | } 211 | ], 212 | "metadata": { 213 | "kernelspec": { 214 | "display_name": "Python 3 (ipykernel)", 215 | "language": "python", 216 | "name": "python3" 217 | }, 218 | "language_info": { 219 | "codemirror_mode": { 220 | "name": "ipython", 221 | "version": 3 222 | }, 223 | "file_extension": ".py", 224 | "mimetype": "text/x-python", 225 | "name": "python", 226 | "nbconvert_exporter": "python", 227 | "pygments_lexer": "ipython3", 228 | "version": "3.10.12" 229 | } 230 | }, 231 | "nbformat": 4, 232 | "nbformat_minor": 4 233 | } 234 | -------------------------------------------------------------------------------- /day1/02-pytorch-mnist-mlp.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# MNIST handwritten digit classification with MLPs\n", 8 | "\n", 9 | "In this notebook, we'll train a multi-layer perceptron model to classify MNIST digits using **PyTorch**. \n", 10 | "\n", 11 | "First, the needed imports. " 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": null, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "%matplotlib inline\n", 21 | "\n", 22 | "import torch\n", 23 | "import torch.nn as nn\n", 24 | "from torch.utils.data import DataLoader\n", 25 | "from torchvision import datasets\n", 26 | "from torchvision.transforms import ToTensor\n", 27 | "\n", 28 | "import numpy as np\n", 29 | "import matplotlib.pyplot as plt\n", 30 | "import os\n", 31 | "\n", 32 | "print('Using PyTorch version:', torch.__version__)\n", 33 | "if torch.cuda.is_available():\n", 34 | " print('Using GPU, device name:', torch.cuda.get_device_name(0))\n", 35 | " device = torch.device('cuda')\n", 36 | "else:\n", 37 | " print('No GPU found, using CPU instead.') \n", 38 | " device = torch.device('cpu')" 39 | ] 40 | }, 41 | { 42 | "cell_type": "markdown", 43 | "metadata": {}, 44 | "source": [ 45 | "## Loading data\n", 46 | "\n", 47 | "PyTorch has two classes from [`torch.utils.data` to work with data](https://pytorch.org/docs/stable/data.html#module-torch.utils.data): \n", 48 | "- [Dataset](https://pytorch.org/docs/stable/data.html#torch.utils.data.Dataset) which represents the actual data items, such as images or pieces of text, and their labels\n", 49 | "- [DataLoader](https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader) which is used for processing the dataset in batches in an efficient manner.\n", 50 | "\n", 51 | "Here we will use TorchVision and `torchvision.datasets` which provides easy access to [many common visual datasets](https://pytorch.org/vision/stable/datasets.html). In this example we'll use the [MNIST class](https://pytorch.org/vision/stable/generated/torchvision.datasets.MNIST.html#torchvision.datasets.MNIST) for loading the [MNIST dataset](https://en.wikipedia.org/wiki/MNIST_database)." 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "batch_size = 32\n", 61 | "\n", 62 | "slurm_project = os.getenv('SLURM_JOB_ACCOUNT')\n", 63 | "data_dir = os.path.join('/scratch', slurm_project, 'data') if slurm_project else './data'\n", 64 | "print('data_dir =', data_dir)\n", 65 | "\n", 66 | "train_dataset = datasets.MNIST(data_dir, train=True, download=True, transform=ToTensor())\n", 67 | "test_dataset = datasets.MNIST(data_dir, train=False, transform=ToTensor())\n", 68 | "\n", 69 | "train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)\n", 70 | "test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)" 71 | ] 72 | }, 73 | { 74 | "cell_type": "markdown", 75 | "metadata": {}, 76 | "source": [ 77 | "The data loaders provide a way of iterating (making a loop over) the datasets, each time getting a new batch of data with the given batch size.\n", 78 | "\n", 79 | "The first element of the data batch (`data`) is a 4th-order tensor of size (`batch_size`, 1, 28, 28), i.e. it consists of a batch of images of size 1x28x28 pixels, where the first value is the number of color channels (only 1 in this case as it's gray scale).\n", 80 | "\n", 81 | "The second element of the batch (`target`) is a vector containing the correct (or \"target\") classes (\"0\", \"1\", ..., \"9\") for each training digit." 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": null, 87 | "metadata": {}, 88 | "outputs": [], 89 | "source": [ 90 | "for (data, target) in train_loader:\n", 91 | " print('data:', data.size(), 'type:', data.type())\n", 92 | " print('target:', target.size(), 'type:', target.type())\n", 93 | " break" 94 | ] 95 | }, 96 | { 97 | "cell_type": "markdown", 98 | "metadata": {}, 99 | "source": [ 100 | "Here are the first 10 training digits:" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": null, 106 | "metadata": {}, 107 | "outputs": [], 108 | "source": [ 109 | "pltsize=1\n", 110 | "plt.figure(figsize=(10*pltsize, pltsize))\n", 111 | "\n", 112 | "for i in range(10):\n", 113 | " plt.subplot(1,10,i+1)\n", 114 | " plt.axis('off')\n", 115 | " plt.imshow(data[i,:,:,:].numpy().reshape(28,28), cmap=\"gray_r\")\n", 116 | " plt.title('Class: '+str(target[i].item()))" 117 | ] 118 | }, 119 | { 120 | "cell_type": "markdown", 121 | "metadata": {}, 122 | "source": [ 123 | "## Multi-layer perceptron (MLP) network\n", 124 | "\n", 125 | "In PyTorch, a neural network is defined as a Python class. It needs to have two methods:\n", 126 | "\n", 127 | "- `__init__()` which initializes the layers used in the network\n", 128 | "- `forward()` which defines how the network performs a forward pass\n", 129 | "\n", 130 | "PyTorch will then automatically generate a `backward()` method that computes the gradients based on the computation done in the forward pass.\n", 131 | "\n", 132 | "All the [neural network building blocks defined in PyTorch can be found in the torch.nn documentation](https://pytorch.org/docs/stable/nn.html).\n", 133 | "\n", 134 | "We use `nn.Sequential` to more easily create a simple sequental neural network:\n", 135 | "\n", 136 | "- First we need to \"flatten\" the 2D image into a vector with `nn.Flatten`\n", 137 | "\n", 138 | "- Next a fully-connected layer with 20 neurons is created with `nn.Linear`. Note that we need to specify the number of input and output connections. In this case there are 28x28=784 inputs, and 20 outputs\n", 139 | "\n", 140 | "- Next, a ReLU non-linear activation\n", 141 | "\n", 142 | "- Finally the output of the last layer needs to be a 10-dimensional vector to match the ground truth of ten classes (the ten digits).\n", 143 | "\n", 144 | "The output of the last layer should be normalized with softmax, but this is actually included implicitly in the loss function in PyTorch (see below)." 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": null, 150 | "metadata": {}, 151 | "outputs": [], 152 | "source": [ 153 | "class SimpleMLP(nn.Module):\n", 154 | " def __init__(self):\n", 155 | " super().__init__()\n", 156 | " self.layers = nn.Sequential(\n", 157 | " nn.Flatten(),\n", 158 | " nn.Linear(28*28, 20),\n", 159 | " nn.ReLU(),\n", 160 | " nn.Linear(20, 10)\n", 161 | " )\n", 162 | "\n", 163 | " def forward(self, x):\n", 164 | " return self.layers(x)\n", 165 | "\n", 166 | "model = SimpleMLP().to(device)\n", 167 | "print(model)" 168 | ] 169 | }, 170 | { 171 | "cell_type": "markdown", 172 | "metadata": {}, 173 | "source": [ 174 | "# Training the model\n", 175 | "\n", 176 | "In order to train the model we need to define a loss function and an optimizer.\n", 177 | "\n", 178 | "For a classification task we typically use the cross entropy loss. For this we can use the class [CrossEntropyLoss](https://pytorch.org/docs/stable/generated/torch.nn.CrossEntropyLoss.html).\n", 179 | "\n", 180 | "**Note:** if you read the documentation of `CrossEntropyLoss` carefully you will see that it expects the unnormalized raw outputs of the model as softmax is included implicitly in PyTorch's implementation of `CrossEntropyLoss`. This is why we don't need to explicitly use softmax in the network definition above.\n", 181 | "\n", 182 | "Finally, we need to define an optimizer, which tells how to update the model parameters based on the computed gradients. There are [several different optimizer algorithms implemented in PyTorch](https://pytorch.org/docs/stable/optim.html#algorithms)." 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": null, 188 | "metadata": {}, 189 | "outputs": [], 190 | "source": [ 191 | "criterion = nn.CrossEntropyLoss()\n", 192 | "optimizer = torch.optim.Adam(model.parameters())" 193 | ] 194 | }, 195 | { 196 | "cell_type": "markdown", 197 | "metadata": {}, 198 | "source": [ 199 | "In PyTorch we have to write the training loop ourselves.\n", 200 | "\n", 201 | "The code below consists of two loops:\n", 202 | "\n", 203 | "- The outer loop goes over a number of *epochs*. An epoch is a single pass through the whole training data.\n", 204 | "- The inner loop goes over all the batches of the dataset. Here we have defined the batch size to be 32, so images are handled 32 at a time.\n", 205 | "\n", 206 | "For each batch we:\n", 207 | "\n", 208 | "- Copy the data to the GPU with the `.to(device)` method. If we don't have a GPU, these commands will not do anything.\n", 209 | "\n", 210 | "- Do a forward pass, which is as simple as: `output = model(data)`\n", 211 | "\n", 212 | "- Finally we calculate the loss - that is the error between the output of the network and the target we want to get - using the `criterion` function we defined earlier\n", 213 | "\n", 214 | "- The last lines do the backward propagation with `loss.backward()`, the weights are updated with `optimizer.step()` and finally we need to zero the gradient counters with `optimizer.zero_grad()`." 215 | ] 216 | }, 217 | { 218 | "cell_type": "markdown", 219 | "metadata": {}, 220 | "source": [ 221 | "First, a helper function to calculate the number of correctly classified digits." 222 | ] 223 | }, 224 | { 225 | "cell_type": "code", 226 | "execution_count": null, 227 | "metadata": {}, 228 | "outputs": [], 229 | "source": [ 230 | "def correct(output, target):\n", 231 | " predicted_digits = output.argmax(1) # pick digit with largest network output\n", 232 | " correct_ones = (predicted_digits == target).type(torch.float) # 1.0 for correct, 0.0 for incorrect\n", 233 | " return correct_ones.sum().item() # count number of correct ones\n" 234 | ] 235 | }, 236 | { 237 | "cell_type": "markdown", 238 | "metadata": {}, 239 | "source": [ 240 | "Next a function for a single training epoch." 241 | ] 242 | }, 243 | { 244 | "cell_type": "code", 245 | "execution_count": null, 246 | "metadata": {}, 247 | "outputs": [], 248 | "source": [ 249 | "def train(data_loader, model, criterion, optimizer):\n", 250 | " model.train()\n", 251 | "\n", 252 | " num_batches = len(data_loader)\n", 253 | " num_items = len(data_loader.dataset)\n", 254 | "\n", 255 | " total_loss = 0\n", 256 | " total_correct = 0\n", 257 | " for data, target in data_loader:\n", 258 | " # Copy data and targets to GPU\n", 259 | " data = data.to(device)\n", 260 | " target = target.to(device)\n", 261 | " \n", 262 | " # Do a forward pass\n", 263 | " output = model(data)\n", 264 | " \n", 265 | " # Calculate the loss\n", 266 | " loss = criterion(output, target)\n", 267 | " total_loss += loss\n", 268 | "\n", 269 | " # Count number of correct digits\n", 270 | " total_correct += correct(output, target)\n", 271 | " \n", 272 | " # Backpropagation\n", 273 | " loss.backward()\n", 274 | " optimizer.step()\n", 275 | " optimizer.zero_grad()\n", 276 | "\n", 277 | " train_loss = total_loss/num_batches\n", 278 | " accuracy = total_correct/num_items\n", 279 | " print(f\"Average loss: {train_loss:7f}, accuracy: {accuracy:.2%}\")\n" 280 | ] 281 | }, 282 | { 283 | "cell_type": "code", 284 | "execution_count": null, 285 | "metadata": {}, 286 | "outputs": [], 287 | "source": [ 288 | "%%time\n", 289 | "\n", 290 | "epochs = 10\n", 291 | "for epoch in range(epochs):\n", 292 | " print(f\"Training epoch: {epoch+1}\")\n", 293 | " train(train_loader, model, criterion, optimizer)" 294 | ] 295 | }, 296 | { 297 | "cell_type": "markdown", 298 | "metadata": {}, 299 | "source": [ 300 | "### Inference\n", 301 | "\n", 302 | "For a better measure of the quality of the model, let's see the model accuracy for the test data.\n", 303 | "\n", 304 | "The code is similar to the training code: we just loop over the whole testset, but no need to do backpropagation or calculate any gradients this time." 305 | ] 306 | }, 307 | { 308 | "cell_type": "code", 309 | "execution_count": null, 310 | "metadata": {}, 311 | "outputs": [], 312 | "source": [ 313 | "def test(test_loader, model, criterion):\n", 314 | " model.eval()\n", 315 | "\n", 316 | " num_batches = len(test_loader)\n", 317 | " num_items = len(test_loader.dataset)\n", 318 | "\n", 319 | " test_loss = 0\n", 320 | " total_correct = 0\n", 321 | "\n", 322 | " with torch.no_grad():\n", 323 | " for data, target in test_loader:\n", 324 | " # Copy data and targets to GPU\n", 325 | " data = data.to(device)\n", 326 | " target = target.to(device)\n", 327 | " \n", 328 | " # Do a forward pass\n", 329 | " output = model(data)\n", 330 | " \n", 331 | " # Calculate the loss\n", 332 | " loss = criterion(output, target)\n", 333 | " test_loss += loss.item()\n", 334 | " \n", 335 | " # Count number of correct digits\n", 336 | " total_correct += correct(output, target)\n", 337 | "\n", 338 | " test_loss = test_loss/num_batches\n", 339 | " accuracy = total_correct/num_items\n", 340 | "\n", 341 | " print(f\"Testset accuracy: {100*accuracy:>0.1f}%, average loss: {test_loss:>7f}\")" 342 | ] 343 | }, 344 | { 345 | "cell_type": "code", 346 | "execution_count": null, 347 | "metadata": {}, 348 | "outputs": [], 349 | "source": [ 350 | "test(test_loader, model, criterion)" 351 | ] 352 | }, 353 | { 354 | "cell_type": "markdown", 355 | "metadata": {}, 356 | "source": [ 357 | "## Task 1: Model with two linear layers\n", 358 | "\n", 359 | "Your task is to try the same problem as above, but with a more complex model. The new model should have **two linear layers**, each with:\n", 360 | "\n", 361 | "- 50 units\n", 362 | "- ReLU activation\n", 363 | "- each followed by a dropout layer with a rate of 0.2 - hint: try [nn.Dropout](https://pytorch.org/docs/stable/generated/torch.nn.Dropout.html#torch.nn.Dropout)\n", 364 | "\n", 365 | "Dropout randomly sets a fraction of inputs to zero during training, which is one approach to regularization and can sometimes help to prevent overfitting.\n", 366 | "\n", 367 | "You can consult the [PyTorch documentation](https://pytorch.org/docs/stable/index.html), in particular all the [neural network building blocks can be found in the `torch.nn` documentation](https://pytorch.org/docs/stable/nn.html).\n", 368 | "\n", 369 | "The code below is missing the model definition. You can copy any suitable layers from the example above." 370 | ] 371 | }, 372 | { 373 | "cell_type": "code", 374 | "execution_count": null, 375 | "metadata": {}, 376 | "outputs": [], 377 | "source": [ 378 | "class TwoLayerMLP(nn.Module):\n", 379 | " def __init__(self):\n", 380 | " super().__init__()\n", 381 | " self.layers = nn.Sequential(\n", 382 | " # TASK 1: ADD LAYERS HERE\n", 383 | " )\n", 384 | "\n", 385 | " def forward(self, x):\n", 386 | " return self.layers(x)\n" 387 | ] 388 | }, 389 | { 390 | "cell_type": "markdown", 391 | "metadata": {}, 392 | "source": [ 393 | "If you want to see an example answer, change the type of the cell below to \"Code\" in the menu bar above and then execute the cell. Execute the cell again to run the example code.\n", 394 | "\n", 395 | "**Note:** in Google Colab you can [click here](https://github.com/csc-training/intro-to-dl/blob/master/day1/solutions/pytorch-mnist-mlp-example-answer.py) and copy the answer manually." 396 | ] 397 | }, 398 | { 399 | "cell_type": "raw", 400 | "metadata": {}, 401 | "source": [ 402 | "%load solutions/pytorch-mnist-mlp-example-answer.py" 403 | ] 404 | }, 405 | { 406 | "cell_type": "code", 407 | "execution_count": null, 408 | "metadata": {}, 409 | "outputs": [], 410 | "source": [ 411 | "ex1_model = TwoLayerMLP()\n", 412 | "print(ex1_model)\n", 413 | "\n", 414 | "assert len(ex1_model.layers) > 0, \"ERROR: You need to write the missing model definition above!\"\n", 415 | "ex1_model = ex1_model.to(device)" 416 | ] 417 | }, 418 | { 419 | "cell_type": "code", 420 | "execution_count": null, 421 | "metadata": {}, 422 | "outputs": [], 423 | "source": [ 424 | "ex1_criterion = nn.CrossEntropyLoss()\n", 425 | "ex1_optimizer = torch.optim.Adam(ex1_model.parameters())" 426 | ] 427 | }, 428 | { 429 | "cell_type": "code", 430 | "execution_count": null, 431 | "metadata": {}, 432 | "outputs": [], 433 | "source": [ 434 | "%%time\n", 435 | "\n", 436 | "epochs = 10\n", 437 | "for epoch in range(epochs):\n", 438 | " print(f\"Epoch: {epoch+1} ...\")\n", 439 | " train(train_loader, ex1_model, ex1_criterion, ex1_optimizer)" 440 | ] 441 | }, 442 | { 443 | "cell_type": "code", 444 | "execution_count": null, 445 | "metadata": {}, 446 | "outputs": [], 447 | "source": [ 448 | "test(test_loader, ex1_model, ex1_criterion)" 449 | ] 450 | }, 451 | { 452 | "cell_type": "markdown", 453 | "metadata": {}, 454 | "source": [ 455 | "## Task 2: Model tuning\n", 456 | "\n", 457 | "Modify the MLP model. Try to improve the classification accuracy, or experiment with the effects of different parameters. If you are interested in the state-of-the-art performance on permutation invariant MNIST, see e.g. [this paper](https://arxiv.org/abs/1507.02672) by Aalto University / The Curious AI Company researchers." 458 | ] 459 | }, 460 | { 461 | "cell_type": "code", 462 | "execution_count": null, 463 | "metadata": {}, 464 | "outputs": [], 465 | "source": [] 466 | }, 467 | { 468 | "cell_type": "markdown", 469 | "metadata": {}, 470 | "source": [ 471 | "## Task 3: Fashion-MNIST\n", 472 | "\n", 473 | "MNIST can be replaced with Fashion-MNIST, which can be used as drop-in replacement for MNIST. Fashion-MNIST contains images of 10 fashion categories:\n", 474 | "\n", 475 | "Label|Description|Label|Description\n", 476 | "--- | --- |--- | ---\n", 477 | "0|T-shirt/top|5|Sandal\n", 478 | "1|Trouser|6|Shirt\n", 479 | "2|Pullover|7|Sneaker\n", 480 | "3|Dress|8|Bag\n", 481 | "4|Coat|9|Ankle boot\n", 482 | "\n", 483 | "Replace the loading of MNIST data with Fashion-MNIST in the beginning of this notebook and re-run the experiments. [Fashion-MNIST can be found with the dataset class `FashionMNIST`](https://pytorch.org/vision/stable/generated/torchvision.datasets.FashionMNIST.html#torchvision.datasets.FashionMNIST)." 484 | ] 485 | }, 486 | { 487 | "cell_type": "code", 488 | "execution_count": null, 489 | "metadata": {}, 490 | "outputs": [], 491 | "source": [] 492 | } 493 | ], 494 | "metadata": { 495 | "kernelspec": { 496 | "display_name": "Python 3 (ipykernel)", 497 | "language": "python", 498 | "name": "python3" 499 | }, 500 | "language_info": { 501 | "codemirror_mode": { 502 | "name": "ipython", 503 | "version": 3 504 | }, 505 | "file_extension": ".py", 506 | "mimetype": "text/x-python", 507 | "name": "python", 508 | "nbconvert_exporter": "python", 509 | "pygments_lexer": "ipython3", 510 | "version": "3.10.12" 511 | } 512 | }, 513 | "nbformat": 4, 514 | "nbformat_minor": 4 515 | } 516 | -------------------------------------------------------------------------------- /day1/03-pytorch-mnist-cnn.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# MNIST handwritten digit classification with CNNs\n", 8 | "\n", 9 | "In this notebook, we'll train a convolutional neural network (CNN, ConvNet) to classify MNIST digits using **PyTorch**. \n", 10 | "\n", 11 | "First, the needed imports. " 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": null, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "%matplotlib inline\n", 21 | "\n", 22 | "import torch\n", 23 | "import torch.nn as nn\n", 24 | "from torch.utils.data import DataLoader\n", 25 | "from torchvision import datasets\n", 26 | "from torchvision.transforms import ToTensor\n", 27 | "\n", 28 | "import numpy as np\n", 29 | "import matplotlib.pyplot as plt\n", 30 | "from tqdm import tqdm\n", 31 | "import os\n", 32 | "\n", 33 | "print('Using PyTorch version:', torch.__version__)\n", 34 | "if torch.cuda.is_available():\n", 35 | " print('Using GPU, device name:', torch.cuda.get_device_name(0))\n", 36 | " device = torch.device('cuda')\n", 37 | "else:\n", 38 | " print('No GPU found, using CPU instead.') \n", 39 | " device = torch.device('cpu')" 40 | ] 41 | }, 42 | { 43 | "cell_type": "markdown", 44 | "metadata": {}, 45 | "source": [ 46 | "## MNIST data set" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": null, 52 | "metadata": {}, 53 | "outputs": [], 54 | "source": [ 55 | "batch_size = 32\n", 56 | "\n", 57 | "slurm_project = os.getenv('SLURM_JOB_ACCOUNT')\n", 58 | "data_dir = os.path.join('/scratch', slurm_project, 'data') if slurm_project else './data'\n", 59 | "print('data_dir =', data_dir)\n", 60 | "\n", 61 | "train_dataset = datasets.MNIST(data_dir, train=True, download=True, transform=ToTensor())\n", 62 | "test_dataset = datasets.MNIST(data_dir, train=False, transform=ToTensor())\n", 63 | "\n", 64 | "train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)\n", 65 | "test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)" 66 | ] 67 | }, 68 | { 69 | "cell_type": "markdown", 70 | "metadata": {}, 71 | "source": [ 72 | "## Convolutional neural network\n", 73 | "\n", 74 | "Now we are ready to create a convolutional model. As before we use `nn.Sequential` to easily create a sequence of layers.\n", 75 | "\n", 76 | "Here we use:\n", 77 | "\n", 78 | "- [Conv2d](https://pytorch.org/docs/stable/generated/torch.nn.Conv2d.html#torch.nn.Conv2d), which operates on 2D matrices so we input the digit images directly to the model (no need to \"flatten\" at this point),\n", 79 | "- [MaxPool2d](https://pytorch.org/docs/stable/generated/torch.nn.MaxPool2d.html#torch.nn.MaxPool2d) reduces the spatial dimensions, that is, makes the image smaller,\n", 80 | "- Finally we flatten the image to a vector and add two linear layers.\n", 81 | "\n", 82 | "All the [neural network building blocks defined in PyTorch can be found in the torch.nn documentation](https://pytorch.org/docs/stable/nn.html).\n", 83 | "\n", 84 | "The output of the last layer should be normalized with softmax, but this is actually included implicitly in the loss function in PyTorch (see below)." 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": null, 90 | "metadata": {}, 91 | "outputs": [], 92 | "source": [ 93 | "class SimpleCNN(nn.Module):\n", 94 | " def __init__(self):\n", 95 | " super().__init__()\n", 96 | " self.layers = nn.Sequential(\n", 97 | " nn.Conv2d(1, 32, kernel_size=3, padding='valid'),\n", 98 | " nn.ReLU(),\n", 99 | " nn.MaxPool2d(kernel_size=2),\n", 100 | " nn.Flatten(),\n", 101 | " nn.Linear(32*13*13, 128),\n", 102 | " nn.ReLU(),\n", 103 | " nn.Linear(128, 10)\n", 104 | " )\n", 105 | "\n", 106 | " def forward(self, x):\n", 107 | " return self.layers(x)\n", 108 | "\n", 109 | "model = SimpleCNN().to(device)\n", 110 | "print(model)" 111 | ] 112 | }, 113 | { 114 | "cell_type": "markdown", 115 | "metadata": {}, 116 | "source": [ 117 | "**Note:** one slightly tricky thing in the code above is that you have to know the input dimension for the first linear layer. This is the output of the `Conv2d` followed by the `MaxPool2d`. This can be reasoned as follows:\n", 118 | "\n", 119 | "- the input to `Conv2d` will be 1x28x28 as the images have a single color channel (gray scale) and have a width by height of 28x28\n", 120 | "- the output of `Conv2d` will be 32x26x26 as the color channels are replaced by the outputs of the 32 convolution kernels, and due the valid padding and kernel size of 3x3 a border of 1 pixel will be excluded\n", 121 | "- `MaxPool2d` will remove every second pixel along each dimension, so we get 32x13x13=5408\n", 122 | "\n", 123 | "If you are lazy you can also just guess something and run the code. The error message will tell you what size it expected to have!" 124 | ] 125 | }, 126 | { 127 | "cell_type": "markdown", 128 | "metadata": {}, 129 | "source": [ 130 | "## Learning\n", 131 | "\n", 132 | "Now let's train the CNN model.\n", 133 | "\n", 134 | "First we'll define the same functions as in the previous exercise. We've made a few minor additions:\n", 135 | "- In the `train` function we added `tqdm` to print a nicer progress bar as the training will be a bit slower this time.\n", 136 | "- We return the loss and accuracy so we can do some plotting" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": null, 142 | "metadata": {}, 143 | "outputs": [], 144 | "source": [ 145 | "def correct(output, target):\n", 146 | " predicted_digits = output.argmax(1) # pick digit with largest network output\n", 147 | " correct_ones = (predicted_digits == target).type(torch.float) # 1.0 for correct, 0.0 for incorrect\n", 148 | " return correct_ones.sum().item() # count number of correct ones\n" 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": null, 154 | "metadata": {}, 155 | "outputs": [], 156 | "source": [ 157 | "def train(data_loader, model, criterion, optimizer):\n", 158 | " model.train()\n", 159 | "\n", 160 | " num_batches = len(data_loader)\n", 161 | " num_items = len(data_loader.dataset)\n", 162 | "\n", 163 | " total_loss = 0\n", 164 | " total_correct = 0\n", 165 | " for data, target in tqdm(data_loader, total=num_batches):\n", 166 | " # Copy data and targets to GPU\n", 167 | " data = data.to(device)\n", 168 | " target = target.to(device)\n", 169 | " \n", 170 | " # Do a forward pass\n", 171 | " output = model(data)\n", 172 | " \n", 173 | " # Calculate the loss\n", 174 | " loss = criterion(output, target)\n", 175 | " total_loss += loss\n", 176 | "\n", 177 | " # Count number of correct digits\n", 178 | " total_correct += correct(output, target)\n", 179 | " \n", 180 | " # Backpropagation\n", 181 | " loss.backward()\n", 182 | " optimizer.step()\n", 183 | " optimizer.zero_grad()\n", 184 | "\n", 185 | " train_loss = total_loss/num_batches\n", 186 | " accuracy = total_correct/num_items\n", 187 | " print(f\"Average loss: {train_loss:7f}, accuracy: {accuracy:.2%}\")\n", 188 | " return train_loss.item(), accuracy\n" 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "execution_count": null, 194 | "metadata": {}, 195 | "outputs": [], 196 | "source": [ 197 | "criterion = nn.CrossEntropyLoss()\n", 198 | "optimizer = torch.optim.Adam(model.parameters())" 199 | ] 200 | }, 201 | { 202 | "cell_type": "markdown", 203 | "metadata": {}, 204 | "source": [ 205 | "This is a relatively complex model, so training is considerably slower than with MLPs. " 206 | ] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "execution_count": null, 211 | "metadata": {}, 212 | "outputs": [], 213 | "source": [ 214 | "%%time\n", 215 | "\n", 216 | "losses = []\n", 217 | "accuracies = []\n", 218 | "epochs = 5\n", 219 | "for epoch in range(epochs):\n", 220 | " print(f\"Training epoch: {epoch+1}\")\n", 221 | " loss, acc = train(train_loader, model, criterion, optimizer)\n", 222 | " losses.append(loss)\n", 223 | " accuracies.append(acc)" 224 | ] 225 | }, 226 | { 227 | "cell_type": "markdown", 228 | "metadata": {}, 229 | "source": [ 230 | "Let's plot how the loss and accuracy change over the epochs" 231 | ] 232 | }, 233 | { 234 | "cell_type": "code", 235 | "execution_count": null, 236 | "metadata": {}, 237 | "outputs": [], 238 | "source": [ 239 | "plt.subplot(2,1,1)\n", 240 | "plt.plot(losses)\n", 241 | "plt.ylabel(\"Loss\")\n", 242 | "plt.subplot(2,1,2)\n", 243 | "plt.plot(accuracies)\n", 244 | "plt.ylabel(\"Accuracy\");" 245 | ] 246 | }, 247 | { 248 | "cell_type": "markdown", 249 | "metadata": {}, 250 | "source": [ 251 | "### Inference\n", 252 | "\n", 253 | "Here we have the same `test` function as before." 254 | ] 255 | }, 256 | { 257 | "cell_type": "code", 258 | "execution_count": null, 259 | "metadata": {}, 260 | "outputs": [], 261 | "source": [ 262 | "def test(test_loader, model, criterion):\n", 263 | " model.eval()\n", 264 | "\n", 265 | " num_batches = len(test_loader)\n", 266 | " num_items = len(test_loader.dataset)\n", 267 | "\n", 268 | " test_loss = 0\n", 269 | " total_correct = 0\n", 270 | "\n", 271 | " with torch.no_grad():\n", 272 | " for data, target in test_loader:\n", 273 | " # Copy data and targets to GPU\n", 274 | " data = data.to(device)\n", 275 | " target = target.to(device)\n", 276 | " \n", 277 | " # Do a forward pass\n", 278 | " output = model(data)\n", 279 | " \n", 280 | " # Calculate the loss\n", 281 | " loss = criterion(output, target)\n", 282 | " test_loss += loss.item()\n", 283 | " \n", 284 | " # Count number of correct digits\n", 285 | " total_correct += correct(output, target)\n", 286 | "\n", 287 | " test_loss = test_loss/num_batches\n", 288 | " accuracy = total_correct/num_items\n", 289 | "\n", 290 | " print(f\"Testset accuracy: {100*accuracy:>0.1f}%, average loss: {test_loss:>7f}\")" 291 | ] 292 | }, 293 | { 294 | "cell_type": "code", 295 | "execution_count": null, 296 | "metadata": {}, 297 | "outputs": [], 298 | "source": [ 299 | "test(test_loader, model, criterion)" 300 | ] 301 | }, 302 | { 303 | "cell_type": "markdown", 304 | "metadata": {}, 305 | "source": [ 306 | "Let's take a look at the convolution weights of the Conv2D layer." 307 | ] 308 | }, 309 | { 310 | "cell_type": "code", 311 | "execution_count": null, 312 | "metadata": {}, 313 | "outputs": [], 314 | "source": [ 315 | "for i, module in enumerate(model.modules()):\n", 316 | " print(i, type(module))" 317 | ] 318 | }, 319 | { 320 | "cell_type": "code", 321 | "execution_count": null, 322 | "metadata": {}, 323 | "outputs": [], 324 | "source": [ 325 | "weights = [module for module in model.modules()][2].weight.data.numpy()\n", 326 | "\n", 327 | "for i in range(weights.shape[0]):\n", 328 | " plt.subplot(4, 8, i+1)\n", 329 | " c = weights[i][0]\n", 330 | " c = c - np.min(c)\n", 331 | " c = c / np.max(c)\n", 332 | " plt.imshow(c, cmap='gray', interpolation='nearest')\n", 333 | " plt.axis('off')\n" 334 | ] 335 | }, 336 | { 337 | "cell_type": "markdown", 338 | "metadata": {}, 339 | "source": [ 340 | "## Task 1: A more complex CNN model\n", 341 | "\n", 342 | "Your task is to try the same problem as above, but with two convolutional layers. The new model should have the following layers in order:\n", 343 | "\n", 344 | "- Convolutional (`Conv2d`) layer with 32 units and 3x3 kernels, valid padding + ReLU activation\n", 345 | "- Another identical convolutional layer + ReLU activation\n", 346 | "- Max pooling (`MaxPool2d`) layer with 2x2 pooling size\n", 347 | "- Dropout with 0.25 rate\n", 348 | "- Flatten\n", 349 | "- Dense layer with 128 units\n", 350 | "- Dropout with 0.5 rate\n", 351 | "- Dense output layer with 10 units\n", 352 | "\n", 353 | "You can consult the [PyTorch documentation](https://pytorch.org/docs/stable/index.html), in particular all the [neural network building blocks can be found in the `torch.nn` documentation](https://pytorch.org/docs/stable/nn.html).\n", 354 | "\n", 355 | "The code below is missing the model definition. You can copy any suitable layers from the example above." 356 | ] 357 | }, 358 | { 359 | "cell_type": "code", 360 | "execution_count": null, 361 | "metadata": {}, 362 | "outputs": [], 363 | "source": [ 364 | "class ComplexCNN(nn.Module):\n", 365 | " def __init__(self):\n", 366 | " super().__init__()\n", 367 | " self.layers = nn.Sequential(\n", 368 | " # TASK 1: ADD LAYERS HERE\n", 369 | " )\n", 370 | "\n", 371 | " def forward(self, x):\n", 372 | " return self.layers(x)\n" 373 | ] 374 | }, 375 | { 376 | "cell_type": "markdown", 377 | "metadata": {}, 378 | "source": [ 379 | "If you want to see an example answer, change the type of the cell below to \"Code\" in the menu bar above and then execute the cell. Execute the cell again to run the example code.\n", 380 | "\n", 381 | "**Note:** in Google Colab you can [click here](https://github.com/csc-training/intro-to-dl/blob/master/day1/solutions/pytorch-mnist-cnn-example-answer.py) and copy the answer manually." 382 | ] 383 | }, 384 | { 385 | "cell_type": "raw", 386 | "metadata": {}, 387 | "source": [ 388 | "%load solutions/pytorch-mnist-cnn-example-answer.py" 389 | ] 390 | }, 391 | { 392 | "cell_type": "code", 393 | "execution_count": null, 394 | "metadata": {}, 395 | "outputs": [], 396 | "source": [ 397 | "ex1_model = ComplexCNN()\n", 398 | "print(ex1_model)\n", 399 | "\n", 400 | "assert len(ex1_model.layers) > 0, \"ERROR: You need to write the missing model definition above!\"\n", 401 | "ex1_model = ex1_model.to(device)" 402 | ] 403 | }, 404 | { 405 | "cell_type": "code", 406 | "execution_count": null, 407 | "metadata": {}, 408 | "outputs": [], 409 | "source": [ 410 | "ex1_criterion = nn.CrossEntropyLoss()\n", 411 | "ex1_optimizer = torch.optim.Adam(ex1_model.parameters())" 412 | ] 413 | }, 414 | { 415 | "cell_type": "code", 416 | "execution_count": null, 417 | "metadata": {}, 418 | "outputs": [], 419 | "source": [ 420 | "%%time\n", 421 | "\n", 422 | "losses = []\n", 423 | "accuracies = []\n", 424 | "epochs = 5\n", 425 | "for epoch in range(epochs):\n", 426 | " print(f\"Epoch: {epoch+1} ...\")\n", 427 | " loss, acc = train(train_loader, ex1_model, ex1_criterion, ex1_optimizer)\n", 428 | " losses.append(loss)\n", 429 | " accuracies.append(acc)" 430 | ] 431 | }, 432 | { 433 | "cell_type": "code", 434 | "execution_count": null, 435 | "metadata": {}, 436 | "outputs": [], 437 | "source": [ 438 | "plt.subplot(2,1,1)\n", 439 | "plt.plot(losses)\n", 440 | "plt.ylabel(\"Loss\")\n", 441 | "plt.subplot(2,1,2)\n", 442 | "plt.plot(accuracies)\n", 443 | "plt.ylabel(\"Accuracy\");" 444 | ] 445 | }, 446 | { 447 | "cell_type": "code", 448 | "execution_count": null, 449 | "metadata": {}, 450 | "outputs": [], 451 | "source": [ 452 | "test(test_loader, ex1_model, ex1_criterion)" 453 | ] 454 | }, 455 | { 456 | "cell_type": "markdown", 457 | "metadata": {}, 458 | "source": [ 459 | "## Task 2: Tune training parameters\n", 460 | "\n", 461 | "Try to improve the classification accuracy, in particular by trying different optimizers and playing with the parameters of the training process.\n", 462 | "\n", 463 | "See optimizers available in PyTorch here: \n", 464 | "\n", 465 | "You can take the model created in Task 1 as a starting point. Below is a code example which you can modify." 466 | ] 467 | }, 468 | { 469 | "cell_type": "markdown", 470 | "metadata": {}, 471 | "source": [ 472 | "If you wish to change the batch size, you need to re-define the data loaders." 473 | ] 474 | }, 475 | { 476 | "cell_type": "code", 477 | "execution_count": null, 478 | "metadata": {}, 479 | "outputs": [], 480 | "source": [ 481 | "batch_size = 32\n", 482 | "\n", 483 | "train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)\n", 484 | "test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)" 485 | ] 486 | }, 487 | { 488 | "cell_type": "code", 489 | "execution_count": null, 490 | "metadata": {}, 491 | "outputs": [], 492 | "source": [ 493 | "ex2_model = ComplexCNN().to(device)\n", 494 | "\n", 495 | "ex2_criterion = nn.CrossEntropyLoss()\n", 496 | "ex2_optimizer = torch.optim.Adam(ex2_model.parameters())\n", 497 | "\n", 498 | "epochs = 5\n", 499 | "for epoch in range(epochs):\n", 500 | " print(f\"Epoch: {epoch+1} ...\")\n", 501 | " train(train_loader, ex2_model, ex2_criterion, ex2_optimizer)" 502 | ] 503 | }, 504 | { 505 | "cell_type": "code", 506 | "execution_count": null, 507 | "metadata": {}, 508 | "outputs": [], 509 | "source": [ 510 | "test(test_loader, ex2_model, ex2_criterion)" 511 | ] 512 | }, 513 | { 514 | "cell_type": "markdown", 515 | "metadata": {}, 516 | "source": [ 517 | "## Extra: View model summary\n", 518 | "\n", 519 | "One way to view more information about the model is to use an external package such as [Torchinfo](https://github.com/TylerYep/torchinfo). It is not installed in the standard Pytorch module in LUMI, so you need to install it using pip:" 520 | ] 521 | }, 522 | { 523 | "cell_type": "code", 524 | "execution_count": null, 525 | "metadata": {}, 526 | "outputs": [], 527 | "source": [ 528 | "!pip3 install torchinfo" 529 | ] 530 | }, 531 | { 532 | "cell_type": "code", 533 | "execution_count": null, 534 | "metadata": {}, 535 | "outputs": [], 536 | "source": [ 537 | "from torchinfo import summary" 538 | ] 539 | }, 540 | { 541 | "cell_type": "code", 542 | "execution_count": null, 543 | "metadata": {}, 544 | "outputs": [], 545 | "source": [ 546 | "summary(model, input_size=(batch_size, 1, 28, 28), \n", 547 | " col_names=[\"input_size\", \"output_size\", \"kernel_size\", \"num_params\"])" 548 | ] 549 | } 550 | ], 551 | "metadata": { 552 | "kernelspec": { 553 | "display_name": "Python 3 (ipykernel)", 554 | "language": "python", 555 | "name": "python3" 556 | }, 557 | "language_info": { 558 | "codemirror_mode": { 559 | "name": "ipython", 560 | "version": 3 561 | }, 562 | "file_extension": ".py", 563 | "mimetype": "text/x-python", 564 | "name": "python", 565 | "nbconvert_exporter": "python", 566 | "pygments_lexer": "ipython3", 567 | "version": "3.10.12" 568 | } 569 | }, 570 | "nbformat": 4, 571 | "nbformat_minor": 4 572 | } 573 | -------------------------------------------------------------------------------- /day1/04b-pytorch-imdb-huggingface.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "2edd322b-1e0a-4fc4-919b-e808ad472cfe", 6 | "metadata": {}, 7 | "source": [ 8 | "# IMDB movie review sentiment classification using Hugging Face models\n", 9 | "\n", 10 | "In this notebook, we'll test pre-trained sentiment analysis models and later finetune a DistilBERT model to perform IMDB movie review sentiment classification. This notebook is adapted from [Getting Started with Sentiment Analysis using Python](https://huggingface.co/blog/sentiment-analysis-python)." 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "id": "c9243fc9-da3f-470d-9e0f-9aaa9528efcd", 16 | "metadata": {}, 17 | "source": [ 18 | "Import the libraries" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "id": "e1227c62-d120-4908-8d35-6bf0f236be50", 25 | "metadata": {}, 26 | "outputs": [], 27 | "source": [ 28 | "from transformers import pipeline\n", 29 | "import torch\n", 30 | "from datasets import load_dataset\n", 31 | "from transformers import AutoTokenizer\n", 32 | "from transformers import DataCollatorWithPadding\n", 33 | "from transformers import AutoModelForSequenceClassification\n", 34 | "import numpy as np\n", 35 | "import evaluate\n", 36 | "from huggingface_hub import notebook_login\n", 37 | "from transformers import TrainingArguments, Trainer\n", 38 | "from transformers import pipeline" 39 | ] 40 | }, 41 | { 42 | "cell_type": "markdown", 43 | "id": "35124c9a-1f07-416b-834d-f9c7508f682c", 44 | "metadata": {}, 45 | "source": [ 46 | "Check if PyTorch is using the GPU" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": null, 52 | "id": "b124df20-1f8a-4a5e-9975-4798bfdaf0f8", 53 | "metadata": {}, 54 | "outputs": [], 55 | "source": [ 56 | "print('Using PyTorch version:', torch.__version__)\n", 57 | "if torch.cuda.is_available():\n", 58 | " print('Using GPU, device name:', torch.cuda.get_device_name(0))\n", 59 | " device = torch.device('cuda')\n", 60 | "else:\n", 61 | " print('No GPU found, using CPU instead.') \n", 62 | " device = torch.device('cpu')" 63 | ] 64 | }, 65 | { 66 | "cell_type": "markdown", 67 | "id": "f5800ebf-82bd-4cdc-9067-66ee8480d528", 68 | "metadata": {}, 69 | "source": [ 70 | "## Use Pre-trained Sentiment Analysis Models" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": null, 76 | "id": "6c82b0f7-62d8-4e3f-9e99-ef3ebc6522bc", 77 | "metadata": {}, 78 | "outputs": [], 79 | "source": [ 80 | "sentiment_pipeline = pipeline(\"sentiment-analysis\", device=device)\n", 81 | "data = [\"I love you\", \"I hate you\"]\n", 82 | "sentiment_pipeline(data)" 83 | ] 84 | }, 85 | { 86 | "cell_type": "markdown", 87 | "id": "e97a94f5-1548-46f2-a2f9-1715113e90ab", 88 | "metadata": { 89 | "jp-MarkdownHeadingCollapsed": true 90 | }, 91 | "source": [ 92 | "- This code snippet above utilizes the **[pipeline](https://huggingface.co/docs/transformers/main_classes/pipelines)** class to generate predictions using models from the Hub. It applies the [default sentiment analysis model](https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english) to evaluate the provided list of text data.\n", 93 | "- The analysis results are **POSITIVE** for first entry and **NEGATIVE** for the second entry." 94 | ] 95 | }, 96 | { 97 | "cell_type": "markdown", 98 | "id": "c273d882-0aa9-4d86-a1fa-fd518e2c3ce0", 99 | "metadata": {}, 100 | "source": [ 101 | "One can also use a specific sentiment analysis model by providing the name of the model, e.g., if you want a sentiment analysis model for tweets, you can specify the model id." 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": null, 107 | "id": "c183b485-adc5-447d-b3b7-bb66e173c80a", 108 | "metadata": {}, 109 | "outputs": [], 110 | "source": [ 111 | "specific_model = pipeline(model=\"finiteautomata/bertweet-base-sentiment-analysis\", device = device)\n", 112 | "specific_model(data)" 113 | ] 114 | }, 115 | { 116 | "cell_type": "markdown", 117 | "id": "daf45bd1-3964-4d32-944b-edd0783163bb", 118 | "metadata": {}, 119 | "source": [ 120 | "## Fine-tuning DistilBERT model using IMDB dataset " 121 | ] 122 | }, 123 | { 124 | "cell_type": "markdown", 125 | "id": "2ec252bb-b0b0-48dc-8e3a-b65b72f931c1", 126 | "metadata": {}, 127 | "source": [ 128 | "- The [IMDB](https://huggingface.co/datasets/stanfordnlp/imdb) dataset contains 50000 movies reviews from the Internet Movie Database, split into 25000 reviews for training and 25000 reviews for testing. Half of the reviews are positive and half are negative. \n", 129 | "\n", 130 | "- The IMDB dataset is relatively large, so let's use 5000 samples for training to speed up our process for this exercise." 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": null, 136 | "id": "48bb7a7d-9194-4904-bc91-bd1adb191ea1", 137 | "metadata": {}, 138 | "outputs": [], 139 | "source": [ 140 | "imdb = load_dataset(\"imdb\")\n", 141 | "small_train_dataset = imdb[\"train\"].shuffle(seed=0).select([i for i in list(range(5000))])\n", 142 | "test_dataset = imdb[\"test\"]" 143 | ] 144 | }, 145 | { 146 | "cell_type": "markdown", 147 | "id": "f40b569b-f984-4adc-9d21-2fd3ce72b9b4", 148 | "metadata": {}, 149 | "source": [ 150 | "Let's look at two samples from the IMDB dataset. One negative (label: `0`) and one positive (label: `1`) review." 151 | ] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "execution_count": null, 156 | "id": "554c9661-89d7-45f1-a222-fa6de2468713", 157 | "metadata": {}, 158 | "outputs": [], 159 | "source": [ 160 | "for i in (2, 12500): \n", 161 | " print(imdb[\"train\"][i])\n", 162 | " print()" 163 | ] 164 | }, 165 | { 166 | "cell_type": "markdown", 167 | "id": "82fe9183-d554-482c-8f3a-a75096e10e14", 168 | "metadata": {}, 169 | "source": [ 170 | "To preprocess our data, we will use DistilBERT tokenizer:" 171 | ] 172 | }, 173 | { 174 | "cell_type": "code", 175 | "execution_count": null, 176 | "id": "dbf1e5fc-3831-47d0-ab3f-01d6dd70482a", 177 | "metadata": {}, 178 | "outputs": [], 179 | "source": [ 180 | "tokenizer = AutoTokenizer.from_pretrained(\"distilbert-base-uncased\")" 181 | ] 182 | }, 183 | { 184 | "cell_type": "markdown", 185 | "id": "88e45582-a0c3-4a80-8a06-d4ce232a1ead", 186 | "metadata": {}, 187 | "source": [ 188 | "- Next, we will prepare the text inputs for the model for both splits of our dataset (training and test) by using the map method:" 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "execution_count": null, 194 | "id": "9ab50bd2-e54b-4e31-a162-24923b763731", 195 | "metadata": {}, 196 | "outputs": [], 197 | "source": [ 198 | "def preprocess_function(examples):\n", 199 | " return tokenizer(examples[\"text\"], truncation=True)\n", 200 | " \n", 201 | "tokenized_train = small_train_dataset.map(preprocess_function, batched=True)\n", 202 | "tokenized_test = test_dataset.map(preprocess_function, batched=True)" 203 | ] 204 | }, 205 | { 206 | "cell_type": "markdown", 207 | "id": "a1b9b4c0-9aea-481b-bb26-5f9bd18228c1", 208 | "metadata": {}, 209 | "source": [ 210 | "- To speed up training, let's use a data_collator to convert your training samples to PyTorch tensors and concatenate them with the correct amount of padding:" 211 | ] 212 | }, 213 | { 214 | "cell_type": "code", 215 | "execution_count": null, 216 | "id": "f10d80c5-77c3-43a6-a7d9-47d97deef882", 217 | "metadata": {}, 218 | "outputs": [], 219 | "source": [ 220 | "data_collator = DataCollatorWithPadding(tokenizer=tokenizer)" 221 | ] 222 | }, 223 | { 224 | "cell_type": "markdown", 225 | "id": "0455d055-7b18-47a2-b983-e7a35e46398f", 226 | "metadata": {}, 227 | "source": [ 228 | "### Training the model\n", 229 | "- We will be throwing away the pretraining head of the DistilBERT model and replacing it with a classification head fine-tuned for sentiment analysis. This enables us to transfer the knowledge from DistilBERT to our custom model." 230 | ] 231 | }, 232 | { 233 | "cell_type": "code", 234 | "execution_count": null, 235 | "id": "1644ded6-7d6a-43d2-b303-3d43eb316e4c", 236 | "metadata": {}, 237 | "outputs": [], 238 | "source": [ 239 | "model = AutoModelForSequenceClassification.from_pretrained(\"distilbert-base-uncased\", num_labels=2)" 240 | ] 241 | }, 242 | { 243 | "cell_type": "markdown", 244 | "id": "b6577793-6f3d-48a3-b13e-95116be72685", 245 | "metadata": {}, 246 | "source": [ 247 | "- Then, let's define the metrics you will be using to evaluate how good is your fine-tuned model (accuracy and f1 score)" 248 | ] 249 | }, 250 | { 251 | "cell_type": "code", 252 | "execution_count": null, 253 | "id": "81da7601-9ca0-45ee-94f8-d9d773fff695", 254 | "metadata": {}, 255 | "outputs": [], 256 | "source": [ 257 | "def compute_metrics(eval_pred):\n", 258 | " eval_accuracy = evaluate.load(\"accuracy\")\n", 259 | " eval_f1 = evaluate.load(\"f1\")\n", 260 | " \n", 261 | " logits, labels = eval_pred\n", 262 | " predictions = np.argmax(logits, axis=-1)\n", 263 | " accuracy = eval_accuracy.compute(predictions=predictions, references=labels)[\"accuracy\"]\n", 264 | " f1 = eval_f1.compute(predictions=predictions, references=labels)[\"f1\"]\n", 265 | " return {\"accuracy\": accuracy, \"f1\": f1}" 266 | ] 267 | }, 268 | { 269 | "cell_type": "markdown", 270 | "id": "ad53c65b-3463-4707-a8ad-ebd2de387133", 271 | "metadata": {}, 272 | "source": [ 273 | "- Define the training arguments" 274 | ] 275 | }, 276 | { 277 | "cell_type": "code", 278 | "execution_count": null, 279 | "id": "96b56df8-a9ac-41ec-9ad7-86c0e0cec2f7", 280 | "metadata": {}, 281 | "outputs": [], 282 | "source": [ 283 | "repo_name = \"finetuning-sentiment-model-5000-samples\"\n", 284 | " \n", 285 | "training_args = TrainingArguments(\n", 286 | " output_dir=repo_name,\n", 287 | " learning_rate=2e-5,\n", 288 | " per_device_train_batch_size=16,\n", 289 | " per_device_eval_batch_size=16,\n", 290 | " num_train_epochs=2,\n", 291 | " weight_decay=0.01,\n", 292 | " save_strategy=\"epoch\",\n", 293 | " push_to_hub=False,\n", 294 | " report_to=\"none\"\n", 295 | ")\n", 296 | " \n", 297 | "trainer = Trainer(\n", 298 | " model=model,\n", 299 | " args=training_args,\n", 300 | " train_dataset=tokenized_train,\n", 301 | " eval_dataset=tokenized_test,\n", 302 | " tokenizer=tokenizer,\n", 303 | " data_collator=data_collator,\n", 304 | " compute_metrics=compute_metrics,\n", 305 | ")" 306 | ] 307 | }, 308 | { 309 | "cell_type": "markdown", 310 | "id": "eaad4946-5446-4cd1-ae2f-8502d2e3037f", 311 | "metadata": {}, 312 | "source": [ 313 | "- Start training" 314 | ] 315 | }, 316 | { 317 | "cell_type": "code", 318 | "execution_count": null, 319 | "id": "e53f771c-d1fb-4a5b-a426-ae5bd53964df", 320 | "metadata": {}, 321 | "outputs": [], 322 | "source": [ 323 | "trainer.train()" 324 | ] 325 | }, 326 | { 327 | "cell_type": "markdown", 328 | "id": "ec3eee23-b26c-4687-8d27-7d075d76d3a9", 329 | "metadata": {}, 330 | "source": [ 331 | "- Evaluate the model" 332 | ] 333 | }, 334 | { 335 | "cell_type": "code", 336 | "execution_count": null, 337 | "id": "2a951518-0ee7-4e73-b47b-f679b7e6e628", 338 | "metadata": {}, 339 | "outputs": [], 340 | "source": [ 341 | "trainer.evaluate()" 342 | ] 343 | }, 344 | { 345 | "cell_type": "markdown", 346 | "id": "61722a1b-3347-469c-b0d9-89398e4601ca", 347 | "metadata": {}, 348 | "source": [ 349 | "- Model inference" 350 | ] 351 | }, 352 | { 353 | "cell_type": "code", 354 | "execution_count": null, 355 | "id": "a3db7322-4f03-4fd2-996b-948e3c271da4", 356 | "metadata": {}, 357 | "outputs": [], 358 | "source": [ 359 | "pipe = pipeline(\"sentiment-analysis\", model=model, tokenizer=tokenizer, device=device)\n", 360 | "pipe([\"I love this move\", \"This movie sucks!\"])" 361 | ] 362 | }, 363 | { 364 | "cell_type": "markdown", 365 | "id": "4d40ac81-bf5b-4ab7-b713-61fd22151020", 366 | "metadata": {}, 367 | "source": [ 368 | "## Task 1 Run this script with GPU" 369 | ] 370 | }, 371 | { 372 | "cell_type": "markdown", 373 | "id": "4f8eb61e-a673-4b22-8c6d-7262946964f6", 374 | "metadata": {}, 375 | "source": [ 376 | "## Task 2 Compare the test dataset accuracy achieved from finetuned DistilBERT model and the previous RNN model. What do you notice?" 377 | ] 378 | } 379 | ], 380 | "metadata": { 381 | "kernelspec": { 382 | "display_name": "Python 3 (ipykernel)", 383 | "language": "python", 384 | "name": "python3" 385 | }, 386 | "language_info": { 387 | "codemirror_mode": { 388 | "name": "ipython", 389 | "version": 3 390 | }, 391 | "file_extension": ".py", 392 | "mimetype": "text/x-python", 393 | "name": "python", 394 | "nbconvert_exporter": "python", 395 | "pygments_lexer": "ipython3", 396 | "version": "3.10.12" 397 | } 398 | }, 399 | "nbformat": 4, 400 | "nbformat_minor": 5 401 | } 402 | -------------------------------------------------------------------------------- /day1/README.md: -------------------------------------------------------------------------------- 1 | # Day 1 2 | 3 | ## Exercise sessions 4 | 5 | ### Exercise 1 6 | 7 | Introduction to Notebooks, PyTorch fundamentals. 8 | 9 | * *01-pytorch-test-setup.ipynb*
[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/csc-training/intro-to-dl/blob/master/day1/01-pytorch-test-setup.ipynb) 10 | 11 | ### Exercise 2 12 | 13 | MNIST classification with MLPs. 14 | 15 | * *02-pytorch-mnist-mlp.ipynb*
[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/csc-training/intro-to-dl/blob/master/day1/02-pytorch-mnist-mlp.ipynb) 16 | 17 | ### Exercise 3 18 | 19 | Image classification with CNNs. 20 | 21 | * *03-pytorch-mnist-cnn.ipynb*
[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/csc-training/intro-to-dl/blob/master/day1/03-pytorch-mnist-cnn.ipynb) 22 | 23 | ### Exercise 4 24 | 25 | Text sentiment classification with RNNs and using a pre-trained DistilBERT from Hugging Face. 26 | 27 | * *04a-pytorch-imdb-rnn.ipynb*
[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/csc-training/intro-to-dl/blob/master/day1/04a-pytorch-imdb-rnn.ipynb) 28 | * 04b-pytorch-imdb-huggingface.ipynb*
[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/csc-training/intro-to-dl/blob/master/day1/04b-pytorch-imdb-huggingface.ipynb) 29 | 30 | ## Setup 31 | 32 | We will use Jupyter Notebooks for all exercises on Day 1. There are several ways to set up a Jupyter environment for running the exercises: 33 | 34 | 35 | ### 1. LUMI web user interface 36 | 37 | *The default option.* 38 | 39 | 1. Go to the [LUMI web user interface](https://www.lumi.csc.fi/). 40 | 2. Login with Haka (Finnish university or research institute) or CSC account (anyone with valid CSC account) 41 | 3. Click "Jupyter for courses" (this works only if you have been added to the course project) 42 | 4. Make sure the selections are correct: 43 | - Reservation: PDL_CPU (during course day 1), No reservation (otherwise) 44 | - Project: project_462000863 45 | - Course module: Practical_Deep_Learning 46 | * if you do not see the course module listed, try "Restart Web Server" from the top-right "question-mark-inside-a-circle" menu item 47 | - Working directory: /users/your-username-here 48 | 6. Click "Launch" 49 | 7. Once the applications has started click "Connect to Jupyter" 50 | 8. If you are not familiar with Jupyter, take a moment to get to know the interface 51 | - open a new notebook (*File* -> *New* -> *Notebook*, on menubar) 52 | - select *"Python 3"* as the kernel for the notebook 53 | - write some Python code to a Jupyter *cell* 54 | - execute the cell with *shift-enter* 55 | 56 | ### 2. CSC Noppe 57 | 58 | CSC's Noppe (https://noppe.csc.fi) provides easy-to-use environments for working with data and programming. You can access everything via your web browser and CSC cloud environment computes on the background. There should be enough resources for launching a notebooks instance for everyone, but unfortunately no GPUs. 59 | 60 | 1. Go to the [Noppe](https://noppe.csc.fi) frontpage 61 | 2. Login according to selected login method: 62 | - **Haka or Virtu** (users from Finnish universities and research institutes) 63 | 1. Press Login button on the frontpage 64 | 2. Press Haka or Virtu button 65 | 3. Select right organization 66 | 4. Enter login information 67 | - **Special login** (if you have been given separate username and password for the course) 68 | 1. Press "Special Login" button on the Notebooks frontpage (below the Login button) 69 | 2. Enter login information (username goes to email slot) 70 | 3. Start the "Practical Deep Learning" application 71 | - You might find it quicker if you select the "Machine Learning" tab 72 | - Click the round start button next to the "Practical Deep Learning" card 73 | - Wait for session to launch 74 | 5. Once the Jupyter Notebook dashboard appears, navigate to `intro-to-dl/day1` 75 | 6. If you are not familiar with Jupyter, take a moment to get to know the interface 76 | - open a new notebook (*File* -> *New* -> *Notebook*, on menubar) 77 | - select *"Python 3"* as the kernel for the notebook 78 | - write some Python code to a Jupyter *cell* 79 | - execute the cell with *shift-enter* 80 | 81 | #### :warning: Note 82 | The notebook sessions have a limited time (4h) after which they, and any data or changes, will be *destroyed*. If you wish to save any files, you need to download them. 83 | 84 | ### 3. Running Jupyter on your laptop 85 | 86 | If you have a laptop that has both jupyter and the other necessary python packages installed, it is possible to use it. In particular, if the laptop has an NVIDIA or AMD GPU and it that has been properly set up (CUDA, cuDNN or ROCm). 87 | 88 | * `git clone https://github.com/csc-training/intro-to-dl.git` 89 | * try to run the `day1/01-pytorch-test-setup.ipynb` notebook without errors 90 | 91 | ### 4. Google Colaboratory 92 | 93 | Google has a free Jupyter Notebooks service you may want to try out. No guarantees, but it does have GPUs available! A Google account is needed to use Colaboratory. 94 | 95 | * Click the corresponding Colab link [above in this document](#exercise-sessions) 96 | * If needed, sign in to your Google account using the "Sign in" button in the top-right corner 97 | * To use a GPU, select: Runtime => Change runtime type => Hardware accelerator: GPU 98 | * Some exercises require the `datasets` library which isn't pre-installed on Colab. You can just run this in a cell: 99 | 100 | ``` 101 | !pip install datasets==2.21.0 102 | ``` 103 | -------------------------------------------------------------------------------- /day1/solutions/pytorch-imdb-rnn-example-answer.py: -------------------------------------------------------------------------------- 1 | embedding_dims = 50 2 | lstm_units = 32 3 | 4 | class TwoLayeredRNN(nn.Module): 5 | def __init__(self): 6 | super().__init__() 7 | self.emb = nn.Embedding(nb_words, embedding_dims) 8 | self.dropout = nn.Dropout(0.2) 9 | self.lstm = nn.LSTM(embedding_dims, lstm_units, num_layers=2, 10 | batch_first=True) 11 | self.linear = nn.Linear(lstm_units, 1) 12 | 13 | # With bidirectional 14 | #self.lstm = nn.LSTM(embedding_dims, lstm_units, num_layers=2, 15 | # batch_first=True, bidirectional=True) 16 | #self.linear = nn.Linear(lstm_units*2, 1) 17 | 18 | self.sigmoid = nn.Sigmoid() 19 | 20 | def forward(self, x): 21 | x = self.emb(x) 22 | x = self.dropout(x) 23 | x, (hn, cn) = self.lstm(x) 24 | x = self.linear(x[:, -1, :]) 25 | return self.sigmoid(x.view(-1)) 26 | -------------------------------------------------------------------------------- /day1/solutions/pytorch-mnist-cnn-example-answer.py: -------------------------------------------------------------------------------- 1 | class ComplexCNN(nn.Module): 2 | def __init__(self): 3 | super().__init__() 4 | self.layers = nn.Sequential( 5 | nn.Conv2d(1, 32, kernel_size=3, padding='valid'), 6 | nn.ReLU(), 7 | nn.Conv2d(32, 32, kernel_size=3, padding='valid'), 8 | nn.ReLU(), 9 | nn.MaxPool2d(kernel_size=2), 10 | nn.Dropout(0.25), 11 | nn.Flatten(), 12 | nn.Linear(12*12*32, 128), 13 | nn.ReLU(), 14 | nn.Dropout(0.5), 15 | nn.Linear(128, 10) 16 | ) 17 | 18 | def forward(self, x): 19 | return self.layers(x) -------------------------------------------------------------------------------- /day1/solutions/pytorch-mnist-mlp-example-answer.py: -------------------------------------------------------------------------------- 1 | class TwoLayerMLP(nn.Module): 2 | def __init__(self): 3 | super().__init__() 4 | self.layers = nn.Sequential( 5 | nn.Flatten(), 6 | nn.Linear(28*28, 50), 7 | nn.ReLU(), 8 | nn.Dropout(0.2), 9 | nn.Linear(50, 50), 10 | nn.ReLU(), 11 | nn.Dropout(0.2), 12 | nn.Linear(50, 10) 13 | ) 14 | 15 | def forward(self, x): 16 | return self.layers(x) 17 | -------------------------------------------------------------------------------- /day2/Exercise_5.md: -------------------------------------------------------------------------------- 1 | # Exercise 5 2 | 3 | In this exercise, we study image classification with two datasets: 4 | 5 | - [_Dogs vs. cats_](imgs/dvc.png) (dvc), where we train on 2000 images, each 6 | depicting either a cat or a dog, 7 | - [_German traffic signs_](imgs/gtsrb-montage.png) (gtsrb), where we train on 8 | 5535 images with [43 types of traffic signs](imgs/traffic-signs.png). 9 | 10 | ## Task 1 11 | 12 | ### Dogs vs. cats 13 | 14 | Starting with the _Dogs vs. cats_ (dvc) database, train, evaluate and report the 15 | accuracy with three different approaches: 16 | 17 | - CNN trained from scratch: [pytorch_dvc_cnn_simple.py](pytorch_dvc_cnn_simple.py) 18 | - Using a pre-trained CNN (VGG16) and fine tuning: 19 | [pytorch_dvc_cnn_pretrained.py](pytorch_dvc_cnn_pretrained.py) 20 | 21 | You can run the training directly with the corresponding script listed above, 22 | for example: 23 | 24 | sbatch run.sh pytorch_dvc_cnn_simple.py 25 | 26 | As a reminder, you can check the status of your runs with the command: 27 | 28 | squeue --me 29 | 30 | The output of the run will appear in a file named `slurm-RUN_ID.out` 31 | where `RUN_ID` is the Slurm batch job id. You can check the last ten 32 | lines of that file with the command: 33 | 34 | tail slurm-RUN_ID.out 35 | 36 | Use `tail -f` if you want to continuously follow the progress of the 37 | output. (Press Ctrl-C when you want to stop following the file.) 38 | 39 | After training, the script runs an evaluation on the test set, you 40 | should find the results of that towards the end of the output log on a 41 | line starting with "Testing". It should contain the accuracy 42 | (percentage of correctly classified images). 43 | 44 | Check the outputs of each run. Note that the pre-trained model will 45 | print out two results, once after pre-training, and again after 46 | fine-tuning. Which model gave the best testset result? Does 47 | finetuning improve the result? 48 | 49 | 50 | ### German traffic signs 51 | 52 | Repeat the experiment with the _German traffic signs_ (gtsrb) database. Which 53 | model gives the best result in this case? Compare the results with the previous 54 | dvc results. 55 | 56 | The scripts are named in the same way as before, just replace "dvc" with 57 | "gtsrb": 58 | 59 | - CNN trained from scratch: [pytorch_gtsrb_cnn_simple.py](pytorch_gtsrb_cnn_simple.py) 60 | - Using a pre-trained CNN (VGG16) and fine tuning: 61 | [pytorch_gtsrb_cnn_pretrained.py](pytorch_gtsrb_cnn_pretrained.py) 62 | 63 | 64 | ## Task 2 65 | 66 | Pick one database (dvc or gtsrb) and try to improve the result, e.g., by 67 | tweaking the model or the training parameters (optimizer, batch size, number of 68 | epochs, etc.). 69 | 70 | ## Extracurricular 1 71 | 72 | There are scripts for both _Dogs vs. cats_ and _German traffic signs_ using 73 | Vision Transformers (ViTs). Compare these with the previous approaches. 74 | 75 | - [pytorch_dvc_vit.py](pytorch_dvc_vit.py): _Dogs vs. cats_ with a pre-trained ViT 76 | - [pytorch_gtsrb_vit.py](pytorch_gtsrb_vit.py): _German traffic signs_ with a pre-trained ViT 77 | 78 | ## Extracurricular 2 79 | 80 | There is another, small dataset [Aliens and predators](imgs/avp.png) 81 | (avp) with 694 training and 200 validation images in the directory 82 | `/scratch/project_462000863/data/avp` on LUMI. Modify the scripts for 83 | _Dogs vs. cats_ to classify between them. 84 | 85 | -------------------------------------------------------------------------------- /day2/Exercise_6.md: -------------------------------------------------------------------------------- 1 | # Exercise 6 2 | 3 | In this exercise, we study text categorization using the [_20 4 | newsgroups_](http://www.cs.cmu.edu/afs/cs.cmu.edu/project/theo-20/www/data/news20.html) 5 | (20ng) dataset. The dataset contains 20,000 text documents (Usenet messages) 6 | in 20 categories (newsgroups or topics). For the embeddings of RNNs and CNNs we are using pre-trained 100-dimensional [GloVe](https://nlp.stanford.edu/projects/glove/) vectors. 7 | 8 | ## Task 1 9 | 10 | Try three different approaches for text classification with the _20 newsgroups_ 11 | (20ng) dataset: 12 | 13 | - Recurrent neural network (RNN): [pytorch_20ng_rnn.py](pytorch_20ng_rnn.py) 14 | - BERT finetuning: [pytorch_20ng_bert.py](pytorch_20ng_bert.py) 15 | - Convolutional neural network (CNN): [pytorch_20ng_cnn.py](pytorch_20ng_cnn.py) 16 | 17 | Run all three models and compare their accuracies and run times. 18 | 19 | ## Task 2 20 | 21 | Pick one model (RNN, CNN or BERT) and try to improve the results, e.g., by 22 | tweaking the model or the training parameters (optimizer, batch size, number of 23 | epochs, etc.). 24 | 25 | You can also work on replacing BERT with another Transformers model (for example 26 | [DistilBert](https://huggingface.co/docs/transformers/master/en/model_doc/distilbert)). 27 | See also the [HuggingFace Transformers documentation](https://huggingface.co/transformers/). 28 | 29 | -------------------------------------------------------------------------------- /day2/Exercise_7.md: -------------------------------------------------------------------------------- 1 | # Exercise 7 2 | 3 | In this exercise, we take a pre-trained GPT-3-like model from the 4 | Hugging Face repository and fine-tune it with movie reviews for the 5 | IMDB dataset: http://ai.stanford.edu/~amaas/data/sentiment/ 6 | 7 | ## Task 1 8 | 9 | Run the fine-tuning of the GPT-3 model by running the script 10 | [pytorch_imdb_gpt.py](pytorch_imdb_gpt.py). 11 | 12 | ```bash 13 | sbatch run.sh pytorch_imdb_gpt.py 14 | ``` 15 | 16 | You can tweak some of the parameters in the script. For example 17 | `max_steps` in `TrainingArguments` tells for how many batches it will 18 | train. It's by default set to `max_steps=5000`, which runs for about 19 | 15 minutes on LUMI. Here are Hugging Face's notes on the many things 20 | that can be tried for improving training: 21 | 22 | 23 | At the end of the run it prints the perplexity on the test set. This 24 | is a measure of how well our trained model predicts the test set 25 | samples. The lower the value, the better. 26 | 27 | Also make a note of where the model is stored, it should be in a 28 | directory like 29 | `/scratch/project_462000863/data/users/$USER/gpt-imdb-model/`, where 30 | `$USER` is replaced with your username on LUMI. Take a look into that 31 | directory: 32 | 33 | ``` 34 | ls -ltr /scratch/project_462000863/data/users/$USER/gpt-imdb-model/ 35 | ``` 36 | 37 | This should list all the files and subdirectories, with the most 38 | recent ones at the bottom. Depending on your training configuration it 39 | should have stored several checkpoints, the latest one is usual the 40 | best one. 41 | 42 | ## Task 2 43 | 44 | You can try generating some movie reviews interactively with the 45 | notebook [pytorch_generate_gpt.ipynb](pytorch_generate_gpt.ipynb). You 46 | should be able to open the Notebook as normal via "Jupyter for 47 | courses". GPUs are not needed for generating text. 48 | 49 | You need to point the `path_to_model` variable to a checkpoint of the 50 | model you trained in Task 1. For example something like 51 | `/scratch/project_462000863/data/users/$USER/gpt-imdb-model/checkpoint-5000` 52 | (here you need to replace `$USER` with your actual username). 53 | 54 | Experiment with different sampling strategies. At the end of the 55 | notebook there is also code to try the original distilgpt2 model, does 56 | our fine-tuned model produce any better results? 57 | 58 | You can also try a model that we prepared earlier that has trained for 59 | a full hour: 60 | 61 | ``` 62 | path_to_model = "/scratch/project_462000863/data/users/mvsjober/gpt-imdb-model/checkpoint-65000/" 63 | ``` 64 | -------------------------------------------------------------------------------- /day2/Exercise_8.md: -------------------------------------------------------------------------------- 1 | # Exercise 8 2 | 3 | In this exercise, we try using multiple GPUs. 4 | 5 | We have prepared a few examples where one of the earlier exercises 6 | have been converted to using DistributedDataParallel (DDP). 7 | 8 | - `pytorch_dvc_cnn_pretrained_multigpu.py` which implements PyTorch 9 | DDP on the pre-trained CNN for cats-vs-dogs. You can try this with 10 | the `run-2gpus.sh` script. 11 | 12 | - `pytorch_imdb_gpt_multigpu.py` which implements PyTorch DDP with the 13 | Hugging Face trainer. Use `run-2gpus.sh`. 14 | 15 | Run these scripts, and also try with 8 GPUs using the `run-8gpus.sh`. 16 | 17 | - Can you see any speed improvement between using 1, 2 or 8 GPUs? 18 | - Do you get the same accuracy? 19 | - Consider per-GPU batch size vs effective batch size. (Hint: with DDP you can check number of GPUs with `dist.get_world_size()`) 20 | 21 | You can check if your runs are actually using multiple GPUs with the 22 | `rocm-smi` command. Check the `JOBID` of your running job with `squeue 23 | --me`, then run (replacing JOBID with the real number): 24 | 25 | srun --overlap --pty --jobid=JOBID bash 26 | 27 | This opens a new shell session in the same machine as your job. Here 28 | you can check your processes with `top` or the state of the GPUs with 29 | `rocm-smi`. A useful command to follow GPU usage is: 30 | 31 | watch rocm-smi 32 | 33 | It will update every 2 seconds. It should show values above 0% for the 34 | GPU% column for all the GPUs you intend to use. Press Ctrl-C to exit 35 | this view. 36 | -------------------------------------------------------------------------------- /day2/README.md: -------------------------------------------------------------------------------- 1 | # Day 2 2 | 3 | ## Exercise sessions 4 | 5 | * [Exercise 5: Image classification](Exercise_5.md) 6 | * [Exercise 6: Text categorization](Exercise_6.md) 7 | * [Exercise 7: Text generation](Exercise_7.md) 8 | * [Exercise 8: Using multiple GPUs](Exercise_8.md) 9 | 10 | ## Setup 11 | 12 | 1. Login to LUMI using either: 13 | - the web user interface at ("Go to login") and start "Login node shell", or 14 | - login with your username and SSH key to `lumi.csc.fi`, for more instructions see: 15 | 16 | 2. In the login node shell, or SSH session, set up the module environment for using PyTorch: 17 | 18 | ```bash 19 | module purge 20 | module use /appl/local/csc/modulefiles/ 21 | module load pytorch 22 | ``` 23 | (In the LUMI web UI login node shell you can use Shift-Insert to paste if you copy commands from here.) 24 | 25 | 3. Go to the exercise directory: 26 | - if you ran the exercises of day 1 using LUMI's "Jupyter for courses", you should already have the repository cloned in your home directory 27 | 28 | ```bash 29 | cd PDL-2025-04/intro-to-dl/day2 30 | ``` 31 | 32 | If you don't have it, you can also clone it yourself: 33 | 34 | ```bash 35 | mkdir PDL-2025-04 36 | cd PDL-2025-04 37 | git clone https://github.com/csc-training/intro-to-dl 38 | cd intro-to-dl/day2 39 | ``` 40 | 41 | ## Edit and submit jobs 42 | 43 | 1. Edit Python script, either by: 44 | - Navigating to the file in the LUMI web UI file browser (Files → Home Directory → PDL-2025-04 → intro-to-dl → day2) and selecting "Edit" on that file (under the three dots "⋮" menu). 45 | - Opening with your favorite text editor in the terminal, for example: 46 | ```bash 47 | nano pytorch_test.py 48 | ``` 49 | 50 | 2. Submit job: 51 | 52 | ```bash 53 | sbatch run.sh pytorch_test.py 54 | ``` 55 | 56 | 3. See the status of your jobs or the queue you are using: 57 | 58 | ```bash 59 | squeue --me 60 | squeue -p small-g 61 | ``` 62 | 63 | 4. After the job has finished, examine the results: 64 | 65 | ```bash 66 | less slurm-xxxxxxxx.out 67 | ``` 68 | 69 | 5. Go to 1 until you are happy with the results. 70 | 71 | ## Optional: TensorBoard 72 | 73 | You can use TensorBoard either via the LUMI web user interface (recommended), or via the terminal using ssh port forwarding. Both approaches are explained below. 74 | 75 | ### Via the LUMI web interface (the recommended method) 76 | 77 | 1. Log in via 78 | 2. Select menu item: Apps → TensorBoard 79 | 4. In the form: 80 | - Select course project: project_462000863 81 | - Specify the "TensorBoard log directory", it's where you have cloned the course repository plus "day2/logs", for example: 82 | `~/PDL-2025-04/intro-to-dl/day2/logs`. You can run `pwd` in the terminal to find out the full path where you are working. 83 | - Leave rest at default settings 84 | 6. Click "Launch" 85 | 7. Wait until you see the "Connect to Tensorboard" button, then click that. 86 | 8. When you're done using TensorBoard, please go to "My Interactive Sessions" in the Puhti web user interface and "Cancel" the session. (It will automatically terminate once the reserved time is up, but it's always better to release the resource as soon as possible so that others can use it.) 87 | 88 | ### Via SSH port forwarding 89 | 90 | 1. Login again from a terminal window to LUMI with SSH port forwarding: 91 | 92 | ```bash 93 | ssh -L PORT:localhost:PORT lumi.csc.fi 94 | ``` 95 | 96 | Replace `PORT` with a freely selectable port number (>1023). By default, TensorBoard uses the port 6006, but **select a different port** to avoid overlaps. 97 | 98 | 2. Set up the module environment and start the TensorBoard server: 99 | 100 | ```bash 101 | module purge 102 | module use use /appl/local/csc/modulefiles/ 103 | module load tensorflow 104 | singularity_wrapper exec tensorboard --logdir=PDL-2025-04/intro-to-dl/day2/logs --port=PORT --bind_all 105 | ``` 106 | 107 | 3. To access TensorBoard, point your web browser to *localhost:PORT* . 108 | -------------------------------------------------------------------------------- /day2/imgs/avp.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csc-training/intro-to-dl/3ce19c7b5d9860ae64f15698b2b362397287e075/day2/imgs/avp.png -------------------------------------------------------------------------------- /day2/imgs/dvc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csc-training/intro-to-dl/3ce19c7b5d9860ae64f15698b2b362397287e075/day2/imgs/dvc.png -------------------------------------------------------------------------------- /day2/imgs/gtsrb-montage.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csc-training/intro-to-dl/3ce19c7b5d9860ae64f15698b2b362397287e075/day2/imgs/gtsrb-montage.png -------------------------------------------------------------------------------- /day2/imgs/traffic-signs.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/csc-training/intro-to-dl/3ce19c7b5d9860ae64f15698b2b362397287e075/day2/imgs/traffic-signs.png -------------------------------------------------------------------------------- /day2/logs/.gitignore: -------------------------------------------------------------------------------- 1 | # Ignore everything in this directory 2 | * 3 | # Except this file 4 | !.gitignore 5 | -------------------------------------------------------------------------------- /day2/pytorch_20ng_bert.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # # 20 newsgroup text classification with BERT finetuning 5 | # 6 | # In this script, we'll use a pre-trained BERT 7 | # (https://arxiv.org/abs/1810.04805) model for text classification 8 | # using PyTorch and HuggingFace's Transformers 9 | # (https://github.com/huggingface/transformers). 10 | 11 | import torch 12 | from torch.utils.data import (TensorDataset, DataLoader, 13 | RandomSampler, SequentialSampler) 14 | from transformers import BertTokenizer 15 | from transformers import BertForSequenceClassification 16 | from transformers import AdamW, get_linear_schedule_with_warmup 17 | 18 | from packaging.version import Version as LV 19 | 20 | from sklearn.model_selection import train_test_split 21 | 22 | from datetime import datetime 23 | 24 | import os 25 | import sys 26 | 27 | import numpy as np 28 | 29 | torch.manual_seed(42) 30 | 31 | if torch.cuda.is_available(): 32 | device = torch.device('cuda') 33 | else: 34 | device = torch.device('cpu') 35 | 36 | print('Using PyTorch version:', torch.__version__, ' Device:', device) 37 | assert LV(torch.__version__) >= LV("1.0.0") 38 | 39 | 40 | def correct(output, target): 41 | predicted = output.argmax(1) # pick class with largest network output 42 | correct_ones = (predicted == target).type(torch.float) 43 | return correct_ones.sum().item() # count number of correct ones 44 | 45 | 46 | def train(data_loader, model, scheduler, optimizer): 47 | model.train() 48 | 49 | num_batches = 0 50 | num_items = 0 51 | 52 | total_loss = 0 53 | total_correct = 0 54 | for input_ids, input_mask, labels in data_loader: 55 | # Copy data and targets to GPU 56 | input_ids = input_ids.to(device) 57 | input_mask = input_mask.to(device) 58 | labels = labels.to(device) 59 | 60 | # Do a forward pass 61 | output = model(input_ids, token_type_ids=None, 62 | attention_mask=input_mask, labels=labels) 63 | 64 | loss = output[0] 65 | logits = output[1] 66 | 67 | total_loss += loss 68 | num_batches += 1 69 | 70 | # Count number of correct 71 | total_correct += correct(logits, labels) 72 | num_items += len(labels) 73 | 74 | # Backpropagation 75 | loss.backward() 76 | optimizer.step() 77 | optimizer.zero_grad() 78 | scheduler.step() 79 | 80 | return { 81 | 'loss': total_loss/num_batches, 82 | 'accuracy': total_correct/num_items 83 | } 84 | 85 | 86 | def test(test_loader, model): 87 | model.eval() 88 | 89 | num_batches = len(test_loader) 90 | num_items = len(test_loader.dataset) 91 | 92 | test_loss = 0 93 | total_correct = 0 94 | 95 | with torch.no_grad(): 96 | for input_ids, input_mask, labels in test_loader: 97 | # Copy data and targets to GPU 98 | input_ids = input_ids.to(device) 99 | input_mask = input_mask.to(device) 100 | labels = labels.to(device) 101 | 102 | # Do a forward pass 103 | output = model(input_ids, token_type_ids=None, 104 | attention_mask=input_mask) 105 | 106 | logits = output[0] 107 | 108 | # Count number of correct digits 109 | total_correct += correct(logits, labels) 110 | 111 | return { 112 | 'loss': test_loss/num_batches, 113 | 'accuracy': total_correct/num_items 114 | } 115 | 116 | 117 | def log_measures(ret, log, prefix, epoch): 118 | if log is not None: 119 | for key, value in ret.items(): 120 | log.add_scalar(prefix + "_" + key, value, epoch) 121 | 122 | 123 | def main(): 124 | try: 125 | import tensorboardX 126 | time_str = datetime.now().strftime('%Y-%m-%d_%H-%M-%S') 127 | logdir = os.path.join(os.getcwd(), "logs", "20ng-bert-" + time_str) 128 | print('TensorBoard log directory:', logdir) 129 | os.makedirs(logdir) 130 | log = tensorboardX.SummaryWriter(logdir) 131 | except (ImportError, FileExistsError): 132 | log = None 133 | 134 | datapath = os.getenv('DATADIR') 135 | if datapath is None: 136 | print("Please set DATADIR environment variable!") 137 | sys.exit(1) 138 | 139 | # 20 Newsgroups data set 140 | text_data_dir = os.path.join(datapath, "20_newsgroup") 141 | 142 | print('Processing text dataset') 143 | 144 | texts = [] # list of text samples 145 | labels_index = {} # dictionary mapping label name to numeric id 146 | labels = [] # list of label ids 147 | for name in sorted(os.listdir(text_data_dir)): 148 | path = os.path.join(text_data_dir, name) 149 | if os.path.isdir(path): 150 | label_id = len(labels_index) 151 | labels_index[name] = label_id 152 | print('-', name, label_id) 153 | for fname in sorted(os.listdir(path)): 154 | if fname.isdigit(): 155 | fpath = os.path.join(path, fname) 156 | args = {} if sys.version_info < (3,) else {'encoding': 'latin-1'} 157 | with open(fpath, **args) as f: 158 | t = f.read() 159 | i = t.find('\n\n') # skip header 160 | if 0 < i: 161 | t = t[i:] 162 | texts.append(t) 163 | labels.append(label_id) 164 | 165 | print('Found %s texts.' % len(texts)) 166 | 167 | # Split the data into a training set and a test set using 168 | # scikit-learn's train_test_split(). 169 | 170 | TEST_SET = 4000 171 | 172 | (sentences_train, sentences_test, 173 | labels_train, labels_test) = train_test_split(texts, labels, 174 | test_size=TEST_SET, 175 | shuffle=True, 176 | random_state=42) 177 | 178 | print('Length of training texts:', len(sentences_train)) 179 | print('Length of training labels:', len(labels_train)) 180 | print('Length of test texts:', len(sentences_test)) 181 | print('Length of test labels:', len(labels_test)) 182 | 183 | # The token [CLS] is a special token required by BERT at the beginning 184 | # of the sentence. 185 | 186 | sentences_train = ["[CLS] " + s for s in sentences_train] 187 | sentences_test = ["[CLS] " + s for s in sentences_test] 188 | 189 | print("The first training sentence:") 190 | print(sentences_train[0], 'LABEL:', labels_train[0]) 191 | 192 | # Next we specify the pre-trained BERT model we are going to use. The 193 | # model `"bert-base-uncased"` is the lowercased "base" model 194 | # (12-layer, 768-hidden, 12-heads, 110M parameters). 195 | # 196 | # We load the used vocabulary from the BERT model, and use the BERT 197 | # tokenizer to convert the sentences into tokens that match the data 198 | # the BERT model was trained on. 199 | 200 | print('Initializing BertTokenizer') 201 | 202 | BERTMODEL = 'bert-base-uncased' 203 | 204 | tokenizer = BertTokenizer.from_pretrained(BERTMODEL, do_lower_case=True) 205 | 206 | tokenized_train = [tokenizer.tokenize(s) for s in sentences_train] 207 | tokenized_test = [tokenizer.tokenize(s) for s in sentences_test] 208 | 209 | print("The full tokenized first training sentence:") 210 | print(tokenized_train[0]) 211 | 212 | # Now we set the maximum sequence lengths for our training and test 213 | # sentences as `MAX_LEN_TRAIN` and `MAX_LEN_TEST`. The maximum length 214 | # supported by the used BERT model is 512. 215 | # 216 | # The token `[SEP]` is another special token required by BERT at the 217 | # end of the sentence. 218 | 219 | MAX_LEN_TRAIN, MAX_LEN_TEST = 128, 512 220 | 221 | tokenized_train = [t[:(MAX_LEN_TRAIN-1)]+['SEP'] for t in tokenized_train] 222 | tokenized_test = [t[:(MAX_LEN_TEST-1)]+['SEP'] for t in tokenized_test] 223 | 224 | print("The truncated tokenized first training sentence:") 225 | print(tokenized_train[0]) 226 | 227 | # Next we use the BERT tokenizer to convert each token into an integer 228 | # index in the BERT vocabulary. We also pad any shorter sequences to 229 | # `MAX_LEN_TRAIN` or `MAX_LEN_TEST` indices with trailing zeros. 230 | 231 | ids_train = [tokenizer.convert_tokens_to_ids(t) for t in tokenized_train] 232 | ids_train = np.array([np.pad(i, (0, MAX_LEN_TRAIN-len(i)), 233 | mode='constant') for i in ids_train]) 234 | 235 | ids_test = [tokenizer.convert_tokens_to_ids(t) for t in tokenized_test] 236 | ids_test = np.array([np.pad(i, (0, MAX_LEN_TEST-len(i)), 237 | mode='constant') for i in ids_test]) 238 | 239 | print("The indices of the first training sentence:") 240 | print(ids_train[0]) 241 | 242 | # BERT also requires *attention masks*, with 1 for each real token in 243 | # the sequences and 0 for the padding: 244 | 245 | amasks_train, amasks_test = [], [] 246 | 247 | for seq in ids_train: 248 | seq_mask = [float(i > 0) for i in seq] 249 | amasks_train.append(seq_mask) 250 | 251 | for seq in ids_test: 252 | seq_mask = [float(i > 0) for i in seq] 253 | amasks_test.append(seq_mask) 254 | 255 | # We use again scikit-learn's train_test_split to use 10% of our 256 | # training data as a validation set, and then convert all data into 257 | # torch.tensors. 258 | 259 | (train_inputs, validation_inputs, 260 | train_labels, validation_labels) = train_test_split(ids_train, 261 | labels_train, 262 | random_state=42, 263 | test_size=0.1) 264 | (train_masks, validation_masks, 265 | _, _) = train_test_split(amasks_train, ids_train, 266 | random_state=42, test_size=0.1) 267 | 268 | train_inputs = torch.tensor(train_inputs) 269 | train_labels = torch.tensor(train_labels) 270 | train_masks = torch.tensor(train_masks) 271 | 272 | validation_inputs = torch.tensor(validation_inputs) 273 | validation_labels = torch.tensor(validation_labels) 274 | validation_masks = torch.tensor(validation_masks) 275 | 276 | test_inputs = torch.tensor(ids_test) 277 | test_labels = torch.tensor(labels_test) 278 | test_masks = torch.tensor(amasks_test) 279 | 280 | # Next we create PyTorch DataLoaders for all data sets. 281 | # 282 | # For fine-tuning BERT on a specific task, the authors recommend a 283 | # batch size of 16 or 32. 284 | 285 | BATCH_SIZE = 32 286 | 287 | print('Train: ', end="") 288 | train_dataset = TensorDataset(train_inputs, train_masks, 289 | train_labels) 290 | train_sampler = RandomSampler(train_dataset) 291 | train_loader = DataLoader(train_dataset, sampler=train_sampler, 292 | batch_size=BATCH_SIZE) 293 | print(len(train_dataset), 'messages') 294 | 295 | print('Validation: ', end="") 296 | validation_dataset = TensorDataset(validation_inputs, validation_masks, 297 | validation_labels) 298 | validation_sampler = SequentialSampler(validation_dataset) 299 | validation_loader = DataLoader(validation_dataset, 300 | sampler=validation_sampler, 301 | batch_size=BATCH_SIZE) 302 | print(len(validation_dataset), 'messages') 303 | 304 | print('Test: ', end="") 305 | test_dataset = TensorDataset(test_inputs, test_masks, test_labels) 306 | test_sampler = SequentialSampler(test_dataset) 307 | test_loader = DataLoader(test_dataset, sampler=test_sampler, 308 | batch_size=BATCH_SIZE) 309 | print(len(test_dataset), 'messages') 310 | 311 | # ## BERT model initialization 312 | # 313 | # We now load a pretrained BERT model with a single linear 314 | # classification layer added on top. 315 | 316 | print('Initializing BertForSequenceClassification') 317 | 318 | model = BertForSequenceClassification.from_pretrained(BERTMODEL, 319 | num_labels=20) 320 | model = model.to(device) 321 | 322 | # We set the remaining hyperparameters needed for fine-tuning the 323 | # pretrained model: 324 | # * num_epochs: the number of training epochs in fine-tuning 325 | # (recommended values between 2 and 4) 326 | # * weight_decay: weight decay for the Adam optimizer 327 | # * lr: learning rate for the Adam optimizer (2e-5 to 5e-5 recommended) 328 | # * warmup_steps: number of warmup steps to (linearly) reach the set 329 | # learning rate 330 | # 331 | # We also need to grab the training parameters from the pretrained model. 332 | 333 | num_epochs = 4 334 | weight_decay = 0.01 335 | lr = 2e-5 336 | warmup_steps = int(0.2*len(train_loader)) 337 | 338 | no_decay = ['bias', 'LayerNorm.weight'] 339 | optimizer_grouped_parameters = [ 340 | {'params': [p for n, p in model.named_parameters() 341 | if not any(nd in n for nd in no_decay)], 342 | 'weight_decay': weight_decay}, 343 | {'params': [p for n, p in model.named_parameters() 344 | if any(nd in n for nd in no_decay)], 345 | 'weight_decay': 0.0} 346 | ] 347 | optimizer = AdamW(optimizer_grouped_parameters, lr=lr, eps=1e-8) 348 | scheduler = get_linear_schedule_with_warmup( 349 | optimizer, num_warmup_steps=warmup_steps, 350 | num_training_steps=len(train_loader)*num_epochs) 351 | 352 | # Training loop 353 | start_time = datetime.now() 354 | for epoch in range(num_epochs): 355 | train_ret = train(train_loader, model, scheduler, optimizer) 356 | log_measures(train_ret, log, "train", epoch) 357 | 358 | val_ret = test(validation_loader, model) 359 | log_measures(val_ret, log, "val", epoch) 360 | print(f"Epoch {epoch+1}: " 361 | f"train loss: {train_ret['loss']:.6f} " 362 | f"train accuracy: {train_ret['accuracy']:.2%}, " 363 | f"val accuracy: {val_ret['accuracy']:.2%}") 364 | 365 | end_time = datetime.now() 366 | print('Total training time: {}.'.format(end_time - start_time)) 367 | 368 | # Inference 369 | ret = test(test_loader, model) 370 | print(f"\nTesting: accuracy: {ret['accuracy']:.2%}") 371 | 372 | 373 | if __name__ == "__main__": 374 | main() 375 | -------------------------------------------------------------------------------- /day2/pytorch_20ng_cnn.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # 20 Newsgroups text classification with pre-trained word embeddings 5 | # 6 | # In this script, we'll use pre-trained [GloVe word 7 | # embeddings](http://nlp.stanford.edu/projects/glove/) for text 8 | # classification using PyTorch. 9 | 10 | import torch 11 | import torch.nn as nn 12 | import torch.nn.functional as F 13 | import torch.optim as optim 14 | from torch.autograd import Variable 15 | from torch.utils.data import TensorDataset, DataLoader 16 | 17 | from packaging.version import Version as LV 18 | 19 | from gensim.utils import simple_preprocess 20 | from gensim.corpora import Dictionary 21 | 22 | from sklearn.model_selection import train_test_split 23 | from sklearn.metrics import confusion_matrix 24 | 25 | from datetime import datetime 26 | 27 | import os 28 | import sys 29 | 30 | import numpy as np 31 | 32 | torch.manual_seed(42) 33 | 34 | if torch.cuda.is_available(): 35 | device = torch.device('cuda') 36 | else: 37 | device = torch.device('cpu') 38 | 39 | print('Using PyTorch version:', torch.__version__, ' Device:', device) 40 | assert(LV(torch.__version__) >= LV("1.0.0")) 41 | 42 | 43 | class Net(nn.Module): 44 | def __init__(self, embedding_matrix): 45 | super(Net, self).__init__() 46 | self.emb = nn.Embedding.from_pretrained(embedding_matrix, freeze=True) 47 | self.layers = nn.Sequential( 48 | nn.Conv1d(100, 128, 5), # output: batch_size x 128 x seq_len-4 49 | nn.ReLU(), 50 | nn.MaxPool1d(5), # output: bs x 128 x 199 51 | nn.Conv1d(128, 128, 5), # output: bs x 128 x 199 52 | nn.ReLU(), 53 | nn.MaxPool1d(5), # output: bs x 128 x 39 54 | nn.Conv1d(128, 128, 5), # output: bs x 128 x 35 55 | nn.ReLU(), 56 | nn.AdaptiveMaxPool1d(1) # output: bs x 128 x 1 57 | ) 58 | self.linear_layers = nn.Sequential( 59 | nn.Flatten(), 60 | nn.Linear(128, 128), 61 | nn.ReLU(), 62 | nn.Linear(128, 20), 63 | ) 64 | 65 | def forward(self, x): 66 | x = self.emb(x) # output from embedding: batch_size x seq_len x embedding dim. 67 | x = x.transpose(1,2) # change to: batch_size x embedding_dim x seq_len 68 | x = self.layers(x) 69 | x = self.linear_layers(x) 70 | return x 71 | 72 | 73 | def correct(output, target): 74 | predicted = output.argmax(1) # pick class with largest network output 75 | correct_ones = (predicted == target).type(torch.float) 76 | return correct_ones.sum().item() # count number of correct ones 77 | 78 | 79 | def train(data_loader, model, criterion, optimizer): 80 | model.train() 81 | 82 | num_batches = 0 83 | num_items = 0 84 | 85 | total_loss = 0 86 | total_correct = 0 87 | for data, target in data_loader: 88 | # Copy data and targets to GPU 89 | data = data.to(device) 90 | target = target.to(device) 91 | 92 | # Do a forward pass 93 | output = model(data) 94 | 95 | # Calculate the loss 96 | loss = criterion(output, target) 97 | total_loss += loss 98 | num_batches += 1 99 | 100 | # Count number of correct 101 | total_correct += correct(output, target) 102 | num_items += len(target) 103 | 104 | # Backpropagation 105 | loss.backward() 106 | optimizer.step() 107 | optimizer.zero_grad() 108 | 109 | return { 110 | 'loss': total_loss/num_batches, 111 | 'accuracy': total_correct/num_items 112 | } 113 | 114 | 115 | def test(test_loader, model, criterion): 116 | model.eval() 117 | 118 | num_batches = len(test_loader) 119 | num_items = len(test_loader.dataset) 120 | 121 | test_loss = 0 122 | total_correct = 0 123 | 124 | with torch.no_grad(): 125 | for data, target in test_loader: 126 | # Copy data and targets to GPU 127 | data = data.to(device) 128 | target = target.to(device) 129 | 130 | # Do a forward pass 131 | output = model(data) 132 | 133 | # Calculate the loss 134 | loss = criterion(output, target) 135 | test_loss += loss.item() 136 | 137 | # Count number of correct digits 138 | total_correct += correct(output, target) 139 | 140 | return { 141 | 'loss': test_loss/num_batches, 142 | 'accuracy': total_correct/num_items 143 | } 144 | 145 | 146 | def log_measures(ret, log, prefix, epoch): 147 | if log is not None: 148 | for key, value in ret.items(): 149 | log.add_scalar(prefix + "_" + key, value, epoch) 150 | 151 | 152 | def main(): 153 | try: 154 | import tensorboardX 155 | time_str = datetime.now().strftime('%Y-%m-%d_%H-%M-%S') 156 | logdir = os.path.join(os.getcwd(), "logs", "20ng-cnn-" + time_str) 157 | print('TensorBoard log directory:', logdir) 158 | os.makedirs(logdir) 159 | log = tensorboardX.SummaryWriter(logdir) 160 | except (ImportError, FileExistsError): 161 | log = None 162 | 163 | # ## GloVe word embeddings 164 | # 165 | # Let's begin by loading a datafile containing pre-trained word 166 | # embeddings. The datafile contains 100-dimensional embeddings for 167 | # 400,000 English words. 168 | 169 | datapath = os.getenv('DATADIR') 170 | if datapath is None: 171 | print("Please set DATADIR environment variable!") 172 | sys.exit(1) 173 | 174 | glove_dir = os.path.join(datapath, "glove.6B") 175 | 176 | print('Indexing word vectors.') 177 | 178 | embeddings_index = {} 179 | with open(os.path.join(glove_dir, 'glove.6B.100d.txt')) as f: 180 | for line in f: 181 | values = line.split() 182 | word = values[0] 183 | coefs = np.asarray(values[1:], dtype='float32') 184 | embeddings_index[word] = coefs 185 | 186 | print('Found %s word vectors.' % len(embeddings_index)) 187 | 188 | 189 | # ## 20 Newsgroups data set 190 | # 191 | # Next we'll load the [20 Newsgroups] 192 | # (http://www.cs.cmu.edu/afs/cs.cmu.edu/project/theo-20/www/data/news20.html) 193 | # data set. 194 | # 195 | # The dataset contains 20000 messages collected from 20 different 196 | # Usenet newsgroups (1000 messages from each group): 197 | # 198 | # | alt.atheism | soc.religion.christian | comp.windows.x | sci.crypt 199 | # | talk.politics.guns | comp.sys.ibm.pc.hardware | rec.autos | sci.electronics 200 | # | talk.politics.mideast | comp.graphics | rec.motorcycles | sci.space 201 | # | talk.politics.misc | comp.os.ms-windows.misc | rec.sport.baseball | sci.med 202 | # | talk.religion.misc | comp.sys.mac.hardware | rec.sport.hockey | misc.forsale 203 | 204 | text_data_dir = os.path.join(datapath, "20_newsgroup") 205 | 206 | print('Processing text dataset') 207 | 208 | texts = [] # list of text samples 209 | labels_index = {} # dictionary mapping label name to numeric id 210 | labels = [] # list of label ids 211 | for name in sorted(os.listdir(text_data_dir)): 212 | path = os.path.join(text_data_dir, name) 213 | if os.path.isdir(path): 214 | label_id = len(labels_index) 215 | labels_index[name] = label_id 216 | print('-', name, label_id) 217 | for fname in sorted(os.listdir(path)): 218 | if fname.isdigit(): 219 | fpath = os.path.join(path, fname) 220 | args = {} if sys.version_info < (3,) else {'encoding': 'latin-1'} 221 | with open(fpath, **args) as f: 222 | t = f.read() 223 | i = t.find('\n\n') # skip header 224 | if 0 < i: 225 | t = t[i:] 226 | texts.append(t) 227 | labels.append(label_id) 228 | 229 | print('Found %s texts.' % len(texts)) 230 | 231 | # Tokenize the texts using gensim. 232 | 233 | tokens = list() 234 | for text in texts: 235 | tokens.append(simple_preprocess(text)) 236 | 237 | # Vectorize the text samples into a 2D integer tensor. 238 | 239 | MAX_NUM_WORDS = 10000 # 2 words reserved: 0=pad, 1=oov 240 | MAX_SEQUENCE_LENGTH = 1000 241 | 242 | dictionary = Dictionary(tokens) 243 | dictionary.filter_extremes(no_below=0, no_above=1.0, 244 | keep_n=MAX_NUM_WORDS-2) 245 | 246 | word_index = dictionary.token2id 247 | print('Found %s unique tokens.' % len(word_index)) 248 | 249 | data = [dictionary.doc2idx(t) for t in tokens] 250 | 251 | # Truncate and pad sequences. 252 | 253 | data = [i[:MAX_SEQUENCE_LENGTH] for i in data] 254 | data = np.array([np.pad(i, (MAX_SEQUENCE_LENGTH-len(i), 0), 255 | mode='constant', constant_values=-2) 256 | for i in data], dtype=int) 257 | data = data + 2 258 | 259 | print('Shape of data tensor:', data.shape) 260 | print('Length of label vector:', len(labels)) 261 | 262 | # Split the data into a training set and a validation set 263 | 264 | VALIDATION_SET, TEST_SET = 1000, 4000 265 | 266 | x_train, x_test, y_train, y_test = train_test_split(data, labels, 267 | test_size=TEST_SET, 268 | shuffle=True, 269 | random_state=42) 270 | 271 | x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, 272 | test_size=VALIDATION_SET, 273 | shuffle=False) 274 | 275 | print('Shape of training data tensor:', x_train.shape) 276 | print('Length of training label vector:', len(y_train)) 277 | print('Shape of validation data tensor:', x_val.shape) 278 | print('Length of validation label vector:', len(y_val)) 279 | print('Shape of test data tensor:', x_test.shape) 280 | print('Length of test label vector:', len(y_test)) 281 | 282 | # Create PyTorch DataLoaders for all data sets: 283 | 284 | BATCH_SIZE = 128 285 | 286 | print('Train: ', end="") 287 | train_dataset = TensorDataset(torch.LongTensor(x_train), 288 | torch.LongTensor(y_train)) 289 | train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, 290 | shuffle=True, num_workers=4) 291 | print(len(train_dataset), 'messages') 292 | 293 | print('Validation: ', end="") 294 | validation_dataset = TensorDataset(torch.LongTensor(x_val), 295 | torch.LongTensor(y_val)) 296 | validation_loader = DataLoader(validation_dataset, batch_size=BATCH_SIZE, 297 | shuffle=False, num_workers=4) 298 | print(len(validation_dataset), 'messages') 299 | 300 | print('Test: ', end="") 301 | test_dataset = TensorDataset(torch.LongTensor(x_test), 302 | torch.LongTensor(y_test)) 303 | test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, 304 | shuffle=False, num_workers=4) 305 | print(len(test_dataset), 'messages') 306 | 307 | # Prepare the embedding matrix: 308 | 309 | print('Preparing embedding matrix.') 310 | 311 | EMBEDDING_DIM = 100 312 | 313 | embedding_matrix = np.zeros((MAX_NUM_WORDS, EMBEDDING_DIM)) 314 | n_not_found = 0 315 | for word, i in word_index.items(): 316 | if i >= MAX_NUM_WORDS-2: 317 | continue 318 | embedding_vector = embeddings_index.get(word) 319 | if embedding_vector is not None: 320 | # words not found in embedding index will be all-zeros. 321 | embedding_matrix[i+2] = embedding_vector 322 | else: 323 | n_not_found += 1 324 | 325 | embedding_matrix = torch.FloatTensor(embedding_matrix) 326 | print('Shape of embedding matrix:', embedding_matrix.shape) 327 | print('Words not found in pre-trained embeddings:', n_not_found) 328 | 329 | model = Net(embedding_matrix) 330 | model = model.to(device) 331 | 332 | # optimizer = optim.RMSprop(model.parameters(), lr=0.001) 333 | optimizer = optim.Adam(model.parameters(), lr=0.0005) 334 | 335 | criterion = nn.CrossEntropyLoss() 336 | 337 | print(model) 338 | 339 | num_epochs = 40 340 | 341 | # Training loop 342 | start_time = datetime.now() 343 | for epoch in range(num_epochs): 344 | train_ret = train(train_loader, model, criterion, optimizer) 345 | log_measures(train_ret, log, "train", epoch) 346 | 347 | val_ret = test(validation_loader, model, criterion) 348 | log_measures(val_ret, log, "val", epoch) 349 | print(f"Epoch {epoch+1}: " 350 | f"train loss: {train_ret['loss']:.6f} " 351 | f"train accuracy: {train_ret['accuracy']:.2%}, " 352 | f"val accuracy: {val_ret['accuracy']:.2%}") 353 | 354 | end_time = datetime.now() 355 | print('Total training time: {}.'.format(end_time - start_time)) 356 | 357 | # Inference 358 | ret = test(test_loader, model, criterion) 359 | print(f"\nTesting: accuracy: {ret['accuracy']:.2%}") 360 | 361 | 362 | if __name__ == "__main__": 363 | main() 364 | -------------------------------------------------------------------------------- /day2/pytorch_20ng_rnn.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # 20 Newsgroups text classification with pre-trained word embeddings 5 | # 6 | # In this script, we'll use pre-trained [GloVe word 7 | # embeddings](http://nlp.stanford.edu/projects/glove/) for text 8 | # classification using PyTorch. 9 | 10 | import torch 11 | import torch.nn as nn 12 | import torch.nn.functional as F 13 | import torch.optim as optim 14 | from torch.autograd import Variable 15 | from torch.utils.data import TensorDataset, DataLoader 16 | 17 | from packaging.version import Version as LV 18 | 19 | from gensim.utils import simple_preprocess 20 | from gensim.corpora import Dictionary 21 | 22 | from sklearn.model_selection import train_test_split 23 | from sklearn.metrics import confusion_matrix 24 | 25 | from datetime import datetime 26 | 27 | import os 28 | import sys 29 | 30 | import numpy as np 31 | 32 | torch.manual_seed(42) 33 | 34 | if torch.cuda.is_available(): 35 | device = torch.device('cuda') 36 | else: 37 | device = torch.device('cpu') 38 | 39 | print('Using PyTorch version:', torch.__version__, ' Device:', device) 40 | assert(LV(torch.__version__) >= LV("1.0.0")) 41 | 42 | 43 | class Net(nn.Module): 44 | def __init__(self, embedding_matrix): 45 | super(Net, self).__init__() 46 | self.emb = nn.Embedding.from_pretrained(embedding_matrix, freeze=True) 47 | self.lstm = nn.LSTM(100, 128, num_layers=2, batch_first=True) 48 | self.linear = nn.Linear(128,20) 49 | 50 | def forward(self, x): 51 | x = self.emb(x) 52 | 53 | # LSTM also returns the values of the internal h_n and c_n parameters 54 | x, (hn, cn) = self.lstm(x) 55 | 56 | # we pick only the last output after having processed the whole sequence 57 | x = self.linear(x[:, -1, :]) 58 | 59 | return x 60 | 61 | 62 | def correct(output, target): 63 | predicted = output.argmax(1) # pick class with largest network output 64 | correct_ones = (predicted == target).type(torch.float) 65 | return correct_ones.sum().item() # count number of correct ones 66 | 67 | 68 | def train(data_loader, model, criterion, optimizer): 69 | model.train() 70 | 71 | num_batches = 0 72 | num_items = 0 73 | 74 | total_loss = 0 75 | total_correct = 0 76 | for data, target in data_loader: 77 | # Copy data and targets to GPU 78 | data = data.to(device) 79 | target = target.to(device) 80 | 81 | # Do a forward pass 82 | output = model(data) 83 | 84 | # Calculate the loss 85 | loss = criterion(output, target) 86 | total_loss += loss 87 | num_batches += 1 88 | 89 | # Count number of correct 90 | total_correct += correct(output, target) 91 | num_items += len(target) 92 | 93 | # Backpropagation 94 | loss.backward() 95 | optimizer.step() 96 | optimizer.zero_grad() 97 | 98 | return { 99 | 'loss': total_loss/num_batches, 100 | 'accuracy': total_correct/num_items 101 | } 102 | 103 | 104 | def test(test_loader, model, criterion): 105 | model.eval() 106 | 107 | num_batches = len(test_loader) 108 | num_items = len(test_loader.dataset) 109 | 110 | test_loss = 0 111 | total_correct = 0 112 | 113 | with torch.no_grad(): 114 | for data, target in test_loader: 115 | # Copy data and targets to GPU 116 | data = data.to(device) 117 | target = target.to(device) 118 | 119 | # Do a forward pass 120 | output = model(data) 121 | 122 | # Calculate the loss 123 | loss = criterion(output, target) 124 | test_loss += loss.item() 125 | 126 | # Count number of correct digits 127 | total_correct += correct(output, target) 128 | 129 | return { 130 | 'loss': test_loss/num_batches, 131 | 'accuracy': total_correct/num_items 132 | } 133 | 134 | 135 | def log_measures(ret, log, prefix, epoch): 136 | if log is not None: 137 | for key, value in ret.items(): 138 | log.add_scalar(prefix + "_" + key, value, epoch) 139 | 140 | 141 | def main(): 142 | try: 143 | import tensorboardX 144 | time_str = datetime.now().strftime('%Y-%m-%d_%H-%M-%S') 145 | logdir = os.path.join(os.getcwd(), "logs", "20ng-rnn-" + time_str) 146 | print('TensorBoard log directory:', logdir) 147 | os.makedirs(logdir) 148 | log = tensorboardX.SummaryWriter(logdir) 149 | except (ImportError, FileExistsError): 150 | log = None 151 | 152 | # ## GloVe word embeddings 153 | # 154 | # Let's begin by loading a datafile containing pre-trained word 155 | # embeddings. The datafile contains 100-dimensional embeddings for 156 | # 400,000 English words. 157 | 158 | datapath = os.getenv('DATADIR') 159 | if datapath is None: 160 | print("Please set DATADIR environment variable!") 161 | sys.exit(1) 162 | 163 | glove_dir = os.path.join(datapath, "glove.6B") 164 | 165 | print('Indexing word vectors.') 166 | 167 | embeddings_index = {} 168 | with open(os.path.join(glove_dir, 'glove.6B.100d.txt')) as f: 169 | for line in f: 170 | values = line.split() 171 | word = values[0] 172 | coefs = np.asarray(values[1:], dtype='float32') 173 | embeddings_index[word] = coefs 174 | 175 | print('Found %s word vectors.' % len(embeddings_index)) 176 | 177 | 178 | # ## 20 Newsgroups data set 179 | # 180 | # Next we'll load the [20 Newsgroups] 181 | # (http://www.cs.cmu.edu/afs/cs.cmu.edu/project/theo-20/www/data/news20.html) 182 | # data set. 183 | # 184 | # The dataset contains 20000 messages collected from 20 different 185 | # Usenet newsgroups (1000 messages from each group): 186 | # 187 | # | alt.atheism | soc.religion.christian | comp.windows.x | sci.crypt 188 | # | talk.politics.guns | comp.sys.ibm.pc.hardware | rec.autos | sci.electronics 189 | # | talk.politics.mideast | comp.graphics | rec.motorcycles | sci.space 190 | # | talk.politics.misc | comp.os.ms-windows.misc | rec.sport.baseball | sci.med 191 | # | talk.religion.misc | comp.sys.mac.hardware | rec.sport.hockey | misc.forsale 192 | 193 | text_data_dir = os.path.join(datapath, "20_newsgroup") 194 | 195 | print('Processing text dataset') 196 | 197 | texts = [] # list of text samples 198 | labels_index = {} # dictionary mapping label name to numeric id 199 | labels = [] # list of label ids 200 | for name in sorted(os.listdir(text_data_dir)): 201 | path = os.path.join(text_data_dir, name) 202 | if os.path.isdir(path): 203 | label_id = len(labels_index) 204 | labels_index[name] = label_id 205 | print('-', name, label_id) 206 | for fname in sorted(os.listdir(path)): 207 | if fname.isdigit(): 208 | fpath = os.path.join(path, fname) 209 | args = {} if sys.version_info < (3,) else {'encoding': 'latin-1'} 210 | with open(fpath, **args) as f: 211 | t = f.read() 212 | i = t.find('\n\n') # skip header 213 | if 0 < i: 214 | t = t[i:] 215 | texts.append(t) 216 | labels.append(label_id) 217 | 218 | print('Found %s texts.' % len(texts)) 219 | 220 | # Tokenize the texts using gensim. 221 | 222 | tokens = list() 223 | for text in texts: 224 | tokens.append(simple_preprocess(text)) 225 | 226 | # Vectorize the text samples into a 2D integer tensor. 227 | 228 | MAX_NUM_WORDS = 10000 # 2 words reserved: 0=pad, 1=oov 229 | MAX_SEQUENCE_LENGTH = 1000 230 | 231 | dictionary = Dictionary(tokens) 232 | dictionary.filter_extremes(no_below=0, no_above=1.0, 233 | keep_n=MAX_NUM_WORDS-2) 234 | 235 | word_index = dictionary.token2id 236 | print('Found %s unique tokens.' % len(word_index)) 237 | 238 | data = [dictionary.doc2idx(t) for t in tokens] 239 | 240 | # Truncate and pad sequences. 241 | 242 | data = [i[:MAX_SEQUENCE_LENGTH] for i in data] 243 | data = np.array([np.pad(i, (MAX_SEQUENCE_LENGTH-len(i), 0), 244 | mode='constant', constant_values=-2) 245 | for i in data], dtype=int) 246 | data = data + 2 247 | 248 | print('Shape of data tensor:', data.shape) 249 | print('Length of label vector:', len(labels)) 250 | 251 | # Split the data into a training set and a validation set 252 | 253 | VALIDATION_SET, TEST_SET = 1000, 4000 254 | 255 | x_train, x_test, y_train, y_test = train_test_split(data, labels, 256 | test_size=TEST_SET, 257 | shuffle=True, 258 | random_state=42) 259 | 260 | x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, 261 | test_size=VALIDATION_SET, 262 | shuffle=False) 263 | 264 | print('Shape of training data tensor:', x_train.shape) 265 | print('Length of training label vector:', len(y_train)) 266 | print('Shape of validation data tensor:', x_val.shape) 267 | print('Length of validation label vector:', len(y_val)) 268 | print('Shape of test data tensor:', x_test.shape) 269 | print('Length of test label vector:', len(y_test)) 270 | 271 | # Create PyTorch DataLoaders for all data sets: 272 | 273 | BATCH_SIZE = 128 274 | 275 | print('Train: ', end="") 276 | train_dataset = TensorDataset(torch.LongTensor(x_train), 277 | torch.LongTensor(y_train)) 278 | train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, 279 | shuffle=True, num_workers=4) 280 | print(len(train_dataset), 'messages') 281 | 282 | print('Validation: ', end="") 283 | validation_dataset = TensorDataset(torch.LongTensor(x_val), 284 | torch.LongTensor(y_val)) 285 | validation_loader = DataLoader(validation_dataset, batch_size=BATCH_SIZE, 286 | shuffle=False, num_workers=4) 287 | print(len(validation_dataset), 'messages') 288 | 289 | print('Test: ', end="") 290 | test_dataset = TensorDataset(torch.LongTensor(x_test), 291 | torch.LongTensor(y_test)) 292 | test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, 293 | shuffle=False, num_workers=4) 294 | print(len(test_dataset), 'messages') 295 | 296 | # Prepare the embedding matrix: 297 | 298 | print('Preparing embedding matrix.') 299 | 300 | EMBEDDING_DIM = 100 301 | 302 | embedding_matrix = np.zeros((MAX_NUM_WORDS, EMBEDDING_DIM)) 303 | n_not_found = 0 304 | for word, i in word_index.items(): 305 | if i >= MAX_NUM_WORDS-2: 306 | continue 307 | embedding_vector = embeddings_index.get(word) 308 | if embedding_vector is not None: 309 | # words not found in embedding index will be all-zeros. 310 | embedding_matrix[i+2] = embedding_vector 311 | else: 312 | n_not_found += 1 313 | 314 | embedding_matrix = torch.FloatTensor(embedding_matrix) 315 | print('Shape of embedding matrix:', embedding_matrix.shape) 316 | print('Words not found in pre-trained embeddings:', n_not_found) 317 | 318 | model = Net(embedding_matrix) 319 | model = model.to(device) 320 | 321 | optimizer = optim.RMSprop(model.parameters(), lr=0.005) 322 | #optimizer = optim.Adam(model.parameters()) 323 | criterion = nn.CrossEntropyLoss() 324 | 325 | print(model) 326 | 327 | num_epochs = 20 328 | 329 | # Training loop 330 | start_time = datetime.now() 331 | for epoch in range(num_epochs): 332 | train_ret = train(train_loader, model, criterion, optimizer) 333 | log_measures(train_ret, log, "train", epoch) 334 | 335 | val_ret = test(validation_loader, model, criterion) 336 | log_measures(val_ret, log, "val", epoch) 337 | print(f"Epoch {epoch+1}: " 338 | f"train loss: {train_ret['loss']:.6f} " 339 | f"train accuracy: {train_ret['accuracy']:.2%}, " 340 | f"val accuracy: {val_ret['accuracy']:.2%}") 341 | 342 | end_time = datetime.now() 343 | print('Total training time: {}.'.format(end_time - start_time)) 344 | 345 | # Inference 346 | ret = test(test_loader, model, criterion) 347 | print(f"\nTesting: accuracy: {ret['accuracy']:.2%}") 348 | 349 | 350 | if __name__ == "__main__": 351 | main() 352 | -------------------------------------------------------------------------------- /day2/pytorch_dvc_cnn_pretrained.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # # Dogs-vs-cats classification with CNNs 5 | # 6 | # In this script, we'll train a convolutional neural network (CNN) to 7 | # classify images of dogs from images of cats using PyTorch. 8 | # 9 | # ## Option 2: Reuse a pre-trained CNN 10 | # 11 | # Here we'll use the VGG16 pre-trained network: 12 | # https://pytorch.org/docs/stable/torchvision/models.html#torchvision.models.vgg16 13 | # 14 | # It has weights learned using ImageNet. We remove the top layers and 15 | # freeze the pre-trained weights, and then stack our own, randomly 16 | # initialized, layers on top of the VGG16 network. 17 | # 18 | 19 | import torch 20 | import torch.nn as nn 21 | import torch.optim as optim 22 | from torch.utils.data import DataLoader 23 | from torchvision import datasets, transforms, models 24 | from packaging.version import Version as LV 25 | from datetime import datetime 26 | import os 27 | import sys 28 | 29 | torch.manual_seed(42) 30 | 31 | if torch.cuda.is_available(): 32 | device = torch.device('cuda') 33 | else: 34 | device = torch.device('cpu') 35 | 36 | print('Using PyTorch version:', torch.__version__, ' Device:', device) 37 | assert LV(torch.__version__) >= LV("1.0.0") 38 | 39 | 40 | class PretrainedNet(nn.Module): 41 | def __init__(self): 42 | super(PretrainedNet, self).__init__() 43 | self.vgg_features = models.vgg16(weights=models.VGG16_Weights.DEFAULT).features 44 | 45 | # Freeze the VGG16 layers 46 | for param in self.vgg_features.parameters(): 47 | param.requires_grad = False 48 | 49 | # Add our own layers on top 50 | self.own_layers = nn.Sequential( 51 | nn.Flatten(), 52 | nn.Linear(512*4*4, 64), 53 | nn.ReLU(), 54 | nn.Linear(64, 1), 55 | nn.Sigmoid() 56 | ) 57 | 58 | def forward(self, x): 59 | x = self.vgg_features(x) 60 | return self.own_layers(x).squeeze() 61 | 62 | 63 | def correct(output, target): 64 | class_pred = output.round().int() # set to 0 for <0.5, 1 for >0.5 65 | correct_ones = class_pred == target.int() # 1 for correct, 0 for incorrect 66 | return correct_ones.sum().item() # count number of correct ones 67 | 68 | 69 | def train(data_loader, model, criterion, optimizer): 70 | model.train() 71 | 72 | num_batches = 0 73 | num_items = 0 74 | 75 | total_loss = 0 76 | total_correct = 0 77 | for data, target in data_loader: 78 | # Copy data and targets to GPU 79 | data = data.to(device) 80 | target = target.to(device).to(torch.float) 81 | 82 | # Do a forward pass 83 | output = model(data) 84 | 85 | # Calculate the loss 86 | loss = criterion(output, target) 87 | total_loss += loss 88 | num_batches += 1 89 | 90 | # Count number of correct 91 | total_correct += correct(output, target) 92 | num_items += len(target) 93 | 94 | # Backpropagation 95 | loss.backward() 96 | optimizer.step() 97 | optimizer.zero_grad() 98 | 99 | return { 100 | 'loss': total_loss/num_batches, 101 | 'accuracy': total_correct/num_items 102 | } 103 | 104 | 105 | def test(test_loader, model, criterion): 106 | model.eval() 107 | 108 | num_batches = len(test_loader) 109 | num_items = len(test_loader.dataset) 110 | 111 | test_loss = 0 112 | total_correct = 0 113 | 114 | with torch.no_grad(): 115 | for data, target in test_loader: 116 | # Copy data and targets to GPU 117 | data = data.to(device) 118 | target = target.to(device).to(torch.float) 119 | 120 | # Do a forward pass 121 | output = model(data) 122 | 123 | # Calculate the loss 124 | loss = criterion(output, target) 125 | test_loss += loss.item() 126 | 127 | # Count number of correct digits 128 | total_correct += correct(output, target) 129 | 130 | return { 131 | 'loss': test_loss/num_batches, 132 | 'accuracy': total_correct/num_items 133 | } 134 | 135 | 136 | def log_measures(ret, log, prefix, epoch): 137 | if log is not None: 138 | for key, value in ret.items(): 139 | log.add_scalar(prefix + "_" + key, value, epoch) 140 | 141 | 142 | def main(): 143 | # TensorBoard for logging 144 | try: 145 | import tensorboardX 146 | time_str = datetime.now().strftime('%Y-%m-%d_%H-%M-%S') 147 | logdir = os.path.join(os.getcwd(), "logs", "dvc-pretrained-" + time_str) 148 | print('TensorBoard log directory:', logdir) 149 | os.makedirs(logdir) 150 | log = tensorboardX.SummaryWriter(logdir) 151 | except ImportError: 152 | log = None 153 | 154 | # The training dataset consists of 2000 images of dogs and cats, split 155 | # in half. In addition, the validation set consists of 1000 images, 156 | # and the test set of 22000 images. 157 | # 158 | # First, we'll resize all training and validation images to a fixed 159 | # size. 160 | # 161 | # Then, to make the most of our limited number of training examples, 162 | # we'll apply random transformations to them each time we are looping 163 | # over them. This way, we "augment" our training dataset to contain 164 | # more data. There are various transformations available in 165 | # torchvision, see: 166 | # https://pytorch.org/docs/stable/torchvision/transforms.html 167 | 168 | datapath = os.getenv('DATADIR') 169 | if datapath is None: 170 | print("Please set DATADIR environment variable!") 171 | sys.exit(1) 172 | datapath = os.path.join(datapath, 'dogs-vs-cats/train-2000') 173 | 174 | input_image_size = (150, 150) 175 | 176 | data_transform = transforms.Compose([ 177 | transforms.Resize(input_image_size), 178 | transforms.RandomAffine(degrees=0, translate=None, 179 | scale=(0.8, 1.2), shear=0.2), 180 | transforms.RandomHorizontalFlip(), 181 | transforms.ToTensor() 182 | ]) 183 | 184 | noop_transform = transforms.Compose([ 185 | transforms.Resize(input_image_size), 186 | transforms.ToTensor() 187 | ]) 188 | 189 | # Data loaders 190 | batch_size = 25 191 | 192 | print('Train: ', end="") 193 | train_dataset = datasets.ImageFolder(root=datapath+'/train', 194 | transform=data_transform) 195 | train_loader = DataLoader(train_dataset, batch_size=batch_size, 196 | shuffle=True, num_workers=4) 197 | print('Found', len(train_dataset), 'images belonging to', 198 | len(train_dataset.classes), 'classes') 199 | 200 | print('Validation: ', end="") 201 | validation_dataset = datasets.ImageFolder(root=datapath+'/validation', 202 | transform=noop_transform) 203 | validation_loader = DataLoader(validation_dataset, batch_size=batch_size, 204 | shuffle=False, num_workers=4) 205 | print('Found', len(validation_dataset), 'images belonging to', 206 | len(validation_dataset.classes), 'classes') 207 | 208 | print('Test: ', end="") 209 | test_dataset = datasets.ImageFolder(root=datapath+'/test', 210 | transform=noop_transform) 211 | test_loader = DataLoader(test_dataset, batch_size=batch_size, 212 | shuffle=False, num_workers=4) 213 | print('Found', len(test_dataset), 'images belonging to', 214 | len(test_dataset.classes), 'classes') 215 | 216 | # Define the network and training parameters 217 | model = PretrainedNet() 218 | model = model.to(device) 219 | optimizer = optim.SGD(model.parameters(), lr=0.01) 220 | criterion = nn.BCELoss() 221 | 222 | print(model) 223 | 224 | num_epochs = 10 225 | 226 | # Training loop 227 | start_time = datetime.now() 228 | for epoch in range(num_epochs): 229 | train_ret = train(train_loader, model, criterion, optimizer) 230 | log_measures(train_ret, log, "train", epoch) 231 | 232 | val_ret = test(validation_loader, model, criterion) 233 | log_measures(val_ret, log, "val", epoch) 234 | print(f"Epoch {epoch+1}: " 235 | f"train accuracy: {train_ret['accuracy']:.2%}, " 236 | f"val accuracy: {val_ret['accuracy']:.2%}") 237 | 238 | end_time = datetime.now() 239 | print('Total training time: {}.'.format(end_time - start_time)) 240 | 241 | # Inference 242 | ret = test(test_loader, model, criterion) 243 | print("\nTesting (pretrained, before fine-tuning): " 244 | f"accuracy: {ret['accuracy']:.2%}\n") 245 | 246 | # Fine-tuning 247 | # 248 | # Once the top layers have learned some reasonable weights, we can 249 | # continue training by unfreezing the last convolution block of 250 | # VGG16 so that it may adapt to our data. The learning rate should 251 | # be smaller than usual. 252 | # 253 | # Below we loop over all layers and set only the last three Conv2d 254 | # layers to trainable. In the printout we mark trainable layers 255 | # with '+', frozen with '-'. Other layers don't have trainable 256 | # parameters. 257 | 258 | print("Marking layers for training (+) or frozen (-):") 259 | for name, layer in model.vgg_features.named_children(): 260 | note = ' ' 261 | for param in layer.parameters(): 262 | note = '-' 263 | if int(name) >= 24: 264 | param.requires_grad = True 265 | note = '+' 266 | print(name, note, layer, len(param)) 267 | 268 | # We set up the training, note that we need to give only the 269 | # parameters that are set to be trainable. 270 | params = filter(lambda p: p.requires_grad, model.parameters()) 271 | #optimizer = optim.SGD(model.parameters(), lr=1e-3) 272 | optimizer = optim.RMSprop(params, lr=1e-5) 273 | criterion = nn.BCELoss() 274 | 275 | # Note that before continuing the training, we create a separate 276 | # TensorBoard log directory. 277 | if log is not None: 278 | logdir_pt = logdir + '-pretrained-finetune' 279 | os.makedirs(logdir_pt) 280 | log = tensorboardX.SummaryWriter(logdir_pt) 281 | 282 | prev_epochs = num_epochs 283 | num_epochs = 20 284 | 285 | start_time = datetime.now() 286 | for epoch in range(prev_epochs, prev_epochs+num_epochs): 287 | train_ret = train(train_loader, model, criterion, optimizer) 288 | log_measures(train_ret, log, "train", epoch) 289 | 290 | val_ret = test(validation_loader, model, criterion) 291 | log_measures(val_ret, log, "val", epoch) 292 | 293 | print(f"Epoch {epoch+1}: " 294 | f"train loss: {train_ret['loss']:.6f} " 295 | f"train accuracy: {train_ret['accuracy']:.2%}, " 296 | f"val accuracy: {val_ret['accuracy']:.2%}") 297 | 298 | end_time = datetime.now() 299 | print('Total fine-tuning time: {}.'.format(end_time - start_time)) 300 | 301 | # Inference 302 | ret = test(test_loader, model, criterion) 303 | print("\nTesting (pretrained, after fine-tuning): " 304 | f"accuracy: {ret['accuracy']:.2%}\n") 305 | 306 | 307 | if __name__ == "__main__": 308 | main() 309 | -------------------------------------------------------------------------------- /day2/pytorch_dvc_cnn_pretrained_multigpu.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # # Dogs-vs-cats classification with CNNs 5 | # 6 | # In this script, we'll train a convolutional neural network (CNN) to 7 | # classify images of dogs from images of cats using PyTorch. 8 | # 9 | # ## Option 2: Reuse a pre-trained CNN 10 | # 11 | # Here we'll use the VGG16 pre-trained network: 12 | # https://pytorch.org/docs/stable/torchvision/models.html#torchvision.models.vgg16 13 | # 14 | # It has weights learned using ImageNet. We remove the top layers and 15 | # freeze the pre-trained weights, and then stack our own, randomly 16 | # initialized, layers on top of the VGG16 network. 17 | # 18 | 19 | import torch 20 | import torch.nn as nn 21 | import torch.optim as optim 22 | from torch.utils.data import DataLoader 23 | from torch.utils.tensorboard import SummaryWriter 24 | from torchvision import datasets, transforms, models 25 | from packaging.version import Version as LV 26 | from datetime import datetime 27 | import os 28 | import sys 29 | 30 | import torch.distributed as dist 31 | from torch.nn.parallel import DistributedDataParallel 32 | from torch.utils.data.distributed import DistributedSampler 33 | 34 | torch.manual_seed(42) 35 | 36 | if torch.cuda.is_available(): 37 | device = torch.device('cuda') 38 | else: 39 | device = torch.device('cpu') 40 | 41 | print('Using PyTorch version:', torch.__version__, ' Device:', device) 42 | assert LV(torch.__version__) >= LV("1.0.0") 43 | 44 | 45 | class PretrainedNet(nn.Module): 46 | def __init__(self): 47 | super(PretrainedNet, self).__init__() 48 | self.vgg_features = models.vgg16(weights=models.VGG16_Weights.DEFAULT).features 49 | 50 | # Freeze the VGG16 layers 51 | for param in self.vgg_features.parameters(): 52 | param.requires_grad = False 53 | 54 | # Add our own layers on top 55 | self.own_layers = nn.Sequential( 56 | nn.Flatten(), 57 | nn.Linear(512*4*4, 64), 58 | nn.ReLU(), 59 | nn.Linear(64, 1), 60 | nn.Sigmoid() 61 | ) 62 | 63 | def forward(self, x): 64 | x = self.vgg_features(x) 65 | return self.own_layers(x).squeeze() 66 | 67 | 68 | def correct(output, target): 69 | class_pred = output.round().int() # set to 0 for <0.5, 1 for >0.5 70 | correct_ones = class_pred == target.int() # 1 for correct, 0 for incorrect 71 | return correct_ones.sum().item() # count number of correct ones 72 | 73 | 74 | def train(data_loader, model, criterion, optimizer): 75 | model.train() 76 | 77 | num_batches = 0 78 | num_items = 0 79 | 80 | total_loss = 0 81 | total_correct = 0 82 | for data, target in data_loader: 83 | # Copy data and targets to GPU 84 | data = data.to(device) 85 | target = target.to(device).to(torch.float) 86 | 87 | # Do a forward pass 88 | output = model(data) 89 | 90 | # Calculate the loss 91 | loss = criterion(output, target) 92 | total_loss += loss 93 | num_batches += 1 94 | 95 | # Count number of correct 96 | total_correct += correct(output, target) 97 | num_items += len(target) 98 | 99 | # Backpropagation 100 | loss.backward() 101 | optimizer.step() 102 | optimizer.zero_grad() 103 | 104 | return { 105 | 'loss': total_loss/num_batches, 106 | 'accuracy': total_correct/num_items 107 | } 108 | 109 | 110 | def test(test_loader, model, criterion): 111 | model.eval() 112 | 113 | num_batches = len(test_loader) 114 | num_items = len(test_loader.dataset) 115 | 116 | test_loss = 0 117 | total_correct = 0 118 | 119 | with torch.no_grad(): 120 | for data, target in test_loader: 121 | # Copy data and targets to GPU 122 | data = data.to(device) 123 | target = target.to(device).to(torch.float) 124 | 125 | # Do a forward pass 126 | output = model(data) 127 | 128 | # Calculate the loss 129 | loss = criterion(output, target) 130 | test_loss += loss.item() 131 | 132 | # Count number of correct digits 133 | total_correct += correct(output, target) 134 | 135 | return { 136 | 'loss': test_loss/num_batches, 137 | 'accuracy': total_correct/num_items 138 | } 139 | 140 | 141 | def log_measures(ret, log, prefix, epoch): 142 | if log is not None: 143 | for key, value in ret.items(): 144 | log.add_scalar(prefix + "_" + key, value, epoch) 145 | 146 | 147 | def main(): 148 | # Initialize PyTorch distributed 149 | dist.init_process_group(backend='nccl') 150 | 151 | local_rank = int(os.environ['LOCAL_RANK']) 152 | torch.cuda.set_device(local_rank) 153 | 154 | rank_0 = dist.get_rank() == 0 155 | 156 | # TensorBoard for logging 157 | log = None 158 | try: 159 | if rank_0: 160 | time_str = datetime.now().strftime('%Y-%m-%d_%H-%M-%S') 161 | logdir = os.path.join(os.getcwd(), "logs", "dvc-pretrained-" + time_str) 162 | print('TensorBoard log directory:', logdir) 163 | os.makedirs(logdir) 164 | log = SummaryWriter(logdir) 165 | except ImportError: 166 | pass 167 | 168 | # The training dataset consists of 2000 images of dogs and cats, split 169 | # in half. In addition, the validation set consists of 1000 images, 170 | # and the test set of 22000 images. 171 | # 172 | # First, we'll resize all training and validation images to a fixed 173 | # size. 174 | # 175 | # Then, to make the most of our limited number of training examples, 176 | # we'll apply random transformations to them each time we are looping 177 | # over them. This way, we "augment" our training dataset to contain 178 | # more data. There are various transformations available in 179 | # torchvision, see: 180 | # https://pytorch.org/docs/stable/torchvision/transforms.html 181 | 182 | datapath = os.getenv('DATADIR') 183 | if datapath is None: 184 | print("Please set DATADIR environment variable!") 185 | sys.exit(1) 186 | datapath = os.path.join(datapath, 'dogs-vs-cats/train-2000') 187 | 188 | input_image_size = (150, 150) 189 | 190 | data_transform = transforms.Compose([ 191 | transforms.Resize(input_image_size), 192 | transforms.RandomAffine(degrees=0, translate=None, 193 | scale=(0.8, 1.2), shear=0.2), 194 | transforms.RandomHorizontalFlip(), 195 | transforms.ToTensor() 196 | ]) 197 | 198 | noop_transform = transforms.Compose([ 199 | transforms.Resize(input_image_size), 200 | transforms.ToTensor() 201 | ]) 202 | 203 | # Data loaders 204 | batch_size = 25 205 | 206 | train_dataset = datasets.ImageFolder(root=datapath+'/train', 207 | transform=data_transform) 208 | train_sampler = DistributedSampler(train_dataset, drop_last=True) 209 | train_loader = DataLoader(train_dataset, batch_size=batch_size, 210 | shuffle=False, num_workers=4, 211 | sampler=train_sampler) 212 | if rank_0: 213 | print('Train: ', end="") 214 | print('Found', len(train_dataset), 'images belonging to', 215 | len(train_dataset.classes), 'classes') 216 | 217 | validation_dataset = datasets.ImageFolder(root=datapath+'/validation', 218 | transform=noop_transform) 219 | validation_loader = DataLoader(validation_dataset, batch_size=batch_size, 220 | shuffle=False, num_workers=4) 221 | if rank_0: 222 | print('Validation: ', end="") 223 | print('Found', len(validation_dataset), 'images belonging to', 224 | len(validation_dataset.classes), 'classes') 225 | 226 | test_dataset = datasets.ImageFolder(root=datapath+'/test', 227 | transform=noop_transform) 228 | test_loader = DataLoader(test_dataset, batch_size=batch_size, 229 | shuffle=False, num_workers=4) 230 | if rank_0: 231 | print('Test: ', end="") 232 | print('Found', len(test_dataset), 'images belonging to', 233 | len(test_dataset.classes), 'classes') 234 | 235 | # Define the network and training parameters 236 | model = PretrainedNet() 237 | model = model.to(device) 238 | model = DistributedDataParallel(model, device_ids=[local_rank]) 239 | if rank_0: 240 | print(model) 241 | 242 | optimizer = optim.SGD(model.parameters(), lr=0.01) 243 | criterion = nn.BCELoss() 244 | 245 | num_epochs = 10 246 | 247 | # Training loop 248 | start_time = datetime.now() 249 | for epoch in range(num_epochs): 250 | train_ret = train(train_loader, model, criterion, optimizer) 251 | if rank_0: 252 | log_measures(train_ret, log, "train", epoch) 253 | 254 | val_ret = test(validation_loader, model, criterion) 255 | if rank_0: 256 | log_measures(val_ret, log, "val", epoch) 257 | print(f"Epoch {epoch+1}: " 258 | f"train accuracy: {train_ret['accuracy']:.2%}, " 259 | f"val accuracy: {val_ret['accuracy']:.2%}") 260 | 261 | end_time = datetime.now() 262 | if rank_0: 263 | print('Total training time: {}.'.format(end_time - start_time)) 264 | 265 | # Inference 266 | ret = test(test_loader, model, criterion) 267 | print("\nTesting (pretrained, before fine-tuning): " 268 | f"accuracy: {ret['accuracy']:.2%}\n") 269 | 270 | # Fine-tuning 271 | # 272 | # Once the top layers have learned some reasonable weights, we can 273 | # continue training by unfreezing the last convolution block of 274 | # VGG16 so that it may adapt to our data. The learning rate should 275 | # be smaller than usual. 276 | # 277 | # Below we loop over all layers and set only the last three Conv2d 278 | # layers to trainable. In the printout we mark trainable layers 279 | # with '+', frozen with '-'. Other layers don't have trainable 280 | # parameters. 281 | 282 | if rank_0: 283 | print("Marking layers for training (+) or frozen (-):") 284 | for name, layer in model.module.vgg_features.named_children(): 285 | note = ' ' 286 | for param in layer.parameters(): 287 | note = '-' 288 | if int(name) >= 24: 289 | param.requires_grad = True 290 | note = '+' 291 | if rank_0: 292 | print(name, note, layer, len(param)) 293 | 294 | # We set up the training, note that we need to give only the 295 | # parameters that are set to be trainable. 296 | params = filter(lambda p: p.requires_grad, model.parameters()) 297 | #optimizer = optim.SGD(model.parameters(), lr=1e-3) 298 | optimizer = optim.RMSprop(params, lr=1e-5) 299 | criterion = nn.BCELoss() 300 | 301 | # Note that before continuing the training, we create a separate 302 | # TensorBoard log directory. 303 | if log is not None: 304 | logdir_pt = logdir + '-pretrained-finetune' 305 | os.makedirs(logdir_pt) 306 | log = SummaryWriter(logdir_pt) 307 | 308 | prev_epochs = num_epochs 309 | num_epochs = 20 310 | 311 | start_time = datetime.now() 312 | for epoch in range(prev_epochs, prev_epochs+num_epochs): 313 | train_ret = train(train_loader, model, criterion, optimizer) 314 | if rank_0: 315 | log_measures(train_ret, log, "train", epoch) 316 | 317 | val_ret = test(validation_loader, model, criterion) 318 | 319 | if rank_0: 320 | log_measures(val_ret, log, "val", epoch) 321 | 322 | print(f"Epoch {epoch+1}: " 323 | f"train loss: {train_ret['loss']:.6f} " 324 | f"train accuracy: {train_ret['accuracy']:.2%}, " 325 | f"val accuracy: {val_ret['accuracy']:.2%}") 326 | 327 | end_time = datetime.now() 328 | if rank_0: 329 | print('Total fine-tuning time: {}.'.format(end_time - start_time)) 330 | 331 | # Inference 332 | if rank_0: 333 | ret = test(test_loader, model, criterion) 334 | print("\nTesting (pretrained, after fine-tuning): " 335 | f"accuracy: {ret['accuracy']:.2%}\n") 336 | 337 | 338 | if __name__ == "__main__": 339 | main() 340 | -------------------------------------------------------------------------------- /day2/pytorch_dvc_cnn_simple.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # # Dogs-vs-cats classification with CNNs 5 | # 6 | # In this script, we'll train a convolutional neural network (CNN) to 7 | # classify images of dogs from images of cats using PyTorch. 8 | # 9 | # ## Option 1: Train a small CNN from scratch 10 | # 11 | # Similarly as with MNIST digits, we can start from scratch and train 12 | # a CNN for the classification task. However, due to the small number 13 | # of training images, a large network will easily overfit, regardless 14 | # of the data augmentation. 15 | 16 | import torch 17 | import torch.nn as nn 18 | import torch.optim as optim 19 | from torch.utils.data import DataLoader 20 | from torchvision import datasets, transforms 21 | from packaging.version import Version as LV 22 | from datetime import datetime 23 | import os 24 | import sys 25 | 26 | torch.manual_seed(42) 27 | 28 | if torch.cuda.is_available(): 29 | device = torch.device('cuda') 30 | else: 31 | device = torch.device('cpu') 32 | 33 | print('Using PyTorch version:', torch.__version__, ' Device:', device) 34 | assert LV(torch.__version__) >= LV("1.0.0") 35 | 36 | 37 | class Net(nn.Module): 38 | def __init__(self): 39 | super(Net, self).__init__() 40 | self.layers = nn.Sequential( 41 | nn.Conv2d(3, 32, (3, 3)), 42 | nn.ReLU(), 43 | nn.MaxPool2d((2, 2)), 44 | 45 | nn.Conv2d(32, 32, (3, 3)), 46 | nn.ReLU(), 47 | nn.MaxPool2d((2, 2)), 48 | 49 | nn.Conv2d(32, 64, (3, 3)), 50 | nn.ReLU(), 51 | nn.MaxPool2d((2, 2)), 52 | 53 | nn.Flatten(), # flatten 2D to 1D 54 | nn.Linear(17*17*64, 64), 55 | nn.ReLU(), 56 | nn.Dropout(0.5), 57 | nn.Linear(64, 1), 58 | nn.Sigmoid() 59 | ) 60 | 61 | def forward(self, x): 62 | return self.layers(x).squeeze() 63 | 64 | 65 | def correct(output, target): 66 | class_pred = output.round().int() # set to 0 for <0.5, 1 for >0.5 67 | correct_ones = class_pred == target.int() # 1 for correct, 0 for incorrect 68 | return correct_ones.sum().item() # count number of correct ones 69 | 70 | 71 | def train(data_loader, model, criterion, optimizer): 72 | model.train() 73 | 74 | num_batches = 0 75 | num_items = 0 76 | 77 | total_loss = 0 78 | total_correct = 0 79 | for data, target in data_loader: 80 | # Copy data and targets to GPU 81 | data = data.to(device) 82 | target = target.to(device).to(torch.float) 83 | 84 | # Do a forward pass 85 | output = model(data) 86 | 87 | # Calculate the loss 88 | loss = criterion(output, target) 89 | total_loss += loss 90 | num_batches += 1 91 | 92 | # Count number of correct 93 | total_correct += correct(output, target) 94 | num_items += len(target) 95 | 96 | # Backpropagation 97 | loss.backward() 98 | optimizer.step() 99 | optimizer.zero_grad() 100 | 101 | return { 102 | 'loss': total_loss/num_batches, 103 | 'accuracy': total_correct/num_items 104 | } 105 | 106 | 107 | def test(test_loader, model, criterion): 108 | model.eval() 109 | 110 | num_batches = len(test_loader) 111 | num_items = len(test_loader.dataset) 112 | 113 | test_loss = 0 114 | total_correct = 0 115 | 116 | with torch.no_grad(): 117 | for data, target in test_loader: 118 | # Copy data and targets to GPU 119 | data = data.to(device) 120 | target = target.to(device).to(torch.float) 121 | 122 | # Do a forward pass 123 | output = model(data) 124 | 125 | # Calculate the loss 126 | loss = criterion(output, target) 127 | test_loss += loss.item() 128 | 129 | # Count number of correct digits 130 | total_correct += correct(output, target) 131 | 132 | return { 133 | 'loss': test_loss/num_batches, 134 | 'accuracy': total_correct/num_items 135 | } 136 | 137 | 138 | def log_measures(ret, log, prefix, epoch): 139 | if log is not None: 140 | for key, value in ret.items(): 141 | log.add_scalar(prefix + "_" + key, value, epoch) 142 | 143 | 144 | def main(): 145 | # TensorBoard for logging 146 | try: 147 | import tensorboardX 148 | time_str = datetime.now().strftime('%Y-%m-%d_%H-%M-%S') 149 | logdir = os.path.join(os.getcwd(), "logs", "dvc-" + time_str) 150 | print('TensorBoard log directory:', logdir) 151 | os.makedirs(logdir) 152 | log = tensorboardX.SummaryWriter(logdir) 153 | except ImportError: 154 | log = None 155 | 156 | # The training dataset consists of 2000 images of dogs and cats, split 157 | # in half. In addition, the validation set consists of 1000 images, 158 | # and the test set of 22000 images. 159 | # 160 | # First, we'll resize all training and validation images to a fixed 161 | # size. 162 | # 163 | # Then, to make the most of our limited number of training examples, 164 | # we'll apply random transformations to them each time we are looping 165 | # over them. This way, we "augment" our training dataset to contain 166 | # more data. There are various transformations available in 167 | # torchvision, see: 168 | # https://pytorch.org/docs/stable/torchvision/transforms.html 169 | 170 | datapath = os.getenv('DATADIR') 171 | if datapath is None: 172 | print("Please set DATADIR environment variable!") 173 | sys.exit(1) 174 | datapath = os.path.join(datapath, 'dogs-vs-cats/train-2000') 175 | 176 | input_image_size = (150, 150) 177 | 178 | data_transform = transforms.Compose([ 179 | transforms.Resize(input_image_size), 180 | transforms.RandomAffine(degrees=0, translate=None, 181 | scale=(0.8, 1.2), shear=0.2), 182 | transforms.RandomHorizontalFlip(), 183 | transforms.ToTensor() 184 | ]) 185 | 186 | noop_transform = transforms.Compose([ 187 | transforms.Resize(input_image_size), 188 | transforms.ToTensor() 189 | ]) 190 | 191 | # Data loaders 192 | batch_size = 25 193 | 194 | print('Train: ', end="") 195 | train_dataset = datasets.ImageFolder(root=datapath+'/train', 196 | transform=data_transform) 197 | train_loader = DataLoader(train_dataset, batch_size=batch_size, 198 | shuffle=True, num_workers=4) 199 | print('Found', len(train_dataset), 'images belonging to', 200 | len(train_dataset.classes), 'classes') 201 | 202 | print('Validation: ', end="") 203 | validation_dataset = datasets.ImageFolder(root=datapath+'/validation', 204 | transform=noop_transform) 205 | validation_loader = DataLoader(validation_dataset, batch_size=batch_size, 206 | shuffle=False, num_workers=4) 207 | print('Found', len(validation_dataset), 'images belonging to', 208 | len(validation_dataset.classes), 'classes') 209 | 210 | print('Test: ', end="") 211 | test_dataset = datasets.ImageFolder(root=datapath+'/test', 212 | transform=noop_transform) 213 | test_loader = DataLoader(test_dataset, batch_size=batch_size, 214 | shuffle=False, num_workers=4) 215 | print('Found', len(test_dataset), 'images belonging to', 216 | len(test_dataset.classes), 'classes') 217 | 218 | # Define the network and training parameters 219 | model = Net() 220 | model = model.to(device) 221 | optimizer = optim.SGD(model.parameters(), lr=0.05) 222 | criterion = nn.BCELoss() 223 | 224 | print(model) 225 | 226 | num_epochs = 50 227 | 228 | # Training loop 229 | start_time = datetime.now() 230 | for epoch in range(num_epochs): 231 | train_ret = train(train_loader, model, criterion, optimizer) 232 | log_measures(train_ret, log, "train", epoch) 233 | 234 | val_ret = test(validation_loader, model, criterion) 235 | log_measures(val_ret, log, "val", epoch) 236 | print(f"Epoch {epoch+1}: " 237 | f"train loss: {train_ret['loss']:.6f} " 238 | f"train accuracy: {train_ret['accuracy']:.2%}, " 239 | f"val accuracy: {val_ret['accuracy']:.2%}") 240 | 241 | end_time = datetime.now() 242 | print('Total training time: {}.'.format(end_time - start_time)) 243 | 244 | # Inference 245 | ret = test(test_loader, model, criterion) 246 | print(f"\nTesting: accuracy: {ret['accuracy']:.2%}") 247 | 248 | 249 | if __name__ == "__main__": 250 | main() 251 | -------------------------------------------------------------------------------- /day2/pytorch_dvc_vit.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # # Dogs-vs-cats classification with CNNs 5 | # 6 | # In this script, we'll finetune a Vision Transformer 7 | # (https://arxiv.org/abs/2010.11929) (ViT) to classify images of cats 8 | # and dogs using PyTorch and HuggingFace Transformers: 9 | # https://github.com/huggingface/transformers 10 | 11 | import torch 12 | import torch.nn as nn 13 | import torch.optim as optim 14 | from torch.utils.data import DataLoader 15 | from torchvision import datasets, transforms 16 | from packaging.version import Version as LV 17 | from datetime import datetime 18 | import os 19 | import sys 20 | 21 | from transformers import AutoImageProcessor, ViTForImageClassification 22 | from transformers import __version__ as transformers_version 23 | 24 | torch.manual_seed(42) 25 | 26 | if torch.cuda.is_available(): 27 | device = torch.device('cuda') 28 | else: 29 | device = torch.device('cpu') 30 | 31 | print('Using PyTorch version:', torch.__version__, 32 | 'Transformers version:', transformers_version, 33 | 'Device:', device) 34 | assert LV(torch.__version__) >= LV("1.0.0") 35 | 36 | 37 | def correct(output, target): 38 | class_pred = (output > 0).int() # set to 0 for <0.5, 1 for >0.5 39 | correct_ones = class_pred == target.int() # 1 for correct, 0 for incorrect 40 | return correct_ones.sum().item() # count number of correct ones 41 | 42 | 43 | def train(data_loader, model, criterion, optimizer): 44 | model.train() 45 | 46 | num_batches = 0 47 | num_items = 0 48 | 49 | total_loss = 0 50 | total_correct = 0 51 | 52 | for data, target in data_loader: 53 | # Copy data and targets to GPU 54 | data = data.to(device) 55 | target = target.to(device) 56 | 57 | # Do a forward pass 58 | output = model(data).logits.squeeze() 59 | 60 | # Calculate the loss 61 | loss = criterion(output, target) 62 | total_loss += loss 63 | num_batches += 1 64 | 65 | # Count number of correct 66 | total_correct += correct(output, target) 67 | num_items += len(target) 68 | 69 | # Backpropagation 70 | loss.backward() 71 | optimizer.step() 72 | optimizer.zero_grad() 73 | 74 | return { 75 | 'loss': total_loss/num_batches, 76 | 'accuracy': total_correct/num_items 77 | } 78 | 79 | 80 | def test(test_loader, model, criterion): 81 | model.eval() 82 | 83 | num_batches = len(test_loader) 84 | num_items = len(test_loader.dataset) 85 | 86 | test_loss = 0 87 | total_correct = 0 88 | 89 | with torch.no_grad(): 90 | for data, target in test_loader: 91 | # Copy data and targets to GPU 92 | data = data.to(device) 93 | target = target.to(device) 94 | 95 | # Do a forward pass 96 | output = model(data).logits.squeeze() 97 | 98 | # Calculate the loss 99 | loss = criterion(output, target) 100 | test_loss += loss.item() 101 | 102 | # Count number of correct digits 103 | total_correct += correct(output, target) 104 | 105 | return { 106 | 'loss': test_loss/num_batches, 107 | 'accuracy': total_correct/num_items 108 | } 109 | 110 | 111 | def log_measures(ret, log, prefix, epoch): 112 | if log is not None: 113 | for key, value in ret.items(): 114 | log.add_scalar(prefix + "_" + key, value, epoch) 115 | 116 | 117 | class ImageClassificationCollator: 118 | def __init__(self, processor): 119 | self.processor = processor 120 | 121 | def __call__(self, batch): 122 | data = self.processor([x[0] for x in batch], do_rescale=False, 123 | return_tensors='pt').pixel_values 124 | targets = torch.tensor([x[1] for x in batch], dtype=torch.float32) 125 | return data, targets 126 | 127 | 128 | def main(): 129 | # TensorBoard for logging 130 | try: 131 | import tensorboardX 132 | time_str = datetime.now().strftime('%Y-%m-%d_%H-%M-%S') 133 | logdir = os.path.join(os.getcwd(), "logs", "dvc-vit-" + time_str) 134 | print('TensorBoard log directory:', logdir) 135 | os.makedirs(logdir) 136 | log = tensorboardX.SummaryWriter(logdir) 137 | except ImportError: 138 | log = None 139 | 140 | # The training dataset consists of 2000 images of dogs and cats, split 141 | # in half. In addition, the validation set consists of 1000 images, 142 | # and the test set of 22000 images. 143 | # 144 | # First, we'll resize all training and validation images to a fixed 145 | # size. 146 | # 147 | # Then, to make the most of our limited number of training examples, 148 | # we'll apply random transformations to them each time we are looping 149 | # over them. This way, we "augment" our training dataset to contain 150 | # more data. There are various transformations available in 151 | # torchvision, see: 152 | # https://pytorch.org/docs/stable/torchvision/transforms.html 153 | 154 | datapath = os.getenv('DATADIR') 155 | if datapath is None: 156 | print("Please set DATADIR environment variable!") 157 | sys.exit(1) 158 | datapath = os.path.join(datapath, 'dogs-vs-cats/train-2000') 159 | 160 | # Data loaders 161 | batch_size = 32 162 | 163 | vitmodel = 'google/vit-base-patch16-224' 164 | processor = AutoImageProcessor.from_pretrained(vitmodel) 165 | collator = ImageClassificationCollator(processor) 166 | 167 | print('Train: ', end="") 168 | train_dataset = datasets.ImageFolder(root=datapath+'/train', 169 | transform=transforms.ToTensor()) 170 | train_loader = DataLoader(train_dataset, batch_size=batch_size, 171 | shuffle=True, num_workers=4, 172 | collate_fn=collator) 173 | print('Found', len(train_dataset), 'images belonging to', 174 | len(train_dataset.classes), 'classes') 175 | 176 | print('Validation: ', end="") 177 | validation_dataset = datasets.ImageFolder(root=datapath+'/validation', 178 | transform=transforms.ToTensor()) 179 | validation_loader = DataLoader(validation_dataset, batch_size=batch_size, 180 | shuffle=False, num_workers=4, 181 | collate_fn=collator) 182 | print('Found', len(validation_dataset), 'images belonging to', 183 | len(validation_dataset.classes), 'classes') 184 | 185 | print('Test: ', end="") 186 | test_dataset = datasets.ImageFolder(root=datapath+'/test', 187 | transform=transforms.ToTensor()) 188 | test_loader = DataLoader(test_dataset, batch_size=batch_size, 189 | shuffle=False, num_workers=4, 190 | collate_fn=collator) 191 | print('Found', len(test_dataset), 'images belonging to', 192 | len(test_dataset.classes), 'classes') 193 | 194 | # Define the network and training parameters 195 | model = ViTForImageClassification.from_pretrained( 196 | vitmodel, num_labels=1, ignore_mismatched_sizes=True) 197 | model = model.to(device) 198 | optimizer = optim.Adam(model.parameters(), lr=1e-5) 199 | criterion = nn.BCEWithLogitsLoss() 200 | 201 | print(model) 202 | 203 | num_epochs = 10 204 | 205 | # Training loop 206 | start_time = datetime.now() 207 | for epoch in range(num_epochs): 208 | train_ret = train(train_loader, model, criterion, optimizer) 209 | log_measures(train_ret, log, "train", epoch) 210 | 211 | val_ret = test(validation_loader, model, criterion) 212 | log_measures(val_ret, log, "val", epoch) 213 | print(f"Epoch {epoch+1}: " 214 | f"train loss: {train_ret['loss']:.6f} " 215 | f"train accuracy: {train_ret['accuracy']:.2%}, " 216 | f"val accuracy: {val_ret['accuracy']:.2%}") 217 | 218 | end_time = datetime.now() 219 | print('Total training time: {}.'.format(end_time - start_time)) 220 | 221 | # Inference 222 | ret = test(test_loader, model, criterion) 223 | print(f"\nTesting: accuracy: {ret['accuracy']:.2%}") 224 | 225 | 226 | if __name__ == "__main__": 227 | main() 228 | -------------------------------------------------------------------------------- /day2/pytorch_generate_gpt.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "c0e6e42c-a2a3-4ba9-be85-056035147486", 6 | "metadata": {}, 7 | "source": [ 8 | "# IMDB movie review text generation\n", 9 | "\n", 10 | "Once you have fine-tuned your model you can test it interactively with this notebook." 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": null, 16 | "id": "fa458fec-a1e9-4960-9a9f-c7f21d0a7b6e", 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "from transformers import pipeline\n", 21 | "\n", 22 | "path_to_model = \"/scratch/project_462000863/data/users/YOUR_USERNAME_HERE/gpt-imdb-model/checkpoint-5000/\"\n", 23 | "generator = pipeline(\"text-generation\", model=path_to_model)" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": null, 29 | "id": "3a5ecc40-1c1d-4c9d-a41c-937bbbbaf025", 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "def print_output(output):\n", 34 | " for item in output:\n", 35 | " text = item['generated_text']\n", 36 | " text = text.replace(\"
\", \"\\n\")\n", 37 | " print('-', text)\n", 38 | " print()" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": null, 44 | "id": "bf677501-f93d-46b1-a618-0fb792cd44cd", 45 | "metadata": {}, 46 | "outputs": [], 47 | "source": [ 48 | "output = generator(\"This movie was\")\n", 49 | "print_output(output)" 50 | ] 51 | }, 52 | { 53 | "cell_type": "markdown", 54 | "id": "73fb8536-887f-4fad-a0b3-190d1749a594", 55 | "metadata": {}, 56 | "source": [ 57 | "## Experiment with the generation strategy\n", 58 | "\n", 59 | "You can play with the text generation if you wish. Text generation strategies are discussed here: https://huggingface.co/docs/transformers/generation_strategies\n", 60 | "\n", 61 | "Note that we are here using the easy-to-use `TextGenerationPipeline` and its `generator()` function, but the link discusses the `model.generate()` method. The same parameters can be used, though, the pipeline just takes care of some of the pre- and post-processing.\n", 62 | "\n", 63 | "In particular these parameters of the `generator()` function might be interesting:\n", 64 | "\n", 65 | "- `max_new_tokens`: the maximum number of tokens to generate\n", 66 | "- `num_beams`: activate Beam search by setting this > 1\n", 67 | "- `do_sample`: activate multinomial sampling if set to True\n", 68 | "- `num_return_sequences`: the number of candidate sentences to return (available only for beam search and sampling)\n", 69 | "\n", 70 | "Here is a nice blog post explaining in more detail about the different generation strategies: https://huggingface.co/blog/how-to-generate" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": null, 76 | "id": "6816b3f3-9a0f-4ca8-a7d9-d7962b0207fc", 77 | "metadata": {}, 78 | "outputs": [], 79 | "source": [ 80 | "output = generator(\"This movie was awful because\", num_return_sequences=1, max_new_tokens=100, do_sample=True)\n", 81 | "print_output(output)" 82 | ] 83 | }, 84 | { 85 | "cell_type": "markdown", 86 | "id": "df008ff8-cb03-488f-b643-4aa2314de52c", 87 | "metadata": {}, 88 | "source": [ 89 | "## Compare with the original model without fine-tuning\n", 90 | "\n", 91 | "We can also load the original `distilgpt2` model and see how it would have worked without fine-tuning." 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": null, 97 | "id": "3ba1f550-970e-419a-aaff-d4e821bacc87", 98 | "metadata": {}, 99 | "outputs": [], 100 | "source": [ 101 | "generator_orig = pipeline(\"text-generation\", model='distilgpt2')" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": null, 107 | "id": "4995c393-29ad-4df1-b01a-83cd85008297", 108 | "metadata": {}, 109 | "outputs": [], 110 | "source": [ 111 | "output = generator_orig(\"This movie was awful because\", num_return_sequences=1, max_new_tokens=100, do_sample=True)\n", 112 | "print_output(output)" 113 | ] 114 | } 115 | ], 116 | "metadata": { 117 | "kernelspec": { 118 | "display_name": "Python 3 (ipykernel)", 119 | "language": "python", 120 | "name": "python3" 121 | }, 122 | "language_info": { 123 | "codemirror_mode": { 124 | "name": "ipython", 125 | "version": 3 126 | }, 127 | "file_extension": ".py", 128 | "mimetype": "text/x-python", 129 | "name": "python", 130 | "nbconvert_exporter": "python", 131 | "pygments_lexer": "ipython3", 132 | "version": "3.10.12" 133 | } 134 | }, 135 | "nbformat": 4, 136 | "nbformat_minor": 5 137 | } 138 | -------------------------------------------------------------------------------- /day2/pytorch_gtsrb_cnn_pretrained.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # # Traffic sign classification with CNNs 5 | # 6 | # In this script, we'll train a convolutional neural network (CNN) to 7 | # classify images of traffic signs from the German Traffic Sign 8 | # Recognition Benchmark: 9 | # https://benchmark.ini.rub.de/gtsrb_news.html 10 | # 11 | # ## Option 2: Reuse a pre-trained CNN 12 | # 13 | # Here we'll use the VGG16 pre-trained network: 14 | # https://pytorch.org/docs/stable/torchvision/models.html#torchvision.models.vgg16 15 | # 16 | # It has weights learned using ImageNet. We remove the top layers and 17 | # freeze the pre-trained weights, and then stack our own, randomly 18 | # initialized, layers on top of the VGG16 network. 19 | # 20 | 21 | import torch 22 | import torch.nn as nn 23 | import torch.optim as optim 24 | from torch.utils.data import DataLoader 25 | from torchvision import datasets, transforms, models 26 | from packaging.version import Version as LV 27 | from datetime import datetime 28 | import os 29 | import sys 30 | 31 | torch.manual_seed(42) 32 | 33 | if torch.cuda.is_available(): 34 | device = torch.device('cuda') 35 | else: 36 | device = torch.device('cpu') 37 | 38 | print('Using PyTorch version:', torch.__version__, ' Device:', device) 39 | assert LV(torch.__version__) >= LV("1.0.0") 40 | 41 | 42 | class PretrainedNet(nn.Module): 43 | def __init__(self): 44 | super(PretrainedNet, self).__init__() 45 | self.vgg_features = models.vgg16(weights=models.VGG16_Weights.DEFAULT).features 46 | 47 | # Freeze the VGG16 layers 48 | for param in self.vgg_features.parameters(): 49 | param.requires_grad = False 50 | 51 | # Add our own layers on top 52 | self.own_layers = nn.Sequential( 53 | nn.Flatten(), 54 | nn.Linear(512*2*2, 256), 55 | nn.ReLU(), 56 | nn.Linear(256, 43) 57 | ) 58 | 59 | def forward(self, x): 60 | x = self.vgg_features(x) 61 | return self.own_layers(x).squeeze() 62 | 63 | # 64 | # There are some broken folders, but we need to keep the class indices 65 | # the same. We created a custom Dataset class to handle this. 66 | # 67 | class ImageFolderRemoveDirs(datasets.ImageFolder): 68 | def __init__(self, root, transform, remove_dirs): 69 | self.remove_dirs = remove_dirs 70 | super(ImageFolderRemoveDirs, self).__init__(root=root, transform=transform) 71 | 72 | def find_classes(self, directory): 73 | classes, class_to_idx = super(ImageFolderRemoveDirs, self).find_classes(directory) 74 | for d in self.remove_dirs: 75 | print('Removing directory', d) 76 | classes.remove(d) 77 | del class_to_idx[d] 78 | return classes, class_to_idx 79 | 80 | def correct(output, target): 81 | predicted = output.argmax(1) # pick class with largest network output 82 | correct_ones = (predicted == target).type(torch.float) 83 | return correct_ones.sum().item() # count number of correct ones 84 | 85 | def train(data_loader, model, criterion, optimizer): 86 | model.train() 87 | 88 | num_batches = 0 89 | num_items = 0 90 | 91 | total_loss = 0 92 | total_correct = 0 93 | for data, target in data_loader: 94 | # Copy data and targets to GPU 95 | data = data.to(device) 96 | target = target.to(device) 97 | 98 | # Do a forward pass 99 | output = model(data) 100 | 101 | # Calculate the loss 102 | loss = criterion(output, target) 103 | total_loss += loss 104 | num_batches += 1 105 | 106 | # Count number of correct 107 | total_correct += correct(output, target) 108 | num_items += len(target) 109 | 110 | # Backpropagation 111 | loss.backward() 112 | optimizer.step() 113 | optimizer.zero_grad() 114 | 115 | return { 116 | 'loss': total_loss/num_batches, 117 | 'accuracy': total_correct/num_items 118 | } 119 | 120 | 121 | def test(test_loader, model, criterion): 122 | model.eval() 123 | 124 | num_batches = len(test_loader) 125 | num_items = len(test_loader.dataset) 126 | 127 | test_loss = 0 128 | total_correct = 0 129 | 130 | with torch.no_grad(): 131 | for data, target in test_loader: 132 | # Copy data and targets to GPU 133 | data = data.to(device) 134 | target = target.to(device) 135 | 136 | # Do a forward pass 137 | output = model(data) 138 | 139 | # Calculate the loss 140 | loss = criterion(output, target) 141 | test_loss += loss.item() 142 | 143 | # Count number of correct digits 144 | total_correct += correct(output, target) 145 | 146 | return { 147 | 'loss': test_loss/num_batches, 148 | 'accuracy': total_correct/num_items 149 | } 150 | 151 | 152 | def log_measures(ret, log, prefix, epoch): 153 | if log is not None: 154 | for key, value in ret.items(): 155 | log.add_scalar(prefix + "_" + key, value, epoch) 156 | 157 | 158 | def main(): 159 | # TensorBoard for logging 160 | try: 161 | import tensorboardX 162 | time_str = datetime.now().strftime('%Y-%m-%d_%H-%M-%S') 163 | logdir = os.path.join(os.getcwd(), "logs", "gtsrb-pretrained-" + time_str) 164 | print('TensorBoard log directory:', logdir) 165 | os.makedirs(logdir) 166 | log = tensorboardX.SummaryWriter(logdir) 167 | except ImportError: 168 | log = None 169 | 170 | # The training dataset consists of 5535 images of traffic signs of 171 | # varying size. There are 43 different types of traffic signs. 172 | # 173 | # The validation and test sets consist of 999 and 12630 images, 174 | # respectively. 175 | # 176 | # Then, to make the most of our limited number of training examples, 177 | # we'll apply random transformations to them each time we are looping 178 | # over them. This way, we "augment" our training dataset to contain 179 | # more data. There are various transformations available in 180 | # torchvision, see: 181 | # https://pytorch.org/docs/stable/torchvision/transforms.html 182 | 183 | datapath = os.getenv('DATADIR') 184 | if datapath is None: 185 | print("Please set DATADIR environment variable!") 186 | sys.exit(1) 187 | datapath = os.path.join(datapath, 'gtsrb/train-5535') 188 | 189 | input_image_size = (75, 75) 190 | 191 | data_transform = transforms.Compose([ 192 | transforms.Resize(input_image_size), 193 | transforms.RandomAffine(degrees=0, translate=None, 194 | scale=(0.8, 1.2), shear=0.2), 195 | # transforms.RandomHorizontalFlip(), 196 | transforms.ToTensor() 197 | ]) 198 | 199 | noop_transform = transforms.Compose([ 200 | transforms.Resize(input_image_size), 201 | transforms.ToTensor() 202 | ]) 203 | 204 | # Data loaders 205 | batch_size = 25 206 | 207 | print('Train: ', end="") 208 | train_dataset = datasets.ImageFolder(root=datapath+'/train', 209 | transform=data_transform) 210 | train_loader = DataLoader(train_dataset, batch_size=batch_size, 211 | shuffle=True, num_workers=4) 212 | print('Found', len(train_dataset), 'images belonging to', 213 | len(train_dataset.classes), 'classes') 214 | 215 | print('Validation: ', end="") 216 | validation_dataset = ImageFolderRemoveDirs(root=datapath+'/validation', 217 | transform=noop_transform, 218 | remove_dirs=['00027', '00039']) 219 | validation_loader = DataLoader(validation_dataset, batch_size=batch_size, 220 | shuffle=False, num_workers=4) 221 | print('Found', len(validation_dataset), 'images belonging to', 222 | len(validation_dataset.classes), 'classes') 223 | 224 | print('Test: ', end="") 225 | test_dataset = datasets.ImageFolder(root=datapath+'/test', 226 | transform=noop_transform) 227 | test_loader = DataLoader(test_dataset, batch_size=batch_size, 228 | shuffle=False, num_workers=4) 229 | print('Found', len(test_dataset), 'images belonging to', 230 | len(test_dataset.classes), 'classes') 231 | 232 | # Define the network and training parameters 233 | model = PretrainedNet() 234 | model = model.to(device) 235 | optimizer = optim.SGD(model.parameters(), lr=0.01) 236 | criterion = nn.CrossEntropyLoss() 237 | 238 | print(model) 239 | 240 | num_epochs = 20 241 | 242 | # Training loop 243 | start_time = datetime.now() 244 | for epoch in range(num_epochs): 245 | train_ret = train(train_loader, model, criterion, optimizer) 246 | log_measures(train_ret, log, "train", epoch) 247 | 248 | val_ret = test(validation_loader, model, criterion) 249 | log_measures(val_ret, log, "val", epoch) 250 | print(f"Epoch {epoch+1}: " 251 | f"train accuracy: {train_ret['accuracy']:.2%}, " 252 | f"val accuracy: {val_ret['accuracy']:.2%}") 253 | 254 | end_time = datetime.now() 255 | print('Total training time: {}.'.format(end_time - start_time)) 256 | 257 | # Inference 258 | ret = test(test_loader, model, criterion) 259 | print("\nTesting (pretrained, before fine-tuning): " 260 | f"accuracy: {ret['accuracy']:.2%}\n") 261 | 262 | # Fine-tuning 263 | # 264 | # Once the top layers have learned some reasonable weights, we can 265 | # continue training by unfreezing the last convolution block of 266 | # VGG16 so that it may adapt to our data. The learning rate should 267 | # be smaller than usual. 268 | # 269 | # Below we loop over all layers and set only the last three Conv2d 270 | # layers to trainable. In the printout we mark trainable layers 271 | # with '+', frozen with '-'. Other layers don't have trainable 272 | # parameters. 273 | 274 | print("Marking layers for training (+) or frozen (-):") 275 | for name, layer in model.vgg_features.named_children(): 276 | note = ' ' 277 | for param in layer.parameters(): 278 | note = '-' 279 | if int(name) >= 24: 280 | param.requires_grad = True 281 | note = '+' 282 | print(name, note, layer, len(param)) 283 | 284 | # We set up the training, note that we need to give only the 285 | # parameters that are set to be trainable. 286 | params = filter(lambda p: p.requires_grad, model.parameters()) 287 | #optimizer = optim.SGD(model.parameters(), lr=1e-3) 288 | optimizer = optim.RMSprop(params, lr=1e-5) 289 | criterion = nn.CrossEntropyLoss() 290 | 291 | # Note that before continuing the training, we create a separate 292 | # TensorBoard log directory. 293 | if log is not None: 294 | logdir_pt = logdir + '-pretrained-finetune' 295 | os.makedirs(logdir_pt) 296 | log = tensorboardX.SummaryWriter(logdir_pt) 297 | 298 | prev_epochs = num_epochs 299 | num_epochs = 20 300 | 301 | start_time = datetime.now() 302 | for epoch in range(prev_epochs, prev_epochs+num_epochs): 303 | train_ret = train(train_loader, model, criterion, optimizer) 304 | log_measures(train_ret, log, "train", epoch) 305 | 306 | val_ret = test(validation_loader, model, criterion) 307 | log_measures(val_ret, log, "val", epoch) 308 | 309 | print(f"Epoch {epoch+1}: " 310 | f"train loss: {train_ret['loss']:.6f} " 311 | f"train accuracy: {train_ret['accuracy']:.2%}, " 312 | f"val accuracy: {val_ret['accuracy']:.2%}") 313 | 314 | end_time = datetime.now() 315 | print('Total fine-tuning time: {}.'.format(end_time - start_time)) 316 | 317 | # Inference 318 | ret = test(test_loader, model, criterion) 319 | print("\nTesting (pretrained, after fine-tuning): " 320 | f"accuracy: {ret['accuracy']:.2%}\n") 321 | 322 | 323 | if __name__ == "__main__": 324 | main() 325 | -------------------------------------------------------------------------------- /day2/pytorch_gtsrb_cnn_simple.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # # Traffic sign classification with CNNs 5 | # 6 | # In this script, we'll train a convolutional neural network (CNN) to 7 | # classify images of traffic signs from the German Traffic Sign 8 | # Recognition Benchmark: 9 | # https://benchmark.ini.rub.de/gtsrb_news.html 10 | # 11 | # ## Option 1: Train a small CNN from scratch 12 | # 13 | # Similarly as with MNIST digits, we can start from scratch and train 14 | # a CNN for the classification task. However, due to the small number 15 | # of training images, a large network will easily overfit, regardless 16 | # of the data augmentation. 17 | 18 | import torch 19 | import torch.nn as nn 20 | import torch.nn.functional as F 21 | import torch.optim as optim 22 | from torch.utils.data import DataLoader 23 | from torchvision import datasets, transforms 24 | from packaging.version import Version as LV 25 | from datetime import datetime 26 | import os 27 | import sys 28 | 29 | torch.manual_seed(42) 30 | 31 | if torch.cuda.is_available(): 32 | device = torch.device('cuda') 33 | else: 34 | device = torch.device('cpu') 35 | 36 | print('Using PyTorch version:', torch.__version__, ' Device:', device) 37 | assert(LV(torch.__version__) >= LV("1.0.0")) 38 | 39 | 40 | class Net(nn.Module): 41 | def __init__(self): 42 | super(Net, self).__init__() 43 | self.layers = nn.Sequential( 44 | nn.Conv2d(3, 32, (3, 3)), 45 | nn.ReLU(), 46 | nn.MaxPool2d((2, 2)), 47 | 48 | nn.Conv2d(32, 32, (3, 3)), 49 | nn.ReLU(), 50 | nn.MaxPool2d((2, 2)), 51 | 52 | nn.Conv2d(32, 64, (3, 3)), 53 | nn.ReLU(), 54 | nn.MaxPool2d((2, 2)), 55 | 56 | nn.Flatten(), # flatten 2D to 1D 57 | nn.Linear(7*7*64, 128), 58 | nn.ReLU(), 59 | nn.Dropout(0.5), 60 | nn.Linear(128, 43) 61 | ) 62 | 63 | def forward(self, x): 64 | return self.layers(x).squeeze() 65 | 66 | 67 | # 68 | # There are some broken folders, but we need to keep the class indices 69 | # the same. We created a custom Dataset class to handle this. 70 | # 71 | class ImageFolderRemoveDirs(datasets.ImageFolder): 72 | def __init__(self, root, transform, remove_dirs): 73 | self.remove_dirs = remove_dirs 74 | super(ImageFolderRemoveDirs, self).__init__(root=root, transform=transform) 75 | 76 | def find_classes(self, directory): 77 | classes, class_to_idx = super(ImageFolderRemoveDirs, self).find_classes(directory) 78 | for d in self.remove_dirs: 79 | print('Removing directory', d) 80 | classes.remove(d) 81 | del class_to_idx[d] 82 | return classes, class_to_idx 83 | 84 | 85 | def correct(output, target): 86 | predicted = output.argmax(1) # pick class with largest network output 87 | correct_ones = (predicted == target).type(torch.float) 88 | return correct_ones.sum().item() # count number of correct ones 89 | 90 | 91 | def train(data_loader, model, criterion, optimizer): 92 | model.train() 93 | 94 | num_batches = 0 95 | num_items = 0 96 | 97 | total_loss = 0 98 | total_correct = 0 99 | for data, target in data_loader: 100 | # Copy data and targets to GPU 101 | data = data.to(device) 102 | target = target.to(device) 103 | 104 | # Do a forward pass 105 | output = model(data) 106 | 107 | # Calculate the loss 108 | loss = criterion(output, target) 109 | total_loss += loss 110 | num_batches += 1 111 | 112 | # Count number of correct 113 | total_correct += correct(output, target) 114 | num_items += len(target) 115 | 116 | # Backpropagation 117 | loss.backward() 118 | optimizer.step() 119 | optimizer.zero_grad() 120 | 121 | return { 122 | 'loss': total_loss/num_batches, 123 | 'accuracy': total_correct/num_items 124 | } 125 | 126 | 127 | def test(test_loader, model, criterion): 128 | model.eval() 129 | 130 | num_batches = len(test_loader) 131 | num_items = len(test_loader.dataset) 132 | 133 | test_loss = 0 134 | total_correct = 0 135 | 136 | with torch.no_grad(): 137 | for data, target in test_loader: 138 | # Copy data and targets to GPU 139 | data = data.to(device) 140 | target = target.to(device) 141 | 142 | # Do a forward pass 143 | output = model(data) 144 | 145 | # Calculate the loss 146 | loss = criterion(output, target) 147 | test_loss += loss.item() 148 | 149 | # Count number of correct digits 150 | total_correct += correct(output, target) 151 | 152 | return { 153 | 'loss': test_loss/num_batches, 154 | 'accuracy': total_correct/num_items 155 | } 156 | 157 | 158 | def log_measures(ret, log, prefix, epoch): 159 | if log is not None: 160 | for key, value in ret.items(): 161 | log.add_scalar(prefix + "_" + key, value, epoch) 162 | 163 | 164 | def main(): 165 | # TensorBoard for logging 166 | try: 167 | import tensorboardX 168 | time_str = datetime.now().strftime('%Y-%m-%d_%H-%M-%S') 169 | logdir = os.path.join(os.getcwd(), "logs", "gtsrb-" + time_str) 170 | print('TensorBoard log directory:', logdir) 171 | os.makedirs(logdir) 172 | log = tensorboardX.SummaryWriter(logdir) 173 | except ImportError: 174 | log = None 175 | 176 | 177 | # The training dataset consists of 5535 images of traffic signs of 178 | # varying size. There are 43 different types of traffic signs. 179 | # 180 | # The validation and test sets consist of 999 and 12630 images, 181 | # respectively. 182 | 183 | # First, we'll resize all training and validation images to a fixed 184 | # size. 185 | # 186 | # Then, to make the most of our limited number of training examples, 187 | # we'll apply random transformations to them each time we are looping 188 | # over them. This way, we "augment" our training dataset to contain 189 | # more data. There are various transformations available in 190 | # torchvision, see: 191 | # https://pytorch.org/docs/stable/torchvision/transforms.html 192 | 193 | datapath = os.getenv('DATADIR') 194 | if datapath is None: 195 | print("Please set DATADIR environment variable!") 196 | sys.exit(1) 197 | datapath = os.path.join(datapath, 'gtsrb/train-5535') 198 | 199 | input_image_size = (75, 75) 200 | 201 | data_transform = transforms.Compose([ 202 | transforms.Resize(input_image_size), 203 | transforms.RandomAffine(degrees=0, translate=None, 204 | scale=(0.8, 1.2), shear=0.2), 205 | # transforms.RandomHorizontalFlip(), 206 | transforms.ToTensor() 207 | ]) 208 | 209 | noop_transform = transforms.Compose([ 210 | transforms.Resize(input_image_size), 211 | transforms.ToTensor() 212 | ]) 213 | 214 | # Data loaders 215 | batch_size = 50 216 | 217 | print('Train: ', end="") 218 | train_dataset = datasets.ImageFolder(root=datapath+'/train', 219 | transform=data_transform) 220 | train_loader = DataLoader(train_dataset, batch_size=batch_size, 221 | shuffle=True, num_workers=4) 222 | print('Found', len(train_dataset), 'images belonging to', 223 | len(train_dataset.classes), 'classes') 224 | 225 | print('Validation: ', end="") 226 | validation_dataset = ImageFolderRemoveDirs(root=datapath+'/validation', 227 | transform=noop_transform, 228 | remove_dirs=['00027', '00039']) 229 | validation_loader = DataLoader(validation_dataset, batch_size=batch_size, 230 | shuffle=False, num_workers=4) 231 | print('Found', len(validation_dataset), 'images belonging to', 232 | len(validation_dataset.classes), 'classes') 233 | 234 | print('Test: ', end="") 235 | test_dataset = datasets.ImageFolder(root=datapath+'/test', 236 | transform=noop_transform) 237 | test_loader = DataLoader(test_dataset, batch_size=batch_size, 238 | shuffle=False, num_workers=4) 239 | print('Found', len(test_dataset), 'images belonging to', 240 | len(test_dataset.classes), 'classes') 241 | 242 | # Define the network and training parameters 243 | model = Net() 244 | model = model.to(device) 245 | optimizer = optim.SGD(model.parameters(), lr=0.03) 246 | #optimizer = optim.RMSprop(model.parameters()) 247 | criterion = nn.CrossEntropyLoss() 248 | 249 | print(model) 250 | 251 | num_epochs = 50 252 | 253 | # Training loop 254 | start_time = datetime.now() 255 | for epoch in range(num_epochs): 256 | train_ret = train(train_loader, model, criterion, optimizer) 257 | log_measures(train_ret, log, "train", epoch) 258 | 259 | val_ret = test(validation_loader, model, criterion) 260 | log_measures(val_ret, log, "val", epoch) 261 | print(f"Epoch {epoch+1}: " 262 | f"train loss: {train_ret['loss']:.6f} " 263 | f"train accuracy: {train_ret['accuracy']:.2%}, " 264 | f"val accuracy: {val_ret['accuracy']:.2%}") 265 | 266 | end_time = datetime.now() 267 | print('Total training time: {}.'.format(end_time - start_time)) 268 | 269 | # Inference 270 | ret = test(test_loader, model, criterion) 271 | print(f"\nTesting: accuracy: {ret['accuracy']:.2%}") 272 | 273 | 274 | if __name__ == "__main__": 275 | main() 276 | -------------------------------------------------------------------------------- /day2/pytorch_gtsrb_vit.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # # Traffic sign classification with CNNs 5 | # 6 | # In this script, we'll finetune a Vision Transformer 7 | # (https://arxiv.org/abs/2010.11929) (ViT) to classify images of 8 | # traffic signs using PyTorch and HuggingFace Transformers: 9 | # https://github.com/huggingface/transformers 10 | 11 | import torch 12 | import torch.nn as nn 13 | import torch.optim as optim 14 | from torch.utils.data import DataLoader 15 | from torchvision import datasets, transforms 16 | from packaging.version import Version as LV 17 | from datetime import datetime 18 | import os 19 | import sys 20 | 21 | from transformers import AutoImageProcessor, ViTForImageClassification 22 | from transformers import __version__ as transformers_version 23 | 24 | torch.manual_seed(42) 25 | 26 | if torch.cuda.is_available(): 27 | device = torch.device('cuda') 28 | else: 29 | device = torch.device('cpu') 30 | 31 | print('Using PyTorch version:', torch.__version__, 32 | 'Transformers version:', transformers_version, 33 | 'Device:', device) 34 | assert LV(torch.__version__) >= LV("1.0.0") 35 | 36 | # 37 | # There are some broken folders, but we need to keep the class indices 38 | # the same. We created a custom Dataset class to handle this. 39 | # 40 | class ImageFolderRemoveDirs(datasets.ImageFolder): 41 | def __init__(self, root, transform, remove_dirs): 42 | self.remove_dirs = remove_dirs 43 | super(ImageFolderRemoveDirs, self).__init__(root=root, transform=transform) 44 | 45 | def find_classes(self, directory): 46 | classes, class_to_idx = super(ImageFolderRemoveDirs, self).find_classes(directory) 47 | for d in self.remove_dirs: 48 | print('Removing directory', d) 49 | classes.remove(d) 50 | del class_to_idx[d] 51 | return classes, class_to_idx 52 | 53 | 54 | def correct(output, target): 55 | predicted = output.argmax(1) # pick class with largest network output 56 | correct_ones = (predicted == target).type(torch.float) 57 | return correct_ones.sum().item() # count number of correct ones 58 | 59 | def train(data_loader, model, criterion, optimizer): 60 | model.train() 61 | 62 | num_batches = 0 63 | num_items = 0 64 | 65 | total_loss = 0 66 | total_correct = 0 67 | 68 | for data, target in data_loader: 69 | # Copy data and targets to GPU 70 | data = data.to(device) 71 | target = target.to(device) 72 | 73 | # Do a forward pass 74 | output = model(data).logits #.squeeze() 75 | 76 | # Calculate the loss 77 | loss = criterion(output, target) 78 | total_loss += loss 79 | num_batches += 1 80 | 81 | # Count number of correct 82 | total_correct += correct(output, target) 83 | num_items += len(target) 84 | 85 | # Backpropagation 86 | loss.backward() 87 | optimizer.step() 88 | optimizer.zero_grad() 89 | 90 | return { 91 | 'loss': total_loss/num_batches, 92 | 'accuracy': total_correct/num_items 93 | } 94 | 95 | 96 | def test(test_loader, model, criterion): 97 | model.eval() 98 | 99 | num_batches = len(test_loader) 100 | num_items = len(test_loader.dataset) 101 | 102 | test_loss = 0 103 | total_correct = 0 104 | 105 | with torch.no_grad(): 106 | for data, target in test_loader: 107 | # Copy data and targets to GPU 108 | data = data.to(device) 109 | target = target.to(device) 110 | 111 | # Do a forward pass 112 | output = model(data).logits #.squeeze() 113 | 114 | # Calculate the loss 115 | loss = criterion(output, target) 116 | test_loss += loss.item() 117 | 118 | # Count number of correct digits 119 | total_correct += correct(output, target) 120 | 121 | return { 122 | 'loss': test_loss/num_batches, 123 | 'accuracy': total_correct/num_items 124 | } 125 | 126 | 127 | def log_measures(ret, log, prefix, epoch): 128 | if log is not None: 129 | for key, value in ret.items(): 130 | log.add_scalar(prefix + "_" + key, value, epoch) 131 | 132 | 133 | class ImageClassificationCollator: 134 | def __init__(self, processor): 135 | self.processor = processor 136 | 137 | def __call__(self, batch): 138 | data = self.processor([x[0] for x in batch], do_rescale=False, 139 | return_tensors='pt').pixel_values 140 | targets = torch.tensor([x[1] for x in batch]) #, dtype=torch.float32) 141 | return data, targets 142 | 143 | 144 | def main(): 145 | # TensorBoard for logging 146 | try: 147 | import tensorboardX 148 | time_str = datetime.now().strftime('%Y-%m-%d_%H-%M-%S') 149 | logdir = os.path.join(os.getcwd(), "logs", "gtsrb-vit-" + time_str) 150 | print('TensorBoard log directory:', logdir) 151 | os.makedirs(logdir) 152 | log = tensorboardX.SummaryWriter(logdir) 153 | except ImportError: 154 | log = None 155 | 156 | # The training dataset consists of 2000 images of dogs and cats, split 157 | # in half. In addition, the validation set consists of 1000 images, 158 | # and the test set of 22000 images. 159 | # 160 | # First, we'll resize all training and validation images to a fixed 161 | # size. 162 | # 163 | # Then, to make the most of our limited number of training examples, 164 | # we'll apply random transformations to them each time we are looping 165 | # over them. This way, we "augment" our training dataset to contain 166 | # more data. There are various transformations available in 167 | # torchvision, see: 168 | # https://pytorch.org/docs/stable/torchvision/transforms.html 169 | 170 | datapath = os.getenv('DATADIR') 171 | if datapath is None: 172 | print("Please set DATADIR environment variable!") 173 | sys.exit(1) 174 | datapath = os.path.join(datapath, 'gtsrb/train-5535') 175 | 176 | # Data loaders 177 | batch_size = 32 178 | 179 | vitmodel = 'google/vit-base-patch16-224' 180 | processor = AutoImageProcessor.from_pretrained(vitmodel) 181 | collator = ImageClassificationCollator(processor) 182 | 183 | print('Train: ', end="") 184 | train_dataset = datasets.ImageFolder(root=datapath+'/train', 185 | transform=transforms.ToTensor()) 186 | train_loader = DataLoader(train_dataset, batch_size=batch_size, 187 | shuffle=True, num_workers=4, 188 | collate_fn=collator) 189 | print('Found', len(train_dataset), 'images belonging to', 190 | len(train_dataset.classes), 'classes') 191 | 192 | print('Validation: ', end="") 193 | validation_dataset = ImageFolderRemoveDirs(root=datapath+'/validation', 194 | transform=transforms.ToTensor(), 195 | remove_dirs=['00027', '00039']) 196 | validation_loader = DataLoader(validation_dataset, batch_size=batch_size, 197 | shuffle=False, num_workers=4, 198 | collate_fn=collator) 199 | print('Found', len(validation_dataset), 'images belonging to', 200 | len(validation_dataset.classes), 'classes') 201 | 202 | print('Test: ', end="") 203 | test_dataset = datasets.ImageFolder(root=datapath+'/test', 204 | transform=transforms.ToTensor()) 205 | test_loader = DataLoader(test_dataset, batch_size=batch_size, 206 | shuffle=False, num_workers=4, 207 | collate_fn=collator) 208 | print('Found', len(test_dataset), 'images belonging to', 209 | len(test_dataset.classes), 'classes') 210 | 211 | # Define the network and training parameters 212 | model = ViTForImageClassification.from_pretrained( 213 | vitmodel, num_labels=43, ignore_mismatched_sizes=True) 214 | model = model.to(device) 215 | optimizer = optim.Adam(model.parameters(), lr=1e-5) 216 | criterion = nn.CrossEntropyLoss() 217 | 218 | print(model) 219 | 220 | num_epochs = 5 221 | 222 | # Training loop 223 | start_time = datetime.now() 224 | for epoch in range(num_epochs): 225 | train_ret = train(train_loader, model, criterion, optimizer) 226 | log_measures(train_ret, log, "train", epoch) 227 | 228 | val_ret = test(validation_loader, model, criterion) 229 | log_measures(val_ret, log, "val", epoch) 230 | print(f"Epoch {epoch+1}: " 231 | f"train loss: {train_ret['loss']:.6f} " 232 | f"train accuracy: {train_ret['accuracy']:.2%}, " 233 | f"val accuracy: {val_ret['accuracy']:.2%}") 234 | 235 | end_time = datetime.now() 236 | print('Total training time: {}.'.format(end_time - start_time)) 237 | 238 | # Inference 239 | ret = test(test_loader, model, criterion) 240 | print(f"\nTesting: accuracy: {ret['accuracy']:.2%}") 241 | 242 | 243 | if __name__ == "__main__": 244 | main() 245 | -------------------------------------------------------------------------------- /day2/pytorch_imdb_gpt.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # # IMDB movie review text generation 5 | # 6 | # In this script, we'll fine-tune a GPT3-like model to generate more 7 | # movie reviews based on a prompt. 8 | 9 | import math 10 | import os 11 | import sys 12 | import time 13 | from pprint import pprint 14 | 15 | import torch 16 | from datasets import load_dataset 17 | from transformers import ( 18 | AutoModelForCausalLM, 19 | AutoTokenizer, 20 | DataCollatorForLanguageModeling, 21 | PreTrainedTokenizerFast, 22 | Trainer, 23 | TrainingArguments 24 | ) 25 | 26 | 27 | def preprocess_data(train_dataset, eval_dataset, 28 | tokenizer: PreTrainedTokenizerFast, 29 | training_args: TrainingArguments): 30 | # IMDb examples are presented as a dictionary: 31 | # { 32 | # 'text': the review text as a string, 33 | # 'label': a sentiment label as an integer, 34 | # }. 35 | # 36 | # We tokenize the text and add the special token for indicating 37 | # the end of the text at the end of each review. We also truncate 38 | # reviews to a maximum length to avoid excessively long sequences 39 | # during training. As we have no use for the label, we discard 40 | # it. 41 | max_length = 128 42 | 43 | def tokenize(x): 44 | texts = [example + tokenizer.eos_token for example in x["text"]] 45 | return tokenizer( 46 | texts, 47 | max_length=max_length, 48 | truncation=True, 49 | add_special_tokens=True, 50 | return_overflowing_tokens=True, 51 | return_length=False, 52 | ) 53 | 54 | train_dataset_tokenized = train_dataset.map( 55 | tokenize, 56 | remove_columns=["text", "label"], 57 | batched=True, 58 | batch_size=training_args.train_batch_size, 59 | num_proc=training_args.dataloader_num_workers, 60 | ) 61 | 62 | eval_dataset_tokenized = eval_dataset.map( 63 | tokenize, 64 | remove_columns=["text", "label"], 65 | batched=True, 66 | num_proc=training_args.dataloader_num_workers, 67 | ) 68 | 69 | # We split a small amount of training data as "validation" test 70 | # set to keep track of evaluation of the loss on non-training data 71 | # during training. This is purely because computing the loss on 72 | # the full evaluation dataset takes much longer. 73 | train_validate_splits = train_dataset_tokenized.train_test_split( 74 | test_size=1000, seed=42, keep_in_memory=True 75 | ) 76 | train_dataset_tokenized = train_validate_splits["train"] 77 | validate_dataset_tokenized = train_validate_splits["test"] 78 | 79 | return (train_dataset_tokenized, validate_dataset_tokenized, 80 | eval_dataset_tokenized) 81 | 82 | 83 | if __name__ == "__main__": 84 | # Determine which device to train the model on, CPU or GPU 85 | print('Using PyTorch version:', torch.__version__) 86 | if torch.cuda.is_available(): 87 | device = torch.device('cuda') 88 | print('Using GPU, device name:', torch.cuda.get_device_name(0)) 89 | else: 90 | print('No GPU found, using CPU instead.') 91 | device = torch.device('cpu') 92 | 93 | # Use DATADIR environment variable to set path for data 94 | datapath = os.getenv('DATADIR') 95 | if datapath is None: 96 | print("Please set DATADIR environment variable!") 97 | sys.exit(1) 98 | user_datapath = os.path.join(datapath, "users", os.getenv('USER')) 99 | os.makedirs(user_datapath, exist_ok=True) 100 | 101 | # ## IMDB data set 102 | # 103 | # Next we'll load the IMDB data set, this time using the Hugging Face 104 | # datasets library: https://huggingface.co/docs/datasets/index. 105 | # 106 | # The dataset contains 100,000 movies reviews from the Internet Movie 107 | # Database, split into 25,000 reviews for training and 25,000 reviews 108 | # for testing and 50,000 without labels (unsupervised). 109 | 110 | train_dataset = load_dataset("imdb", keep_in_memory=True, 111 | split="train+unsupervised") 112 | test_dataset = load_dataset("imdb", keep_in_memory=True, 113 | split="test") 114 | 115 | # Let's print one sample from the dataset. 116 | print('Sample from dataset') 117 | pprint(train_dataset[200]) 118 | 119 | # #### Loading the GPT-3 model 120 | # 121 | # We'll use the gpt-neo models from the Hugging Face library: 122 | # https://huggingface.co/EleutherAI/gpt-neo-125m 123 | pretrained_model = "EleutherAI/gpt-neo-125m" 124 | 125 | # If you have time, you can also test with a larger 1.3 billion 126 | # parameter version of the same model: 127 | # https://huggingface.co/EleutherAI/gpt-neo-1.3B 128 | 129 | # pretrained_model = "EleutherAI/gpt-neo-1.3B" 130 | 131 | # Load the tokenizer associated with the model 132 | print("Loading model and tokenizer") 133 | start = time.time() 134 | tokenizer = AutoTokenizer.from_pretrained(pretrained_model, use_fast=True) 135 | tokenizer.pad_token = tokenizer.eos_token 136 | 137 | # Load the actual base model from Hugging Face 138 | model = AutoModelForCausalLM.from_pretrained(pretrained_model) 139 | model.to(device) 140 | stop = time.time() 141 | print(f"Loading model and tokenizer took: {stop-start:.2f} seconds") 142 | 143 | # Setting up the training configuration 144 | train_batch_size = 32 145 | test_batch_size = 128 146 | 147 | output_dir = os.path.join(user_datapath, "gpt-imdb-model") 148 | training_args = TrainingArguments( 149 | output_dir=output_dir, 150 | overwrite_output_dir=True, 151 | save_strategy="steps", # save a snapshot of the model every 152 | save_steps=100, # 100 steps 153 | save_total_limit=4, # only keep the last 4 snapshots 154 | logging_dir="logs", 155 | eval_strategy="steps", 156 | eval_steps=1000, # compute validation loss every 1000 steps 157 | learning_rate=2e-5, 158 | weight_decay=0.01, 159 | bf16=True, # use 16-bit floating point precision 160 | per_device_train_batch_size=train_batch_size, 161 | per_device_eval_batch_size=test_batch_size, 162 | max_steps=5000, 163 | dataloader_num_workers=7, 164 | dataloader_pin_memory=True, 165 | report_to=["tensorboard"], # log statistics for tensorboard 166 | ) 167 | 168 | # ## Preprocessing of training data 169 | # 170 | # We tokenize the data into torch tensors, split training into 171 | # training and validation and set up a collator that is able to 172 | # arrange single data samples into batches. 173 | 174 | (train_dataset_tokenized, validate_dataset_tokenized, 175 | test_dataset_tokenized) = preprocess_data(train_dataset, 176 | test_dataset, 177 | tokenizer, 178 | training_args) 179 | 180 | collator = DataCollatorForLanguageModeling( 181 | tokenizer, mlm=False, return_tensors="pt" 182 | ) 183 | 184 | # Sanity check: How does the training data look like after preprocessing? 185 | print("Sample of tokenized data") 186 | for b in train_dataset_tokenized: 187 | pprint(b, compact=True) 188 | print("Length of input_ids:", len(b["input_ids"])) 189 | break 190 | print("Length of dataset (tokenized)", len(train_dataset_tokenized)) 191 | 192 | trainer = Trainer( 193 | model=model, 194 | args=training_args, 195 | tokenizer=tokenizer, 196 | data_collator=collator, 197 | train_dataset=train_dataset_tokenized, 198 | eval_dataset=validate_dataset_tokenized, 199 | ) 200 | 201 | trainer.train() 202 | 203 | print() 204 | print("Training done, you can find all the model checkpoints in", 205 | output_dir) 206 | 207 | with torch.no_grad(): 208 | model.eval() 209 | 210 | # Calculate perplexity 211 | validate_results = trainer.evaluate() 212 | test_results = trainer.evaluate(test_dataset_tokenized) 213 | 214 | print(f'Perplexity (val): {math.exp(validate_results["eval_loss"]):.2f}') 215 | print(f'Perplexity (test): {math.exp(test_results["eval_loss"]):.2f}') 216 | 217 | # Let's print a few sample generated reviews 218 | prompt = "The movie about LUMI AI Factory was great because" 219 | input_ids = tokenizer(prompt, return_tensors='pt').input_ids.to(device) 220 | outputs = model.generate(input_ids, do_sample=True, max_length=80, 221 | num_return_sequences=4) 222 | decoded_outputs = tokenizer.batch_decode(outputs, 223 | skip_special_tokens=True) 224 | 225 | print('Sample generated review:') 226 | for txt in decoded_outputs: 227 | print('-', txt) 228 | -------------------------------------------------------------------------------- /day2/pytorch_imdb_gpt_multigpu.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # # IMDB movie review text generation 5 | # 6 | # In this script, we'll fine-tune a GPT3-like model to generate more 7 | # movie reviews based on a prompt. 8 | 9 | import math 10 | import os 11 | import sys 12 | import time 13 | from pprint import pprint 14 | 15 | import torch 16 | import torch.distributed as dist 17 | 18 | from datasets import load_dataset 19 | from transformers import ( 20 | AutoModelForCausalLM, 21 | AutoTokenizer, 22 | DataCollatorForLanguageModeling, 23 | PreTrainedTokenizerFast, 24 | Trainer, 25 | TrainingArguments 26 | ) 27 | 28 | 29 | def preprocess_data(train_dataset, eval_dataset, 30 | tokenizer: PreTrainedTokenizerFast, 31 | training_args: TrainingArguments): 32 | # IMDb examples are presented as a dictionary: 33 | # { 34 | # 'text': the review text as a string, 35 | # 'label': a sentiment label as an integer, 36 | # }. 37 | # 38 | # We tokenize the text and add the special token for indicating 39 | # the end of the text at the end of each review. We also truncate 40 | # reviews to a maximum length to avoid excessively long sequences 41 | # during training. As we have no use for the label, we discard 42 | # it. 43 | max_length = 128 44 | 45 | def tokenize(x): 46 | texts = [example + tokenizer.eos_token for example in x["text"]] 47 | return tokenizer( 48 | texts, 49 | max_length=max_length, 50 | truncation=True, 51 | add_special_tokens=True, 52 | return_overflowing_tokens=True, 53 | return_length=False, 54 | ) 55 | 56 | train_dataset_tokenized = train_dataset.map( 57 | tokenize, 58 | remove_columns=["text", "label"], 59 | batched=True, 60 | batch_size=training_args.train_batch_size, 61 | num_proc=training_args.dataloader_num_workers, 62 | ) 63 | 64 | eval_dataset_tokenized = eval_dataset.map( 65 | tokenize, 66 | remove_columns=["text", "label"], 67 | batched=True, 68 | num_proc=training_args.dataloader_num_workers, 69 | ) 70 | 71 | # We split a small amount of training data as "validation" test 72 | # set to keep track of evaluation of the loss on non-training data 73 | # during training. This is purely because computing the loss on 74 | # the full evaluation dataset takes much longer. 75 | train_validate_splits = train_dataset_tokenized.train_test_split( 76 | test_size=1000, seed=42, keep_in_memory=True 77 | ) 78 | train_dataset_tokenized = train_validate_splits["train"] 79 | validate_dataset_tokenized = train_validate_splits["test"] 80 | 81 | return (train_dataset_tokenized, validate_dataset_tokenized, 82 | eval_dataset_tokenized) 83 | 84 | 85 | if __name__ == "__main__": 86 | # Determine which device to train the model on, CPU or GPU 87 | print('Using PyTorch version:', torch.__version__) 88 | if torch.cuda.is_available(): 89 | device = torch.device('cuda') 90 | print('Using GPU, device name:', torch.cuda.get_device_name(0)) 91 | else: 92 | print('No GPU found, using CPU instead.') 93 | device = torch.device('cpu') 94 | 95 | dist.init_process_group(backend='nccl') 96 | rank_0 = dist.get_rank() == 0 97 | 98 | # Use DATADIR environment variable to set path for data 99 | datapath = os.getenv('DATADIR') 100 | if datapath is None: 101 | print("Please set DATADIR environment variable!") 102 | sys.exit(1) 103 | user_datapath = os.path.join(datapath, "users", os.getenv('USER')) 104 | os.makedirs(user_datapath, exist_ok=True) 105 | 106 | # ## IMDB data set 107 | # 108 | # Next we'll load the IMDB data set, this time using the Hugging Face 109 | # datasets library: https://huggingface.co/docs/datasets/index. 110 | # 111 | # The dataset contains 100,000 movies reviews from the Internet Movie 112 | # Database, split into 25,000 reviews for training and 25,000 reviews 113 | # for testing and 50,000 without labels (unsupervised). 114 | 115 | train_dataset = load_dataset("imdb", keep_in_memory=True, 116 | split="train+unsupervised") 117 | test_dataset = load_dataset("imdb", keep_in_memory=True, 118 | split="test") 119 | 120 | # Let's print one sample from the dataset. 121 | if rank_0: 122 | print('Sample from dataset') 123 | pprint(train_dataset[200]) 124 | 125 | # #### Loading the GPT-3 model 126 | # 127 | # We'll use the gpt-neo models from the Hugging Face library: 128 | # https://huggingface.co/EleutherAI/gpt-neo-125m 129 | pretrained_model = "EleutherAI/gpt-neo-125m" 130 | 131 | # If you have time, you can also test with a larger 1.3 billion 132 | # parameter version of the same model: 133 | # https://huggingface.co/EleutherAI/gpt-neo-1.3B 134 | #pretrained_model = "EleutherAI/gpt-neo-1.3B" 135 | 136 | # Load the tokenizer associated with the model 137 | print("Loading model and tokenizer") 138 | start = time.time() 139 | tokenizer = AutoTokenizer.from_pretrained(pretrained_model, use_fast=True) 140 | tokenizer.pad_token = tokenizer.eos_token 141 | 142 | # Load the actual base model from Hugging Face 143 | model = AutoModelForCausalLM.from_pretrained(pretrained_model) 144 | model.to(device) 145 | stop = time.time() 146 | print(f"Loading model and tokenizer took: {stop-start:.2f} seconds") 147 | 148 | # Setting up the training configuration 149 | train_batch_size = 32 150 | test_batch_size = 128 151 | 152 | output_dir = os.path.join(user_datapath, "gpt-imdb-model") 153 | training_args = TrainingArguments( 154 | output_dir=output_dir, 155 | overwrite_output_dir=True, 156 | save_strategy="steps", # save a snapshot of the model every 157 | save_steps=100, # 100 steps 158 | save_total_limit=4, # only keep the last 4 snapshots 159 | logging_dir="logs", 160 | eval_strategy="steps", 161 | eval_steps=1000, # compute validation loss every 1000 steps 162 | learning_rate=2e-5, 163 | weight_decay=0.01, 164 | bf16=True, # use 16-bit floating point precision 165 | per_device_train_batch_size=train_batch_size, 166 | per_device_eval_batch_size=test_batch_size, 167 | max_steps=5000, 168 | dataloader_num_workers=7, 169 | dataloader_pin_memory=True, 170 | report_to=["tensorboard"], # log statistics for tensorboard 171 | ) 172 | 173 | # ## Preprocessing of training data 174 | # 175 | # We tokenize the data into torch tensors, split training into 176 | # training and validation and set up a collator that is able to 177 | # arrange single data samples into batches. 178 | 179 | (train_dataset_tokenized, validate_dataset_tokenized, 180 | test_dataset_tokenized) = preprocess_data(train_dataset, 181 | test_dataset, 182 | tokenizer, 183 | training_args) 184 | 185 | collator = DataCollatorForLanguageModeling( 186 | tokenizer, mlm=False, return_tensors="pt" 187 | ) 188 | 189 | # Sanity check: How does the training data look like after preprocessing? 190 | if rank_0: 191 | print("Sample of tokenized data") 192 | for b in train_dataset_tokenized: 193 | pprint(b, compact=True) 194 | print("Length of input_ids:", len(b["input_ids"])) 195 | break 196 | print("Length of dataset (tokenized)", len(train_dataset_tokenized)) 197 | 198 | trainer = Trainer( 199 | model=model, 200 | args=training_args, 201 | tokenizer=tokenizer, 202 | data_collator=collator, 203 | train_dataset=train_dataset_tokenized, 204 | eval_dataset=validate_dataset_tokenized, 205 | ) 206 | 207 | trainer.train() 208 | 209 | if rank_0: 210 | print() 211 | print("Training done, you can find all the model checkpoints in", 212 | output_dir) 213 | 214 | with torch.no_grad(): 215 | model.eval() 216 | 217 | # Calculate perplexity 218 | validate_results = trainer.evaluate() 219 | test_results = trainer.evaluate(test_dataset_tokenized) 220 | 221 | if rank_0: 222 | print(f'Perplexity (val): {math.exp(validate_results["eval_loss"]):.2f}') 223 | print(f'Perplexity (test): {math.exp(test_results["eval_loss"]):.2f}') 224 | 225 | # Let's print a few sample generated reviews 226 | prompt = "The movie about LUMI AI Factory was great because" 227 | input_ids = tokenizer(prompt, return_tensors='pt').input_ids.to(device) 228 | outputs = model.generate(input_ids, do_sample=True, max_length=80, 229 | num_return_sequences=4) 230 | decoded_outputs = tokenizer.batch_decode(outputs, 231 | skip_special_tokens=True) 232 | 233 | print('Sample generated review:') 234 | for txt in decoded_outputs: 235 | print('-', txt) 236 | -------------------------------------------------------------------------------- /day2/pytorch_test.py: -------------------------------------------------------------------------------- 1 | # Script for testing the PyTorch setup 2 | 3 | import torch 4 | import torch.nn as nn 5 | import torchvision 6 | from torch.utils.data import DataLoader 7 | from torchvision import datasets 8 | import torchvision.transforms as transforms 9 | 10 | from packaging.version import Version as LV 11 | from tqdm import tqdm 12 | 13 | import numpy as np 14 | import matplotlib.pyplot as plt 15 | import seaborn as sns 16 | sns.set() 17 | 18 | print('Using PyTorch version:', torch.__version__) 19 | assert(LV(torch.__version__) >= LV("2.0")) 20 | 21 | if torch.cuda.is_available(): 22 | print('Using GPU, device name:', torch.cuda.get_device_name(0)) 23 | device = torch.device('cuda') 24 | else: 25 | print('No GPU found, using CPU instead.') 26 | device = torch.device('cpu') 27 | 28 | # Create some tensors 29 | x = torch.ones(3, 4) 30 | data = [[1, 2, 3],[4, 5, 6]] 31 | y = torch.tensor(data, dtype=torch.float) 32 | 33 | # Copy them to the GPU 34 | x = x.to(device) 35 | y = y.to(device) 36 | 37 | # Perform matrix multiplication on GPU 38 | z = y.matmul(x) 39 | 40 | print("z =", z) 41 | -------------------------------------------------------------------------------- /day2/run-2gpus-torchrun.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --account=project_462000863 3 | #SBATCH --partition=small-g 4 | #SBATCH --ntasks=1 5 | #SBATCH --cpus-per-task=14 6 | #SBATCH --gpus-per-node=2 7 | #SBATCH --mem=120G 8 | #SBATCH --time=1:00:00 9 | ##SBATCH --reservation=pdl_day2-no-ood 10 | 11 | module purge 12 | module use /appl/local/csc/modulefiles/ 13 | module load pytorch 14 | 15 | COURSE_SCRATCH="/scratch/${SLURM_JOB_ACCOUNT}" 16 | 17 | export DATADIR=$COURSE_SCRATCH/data 18 | export TORCH_HOME=$COURSE_SCRATCH/torch-cache 19 | export HF_HOME=$COURSE_SCRATCH/hf-cache 20 | export MLFLOW_TRACKING_URI=$COURSE_SCRATCH/data/users/$USER/mlruns 21 | 22 | set -xv 23 | torchrun --standalone --nnodes=1 --nproc_per_node=$SLURM_GPUS_PER_NODE $* 24 | -------------------------------------------------------------------------------- /day2/run-2gpus.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --account=project_462000863 3 | #SBATCH --partition=small-g 4 | #SBATCH --ntasks=1 5 | #SBATCH --cpus-per-task=14 6 | #SBATCH --gpus-per-node=2 7 | #SBATCH --mem=120G 8 | #SBATCH --time=1:00:00 9 | #SBATCH --reservation=pdl_day2-no-ood 10 | 11 | module purge 12 | module use /appl/local/csc/modulefiles/ 13 | module load pytorch/2.4 14 | 15 | COURSE_SCRATCH="/scratch/${SLURM_JOB_ACCOUNT}" 16 | 17 | export DATADIR=$COURSE_SCRATCH/data 18 | export TORCH_HOME=$COURSE_SCRATCH/torch-cache 19 | export HF_HOME=$COURSE_SCRATCH/hf-cache 20 | 21 | export MLFLOW_TRACKING_URI=$COURSE_SCRATCH/data/users/$USER/mlruns 22 | export TOKENIZERS_PARALLELISM=false 23 | 24 | umask 002 25 | 26 | set -xv 27 | torchrun --standalone --nnodes=1 --nproc_per_node=$SLURM_GPUS_PER_NODE $* 28 | -------------------------------------------------------------------------------- /day2/run-8gpus.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --account=project_462000863 3 | #SBATCH --partition=small-g 4 | #SBATCH --ntasks=1 5 | #SBATCH --cpus-per-task=56 6 | #SBATCH --gpus-per-node=8 7 | #SBATCH --mem=480G 8 | #SBATCH --time=1:00:00 9 | #SBATCH --reservation=pdl_day2-no-ood 10 | 11 | module purge 12 | module use /appl/local/csc/modulefiles/ 13 | module load pytorch/2.4 14 | 15 | COURSE_SCRATCH="/scratch/${SLURM_JOB_ACCOUNT}" 16 | 17 | export DATADIR=$COURSE_SCRATCH/data 18 | export TORCH_HOME=$COURSE_SCRATCH/torch-cache 19 | export HF_HOME=$COURSE_SCRATCH/hf-cache 20 | 21 | export MLFLOW_TRACKING_URI=$COURSE_SCRATCH/data/users/$USER/mlruns 22 | export TOKENIZERS_PARALLELISM=false 23 | 24 | umask 002 25 | 26 | set -xv 27 | torchrun --standalone --nnodes=1 --nproc_per_node=$SLURM_GPUS_PER_NODE $* 28 | -------------------------------------------------------------------------------- /day2/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --account=project_462000863 3 | #SBATCH --partition=small-g 4 | #SBATCH --ntasks=1 5 | #SBATCH --cpus-per-task=7 6 | #SBATCH --gpus-per-task=1 7 | #SBATCH --mem=60G 8 | #SBATCH --time=1:00:00 9 | #SBATCH --reservation=pdl_day2-no-ood 10 | 11 | module purge 12 | module use /appl/local/csc/modulefiles/ 13 | module load pytorch/2.4 14 | 15 | COURSE_SCRATCH="/scratch/${SLURM_JOB_ACCOUNT}" 16 | 17 | export DATADIR=$COURSE_SCRATCH/data 18 | export TORCH_HOME=$COURSE_SCRATCH/torch-cache 19 | export HF_HOME=$COURSE_SCRATCH/hf-cache 20 | 21 | export MLFLOW_TRACKING_URI=$COURSE_SCRATCH/data/users/$USER/mlruns 22 | export TOKENIZERS_PARALLELISM=false 23 | 24 | umask 002 25 | 26 | set -xv 27 | python3 $* 28 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | pytorch 2 | matplotlib 3 | seaborn 4 | notebook 5 | pydot 6 | scikit-learn 7 | --------------------------------------------------------------------------------