├── .gitignore
├── .notebooks-setup
    └── get-started.bash
├── LICENSE
├── README.md
├── day1
    ├── 01-pytorch-test-setup.ipynb
    ├── 02-pytorch-mnist-mlp.ipynb
    ├── 03-pytorch-mnist-cnn.ipynb
    ├── 04a-pytorch-imdb-rnn.ipynb
    ├── 04b-pytorch-imdb-huggingface.ipynb
    ├── README.md
    └── solutions
    │   ├── pytorch-imdb-rnn-example-answer.py
    │   ├── pytorch-mnist-cnn-example-answer.py
    │   └── pytorch-mnist-mlp-example-answer.py
├── day2
    ├── Exercise_5.md
    ├── Exercise_6.md
    ├── Exercise_7.md
    ├── Exercise_8.md
    ├── README.md
    ├── imgs
    │   ├── avp.png
    │   ├── dvc.png
    │   ├── gtsrb-montage.png
    │   └── traffic-signs.png
    ├── logs
    │   └── .gitignore
    ├── pytorch_20ng_bert.py
    ├── pytorch_20ng_cnn.py
    ├── pytorch_20ng_rnn.py
    ├── pytorch_dvc_cnn_pretrained.py
    ├── pytorch_dvc_cnn_pretrained_multigpu.py
    ├── pytorch_dvc_cnn_simple.py
    ├── pytorch_dvc_vit.py
    ├── pytorch_generate_gpt.ipynb
    ├── pytorch_gtsrb_cnn_pretrained.py
    ├── pytorch_gtsrb_cnn_simple.py
    ├── pytorch_gtsrb_vit.py
    ├── pytorch_imdb_gpt.py
    ├── pytorch_imdb_gpt_multigpu.py
    ├── pytorch_test.py
    ├── run-2gpus-torchrun.sh
    ├── run-2gpus.sh
    ├── run-8gpus.sh
    └── run.sh
└── requirements.txt


/.gitignore:
--------------------------------------------------------------------------------
 1 | day2/*.png
 2 | day2/*.out
 3 | day2/*.h5
 4 | day2/*.pt
 5 | .ipynb_checkpoints
 6 | *~
 7 | data/
 8 | __pycache__/
 9 | day1/**/model.png
10 | day1/optional/pml_utils.py
11 | day2/mlruns/
12 | 


--------------------------------------------------------------------------------
/.notebooks-setup/get-started.bash:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | ## Script that downloads the code for doing the Deep Learning course exercises
 3 | cd /home/jovyan
 4 | 
 5 | # git reflog requires a name and email if user is not in passwd
 6 | # even if you're only cloning
 7 | export GIT_COMMITTER_NAME=anonymous
 8 | export GIT_COMMITTER_EMAIL=anon@localhost
 9 | 
10 | #git clone -b vuokatti2021 --single-branch https://github.com/csc-training/intro-to-dl.git
11 | git clone https://github.com/csc-training/intro-to-dl
12 | 
13 | rmdir work
14 | rm get-started.bash
15 | #pip install imageio h5py tqdm
16 | #pip install scikit-learn
17 | #pip install torchtext
18 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019-2024 CSC - IT Center for Science Ltd.
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Introduction to deep learning 
2 | 
3 | This repository contains the exercise materials for the [CSC](https://www.csc.fi/) course [Practical deep learning](https://csc.fi/koulutuskalenteri/practical-deep-learning-5/).
4 | 
5 | - [Exercises for day 1](day1/README.md)
6 | - [Exercises for day 2](day2/README.md)
7 | 


--------------------------------------------------------------------------------
/day1/01-pytorch-test-setup.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "deletable": true,
  7 |     "editable": true
  8 |    },
  9 |    "source": [
 10 |     "# Notebook for testing the PyTorch setup\n",
 11 |     "\n",
 12 |     "This notebook is for testing the [PyTorch](http://pytorch.org/) setup.  Below is a set of required imports.  \n",
 13 |     "\n",
 14 |     "Run the cell, and no error messages should appear."
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": null,
 20 |    "metadata": {
 21 |     "collapsed": false,
 22 |     "deletable": true,
 23 |     "editable": true,
 24 |     "jupyter": {
 25 |      "outputs_hidden": false
 26 |     }
 27 |    },
 28 |    "outputs": [],
 29 |    "source": [
 30 |     "%matplotlib inline\n",
 31 |     "\n",
 32 |     "import torch\n",
 33 |     "import torch.nn as nn\n",
 34 |     "import torchvision\n",
 35 |     "from torch.utils.data import DataLoader\n",
 36 |     "from torchvision import datasets\n",
 37 |     "import torchvision.transforms as transforms\n",
 38 |     "\n",
 39 |     "from datasets import load_dataset\n",
 40 |     "from tokenizers import Tokenizer\n",
 41 |     "from tokenizers import models, trainers, pre_tokenizers, normalizers, processors\n",
 42 |     "\n",
 43 |     "from packaging.version import Version as LV\n",
 44 |     "from tqdm import tqdm\n",
 45 |     "\n",
 46 |     "import numpy as np\n",
 47 |     "import matplotlib.pyplot as plt\n",
 48 |     "import seaborn as sns\n",
 49 |     "sns.set()\n",
 50 |     "\n",
 51 |     "print('Using PyTorch version:', torch.__version__)\n",
 52 |     "assert(LV(torch.__version__) >= LV(\"2.0\"))"
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "markdown",
 57 |    "metadata": {},
 58 |    "source": [
 59 |     "Let's check if we have GPU available."
 60 |    ]
 61 |   },
 62 |   {
 63 |    "cell_type": "code",
 64 |    "execution_count": null,
 65 |    "metadata": {
 66 |     "deletable": true,
 67 |     "editable": true
 68 |    },
 69 |    "outputs": [],
 70 |    "source": [
 71 |     "if torch.cuda.is_available():\n",
 72 |     "    print('Using GPU, device name:', torch.cuda.get_device_name(0))\n",
 73 |     "    device = torch.device('cuda')\n",
 74 |     "else:\n",
 75 |     "    print('No GPU found, using CPU instead.') \n",
 76 |     "    device = torch.device('cpu')"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "markdown",
 81 |    "metadata": {},
 82 |    "source": [
 83 |     "## Tensors in PyTorch\n",
 84 |     "\n",
 85 |     "Tensors are data structures that contain vectors, matrices or higher-dimensional arrays. They are similar to NumPy's ndarrays, except that PyTorch tensors can also run on GPUs and other hardware accelerators. Also check the [PyTorch Tensors tutorial](https://pytorch.org/tutorials/beginner/basics/tensorqs_tutorial.html).\n",
 86 |     "\n",
 87 |     "Let's create some tensors and investigate their shapes and data types."
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "code",
 92 |    "execution_count": null,
 93 |    "metadata": {},
 94 |    "outputs": [],
 95 |    "source": [
 96 |     "x = torch.ones(3, 4)\n",
 97 |     "print(type(x))"
 98 |    ]
 99 |   },
100 |   {
101 |    "cell_type": "code",
102 |    "execution_count": null,
103 |    "metadata": {},
104 |    "outputs": [],
105 |    "source": [
106 |     "print(\"x.shape =\",x.shape)\n",
107 |     "print(\"x.dtype =\", x.dtype)\n",
108 |     "print(\"x =\", x)"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "code",
113 |    "execution_count": null,
114 |    "metadata": {},
115 |    "outputs": [],
116 |    "source": [
117 |     "data = [[1, 2, 3],[4, 5, 6]]\n",
118 |     "y = torch.tensor(data, dtype=torch.float)\n",
119 |     "\n",
120 |     "print(\"y.shape =\", y.shape)\n",
121 |     "print(\"y.dtype =\", y.dtype)\n",
122 |     "print(\"y =\", y)"
123 |    ]
124 |   },
125 |   {
126 |    "cell_type": "markdown",
127 |    "metadata": {},
128 |    "source": [
129 |     "### Operations on tensors\n",
130 |     "\n",
131 |     "There are a lot of built-in [operations that can be run on tensors](https://pytorch.org/docs/stable/torch.html). Let's try matrix multiplication:"
132 |    ]
133 |   },
134 |   {
135 |    "cell_type": "code",
136 |    "execution_count": null,
137 |    "metadata": {},
138 |    "outputs": [],
139 |    "source": [
140 |     "# This computes the matrix product y x\n",
141 |     "z = y.matmul(x)\n",
142 |     "\n",
143 |     "print(\"z.shape =\", z.shape)\n",
144 |     "print(\"z.dtype =\", z.dtype)\n",
145 |     "print(\"z =\", z)"
146 |    ]
147 |   },
148 |   {
149 |    "cell_type": "markdown",
150 |    "metadata": {},
151 |    "source": [
152 |     "### Devices\n",
153 |     "\n",
154 |     "We mentioned that PyTorch tensors can also be used on GPUs. We can check what device our tensors is on with `x.device`, we can move it to another device with `x.to(device)` where `device` can be defined dynamically based on if we have GPU available or not. We already did this above with code similar to this:\n",
155 |     "\n",
156 |     "```python\n",
157 |     "if torch.cuda.is_available():\n",
158 |     "    device = torch.device('cuda')\n",
159 |     "else:\n",
160 |     "    device = torch.device('cpu')\n",
161 |     "```\n",
162 |     "\n",
163 |     "If we don't have a GPU the tensor will just stay on the CPU."
164 |    ]
165 |   },
166 |   {
167 |    "cell_type": "code",
168 |    "execution_count": null,
169 |    "metadata": {},
170 |    "outputs": [],
171 |    "source": [
172 |     "print(\"(before) x.device =\", x.device)\n",
173 |     "x = x.to(device)\n",
174 |     "print(\"(after) x.device =\", x.device)"
175 |    ]
176 |   },
177 |   {
178 |    "cell_type": "markdown",
179 |    "metadata": {},
180 |    "source": [
181 |     "If our tensors are now on the GPU, the matrix multiplication will also take place on the GPU and be much faster (of course not something we would notice in this trivial example)."
182 |    ]
183 |   },
184 |   {
185 |    "cell_type": "code",
186 |    "execution_count": null,
187 |    "metadata": {},
188 |    "outputs": [],
189 |    "source": [
190 |     "y = y.to(device)\n",
191 |     "z = y.matmul(x)"
192 |    ]
193 |   },
194 |   {
195 |    "cell_type": "code",
196 |    "execution_count": null,
197 |    "metadata": {},
198 |    "outputs": [],
199 |    "source": [
200 |     "print(\"z.device =\", z.device)\n",
201 |     "print(\"z =\", z)"
202 |    ]
203 |   },
204 |   {
205 |    "cell_type": "code",
206 |    "execution_count": null,
207 |    "metadata": {},
208 |    "outputs": [],
209 |    "source": []
210 |   }
211 |  ],
212 |  "metadata": {
213 |   "kernelspec": {
214 |    "display_name": "Python 3 (ipykernel)",
215 |    "language": "python",
216 |    "name": "python3"
217 |   },
218 |   "language_info": {
219 |    "codemirror_mode": {
220 |     "name": "ipython",
221 |     "version": 3
222 |    },
223 |    "file_extension": ".py",
224 |    "mimetype": "text/x-python",
225 |    "name": "python",
226 |    "nbconvert_exporter": "python",
227 |    "pygments_lexer": "ipython3",
228 |    "version": "3.10.12"
229 |   }
230 |  },
231 |  "nbformat": 4,
232 |  "nbformat_minor": 4
233 | }
234 | 


--------------------------------------------------------------------------------
/day1/02-pytorch-mnist-mlp.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# MNIST handwritten digit classification with MLPs\n",
  8 |     "\n",
  9 |     "In this notebook, we'll train a multi-layer perceptron model to classify MNIST digits using **PyTorch**. \n",
 10 |     "\n",
 11 |     "First, the needed imports. "
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": null,
 17 |    "metadata": {},
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "%matplotlib inline\n",
 21 |     "\n",
 22 |     "import torch\n",
 23 |     "import torch.nn as nn\n",
 24 |     "from torch.utils.data import DataLoader\n",
 25 |     "from torchvision import datasets\n",
 26 |     "from torchvision.transforms import ToTensor\n",
 27 |     "\n",
 28 |     "import numpy as np\n",
 29 |     "import matplotlib.pyplot as plt\n",
 30 |     "import os\n",
 31 |     "\n",
 32 |     "print('Using PyTorch version:', torch.__version__)\n",
 33 |     "if torch.cuda.is_available():\n",
 34 |     "    print('Using GPU, device name:', torch.cuda.get_device_name(0))\n",
 35 |     "    device = torch.device('cuda')\n",
 36 |     "else:\n",
 37 |     "    print('No GPU found, using CPU instead.') \n",
 38 |     "    device = torch.device('cpu')"
 39 |    ]
 40 |   },
 41 |   {
 42 |    "cell_type": "markdown",
 43 |    "metadata": {},
 44 |    "source": [
 45 |     "## Loading data\n",
 46 |     "\n",
 47 |     "PyTorch has two classes from [`torch.utils.data` to work with data](https://pytorch.org/docs/stable/data.html#module-torch.utils.data): \n",
 48 |     "- [Dataset](https://pytorch.org/docs/stable/data.html#torch.utils.data.Dataset) which represents the actual data items, such as images or pieces of text, and their labels\n",
 49 |     "- [DataLoader](https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader) which is used for processing the dataset in batches in an efficient manner.\n",
 50 |     "\n",
 51 |     "Here we will use TorchVision and `torchvision.datasets` which provides easy access to [many common visual datasets](https://pytorch.org/vision/stable/datasets.html). In this example we'll use the [MNIST class](https://pytorch.org/vision/stable/generated/torchvision.datasets.MNIST.html#torchvision.datasets.MNIST) for loading the [MNIST dataset](https://en.wikipedia.org/wiki/MNIST_database)."
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": null,
 57 |    "metadata": {},
 58 |    "outputs": [],
 59 |    "source": [
 60 |     "batch_size = 32\n",
 61 |     "\n",
 62 |     "slurm_project = os.getenv('SLURM_JOB_ACCOUNT')\n",
 63 |     "data_dir = os.path.join('/scratch', slurm_project, 'data') if slurm_project else './data'\n",
 64 |     "print('data_dir =', data_dir)\n",
 65 |     "\n",
 66 |     "train_dataset = datasets.MNIST(data_dir, train=True, download=True, transform=ToTensor())\n",
 67 |     "test_dataset = datasets.MNIST(data_dir, train=False, transform=ToTensor())\n",
 68 |     "\n",
 69 |     "train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)\n",
 70 |     "test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "markdown",
 75 |    "metadata": {},
 76 |    "source": [
 77 |     "The data loaders provide a way of iterating (making a loop over) the datasets, each time getting a new batch of data with the given batch size.\n",
 78 |     "\n",
 79 |     "The first element of the data batch (`data`) is a 4th-order tensor of size (`batch_size`, 1, 28, 28), i.e. it consists of a batch of images of size 1x28x28 pixels, where the first value is the number of color channels (only 1 in this case as it's gray scale).\n",
 80 |     "\n",
 81 |     "The second element of the batch (`target`) is a vector containing the correct (or \"target\") classes (\"0\", \"1\", ..., \"9\") for each training digit."
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "code",
 86 |    "execution_count": null,
 87 |    "metadata": {},
 88 |    "outputs": [],
 89 |    "source": [
 90 |     "for (data, target) in train_loader:\n",
 91 |     "    print('data:', data.size(), 'type:', data.type())\n",
 92 |     "    print('target:', target.size(), 'type:', target.type())\n",
 93 |     "    break"
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "markdown",
 98 |    "metadata": {},
 99 |    "source": [
100 |     "Here are the first 10 training digits:"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "code",
105 |    "execution_count": null,
106 |    "metadata": {},
107 |    "outputs": [],
108 |    "source": [
109 |     "pltsize=1\n",
110 |     "plt.figure(figsize=(10*pltsize, pltsize))\n",
111 |     "\n",
112 |     "for i in range(10):\n",
113 |     "    plt.subplot(1,10,i+1)\n",
114 |     "    plt.axis('off')\n",
115 |     "    plt.imshow(data[i,:,:,:].numpy().reshape(28,28), cmap=\"gray_r\")\n",
116 |     "    plt.title('Class: '+str(target[i].item()))"
117 |    ]
118 |   },
119 |   {
120 |    "cell_type": "markdown",
121 |    "metadata": {},
122 |    "source": [
123 |     "## Multi-layer perceptron (MLP) network\n",
124 |     "\n",
125 |     "In PyTorch, a neural network is defined as a Python class. It needs to have two methods:\n",
126 |     "\n",
127 |     "- `__init__()` which initializes the layers used in the network\n",
128 |     "- `forward()` which defines how the network performs a forward pass\n",
129 |     "\n",
130 |     "PyTorch will then automatically generate a `backward()` method that computes the gradients based on the computation done in the forward pass.\n",
131 |     "\n",
132 |     "All the [neural network building blocks defined in PyTorch can be found in the torch.nn documentation](https://pytorch.org/docs/stable/nn.html).\n",
133 |     "\n",
134 |     "We use `nn.Sequential` to more easily create a simple sequental neural network:\n",
135 |     "\n",
136 |     "- First we need to \"flatten\" the 2D image into a vector with `nn.Flatten`\n",
137 |     "\n",
138 |     "- Next a fully-connected layer with 20 neurons is created with `nn.Linear`. Note that we need to specify the number of input and output connections. In this case there are 28x28=784 inputs, and 20 outputs\n",
139 |     "\n",
140 |     "- Next, a ReLU non-linear activation\n",
141 |     "\n",
142 |     "- Finally the output of the last layer needs to be a 10-dimensional vector to match the ground truth of ten classes (the ten digits).\n",
143 |     "\n",
144 |     "The output of the last layer should be normalized with softmax, but this is actually included implicitly in the loss function in PyTorch (see below)."
145 |    ]
146 |   },
147 |   {
148 |    "cell_type": "code",
149 |    "execution_count": null,
150 |    "metadata": {},
151 |    "outputs": [],
152 |    "source": [
153 |     "class SimpleMLP(nn.Module):\n",
154 |     "    def __init__(self):\n",
155 |     "        super().__init__()\n",
156 |     "        self.layers = nn.Sequential(\n",
157 |     "            nn.Flatten(),\n",
158 |     "            nn.Linear(28*28, 20),\n",
159 |     "            nn.ReLU(),\n",
160 |     "            nn.Linear(20, 10)\n",
161 |     "        )\n",
162 |     "\n",
163 |     "    def forward(self, x):\n",
164 |     "        return self.layers(x)\n",
165 |     "\n",
166 |     "model = SimpleMLP().to(device)\n",
167 |     "print(model)"
168 |    ]
169 |   },
170 |   {
171 |    "cell_type": "markdown",
172 |    "metadata": {},
173 |    "source": [
174 |     "# Training the model\n",
175 |     "\n",
176 |     "In order to train the model we need to define a loss function and an optimizer.\n",
177 |     "\n",
178 |     "For a classification task we typically use the cross entropy loss. For this we can use the class [CrossEntropyLoss](https://pytorch.org/docs/stable/generated/torch.nn.CrossEntropyLoss.html).\n",
179 |     "\n",
180 |     "**Note:** if you read the documentation of `CrossEntropyLoss` carefully you will see that it expects the unnormalized raw outputs of the model as softmax is included implicitly in PyTorch's implementation of `CrossEntropyLoss`. This is why we don't need to explicitly use softmax in the network definition above.\n",
181 |     "\n",
182 |     "Finally, we need to define an optimizer, which tells how to update the model parameters based on the computed gradients. There are [several different optimizer algorithms implemented in PyTorch](https://pytorch.org/docs/stable/optim.html#algorithms)."
183 |    ]
184 |   },
185 |   {
186 |    "cell_type": "code",
187 |    "execution_count": null,
188 |    "metadata": {},
189 |    "outputs": [],
190 |    "source": [
191 |     "criterion = nn.CrossEntropyLoss()\n",
192 |     "optimizer = torch.optim.Adam(model.parameters())"
193 |    ]
194 |   },
195 |   {
196 |    "cell_type": "markdown",
197 |    "metadata": {},
198 |    "source": [
199 |     "In PyTorch we have to write the training loop ourselves.\n",
200 |     "\n",
201 |     "The code below consists of two loops:\n",
202 |     "\n",
203 |     "- The outer loop goes over a number of *epochs*. An epoch is a single pass through the whole training data.\n",
204 |     "- The inner loop goes over all the batches of the dataset. Here we have defined the batch size to be 32, so images are handled 32 at a time.\n",
205 |     "\n",
206 |     "For each batch we:\n",
207 |     "\n",
208 |     "- Copy the data to the GPU with the `.to(device)` method. If we don't have a GPU, these commands will not do anything.\n",
209 |     "\n",
210 |     "- Do a forward pass, which is as simple as: `output = model(data)`\n",
211 |     "\n",
212 |     "- Finally we calculate the loss - that is the error between the output of the network and the target we want to get - using the `criterion` function we defined earlier\n",
213 |     "\n",
214 |     "- The last lines do the backward propagation with `loss.backward()`, the weights are updated with `optimizer.step()` and finally we need to zero the gradient counters with `optimizer.zero_grad()`."
215 |    ]
216 |   },
217 |   {
218 |    "cell_type": "markdown",
219 |    "metadata": {},
220 |    "source": [
221 |     "First, a helper function to calculate the number of correctly classified digits."
222 |    ]
223 |   },
224 |   {
225 |    "cell_type": "code",
226 |    "execution_count": null,
227 |    "metadata": {},
228 |    "outputs": [],
229 |    "source": [
230 |     "def correct(output, target):\n",
231 |     "    predicted_digits = output.argmax(1)                            # pick digit with largest network output\n",
232 |     "    correct_ones = (predicted_digits == target).type(torch.float)  # 1.0 for correct, 0.0 for incorrect\n",
233 |     "    return correct_ones.sum().item()                               # count number of correct ones\n"
234 |    ]
235 |   },
236 |   {
237 |    "cell_type": "markdown",
238 |    "metadata": {},
239 |    "source": [
240 |     "Next a function for a single training epoch."
241 |    ]
242 |   },
243 |   {
244 |    "cell_type": "code",
245 |    "execution_count": null,
246 |    "metadata": {},
247 |    "outputs": [],
248 |    "source": [
249 |     "def train(data_loader, model, criterion, optimizer):\n",
250 |     "    model.train()\n",
251 |     "\n",
252 |     "    num_batches = len(data_loader)\n",
253 |     "    num_items = len(data_loader.dataset)\n",
254 |     "\n",
255 |     "    total_loss = 0\n",
256 |     "    total_correct = 0\n",
257 |     "    for data, target in data_loader:\n",
258 |     "        # Copy data and targets to GPU\n",
259 |     "        data = data.to(device)\n",
260 |     "        target = target.to(device)\n",
261 |     "        \n",
262 |     "        # Do a forward pass\n",
263 |     "        output = model(data)\n",
264 |     "        \n",
265 |     "        # Calculate the loss\n",
266 |     "        loss = criterion(output, target)\n",
267 |     "        total_loss += loss\n",
268 |     "\n",
269 |     "        # Count number of correct digits\n",
270 |     "        total_correct += correct(output, target)\n",
271 |     "        \n",
272 |     "        # Backpropagation\n",
273 |     "        loss.backward()\n",
274 |     "        optimizer.step()\n",
275 |     "        optimizer.zero_grad()\n",
276 |     "\n",
277 |     "    train_loss = total_loss/num_batches\n",
278 |     "    accuracy = total_correct/num_items\n",
279 |     "    print(f\"Average loss: {train_loss:7f}, accuracy: {accuracy:.2%}\")\n"
280 |    ]
281 |   },
282 |   {
283 |    "cell_type": "code",
284 |    "execution_count": null,
285 |    "metadata": {},
286 |    "outputs": [],
287 |    "source": [
288 |     "%%time\n",
289 |     "\n",
290 |     "epochs = 10\n",
291 |     "for epoch in range(epochs):\n",
292 |     "    print(f\"Training epoch: {epoch+1}\")\n",
293 |     "    train(train_loader, model, criterion, optimizer)"
294 |    ]
295 |   },
296 |   {
297 |    "cell_type": "markdown",
298 |    "metadata": {},
299 |    "source": [
300 |     "### Inference\n",
301 |     "\n",
302 |     "For a better measure of the quality of the model, let's see the model accuracy for the test data.\n",
303 |     "\n",
304 |     "The code is similar to the training code: we just loop over the whole testset, but no need to do backpropagation or calculate any gradients this time."
305 |    ]
306 |   },
307 |   {
308 |    "cell_type": "code",
309 |    "execution_count": null,
310 |    "metadata": {},
311 |    "outputs": [],
312 |    "source": [
313 |     "def test(test_loader, model, criterion):\n",
314 |     "    model.eval()\n",
315 |     "\n",
316 |     "    num_batches = len(test_loader)\n",
317 |     "    num_items = len(test_loader.dataset)\n",
318 |     "\n",
319 |     "    test_loss = 0\n",
320 |     "    total_correct = 0\n",
321 |     "\n",
322 |     "    with torch.no_grad():\n",
323 |     "        for data, target in test_loader:\n",
324 |     "            # Copy data and targets to GPU\n",
325 |     "            data = data.to(device)\n",
326 |     "            target = target.to(device)\n",
327 |     "        \n",
328 |     "            # Do a forward pass\n",
329 |     "            output = model(data)\n",
330 |     "        \n",
331 |     "            # Calculate the loss\n",
332 |     "            loss = criterion(output, target)\n",
333 |     "            test_loss += loss.item()\n",
334 |     "        \n",
335 |     "            # Count number of correct digits\n",
336 |     "            total_correct += correct(output, target)\n",
337 |     "\n",
338 |     "    test_loss = test_loss/num_batches\n",
339 |     "    accuracy = total_correct/num_items\n",
340 |     "\n",
341 |     "    print(f\"Testset accuracy: {100*accuracy:>0.1f}%, average loss: {test_loss:>7f}\")"
342 |    ]
343 |   },
344 |   {
345 |    "cell_type": "code",
346 |    "execution_count": null,
347 |    "metadata": {},
348 |    "outputs": [],
349 |    "source": [
350 |     "test(test_loader, model, criterion)"
351 |    ]
352 |   },
353 |   {
354 |    "cell_type": "markdown",
355 |    "metadata": {},
356 |    "source": [
357 |     "## Task 1: Model with two linear layers\n",
358 |     "\n",
359 |     "Your task is to try the same problem as above, but with a more complex model. The new model should have **two linear layers**, each with:\n",
360 |     "\n",
361 |     "- 50 units\n",
362 |     "- ReLU activation\n",
363 |     "- each followed by a dropout layer with a rate of 0.2 - hint: try [nn.Dropout](https://pytorch.org/docs/stable/generated/torch.nn.Dropout.html#torch.nn.Dropout)\n",
364 |     "\n",
365 |     "Dropout randomly sets a fraction of inputs to zero during training, which is one approach to regularization and can sometimes help to prevent overfitting.\n",
366 |     "\n",
367 |     "You can consult the [PyTorch documentation](https://pytorch.org/docs/stable/index.html), in particular all the [neural network building blocks can be found in the `torch.nn` documentation](https://pytorch.org/docs/stable/nn.html).\n",
368 |     "\n",
369 |     "The code below is missing the model definition. You can copy any suitable layers from the example above."
370 |    ]
371 |   },
372 |   {
373 |    "cell_type": "code",
374 |    "execution_count": null,
375 |    "metadata": {},
376 |    "outputs": [],
377 |    "source": [
378 |     "class TwoLayerMLP(nn.Module):\n",
379 |     "    def __init__(self):\n",
380 |     "        super().__init__()\n",
381 |     "        self.layers = nn.Sequential(\n",
382 |     "           # TASK 1: ADD LAYERS HERE\n",
383 |     "        )\n",
384 |     "\n",
385 |     "    def forward(self, x):\n",
386 |     "        return self.layers(x)\n"
387 |    ]
388 |   },
389 |   {
390 |    "cell_type": "markdown",
391 |    "metadata": {},
392 |    "source": [
393 |     "If you want to see an example answer, change the type of the cell below to \"Code\" in the menu bar above and then execute the cell. Execute the cell again to run the example code.\n",
394 |     "\n",
395 |     "**Note:** in Google Colab you can [click here](https://github.com/csc-training/intro-to-dl/blob/master/day1/solutions/pytorch-mnist-mlp-example-answer.py) and copy the answer manually."
396 |    ]
397 |   },
398 |   {
399 |    "cell_type": "raw",
400 |    "metadata": {},
401 |    "source": [
402 |     "%load solutions/pytorch-mnist-mlp-example-answer.py"
403 |    ]
404 |   },
405 |   {
406 |    "cell_type": "code",
407 |    "execution_count": null,
408 |    "metadata": {},
409 |    "outputs": [],
410 |    "source": [
411 |     "ex1_model = TwoLayerMLP()\n",
412 |     "print(ex1_model)\n",
413 |     "\n",
414 |     "assert len(ex1_model.layers) > 0, \"ERROR: You need to write the missing model definition above!\"\n",
415 |     "ex1_model = ex1_model.to(device)"
416 |    ]
417 |   },
418 |   {
419 |    "cell_type": "code",
420 |    "execution_count": null,
421 |    "metadata": {},
422 |    "outputs": [],
423 |    "source": [
424 |     "ex1_criterion = nn.CrossEntropyLoss()\n",
425 |     "ex1_optimizer = torch.optim.Adam(ex1_model.parameters())"
426 |    ]
427 |   },
428 |   {
429 |    "cell_type": "code",
430 |    "execution_count": null,
431 |    "metadata": {},
432 |    "outputs": [],
433 |    "source": [
434 |     "%%time\n",
435 |     "\n",
436 |     "epochs = 10\n",
437 |     "for epoch in range(epochs):\n",
438 |     "    print(f\"Epoch: {epoch+1} ...\")\n",
439 |     "    train(train_loader, ex1_model, ex1_criterion, ex1_optimizer)"
440 |    ]
441 |   },
442 |   {
443 |    "cell_type": "code",
444 |    "execution_count": null,
445 |    "metadata": {},
446 |    "outputs": [],
447 |    "source": [
448 |     "test(test_loader, ex1_model, ex1_criterion)"
449 |    ]
450 |   },
451 |   {
452 |    "cell_type": "markdown",
453 |    "metadata": {},
454 |    "source": [
455 |     "## Task 2: Model tuning\n",
456 |     "\n",
457 |     "Modify the MLP model.  Try to improve the classification accuracy, or experiment with the effects of different parameters.  If you are interested in the state-of-the-art performance on permutation invariant MNIST, see e.g. [this paper](https://arxiv.org/abs/1507.02672) by Aalto University / The Curious AI Company researchers."
458 |    ]
459 |   },
460 |   {
461 |    "cell_type": "code",
462 |    "execution_count": null,
463 |    "metadata": {},
464 |    "outputs": [],
465 |    "source": []
466 |   },
467 |   {
468 |    "cell_type": "markdown",
469 |    "metadata": {},
470 |    "source": [
471 |     "## Task 3: Fashion-MNIST\n",
472 |     "\n",
473 |     "MNIST can be replaced with Fashion-MNIST, which can be used as drop-in replacement for MNIST. Fashion-MNIST contains images of 10 fashion categories:\n",
474 |     "\n",
475 |     "Label|Description|Label|Description\n",
476 |     "--- | --- |--- | ---\n",
477 |     "0|T-shirt/top|5|Sandal\n",
478 |     "1|Trouser|6|Shirt\n",
479 |     "2|Pullover|7|Sneaker\n",
480 |     "3|Dress|8|Bag\n",
481 |     "4|Coat|9|Ankle boot\n",
482 |     "\n",
483 |     "Replace the loading of MNIST data with Fashion-MNIST in the beginning of this notebook and re-run the experiments. [Fashion-MNIST can be found with the dataset class `FashionMNIST`](https://pytorch.org/vision/stable/generated/torchvision.datasets.FashionMNIST.html#torchvision.datasets.FashionMNIST)."
484 |    ]
485 |   },
486 |   {
487 |    "cell_type": "code",
488 |    "execution_count": null,
489 |    "metadata": {},
490 |    "outputs": [],
491 |    "source": []
492 |   }
493 |  ],
494 |  "metadata": {
495 |   "kernelspec": {
496 |    "display_name": "Python 3 (ipykernel)",
497 |    "language": "python",
498 |    "name": "python3"
499 |   },
500 |   "language_info": {
501 |    "codemirror_mode": {
502 |     "name": "ipython",
503 |     "version": 3
504 |    },
505 |    "file_extension": ".py",
506 |    "mimetype": "text/x-python",
507 |    "name": "python",
508 |    "nbconvert_exporter": "python",
509 |    "pygments_lexer": "ipython3",
510 |    "version": "3.10.12"
511 |   }
512 |  },
513 |  "nbformat": 4,
514 |  "nbformat_minor": 4
515 | }
516 | 


--------------------------------------------------------------------------------
/day1/03-pytorch-mnist-cnn.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# MNIST handwritten digit classification with CNNs\n",
  8 |     "\n",
  9 |     "In this notebook, we'll train a convolutional neural network (CNN, ConvNet) to classify MNIST digits using **PyTorch**. \n",
 10 |     "\n",
 11 |     "First, the needed imports. "
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": null,
 17 |    "metadata": {},
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "%matplotlib inline\n",
 21 |     "\n",
 22 |     "import torch\n",
 23 |     "import torch.nn as nn\n",
 24 |     "from torch.utils.data import DataLoader\n",
 25 |     "from torchvision import datasets\n",
 26 |     "from torchvision.transforms import ToTensor\n",
 27 |     "\n",
 28 |     "import numpy as np\n",
 29 |     "import matplotlib.pyplot as plt\n",
 30 |     "from tqdm import tqdm\n",
 31 |     "import os\n",
 32 |     "\n",
 33 |     "print('Using PyTorch version:', torch.__version__)\n",
 34 |     "if torch.cuda.is_available():\n",
 35 |     "    print('Using GPU, device name:', torch.cuda.get_device_name(0))\n",
 36 |     "    device = torch.device('cuda')\n",
 37 |     "else:\n",
 38 |     "    print('No GPU found, using CPU instead.') \n",
 39 |     "    device = torch.device('cpu')"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "markdown",
 44 |    "metadata": {},
 45 |    "source": [
 46 |     "## MNIST data set"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": null,
 52 |    "metadata": {},
 53 |    "outputs": [],
 54 |    "source": [
 55 |     "batch_size = 32\n",
 56 |     "\n",
 57 |     "slurm_project = os.getenv('SLURM_JOB_ACCOUNT')\n",
 58 |     "data_dir = os.path.join('/scratch', slurm_project, 'data') if slurm_project else './data'\n",
 59 |     "print('data_dir =', data_dir)\n",
 60 |     "\n",
 61 |     "train_dataset = datasets.MNIST(data_dir, train=True, download=True, transform=ToTensor())\n",
 62 |     "test_dataset = datasets.MNIST(data_dir, train=False, transform=ToTensor())\n",
 63 |     "\n",
 64 |     "train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)\n",
 65 |     "test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "markdown",
 70 |    "metadata": {},
 71 |    "source": [
 72 |     "## Convolutional neural network\n",
 73 |     "\n",
 74 |     "Now we are ready to create a convolutional model. As before we use `nn.Sequential` to easily create a sequence of layers.\n",
 75 |     "\n",
 76 |     "Here we use:\n",
 77 |     "\n",
 78 |     "- [Conv2d](https://pytorch.org/docs/stable/generated/torch.nn.Conv2d.html#torch.nn.Conv2d), which operates on 2D matrices so we input the digit images directly to the model (no need to \"flatten\" at this point),\n",
 79 |     "- [MaxPool2d](https://pytorch.org/docs/stable/generated/torch.nn.MaxPool2d.html#torch.nn.MaxPool2d) reduces the spatial dimensions, that is, makes the image smaller,\n",
 80 |     "- Finally we flatten the image to a vector and add two linear layers.\n",
 81 |     "\n",
 82 |     "All the [neural network building blocks defined in PyTorch can be found in the torch.nn documentation](https://pytorch.org/docs/stable/nn.html).\n",
 83 |     "\n",
 84 |     "The output of the last layer should be normalized with softmax, but this is actually included implicitly in the loss function in PyTorch (see below)."
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "code",
 89 |    "execution_count": null,
 90 |    "metadata": {},
 91 |    "outputs": [],
 92 |    "source": [
 93 |     "class SimpleCNN(nn.Module):\n",
 94 |     "    def __init__(self):\n",
 95 |     "        super().__init__()\n",
 96 |     "        self.layers = nn.Sequential(\n",
 97 |     "            nn.Conv2d(1, 32, kernel_size=3, padding='valid'),\n",
 98 |     "            nn.ReLU(),\n",
 99 |     "            nn.MaxPool2d(kernel_size=2),\n",
100 |     "            nn.Flatten(),\n",
101 |     "            nn.Linear(32*13*13, 128),\n",
102 |     "            nn.ReLU(),\n",
103 |     "            nn.Linear(128, 10)\n",
104 |     "        )\n",
105 |     "\n",
106 |     "    def forward(self, x):\n",
107 |     "        return self.layers(x)\n",
108 |     "\n",
109 |     "model = SimpleCNN().to(device)\n",
110 |     "print(model)"
111 |    ]
112 |   },
113 |   {
114 |    "cell_type": "markdown",
115 |    "metadata": {},
116 |    "source": [
117 |     "**Note:** one slightly tricky thing in the code above is that you have to know the input dimension for the first linear layer. This is the output of the `Conv2d` followed by the `MaxPool2d`. This can be reasoned as follows:\n",
118 |     "\n",
119 |     "- the input to `Conv2d` will be 1x28x28 as the images have a single color channel (gray scale) and have a width by height of 28x28\n",
120 |     "- the output of `Conv2d` will be 32x26x26 as the color channels are replaced by the outputs of the 32 convolution kernels, and due the valid padding and kernel size of 3x3 a border of 1 pixel will be excluded\n",
121 |     "- `MaxPool2d` will remove every second pixel along each dimension, so we get 32x13x13=5408\n",
122 |     "\n",
123 |     "If you are lazy you can also just guess something and run the code. The error message will tell you what size it expected to have!"
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "markdown",
128 |    "metadata": {},
129 |    "source": [
130 |     "## Learning\n",
131 |     "\n",
132 |     "Now let's train the CNN model.\n",
133 |     "\n",
134 |     "First we'll define the same functions as in the previous exercise. We've made a few minor additions:\n",
135 |     "- In the `train` function we added `tqdm` to print a nicer progress bar as the training will be a bit slower this time.\n",
136 |     "- We return the loss and accuracy so we can do some plotting"
137 |    ]
138 |   },
139 |   {
140 |    "cell_type": "code",
141 |    "execution_count": null,
142 |    "metadata": {},
143 |    "outputs": [],
144 |    "source": [
145 |     "def correct(output, target):\n",
146 |     "    predicted_digits = output.argmax(1)                            # pick digit with largest network output\n",
147 |     "    correct_ones = (predicted_digits == target).type(torch.float)  # 1.0 for correct, 0.0 for incorrect\n",
148 |     "    return correct_ones.sum().item()                               # count number of correct ones\n"
149 |    ]
150 |   },
151 |   {
152 |    "cell_type": "code",
153 |    "execution_count": null,
154 |    "metadata": {},
155 |    "outputs": [],
156 |    "source": [
157 |     "def train(data_loader, model, criterion, optimizer):\n",
158 |     "    model.train()\n",
159 |     "\n",
160 |     "    num_batches = len(data_loader)\n",
161 |     "    num_items = len(data_loader.dataset)\n",
162 |     "\n",
163 |     "    total_loss = 0\n",
164 |     "    total_correct = 0\n",
165 |     "    for data, target in tqdm(data_loader, total=num_batches):\n",
166 |     "        # Copy data and targets to GPU\n",
167 |     "        data = data.to(device)\n",
168 |     "        target = target.to(device)\n",
169 |     "        \n",
170 |     "        # Do a forward pass\n",
171 |     "        output = model(data)\n",
172 |     "        \n",
173 |     "        # Calculate the loss\n",
174 |     "        loss = criterion(output, target)\n",
175 |     "        total_loss += loss\n",
176 |     "\n",
177 |     "        # Count number of correct digits\n",
178 |     "        total_correct += correct(output, target)\n",
179 |     "        \n",
180 |     "        # Backpropagation\n",
181 |     "        loss.backward()\n",
182 |     "        optimizer.step()\n",
183 |     "        optimizer.zero_grad()\n",
184 |     "\n",
185 |     "    train_loss = total_loss/num_batches\n",
186 |     "    accuracy = total_correct/num_items\n",
187 |     "    print(f\"Average loss: {train_loss:7f}, accuracy: {accuracy:.2%}\")\n",
188 |     "    return train_loss.item(), accuracy\n"
189 |    ]
190 |   },
191 |   {
192 |    "cell_type": "code",
193 |    "execution_count": null,
194 |    "metadata": {},
195 |    "outputs": [],
196 |    "source": [
197 |     "criterion = nn.CrossEntropyLoss()\n",
198 |     "optimizer = torch.optim.Adam(model.parameters())"
199 |    ]
200 |   },
201 |   {
202 |    "cell_type": "markdown",
203 |    "metadata": {},
204 |    "source": [
205 |     "This is a relatively complex model, so training is considerably slower than with MLPs. "
206 |    ]
207 |   },
208 |   {
209 |    "cell_type": "code",
210 |    "execution_count": null,
211 |    "metadata": {},
212 |    "outputs": [],
213 |    "source": [
214 |     "%%time\n",
215 |     "\n",
216 |     "losses = []\n",
217 |     "accuracies = []\n",
218 |     "epochs = 5\n",
219 |     "for epoch in range(epochs):\n",
220 |     "    print(f\"Training epoch: {epoch+1}\")\n",
221 |     "    loss, acc = train(train_loader, model, criterion, optimizer)\n",
222 |     "    losses.append(loss)\n",
223 |     "    accuracies.append(acc)"
224 |    ]
225 |   },
226 |   {
227 |    "cell_type": "markdown",
228 |    "metadata": {},
229 |    "source": [
230 |     "Let's plot how the loss and accuracy change over the epochs"
231 |    ]
232 |   },
233 |   {
234 |    "cell_type": "code",
235 |    "execution_count": null,
236 |    "metadata": {},
237 |    "outputs": [],
238 |    "source": [
239 |     "plt.subplot(2,1,1)\n",
240 |     "plt.plot(losses)\n",
241 |     "plt.ylabel(\"Loss\")\n",
242 |     "plt.subplot(2,1,2)\n",
243 |     "plt.plot(accuracies)\n",
244 |     "plt.ylabel(\"Accuracy\");"
245 |    ]
246 |   },
247 |   {
248 |    "cell_type": "markdown",
249 |    "metadata": {},
250 |    "source": [
251 |     "### Inference\n",
252 |     "\n",
253 |     "Here we have the same `test` function as before."
254 |    ]
255 |   },
256 |   {
257 |    "cell_type": "code",
258 |    "execution_count": null,
259 |    "metadata": {},
260 |    "outputs": [],
261 |    "source": [
262 |     "def test(test_loader, model, criterion):\n",
263 |     "    model.eval()\n",
264 |     "\n",
265 |     "    num_batches = len(test_loader)\n",
266 |     "    num_items = len(test_loader.dataset)\n",
267 |     "\n",
268 |     "    test_loss = 0\n",
269 |     "    total_correct = 0\n",
270 |     "\n",
271 |     "    with torch.no_grad():\n",
272 |     "        for data, target in test_loader:\n",
273 |     "            # Copy data and targets to GPU\n",
274 |     "            data = data.to(device)\n",
275 |     "            target = target.to(device)\n",
276 |     "        \n",
277 |     "            # Do a forward pass\n",
278 |     "            output = model(data)\n",
279 |     "        \n",
280 |     "            # Calculate the loss\n",
281 |     "            loss = criterion(output, target)\n",
282 |     "            test_loss += loss.item()\n",
283 |     "        \n",
284 |     "            # Count number of correct digits\n",
285 |     "            total_correct += correct(output, target)\n",
286 |     "\n",
287 |     "    test_loss = test_loss/num_batches\n",
288 |     "    accuracy = total_correct/num_items\n",
289 |     "\n",
290 |     "    print(f\"Testset accuracy: {100*accuracy:>0.1f}%, average loss: {test_loss:>7f}\")"
291 |    ]
292 |   },
293 |   {
294 |    "cell_type": "code",
295 |    "execution_count": null,
296 |    "metadata": {},
297 |    "outputs": [],
298 |    "source": [
299 |     "test(test_loader, model, criterion)"
300 |    ]
301 |   },
302 |   {
303 |    "cell_type": "markdown",
304 |    "metadata": {},
305 |    "source": [
306 |     "Let's take a look at the convolution weights of the Conv2D layer."
307 |    ]
308 |   },
309 |   {
310 |    "cell_type": "code",
311 |    "execution_count": null,
312 |    "metadata": {},
313 |    "outputs": [],
314 |    "source": [
315 |     "for i, module in enumerate(model.modules()):\n",
316 |     "    print(i, type(module))"
317 |    ]
318 |   },
319 |   {
320 |    "cell_type": "code",
321 |    "execution_count": null,
322 |    "metadata": {},
323 |    "outputs": [],
324 |    "source": [
325 |     "weights = [module for module in model.modules()][2].weight.data.numpy()\n",
326 |     "\n",
327 |     "for i in range(weights.shape[0]):\n",
328 |     "    plt.subplot(4, 8, i+1)\n",
329 |     "    c = weights[i][0]\n",
330 |     "    c = c - np.min(c)\n",
331 |     "    c = c / np.max(c)\n",
332 |     "    plt.imshow(c, cmap='gray', interpolation='nearest')\n",
333 |     "    plt.axis('off')\n"
334 |    ]
335 |   },
336 |   {
337 |    "cell_type": "markdown",
338 |    "metadata": {},
339 |    "source": [
340 |     "## Task 1: A more complex CNN model\n",
341 |     "\n",
342 |     "Your task is to try the same problem as above, but with two convolutional layers. The new model should have the following layers in order:\n",
343 |     "\n",
344 |     "- Convolutional (`Conv2d`) layer with 32 units and 3x3 kernels, valid padding + ReLU activation\n",
345 |     "- Another identical convolutional layer + ReLU activation\n",
346 |     "- Max pooling (`MaxPool2d`) layer with 2x2 pooling size\n",
347 |     "- Dropout with 0.25 rate\n",
348 |     "- Flatten\n",
349 |     "- Dense layer with 128 units\n",
350 |     "- Dropout with 0.5 rate\n",
351 |     "- Dense output layer with 10 units\n",
352 |     "\n",
353 |     "You can consult the [PyTorch documentation](https://pytorch.org/docs/stable/index.html), in particular all the [neural network building blocks can be found in the `torch.nn` documentation](https://pytorch.org/docs/stable/nn.html).\n",
354 |     "\n",
355 |     "The code below is missing the model definition. You can copy any suitable layers from the example above."
356 |    ]
357 |   },
358 |   {
359 |    "cell_type": "code",
360 |    "execution_count": null,
361 |    "metadata": {},
362 |    "outputs": [],
363 |    "source": [
364 |     "class ComplexCNN(nn.Module):\n",
365 |     "    def __init__(self):\n",
366 |     "        super().__init__()\n",
367 |     "        self.layers = nn.Sequential(\n",
368 |     "           # TASK 1: ADD LAYERS HERE\n",
369 |     "        )\n",
370 |     "\n",
371 |     "    def forward(self, x):\n",
372 |     "        return self.layers(x)\n"
373 |    ]
374 |   },
375 |   {
376 |    "cell_type": "markdown",
377 |    "metadata": {},
378 |    "source": [
379 |     "If you want to see an example answer, change the type of the cell below to \"Code\" in the menu bar above and then execute the cell. Execute the cell again to run the example code.\n",
380 |     "\n",
381 |     "**Note:** in Google Colab you can [click here](https://github.com/csc-training/intro-to-dl/blob/master/day1/solutions/pytorch-mnist-cnn-example-answer.py) and copy the answer manually."
382 |    ]
383 |   },
384 |   {
385 |    "cell_type": "raw",
386 |    "metadata": {},
387 |    "source": [
388 |     "%load solutions/pytorch-mnist-cnn-example-answer.py"
389 |    ]
390 |   },
391 |   {
392 |    "cell_type": "code",
393 |    "execution_count": null,
394 |    "metadata": {},
395 |    "outputs": [],
396 |    "source": [
397 |     "ex1_model = ComplexCNN()\n",
398 |     "print(ex1_model)\n",
399 |     "\n",
400 |     "assert len(ex1_model.layers) > 0, \"ERROR: You need to write the missing model definition above!\"\n",
401 |     "ex1_model = ex1_model.to(device)"
402 |    ]
403 |   },
404 |   {
405 |    "cell_type": "code",
406 |    "execution_count": null,
407 |    "metadata": {},
408 |    "outputs": [],
409 |    "source": [
410 |     "ex1_criterion = nn.CrossEntropyLoss()\n",
411 |     "ex1_optimizer = torch.optim.Adam(ex1_model.parameters())"
412 |    ]
413 |   },
414 |   {
415 |    "cell_type": "code",
416 |    "execution_count": null,
417 |    "metadata": {},
418 |    "outputs": [],
419 |    "source": [
420 |     "%%time\n",
421 |     "\n",
422 |     "losses = []\n",
423 |     "accuracies = []\n",
424 |     "epochs = 5\n",
425 |     "for epoch in range(epochs):\n",
426 |     "    print(f\"Epoch: {epoch+1} ...\")\n",
427 |     "    loss, acc = train(train_loader, ex1_model, ex1_criterion, ex1_optimizer)\n",
428 |     "    losses.append(loss)\n",
429 |     "    accuracies.append(acc)"
430 |    ]
431 |   },
432 |   {
433 |    "cell_type": "code",
434 |    "execution_count": null,
435 |    "metadata": {},
436 |    "outputs": [],
437 |    "source": [
438 |     "plt.subplot(2,1,1)\n",
439 |     "plt.plot(losses)\n",
440 |     "plt.ylabel(\"Loss\")\n",
441 |     "plt.subplot(2,1,2)\n",
442 |     "plt.plot(accuracies)\n",
443 |     "plt.ylabel(\"Accuracy\");"
444 |    ]
445 |   },
446 |   {
447 |    "cell_type": "code",
448 |    "execution_count": null,
449 |    "metadata": {},
450 |    "outputs": [],
451 |    "source": [
452 |     "test(test_loader, ex1_model, ex1_criterion)"
453 |    ]
454 |   },
455 |   {
456 |    "cell_type": "markdown",
457 |    "metadata": {},
458 |    "source": [
459 |     "## Task 2: Tune training parameters\n",
460 |     "\n",
461 |     "Try to improve the classification accuracy, in particular by trying different optimizers and playing with the parameters of the training process.\n",
462 |     "\n",
463 |     "See optimizers available in PyTorch here: <https://pytorch.org/docs/stable/optim.html#algorithms>\n",
464 |     "\n",
465 |     "You can take the model created in Task 1 as a starting point. Below is a code example which you can modify."
466 |    ]
467 |   },
468 |   {
469 |    "cell_type": "markdown",
470 |    "metadata": {},
471 |    "source": [
472 |     "If you wish to change the batch size, you need to re-define the data loaders."
473 |    ]
474 |   },
475 |   {
476 |    "cell_type": "code",
477 |    "execution_count": null,
478 |    "metadata": {},
479 |    "outputs": [],
480 |    "source": [
481 |     "batch_size = 32\n",
482 |     "\n",
483 |     "train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)\n",
484 |     "test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)"
485 |    ]
486 |   },
487 |   {
488 |    "cell_type": "code",
489 |    "execution_count": null,
490 |    "metadata": {},
491 |    "outputs": [],
492 |    "source": [
493 |     "ex2_model = ComplexCNN().to(device)\n",
494 |     "\n",
495 |     "ex2_criterion = nn.CrossEntropyLoss()\n",
496 |     "ex2_optimizer = torch.optim.Adam(ex2_model.parameters())\n",
497 |     "\n",
498 |     "epochs = 5\n",
499 |     "for epoch in range(epochs):\n",
500 |     "    print(f\"Epoch: {epoch+1} ...\")\n",
501 |     "    train(train_loader, ex2_model, ex2_criterion, ex2_optimizer)"
502 |    ]
503 |   },
504 |   {
505 |    "cell_type": "code",
506 |    "execution_count": null,
507 |    "metadata": {},
508 |    "outputs": [],
509 |    "source": [
510 |     "test(test_loader, ex2_model, ex2_criterion)"
511 |    ]
512 |   },
513 |   {
514 |    "cell_type": "markdown",
515 |    "metadata": {},
516 |    "source": [
517 |     "## Extra: View model summary\n",
518 |     "\n",
519 |     "One way to view more information about the model is to use an external package such as [Torchinfo](https://github.com/TylerYep/torchinfo). It is not installed in the standard Pytorch module in LUMI, so you need to install it using pip:"
520 |    ]
521 |   },
522 |   {
523 |    "cell_type": "code",
524 |    "execution_count": null,
525 |    "metadata": {},
526 |    "outputs": [],
527 |    "source": [
528 |     "!pip3 install torchinfo"
529 |    ]
530 |   },
531 |   {
532 |    "cell_type": "code",
533 |    "execution_count": null,
534 |    "metadata": {},
535 |    "outputs": [],
536 |    "source": [
537 |     "from torchinfo import summary"
538 |    ]
539 |   },
540 |   {
541 |    "cell_type": "code",
542 |    "execution_count": null,
543 |    "metadata": {},
544 |    "outputs": [],
545 |    "source": [
546 |     "summary(model, input_size=(batch_size, 1, 28, 28), \n",
547 |     "        col_names=[\"input_size\", \"output_size\", \"kernel_size\", \"num_params\"])"
548 |    ]
549 |   }
550 |  ],
551 |  "metadata": {
552 |   "kernelspec": {
553 |    "display_name": "Python 3 (ipykernel)",
554 |    "language": "python",
555 |    "name": "python3"
556 |   },
557 |   "language_info": {
558 |    "codemirror_mode": {
559 |     "name": "ipython",
560 |     "version": 3
561 |    },
562 |    "file_extension": ".py",
563 |    "mimetype": "text/x-python",
564 |    "name": "python",
565 |    "nbconvert_exporter": "python",
566 |    "pygments_lexer": "ipython3",
567 |    "version": "3.10.12"
568 |   }
569 |  },
570 |  "nbformat": 4,
571 |  "nbformat_minor": 4
572 | }
573 | 


--------------------------------------------------------------------------------
/day1/04b-pytorch-imdb-huggingface.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "2edd322b-1e0a-4fc4-919b-e808ad472cfe",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# IMDB movie review sentiment classification using Hugging Face models\n",
  9 |     "\n",
 10 |     "In this notebook, we'll test pre-trained sentiment analysis models and later finetune a DistilBERT model to perform IMDB movie review sentiment classification. This notebook is adapted from [Getting Started with Sentiment Analysis using Python](https://huggingface.co/blog/sentiment-analysis-python)."
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "markdown",
 15 |    "id": "c9243fc9-da3f-470d-9e0f-9aaa9528efcd",
 16 |    "metadata": {},
 17 |    "source": [
 18 |     "Import the libraries"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": null,
 24 |    "id": "e1227c62-d120-4908-8d35-6bf0f236be50",
 25 |    "metadata": {},
 26 |    "outputs": [],
 27 |    "source": [
 28 |     "from transformers import pipeline\n",
 29 |     "import torch\n",
 30 |     "from datasets import load_dataset\n",
 31 |     "from transformers import AutoTokenizer\n",
 32 |     "from transformers import DataCollatorWithPadding\n",
 33 |     "from transformers import AutoModelForSequenceClassification\n",
 34 |     "import numpy as np\n",
 35 |     "import evaluate\n",
 36 |     "from huggingface_hub import notebook_login\n",
 37 |     "from transformers import TrainingArguments, Trainer\n",
 38 |     "from transformers import pipeline"
 39 |    ]
 40 |   },
 41 |   {
 42 |    "cell_type": "markdown",
 43 |    "id": "35124c9a-1f07-416b-834d-f9c7508f682c",
 44 |    "metadata": {},
 45 |    "source": [
 46 |     "Check if PyTorch is using the GPU"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": null,
 52 |    "id": "b124df20-1f8a-4a5e-9975-4798bfdaf0f8",
 53 |    "metadata": {},
 54 |    "outputs": [],
 55 |    "source": [
 56 |     "print('Using PyTorch version:', torch.__version__)\n",
 57 |     "if torch.cuda.is_available():\n",
 58 |     "    print('Using GPU, device name:', torch.cuda.get_device_name(0))\n",
 59 |     "    device = torch.device('cuda')\n",
 60 |     "else:\n",
 61 |     "    print('No GPU found, using CPU instead.') \n",
 62 |     "    device = torch.device('cpu')"
 63 |    ]
 64 |   },
 65 |   {
 66 |    "cell_type": "markdown",
 67 |    "id": "f5800ebf-82bd-4cdc-9067-66ee8480d528",
 68 |    "metadata": {},
 69 |    "source": [
 70 |     "## Use Pre-trained Sentiment Analysis Models"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "code",
 75 |    "execution_count": null,
 76 |    "id": "6c82b0f7-62d8-4e3f-9e99-ef3ebc6522bc",
 77 |    "metadata": {},
 78 |    "outputs": [],
 79 |    "source": [
 80 |     "sentiment_pipeline = pipeline(\"sentiment-analysis\", device=device)\n",
 81 |     "data = [\"I love you\", \"I hate you\"]\n",
 82 |     "sentiment_pipeline(data)"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "markdown",
 87 |    "id": "e97a94f5-1548-46f2-a2f9-1715113e90ab",
 88 |    "metadata": {
 89 |     "jp-MarkdownHeadingCollapsed": true
 90 |    },
 91 |    "source": [
 92 |     "- This code snippet above utilizes the **[pipeline](https://huggingface.co/docs/transformers/main_classes/pipelines)** class to generate predictions using models from the Hub. It applies the [default sentiment analysis model](https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english) to evaluate the provided list of text data.\n",
 93 |     "- The analysis results are **POSITIVE** for first entry and **NEGATIVE** for the second entry."
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "markdown",
 98 |    "id": "c273d882-0aa9-4d86-a1fa-fd518e2c3ce0",
 99 |    "metadata": {},
100 |    "source": [
101 |     "One can also use a specific sentiment analysis model by providing the name of the model, e.g., if you want a sentiment analysis model for tweets, you can specify the model id."
102 |    ]
103 |   },
104 |   {
105 |    "cell_type": "code",
106 |    "execution_count": null,
107 |    "id": "c183b485-adc5-447d-b3b7-bb66e173c80a",
108 |    "metadata": {},
109 |    "outputs": [],
110 |    "source": [
111 |     "specific_model = pipeline(model=\"finiteautomata/bertweet-base-sentiment-analysis\", device = device)\n",
112 |     "specific_model(data)"
113 |    ]
114 |   },
115 |   {
116 |    "cell_type": "markdown",
117 |    "id": "daf45bd1-3964-4d32-944b-edd0783163bb",
118 |    "metadata": {},
119 |    "source": [
120 |     "## Fine-tuning DistilBERT model using IMDB dataset "
121 |    ]
122 |   },
123 |   {
124 |    "cell_type": "markdown",
125 |    "id": "2ec252bb-b0b0-48dc-8e3a-b65b72f931c1",
126 |    "metadata": {},
127 |    "source": [
128 |     "- The [IMDB](https://huggingface.co/datasets/stanfordnlp/imdb) dataset contains 50000 movies reviews from the Internet Movie Database, split into 25000 reviews for training and 25000 reviews for testing. Half of the reviews are positive and half are negative. \n",
129 |     "\n",
130 |     "- The IMDB dataset is relatively large, so let's use 5000 samples for training to speed up our process for this exercise."
131 |    ]
132 |   },
133 |   {
134 |    "cell_type": "code",
135 |    "execution_count": null,
136 |    "id": "48bb7a7d-9194-4904-bc91-bd1adb191ea1",
137 |    "metadata": {},
138 |    "outputs": [],
139 |    "source": [
140 |     "imdb = load_dataset(\"imdb\")\n",
141 |     "small_train_dataset = imdb[\"train\"].shuffle(seed=0).select([i for i in list(range(5000))])\n",
142 |     "test_dataset = imdb[\"test\"]"
143 |    ]
144 |   },
145 |   {
146 |    "cell_type": "markdown",
147 |    "id": "f40b569b-f984-4adc-9d21-2fd3ce72b9b4",
148 |    "metadata": {},
149 |    "source": [
150 |     "Let's look at two samples from the IMDB dataset. One negative (label: `0`) and one positive (label: `1`) review."
151 |    ]
152 |   },
153 |   {
154 |    "cell_type": "code",
155 |    "execution_count": null,
156 |    "id": "554c9661-89d7-45f1-a222-fa6de2468713",
157 |    "metadata": {},
158 |    "outputs": [],
159 |    "source": [
160 |     "for i in (2, 12500): \n",
161 |     "    print(imdb[\"train\"][i])\n",
162 |     "    print()"
163 |    ]
164 |   },
165 |   {
166 |    "cell_type": "markdown",
167 |    "id": "82fe9183-d554-482c-8f3a-a75096e10e14",
168 |    "metadata": {},
169 |    "source": [
170 |     "To preprocess our data, we will use DistilBERT tokenizer:"
171 |    ]
172 |   },
173 |   {
174 |    "cell_type": "code",
175 |    "execution_count": null,
176 |    "id": "dbf1e5fc-3831-47d0-ab3f-01d6dd70482a",
177 |    "metadata": {},
178 |    "outputs": [],
179 |    "source": [
180 |     "tokenizer = AutoTokenizer.from_pretrained(\"distilbert-base-uncased\")"
181 |    ]
182 |   },
183 |   {
184 |    "cell_type": "markdown",
185 |    "id": "88e45582-a0c3-4a80-8a06-d4ce232a1ead",
186 |    "metadata": {},
187 |    "source": [
188 |     "- Next, we will prepare the text inputs for the model for both splits of our dataset (training and test) by using the map method:"
189 |    ]
190 |   },
191 |   {
192 |    "cell_type": "code",
193 |    "execution_count": null,
194 |    "id": "9ab50bd2-e54b-4e31-a162-24923b763731",
195 |    "metadata": {},
196 |    "outputs": [],
197 |    "source": [
198 |     "def preprocess_function(examples):\n",
199 |     "   return tokenizer(examples[\"text\"], truncation=True)\n",
200 |     " \n",
201 |     "tokenized_train = small_train_dataset.map(preprocess_function, batched=True)\n",
202 |     "tokenized_test = test_dataset.map(preprocess_function, batched=True)"
203 |    ]
204 |   },
205 |   {
206 |    "cell_type": "markdown",
207 |    "id": "a1b9b4c0-9aea-481b-bb26-5f9bd18228c1",
208 |    "metadata": {},
209 |    "source": [
210 |     "- To speed up training, let's use a data_collator to convert your training samples to PyTorch tensors and concatenate them with the correct amount of padding:"
211 |    ]
212 |   },
213 |   {
214 |    "cell_type": "code",
215 |    "execution_count": null,
216 |    "id": "f10d80c5-77c3-43a6-a7d9-47d97deef882",
217 |    "metadata": {},
218 |    "outputs": [],
219 |    "source": [
220 |     "data_collator = DataCollatorWithPadding(tokenizer=tokenizer)"
221 |    ]
222 |   },
223 |   {
224 |    "cell_type": "markdown",
225 |    "id": "0455d055-7b18-47a2-b983-e7a35e46398f",
226 |    "metadata": {},
227 |    "source": [
228 |     "### Training the model\n",
229 |     "- We will be throwing away the pretraining head of the DistilBERT model and replacing it with a classification head fine-tuned for sentiment analysis. This enables us to transfer the knowledge from DistilBERT to our custom model."
230 |    ]
231 |   },
232 |   {
233 |    "cell_type": "code",
234 |    "execution_count": null,
235 |    "id": "1644ded6-7d6a-43d2-b303-3d43eb316e4c",
236 |    "metadata": {},
237 |    "outputs": [],
238 |    "source": [
239 |     "model = AutoModelForSequenceClassification.from_pretrained(\"distilbert-base-uncased\", num_labels=2)"
240 |    ]
241 |   },
242 |   {
243 |    "cell_type": "markdown",
244 |    "id": "b6577793-6f3d-48a3-b13e-95116be72685",
245 |    "metadata": {},
246 |    "source": [
247 |     "- Then, let's define the metrics you will be using to evaluate how good is your fine-tuned model (accuracy and f1 score)"
248 |    ]
249 |   },
250 |   {
251 |    "cell_type": "code",
252 |    "execution_count": null,
253 |    "id": "81da7601-9ca0-45ee-94f8-d9d773fff695",
254 |    "metadata": {},
255 |    "outputs": [],
256 |    "source": [
257 |     "def compute_metrics(eval_pred):\n",
258 |     "   eval_accuracy = evaluate.load(\"accuracy\")\n",
259 |     "   eval_f1 = evaluate.load(\"f1\")\n",
260 |     "  \n",
261 |     "   logits, labels = eval_pred\n",
262 |     "   predictions = np.argmax(logits, axis=-1)\n",
263 |     "   accuracy = eval_accuracy.compute(predictions=predictions, references=labels)[\"accuracy\"]\n",
264 |     "   f1 = eval_f1.compute(predictions=predictions, references=labels)[\"f1\"]\n",
265 |     "   return {\"accuracy\": accuracy, \"f1\": f1}"
266 |    ]
267 |   },
268 |   {
269 |    "cell_type": "markdown",
270 |    "id": "ad53c65b-3463-4707-a8ad-ebd2de387133",
271 |    "metadata": {},
272 |    "source": [
273 |     "- Define the training arguments"
274 |    ]
275 |   },
276 |   {
277 |    "cell_type": "code",
278 |    "execution_count": null,
279 |    "id": "96b56df8-a9ac-41ec-9ad7-86c0e0cec2f7",
280 |    "metadata": {},
281 |    "outputs": [],
282 |    "source": [
283 |     "repo_name = \"finetuning-sentiment-model-5000-samples\"\n",
284 |     " \n",
285 |     "training_args = TrainingArguments(\n",
286 |     "   output_dir=repo_name,\n",
287 |     "   learning_rate=2e-5,\n",
288 |     "   per_device_train_batch_size=16,\n",
289 |     "   per_device_eval_batch_size=16,\n",
290 |     "   num_train_epochs=2,\n",
291 |     "   weight_decay=0.01,\n",
292 |     "   save_strategy=\"epoch\",\n",
293 |     "   push_to_hub=False,\n",
294 |     "   report_to=\"none\"\n",
295 |     ")\n",
296 |     " \n",
297 |     "trainer = Trainer(\n",
298 |     "   model=model,\n",
299 |     "   args=training_args,\n",
300 |     "   train_dataset=tokenized_train,\n",
301 |     "   eval_dataset=tokenized_test,\n",
302 |     "   tokenizer=tokenizer,\n",
303 |     "   data_collator=data_collator,\n",
304 |     "   compute_metrics=compute_metrics,\n",
305 |     ")"
306 |    ]
307 |   },
308 |   {
309 |    "cell_type": "markdown",
310 |    "id": "eaad4946-5446-4cd1-ae2f-8502d2e3037f",
311 |    "metadata": {},
312 |    "source": [
313 |     "- Start training"
314 |    ]
315 |   },
316 |   {
317 |    "cell_type": "code",
318 |    "execution_count": null,
319 |    "id": "e53f771c-d1fb-4a5b-a426-ae5bd53964df",
320 |    "metadata": {},
321 |    "outputs": [],
322 |    "source": [
323 |     "trainer.train()"
324 |    ]
325 |   },
326 |   {
327 |    "cell_type": "markdown",
328 |    "id": "ec3eee23-b26c-4687-8d27-7d075d76d3a9",
329 |    "metadata": {},
330 |    "source": [
331 |     "- Evaluate the model"
332 |    ]
333 |   },
334 |   {
335 |    "cell_type": "code",
336 |    "execution_count": null,
337 |    "id": "2a951518-0ee7-4e73-b47b-f679b7e6e628",
338 |    "metadata": {},
339 |    "outputs": [],
340 |    "source": [
341 |     "trainer.evaluate()"
342 |    ]
343 |   },
344 |   {
345 |    "cell_type": "markdown",
346 |    "id": "61722a1b-3347-469c-b0d9-89398e4601ca",
347 |    "metadata": {},
348 |    "source": [
349 |     "- Model inference"
350 |    ]
351 |   },
352 |   {
353 |    "cell_type": "code",
354 |    "execution_count": null,
355 |    "id": "a3db7322-4f03-4fd2-996b-948e3c271da4",
356 |    "metadata": {},
357 |    "outputs": [],
358 |    "source": [
359 |     "pipe = pipeline(\"sentiment-analysis\", model=model, tokenizer=tokenizer, device=device)\n",
360 |     "pipe([\"I love this move\", \"This movie sucks!\"])"
361 |    ]
362 |   },
363 |   {
364 |    "cell_type": "markdown",
365 |    "id": "4d40ac81-bf5b-4ab7-b713-61fd22151020",
366 |    "metadata": {},
367 |    "source": [
368 |     "## Task 1 Run this script with GPU"
369 |    ]
370 |   },
371 |   {
372 |    "cell_type": "markdown",
373 |    "id": "4f8eb61e-a673-4b22-8c6d-7262946964f6",
374 |    "metadata": {},
375 |    "source": [
376 |     "## Task 2 Compare the test dataset accuracy achieved from finetuned DistilBERT model and the previous RNN model. What do you notice?"
377 |    ]
378 |   }
379 |  ],
380 |  "metadata": {
381 |   "kernelspec": {
382 |    "display_name": "Python 3 (ipykernel)",
383 |    "language": "python",
384 |    "name": "python3"
385 |   },
386 |   "language_info": {
387 |    "codemirror_mode": {
388 |     "name": "ipython",
389 |     "version": 3
390 |    },
391 |    "file_extension": ".py",
392 |    "mimetype": "text/x-python",
393 |    "name": "python",
394 |    "nbconvert_exporter": "python",
395 |    "pygments_lexer": "ipython3",
396 |    "version": "3.10.12"
397 |   }
398 |  },
399 |  "nbformat": 4,
400 |  "nbformat_minor": 5
401 | }
402 | 


--------------------------------------------------------------------------------
/day1/README.md:
--------------------------------------------------------------------------------
  1 | # Day 1
  2 | 
  3 | ## Exercise sessions
  4 | 
  5 | ### Exercise 1
  6 | 
  7 | Introduction to Notebooks, PyTorch fundamentals.
  8 | 
  9 | * *01-pytorch-test-setup.ipynb*<br/>[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/csc-training/intro-to-dl/blob/master/day1/01-pytorch-test-setup.ipynb)
 10 | 
 11 | ### Exercise 2
 12 | 
 13 | MNIST classification with MLPs.
 14 | 
 15 | * *02-pytorch-mnist-mlp.ipynb*<br/>[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/csc-training/intro-to-dl/blob/master/day1/02-pytorch-mnist-mlp.ipynb)
 16 | 
 17 | ### Exercise 3
 18 | 
 19 | Image classification with CNNs.
 20 | 
 21 | * *03-pytorch-mnist-cnn.ipynb*<br/>[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/csc-training/intro-to-dl/blob/master/day1/03-pytorch-mnist-cnn.ipynb)
 22 | 
 23 | ### Exercise 4
 24 | 
 25 | Text sentiment classification with RNNs and using a pre-trained DistilBERT from Hugging Face.
 26 | 
 27 | * *04a-pytorch-imdb-rnn.ipynb*<br/>[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/csc-training/intro-to-dl/blob/master/day1/04a-pytorch-imdb-rnn.ipynb)
 28 | * 04b-pytorch-imdb-huggingface.ipynb*<br/>[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/csc-training/intro-to-dl/blob/master/day1/04b-pytorch-imdb-huggingface.ipynb)
 29 | 
 30 | ## Setup
 31 | 
 32 | We will use Jupyter Notebooks for all exercises on Day 1. There are several ways to set up a Jupyter environment for running the exercises:
 33 | 
 34 | 
 35 | ### 1. LUMI web user interface
 36 | 
 37 | *The default option.*
 38 | 
 39 | 1. Go to the [LUMI web user interface](https://www.lumi.csc.fi/).
 40 | 2. Login with Haka (Finnish university or research institute) or CSC account (anyone with valid CSC account)
 41 | 3. Click "Jupyter for courses" (this works only if you have been added to the course project)
 42 | 4. Make sure the selections are correct:
 43 |    - Reservation: PDL_CPU (during course day 1), No reservation (otherwise)
 44 |    - Project: project_462000863
 45 |    - Course module: Practical_Deep_Learning
 46 |      * if you do not see the course module listed, try "Restart Web Server" from the top-right "question-mark-inside-a-circle" menu item 
 47 |    - Working directory: /users/your-username-here
 48 | 6. Click "Launch"
 49 | 7. Once the applications has started click "Connect to Jupyter"
 50 | 8. If you are not familiar with Jupyter, take a moment to get to know the interface
 51 |    - open a new notebook (*File* -> *New* -> *Notebook*, on menubar) 
 52 |    - select *"Python 3"* as the kernel for the notebook
 53 |    - write some Python code to a Jupyter *cell*
 54 |    - execute the cell with *shift-enter*
 55 | 
 56 | ### 2. CSC Noppe
 57 | 
 58 | CSC's Noppe (https://noppe.csc.fi) provides easy-to-use environments for working with data and programming. You can access everything via your web browser and CSC cloud environment computes on the background. There should be enough resources for launching a notebooks instance for everyone, but unfortunately no GPUs. 
 59 | 
 60 | 1. Go to the [Noppe](https://noppe.csc.fi) frontpage
 61 | 2. Login according to selected login method:
 62 |    - **Haka or Virtu** (users from Finnish universities and research institutes)
 63 |        1. Press Login button on the frontpage
 64 |        2. Press Haka or Virtu button
 65 |        3. Select right organization
 66 |        4. Enter login information
 67 |    - **Special login** (if you have been given separate username and password for the course)
 68 |        1. Press "Special Login" button on the Notebooks frontpage (below the Login button)
 69 |        2. Enter login information (username goes to email slot)
 70 | 3. Start the "Practical Deep Learning" application
 71 |    - You might find it quicker if you select the "Machine Learning" tab
 72 |    - Click the round start button next to the "Practical Deep Learning" card
 73 |    - Wait for session to launch
 74 | 5. Once the Jupyter Notebook dashboard appears, navigate to `intro-to-dl/day1` 
 75 | 6. If you are not familiar with Jupyter, take a moment to get to know the interface
 76 |    - open a new notebook (*File* -> *New* -> *Notebook*, on menubar) 
 77 |    - select *"Python 3"* as the kernel for the notebook
 78 |    - write some Python code to a Jupyter *cell*
 79 |    - execute the cell with *shift-enter*
 80 | 
 81 | #### :warning: Note
 82 | The notebook sessions have a limited time (4h) after which they, and any data or changes, will be *destroyed*. If you wish to save any files, you need to download them.
 83 |     
 84 | ### 3. Running Jupyter on your laptop
 85 | 
 86 | If you have a laptop that has both jupyter and the other necessary python packages installed, it is possible to use it. In particular, if the laptop has an NVIDIA or AMD GPU and it that has been properly set up (CUDA, cuDNN or ROCm).
 87 | 
 88 | * `git clone https://github.com/csc-training/intro-to-dl.git`   
 89 | * try to run the `day1/01-pytorch-test-setup.ipynb` notebook without errors
 90 | 
 91 | ### 4. Google Colaboratory
 92 | 
 93 | Google has a free Jupyter Notebooks service you may want to try out. No guarantees, but it does have GPUs available! A Google account is needed to use Colaboratory. 
 94 | 
 95 | * Click the corresponding Colab link [above in this document](#exercise-sessions)
 96 | * If needed, sign in to your Google account using the "Sign in" button in the top-right corner
 97 | * To use a GPU, select: Runtime => Change runtime type => Hardware accelerator: GPU
 98 | * Some exercises require the `datasets` library which isn't pre-installed on Colab. You can just run this in a cell:
 99 | 
100 | ```
101 | !pip install datasets==2.21.0
102 | ```
103 | 


--------------------------------------------------------------------------------
/day1/solutions/pytorch-imdb-rnn-example-answer.py:
--------------------------------------------------------------------------------
 1 | embedding_dims = 50
 2 | lstm_units = 32
 3 | 
 4 | class TwoLayeredRNN(nn.Module):
 5 |     def __init__(self):
 6 |         super().__init__()
 7 |         self.emb = nn.Embedding(nb_words, embedding_dims)
 8 |         self.dropout = nn.Dropout(0.2)
 9 |         self.lstm = nn.LSTM(embedding_dims, lstm_units, num_layers=2,
10 |                             batch_first=True)
11 |         self.linear = nn.Linear(lstm_units, 1)
12 | 
13 |         # With bidirectional
14 |         #self.lstm = nn.LSTM(embedding_dims, lstm_units, num_layers=2,
15 |         #                    batch_first=True, bidirectional=True)
16 |         #self.linear = nn.Linear(lstm_units*2, 1)
17 |         
18 |         self.sigmoid = nn.Sigmoid()
19 | 
20 |     def forward(self, x):
21 |         x = self.emb(x)
22 |         x = self.dropout(x)
23 |         x, (hn, cn) = self.lstm(x)
24 |         x = self.linear(x[:, -1, :])
25 |         return self.sigmoid(x.view(-1))
26 | 


--------------------------------------------------------------------------------
/day1/solutions/pytorch-mnist-cnn-example-answer.py:
--------------------------------------------------------------------------------
 1 | class ComplexCNN(nn.Module):
 2 |     def __init__(self):
 3 |         super().__init__()
 4 |         self.layers = nn.Sequential(
 5 |             nn.Conv2d(1, 32, kernel_size=3, padding='valid'),
 6 |             nn.ReLU(),
 7 |             nn.Conv2d(32, 32, kernel_size=3, padding='valid'),
 8 |             nn.ReLU(),
 9 |             nn.MaxPool2d(kernel_size=2),
10 |             nn.Dropout(0.25),
11 |             nn.Flatten(),
12 |             nn.Linear(12*12*32, 128),
13 |             nn.ReLU(),
14 |             nn.Dropout(0.5),
15 |             nn.Linear(128, 10)
16 |         )
17 | 
18 |     def forward(self, x):
19 |         return self.layers(x)


--------------------------------------------------------------------------------
/day1/solutions/pytorch-mnist-mlp-example-answer.py:
--------------------------------------------------------------------------------
 1 | class TwoLayerMLP(nn.Module):
 2 |     def __init__(self):
 3 |         super().__init__()
 4 |         self.layers = nn.Sequential(
 5 |             nn.Flatten(),
 6 |             nn.Linear(28*28, 50),
 7 |             nn.ReLU(),
 8 |             nn.Dropout(0.2),
 9 |             nn.Linear(50, 50),
10 |             nn.ReLU(),
11 |             nn.Dropout(0.2),
12 |             nn.Linear(50, 10)
13 |         )
14 | 
15 |     def forward(self, x):
16 |         return self.layers(x)
17 | 


--------------------------------------------------------------------------------
/day2/Exercise_5.md:
--------------------------------------------------------------------------------
 1 | # Exercise 5
 2 | 
 3 | In this exercise, we study image classification with two datasets:
 4 | 
 5 | - [_Dogs vs. cats_](imgs/dvc.png) (dvc), where we train on 2000 images, each
 6 |   depicting either a cat or a dog,
 7 | - [_German traffic signs_](imgs/gtsrb-montage.png) (gtsrb), where we train on
 8 |   5535 images with [43 types of traffic signs](imgs/traffic-signs.png).
 9 | 
10 | ## Task 1
11 | 
12 | ### Dogs vs. cats
13 | 
14 | Starting with the _Dogs vs. cats_ (dvc) database, train, evaluate and report the
15 | accuracy with three different approaches:
16 | 
17 | - CNN trained from scratch: [pytorch_dvc_cnn_simple.py](pytorch_dvc_cnn_simple.py)
18 | - Using a pre-trained CNN (VGG16) and fine tuning:
19 |   [pytorch_dvc_cnn_pretrained.py](pytorch_dvc_cnn_pretrained.py)
20 | 
21 | You can run the training directly with the corresponding script listed above,
22 | for example:
23 | 
24 |     sbatch run.sh pytorch_dvc_cnn_simple.py
25 | 
26 | As a reminder, you can check the status of your runs with the command:
27 | 
28 |     squeue --me
29 | 
30 | The output of the run will appear in a file named `slurm-RUN_ID.out`
31 | where `RUN_ID` is the Slurm batch job id. You can check the last ten
32 | lines of that file with the command:
33 | 
34 |     tail slurm-RUN_ID.out
35 | 
36 | Use `tail -f` if you want to continuously follow the progress of the
37 | output. (Press Ctrl-C when you want to stop following the file.)
38 | 
39 | After training, the script runs an evaluation on the test set, you
40 | should find the results of that towards the end of the output log on a
41 | line starting with "Testing". It should contain the accuracy
42 | (percentage of correctly classified images).
43 | 
44 | Check the outputs of each run. Note that the pre-trained model will
45 | print out two results, once after pre-training, and again after
46 | fine-tuning.  Which model gave the best testset result? Does
47 | finetuning improve the result?
48 | 
49 | 
50 | ### German traffic signs
51 | 
52 | Repeat the experiment with the _German traffic signs_ (gtsrb) database. Which
53 | model gives the best result in this case? Compare the results with the previous
54 | dvc results.
55 | 
56 | The scripts are named in the same way as before, just replace "dvc" with
57 | "gtsrb":
58 | 
59 | - CNN trained from scratch: [pytorch_gtsrb_cnn_simple.py](pytorch_gtsrb_cnn_simple.py)
60 | - Using a pre-trained CNN (VGG16) and fine tuning:
61 |   [pytorch_gtsrb_cnn_pretrained.py](pytorch_gtsrb_cnn_pretrained.py)
62 | 
63 | 
64 | ## Task 2
65 | 
66 | Pick one database (dvc or gtsrb) and try to improve the result, e.g., by
67 | tweaking the model or the training parameters (optimizer, batch size, number of
68 | epochs, etc.).
69 | 
70 | ## Extracurricular 1
71 | 
72 | There are scripts for both _Dogs vs. cats_ and _German traffic signs_ using
73 | Vision Transformers (ViTs). Compare these with the previous approaches.
74 | 
75 | - [pytorch_dvc_vit.py](pytorch_dvc_vit.py): _Dogs vs. cats_ with a pre-trained ViT
76 | - [pytorch_gtsrb_vit.py](pytorch_gtsrb_vit.py): _German traffic signs_ with a pre-trained ViT
77 | 
78 | ## Extracurricular 2
79 | 
80 | There is another, small dataset [Aliens and predators](imgs/avp.png)
81 | (avp) with 694 training and 200 validation images in the directory
82 | `/scratch/project_462000863/data/avp` on LUMI.  Modify the scripts for
83 | _Dogs vs. cats_ to classify between them.
84 | 
85 | 


--------------------------------------------------------------------------------
/day2/Exercise_6.md:
--------------------------------------------------------------------------------
 1 | # Exercise 6
 2 | 
 3 | In this exercise, we study text categorization using the [_20
 4 | newsgroups_](http://www.cs.cmu.edu/afs/cs.cmu.edu/project/theo-20/www/data/news20.html)
 5 | (20ng) dataset. The dataset contains 20,000 text documents (Usenet messages)
 6 | in 20 categories (newsgroups or topics). For the embeddings of RNNs and CNNs we are using pre-trained 100-dimensional [GloVe](https://nlp.stanford.edu/projects/glove/) vectors.
 7 | 
 8 | ## Task 1
 9 | 
10 | Try three different approaches for text classification with the _20 newsgroups_
11 | (20ng) dataset:
12 | 
13 | - Recurrent neural network (RNN): [pytorch_20ng_rnn.py](pytorch_20ng_rnn.py)
14 | - BERT finetuning: [pytorch_20ng_bert.py](pytorch_20ng_bert.py)
15 | - Convolutional neural network (CNN): [pytorch_20ng_cnn.py](pytorch_20ng_cnn.py)
16 | 
17 | Run all three models and compare their accuracies and run times.
18 | 
19 | ## Task 2
20 | 
21 | Pick one model (RNN, CNN or BERT) and try to improve the results, e.g., by
22 | tweaking the model or the training parameters (optimizer, batch size, number of
23 | epochs, etc.). 
24 | 
25 | You can also work on replacing BERT with another Transformers model (for example
26 | [DistilBert](https://huggingface.co/docs/transformers/master/en/model_doc/distilbert)). 
27 | See also the [HuggingFace Transformers documentation](https://huggingface.co/transformers/).
28 | 
29 | 


--------------------------------------------------------------------------------
/day2/Exercise_7.md:
--------------------------------------------------------------------------------
 1 | # Exercise 7
 2 | 
 3 | In this exercise, we take a pre-trained GPT-3-like model from the
 4 | Hugging Face repository and fine-tune it with movie reviews for the
 5 | IMDB dataset: http://ai.stanford.edu/~amaas/data/sentiment/
 6 | 
 7 | ## Task 1
 8 | 
 9 | Run the fine-tuning of the GPT-3 model by running the script
10 | [pytorch_imdb_gpt.py](pytorch_imdb_gpt.py).
11 | 
12 |    ```bash
13 |    sbatch run.sh pytorch_imdb_gpt.py
14 |    ```
15 | 
16 | You can tweak some of the parameters in the script. For example
17 | `max_steps` in `TrainingArguments` tells for how many batches it will
18 | train. It's by default set to `max_steps=5000`, which runs for about
19 | 15 minutes on LUMI. Here are Hugging Face's notes on the many things
20 | that can be tried for improving training:
21 | <https://huggingface.co/docs/transformers/perf_train_gpu_one>
22 | 
23 | At the end of the run it prints the perplexity on the test set. This
24 | is a measure of how well our trained model predicts the test set
25 | samples. The lower the value, the better.
26 | 
27 | Also make a note of where the model is stored, it should be in a
28 | directory like
29 | `/scratch/project_462000863/data/users/$USER/gpt-imdb-model/`, where
30 | `$USER` is replaced with your username on LUMI. Take a look into that
31 | directory:
32 | 
33 | ```
34 | ls -ltr /scratch/project_462000863/data/users/$USER/gpt-imdb-model/
35 | ```
36 | 
37 | This should list all the files and subdirectories, with the most
38 | recent ones at the bottom. Depending on your training configuration it
39 | should have stored several checkpoints, the latest one is usual the
40 | best one.
41 | 
42 | ## Task 2
43 | 
44 | You can try generating some movie reviews interactively with the
45 | notebook [pytorch_generate_gpt.ipynb](pytorch_generate_gpt.ipynb). You
46 | should be able to open the Notebook as normal via "Jupyter for
47 | courses". GPUs are not needed for generating text.
48 | 
49 | You need to point the `path_to_model` variable to a checkpoint of the
50 | model you trained in Task 1. For example something like
51 | `/scratch/project_462000863/data/users/$USER/gpt-imdb-model/checkpoint-5000`
52 | (here you need to replace `$USER` with your actual username).
53 | 
54 | Experiment with different sampling strategies. At the end of the
55 | notebook there is also code to try the original distilgpt2 model, does
56 | our fine-tuned model produce any better results?
57 | 
58 | You can also try a model that we prepared earlier that has trained for
59 | a full hour:
60 | 
61 | ```
62 | path_to_model = "/scratch/project_462000863/data/users/mvsjober/gpt-imdb-model/checkpoint-65000/"
63 | ```
64 | 


--------------------------------------------------------------------------------
/day2/Exercise_8.md:
--------------------------------------------------------------------------------
 1 | # Exercise 8
 2 | 
 3 | In this exercise, we try using multiple GPUs.
 4 | 
 5 | We have prepared a few examples where one of the earlier exercises
 6 | have been converted to using DistributedDataParallel (DDP).
 7 | 
 8 | - `pytorch_dvc_cnn_pretrained_multigpu.py` which implements PyTorch
 9 |   DDP on the pre-trained CNN for cats-vs-dogs. You can try this with
10 |   the `run-2gpus.sh` script.
11 |   
12 | - `pytorch_imdb_gpt_multigpu.py` which implements PyTorch DDP with the
13 |   Hugging Face trainer. Use `run-2gpus.sh`.
14 | 
15 | Run these scripts, and also try with 8 GPUs using the `run-8gpus.sh`.
16 | 
17 | - Can you see any speed improvement between using 1, 2 or 8 GPUs?
18 | - Do you get the same accuracy?
19 | - Consider per-GPU batch size vs effective batch size. (Hint: with DDP you can check number of GPUs with `dist.get_world_size()`)
20 | 
21 | You can check if your runs are actually using multiple GPUs with the
22 | `rocm-smi` command. Check the `JOBID` of your running job with `squeue
23 | --me`, then run (replacing JOBID with the real number):
24 | 
25 |     srun --overlap --pty --jobid=JOBID bash
26 | 
27 | This opens a new shell session in the same machine as your job. Here
28 | you can check your processes with `top` or the state of the GPUs with
29 | `rocm-smi`. A useful command to follow GPU usage is:
30 | 
31 |     watch rocm-smi
32 |     
33 | It will update every 2 seconds. It should show values above 0% for the
34 | GPU% column for all the GPUs you intend to use. Press Ctrl-C to exit
35 | this view.
36 | 


--------------------------------------------------------------------------------
/day2/README.md:
--------------------------------------------------------------------------------
  1 | # Day 2
  2 | 
  3 | ## Exercise sessions
  4 | 
  5 | * [Exercise 5: Image classification](Exercise_5.md)
  6 | * [Exercise 6: Text categorization](Exercise_6.md)
  7 | * [Exercise 7: Text generation](Exercise_7.md)
  8 | * [Exercise 8: Using multiple GPUs](Exercise_8.md)
  9 | 
 10 | ## Setup
 11 | 
 12 | 1. Login to LUMI using either:
 13 |    - the web user interface at <https://www.lumi.csc.fi/> ("Go to login") and start "Login node shell", or
 14 |    - login with your username and SSH key to `lumi.csc.fi`, for more instructions see: <https://docs.lumi-supercomputer.eu/firststeps/>
 15 |    
 16 | 2. In the login node shell, or SSH session, set up the module environment for using PyTorch:
 17 | 
 18 |    ```bash
 19 |    module purge
 20 |    module use /appl/local/csc/modulefiles/
 21 |    module load pytorch
 22 |    ```
 23 |    (In the LUMI web UI login node shell you can use Shift-Insert to paste if you copy commands from here.)
 24 |    
 25 | 3. Go to the exercise directory:
 26 |    - if you ran the exercises of day 1 using LUMI's "Jupyter for courses", you should already have the repository cloned in your home directory
 27 |    
 28 |    ```bash
 29 |    cd PDL-2025-04/intro-to-dl/day2
 30 |    ```
 31 |    
 32 |    If you don't have it, you can also clone it yourself:
 33 | 
 34 |    ```bash
 35 |    mkdir PDL-2025-04
 36 |    cd PDL-2025-04
 37 |    git clone https://github.com/csc-training/intro-to-dl
 38 |    cd intro-to-dl/day2
 39 |    ```
 40 | 
 41 | ## Edit and submit jobs
 42 | 
 43 | 1. Edit Python script, either by:
 44 |    - Navigating to the file in the LUMI web UI file browser (Files → Home Directory → PDL-2025-04 → intro-to-dl → day2) and selecting "Edit" on that file (under the three dots "⋮" menu).
 45 |    - Opening with your favorite text editor in the terminal, for example:
 46 |      ```bash
 47 |      nano pytorch_test.py
 48 |      ```
 49 | 
 50 | 2. Submit job:
 51 | 
 52 |    ```bash
 53 |    sbatch run.sh pytorch_test.py
 54 |    ```
 55 |    
 56 | 3. See the status of your jobs or the queue you are using:
 57 | 
 58 |    ```bash
 59 |    squeue --me
 60 |    squeue -p small-g
 61 |    ```
 62 | 
 63 | 4. After the job has finished, examine the results:
 64 | 
 65 |    ```bash
 66 |    less slurm-xxxxxxxx.out
 67 |    ```
 68 | 
 69 | 5. Go to 1 until you are happy with the results.
 70 | 
 71 | ## Optional: TensorBoard
 72 | 
 73 | You can use TensorBoard either via the LUMI web user interface (recommended), or via the terminal using ssh port forwarding. Both approaches are explained below.
 74 | 
 75 | ### Via the LUMI web interface (the recommended method)
 76 | 
 77 | 1. Log in via <https://www.lumi.csc.fi/>
 78 | 2. Select menu item: Apps → TensorBoard
 79 | 4. In the form:
 80 |    - Select course project: project_462000863
 81 |    - Specify the "TensorBoard log directory", it's where you have cloned the course repository plus "day2/logs", for example:
 82 |   `~/PDL-2025-04/intro-to-dl/day2/logs`. You can run `pwd` in the terminal to find out the full path where you are working.
 83 |    - Leave rest at default settings
 84 | 6. Click "Launch"
 85 | 7. Wait until you see the "Connect to Tensorboard" button, then click that.
 86 | 8. When you're done using TensorBoard, please go to "My Interactive Sessions" in the Puhti web user interface and "Cancel" the session. (It will automatically terminate once the reserved time is up, but it's always better to release the resource as soon as possible so that others can use it.)
 87 | 
 88 | ### Via SSH port forwarding
 89 | 
 90 | 1. Login again from a terminal window to LUMI with SSH port forwarding:
 91 | 
 92 |    ```bash
 93 |    ssh -L PORT:localhost:PORT lumi.csc.fi
 94 |    ```
 95 |         
 96 |    Replace `PORT` with a freely selectable port number (>1023). By default, TensorBoard uses the port 6006, but **select a different port** to avoid overlaps. 
 97 | 
 98 | 2. Set up the module environment and start the TensorBoard server:
 99 | 
100 |    ```bash
101 |    module purge
102 |    module use use /appl/local/csc/modulefiles/
103 |    module load tensorflow
104 |    singularity_wrapper exec tensorboard --logdir=PDL-2025-04/intro-to-dl/day2/logs --port=PORT --bind_all
105 |    ```
106 | 
107 | 3. To access TensorBoard, point your web browser to *localhost:PORT* .
108 | 


--------------------------------------------------------------------------------
/day2/imgs/avp.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/intro-to-dl/3ce19c7b5d9860ae64f15698b2b362397287e075/day2/imgs/avp.png


--------------------------------------------------------------------------------
/day2/imgs/dvc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/intro-to-dl/3ce19c7b5d9860ae64f15698b2b362397287e075/day2/imgs/dvc.png


--------------------------------------------------------------------------------
/day2/imgs/gtsrb-montage.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/intro-to-dl/3ce19c7b5d9860ae64f15698b2b362397287e075/day2/imgs/gtsrb-montage.png


--------------------------------------------------------------------------------
/day2/imgs/traffic-signs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/csc-training/intro-to-dl/3ce19c7b5d9860ae64f15698b2b362397287e075/day2/imgs/traffic-signs.png


--------------------------------------------------------------------------------
/day2/logs/.gitignore:
--------------------------------------------------------------------------------
1 | # Ignore everything in this directory
2 | *
3 | # Except this file
4 | !.gitignore
5 | 


--------------------------------------------------------------------------------
/day2/pytorch_20ng_bert.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | # # 20 newsgroup text classification with BERT finetuning
  5 | #
  6 | # In this script, we'll use a pre-trained BERT
  7 | # (https://arxiv.org/abs/1810.04805) model for text classification
  8 | # using PyTorch and HuggingFace's Transformers
  9 | # (https://github.com/huggingface/transformers).
 10 | 
 11 | import torch
 12 | from torch.utils.data import (TensorDataset, DataLoader,
 13 |                               RandomSampler, SequentialSampler)
 14 | from transformers import BertTokenizer
 15 | from transformers import BertForSequenceClassification
 16 | from transformers import AdamW, get_linear_schedule_with_warmup
 17 | 
 18 | from packaging.version import Version as LV
 19 | 
 20 | from sklearn.model_selection import train_test_split
 21 | 
 22 | from datetime import datetime
 23 | 
 24 | import os
 25 | import sys
 26 | 
 27 | import numpy as np
 28 | 
 29 | torch.manual_seed(42)
 30 | 
 31 | if torch.cuda.is_available():
 32 |     device = torch.device('cuda')
 33 | else:
 34 |     device = torch.device('cpu')
 35 | 
 36 | print('Using PyTorch version:', torch.__version__, ' Device:', device)
 37 | assert LV(torch.__version__) >= LV("1.0.0")
 38 | 
 39 | 
 40 | def correct(output, target):
 41 |     predicted = output.argmax(1)  # pick class with largest network output
 42 |     correct_ones = (predicted == target).type(torch.float)
 43 |     return correct_ones.sum().item()  # count number of correct ones
 44 | 
 45 | 
 46 | def train(data_loader, model, scheduler, optimizer):
 47 |     model.train()
 48 | 
 49 |     num_batches = 0
 50 |     num_items = 0
 51 | 
 52 |     total_loss = 0
 53 |     total_correct = 0
 54 |     for input_ids, input_mask, labels in data_loader:
 55 |         # Copy data and targets to GPU
 56 |         input_ids = input_ids.to(device)
 57 |         input_mask = input_mask.to(device)
 58 |         labels = labels.to(device)
 59 | 
 60 |         # Do a forward pass
 61 |         output = model(input_ids, token_type_ids=None,
 62 |                        attention_mask=input_mask, labels=labels)
 63 | 
 64 |         loss = output[0]
 65 |         logits = output[1]
 66 | 
 67 |         total_loss += loss
 68 |         num_batches += 1
 69 | 
 70 |         # Count number of correct
 71 |         total_correct += correct(logits, labels)
 72 |         num_items += len(labels)
 73 | 
 74 |         # Backpropagation
 75 |         loss.backward()
 76 |         optimizer.step()
 77 |         optimizer.zero_grad()
 78 |         scheduler.step()
 79 | 
 80 |     return {
 81 |         'loss': total_loss/num_batches,
 82 |         'accuracy': total_correct/num_items
 83 |         }
 84 | 
 85 | 
 86 | def test(test_loader, model):
 87 |     model.eval()
 88 | 
 89 |     num_batches = len(test_loader)
 90 |     num_items = len(test_loader.dataset)
 91 | 
 92 |     test_loss = 0
 93 |     total_correct = 0
 94 | 
 95 |     with torch.no_grad():
 96 |         for input_ids, input_mask, labels in test_loader:
 97 |             # Copy data and targets to GPU
 98 |             input_ids = input_ids.to(device)
 99 |             input_mask = input_mask.to(device)
100 |             labels = labels.to(device)
101 | 
102 |             # Do a forward pass
103 |             output = model(input_ids, token_type_ids=None,
104 |                            attention_mask=input_mask)
105 | 
106 |             logits = output[0]
107 | 
108 |             # Count number of correct digits
109 |             total_correct += correct(logits, labels)
110 | 
111 |     return {
112 |         'loss': test_loss/num_batches,
113 |         'accuracy': total_correct/num_items
114 |     }
115 | 
116 | 
117 | def log_measures(ret, log, prefix, epoch):
118 |     if log is not None:
119 |         for key, value in ret.items():
120 |             log.add_scalar(prefix + "_" + key, value, epoch)
121 | 
122 | 
123 | def main():
124 |     try:
125 |         import tensorboardX
126 |         time_str = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
127 |         logdir = os.path.join(os.getcwd(), "logs", "20ng-bert-" + time_str)
128 |         print('TensorBoard log directory:', logdir)
129 |         os.makedirs(logdir)
130 |         log = tensorboardX.SummaryWriter(logdir)
131 |     except (ImportError, FileExistsError):
132 |         log = None
133 | 
134 |     datapath = os.getenv('DATADIR')
135 |     if datapath is None:
136 |         print("Please set DATADIR environment variable!")
137 |         sys.exit(1)
138 | 
139 |     # 20 Newsgroups data set
140 |     text_data_dir = os.path.join(datapath, "20_newsgroup")
141 | 
142 |     print('Processing text dataset')
143 | 
144 |     texts = []  # list of text samples
145 |     labels_index = {}  # dictionary mapping label name to numeric id
146 |     labels = []  # list of label ids
147 |     for name in sorted(os.listdir(text_data_dir)):
148 |         path = os.path.join(text_data_dir, name)
149 |         if os.path.isdir(path):
150 |             label_id = len(labels_index)
151 |             labels_index[name] = label_id
152 |             print('-', name, label_id)
153 |             for fname in sorted(os.listdir(path)):
154 |                 if fname.isdigit():
155 |                     fpath = os.path.join(path, fname)
156 |                     args = {} if sys.version_info < (3,) else {'encoding': 'latin-1'}
157 |                     with open(fpath, **args) as f:
158 |                         t = f.read()
159 |                         i = t.find('\n\n')  # skip header
160 |                         if 0 < i:
161 |                             t = t[i:]
162 |                         texts.append(t)
163 |                     labels.append(label_id)
164 | 
165 |     print('Found %s texts.' % len(texts))
166 | 
167 |     # Split the data into a training set and a test set using
168 |     # scikit-learn's train_test_split().
169 | 
170 |     TEST_SET = 4000
171 | 
172 |     (sentences_train, sentences_test,
173 |      labels_train, labels_test) = train_test_split(texts, labels,
174 |                                                    test_size=TEST_SET,
175 |                                                    shuffle=True,
176 |                                                    random_state=42)
177 | 
178 |     print('Length of training texts:', len(sentences_train))
179 |     print('Length of training labels:', len(labels_train))
180 |     print('Length of test texts:', len(sentences_test))
181 |     print('Length of test labels:', len(labels_test))
182 | 
183 |     # The token [CLS] is a special token required by BERT at the beginning
184 |     # of the sentence.
185 | 
186 |     sentences_train = ["[CLS] " + s for s in sentences_train]
187 |     sentences_test = ["[CLS] " + s for s in sentences_test]
188 | 
189 |     print("The first training sentence:")
190 |     print(sentences_train[0], 'LABEL:', labels_train[0])
191 | 
192 |     # Next we specify the pre-trained BERT model we are going to use. The
193 |     # model `"bert-base-uncased"` is the lowercased "base" model
194 |     # (12-layer, 768-hidden, 12-heads, 110M parameters).
195 |     #
196 |     # We load the used vocabulary from the BERT model, and use the BERT
197 |     # tokenizer to convert the sentences into tokens that match the data
198 |     # the BERT model was trained on.
199 | 
200 |     print('Initializing BertTokenizer')
201 | 
202 |     BERTMODEL = 'bert-base-uncased'
203 | 
204 |     tokenizer = BertTokenizer.from_pretrained(BERTMODEL, do_lower_case=True)
205 | 
206 |     tokenized_train = [tokenizer.tokenize(s) for s in sentences_train]
207 |     tokenized_test = [tokenizer.tokenize(s) for s in sentences_test]
208 | 
209 |     print("The full tokenized first training sentence:")
210 |     print(tokenized_train[0])
211 | 
212 |     # Now we set the maximum sequence lengths for our training and test
213 |     # sentences as `MAX_LEN_TRAIN` and `MAX_LEN_TEST`. The maximum length
214 |     # supported by the used BERT model is 512.
215 |     #
216 |     # The token `[SEP]` is another special token required by BERT at the
217 |     # end of the sentence.
218 | 
219 |     MAX_LEN_TRAIN, MAX_LEN_TEST = 128, 512
220 | 
221 |     tokenized_train = [t[:(MAX_LEN_TRAIN-1)]+['SEP'] for t in tokenized_train]
222 |     tokenized_test = [t[:(MAX_LEN_TEST-1)]+['SEP'] for t in tokenized_test]
223 | 
224 |     print("The truncated tokenized first training sentence:")
225 |     print(tokenized_train[0])
226 | 
227 |     # Next we use the BERT tokenizer to convert each token into an integer
228 |     # index in the BERT vocabulary. We also pad any shorter sequences to
229 |     # `MAX_LEN_TRAIN` or `MAX_LEN_TEST` indices with trailing zeros.
230 | 
231 |     ids_train = [tokenizer.convert_tokens_to_ids(t) for t in tokenized_train]
232 |     ids_train = np.array([np.pad(i, (0, MAX_LEN_TRAIN-len(i)),
233 |                                  mode='constant') for i in ids_train])
234 | 
235 |     ids_test = [tokenizer.convert_tokens_to_ids(t) for t in tokenized_test]
236 |     ids_test = np.array([np.pad(i, (0, MAX_LEN_TEST-len(i)),
237 |                                 mode='constant') for i in ids_test])
238 | 
239 |     print("The indices of the first training sentence:")
240 |     print(ids_train[0])
241 | 
242 |     # BERT also requires *attention masks*, with 1 for each real token in
243 |     # the sequences and 0 for the padding:
244 | 
245 |     amasks_train, amasks_test = [], []
246 | 
247 |     for seq in ids_train:
248 |         seq_mask = [float(i > 0) for i in seq]
249 |         amasks_train.append(seq_mask)
250 | 
251 |     for seq in ids_test:
252 |         seq_mask = [float(i > 0) for i in seq]
253 |         amasks_test.append(seq_mask)
254 | 
255 |     # We use again scikit-learn's train_test_split to use 10% of our
256 |     # training data as a validation set, and then convert all data into
257 |     # torch.tensors.
258 | 
259 |     (train_inputs, validation_inputs,
260 |      train_labels, validation_labels) = train_test_split(ids_train,
261 |                                                          labels_train,
262 |                                                          random_state=42,
263 |                                                          test_size=0.1)
264 |     (train_masks, validation_masks,
265 |      _, _) = train_test_split(amasks_train, ids_train,
266 |                               random_state=42, test_size=0.1)
267 | 
268 |     train_inputs = torch.tensor(train_inputs)
269 |     train_labels = torch.tensor(train_labels)
270 |     train_masks = torch.tensor(train_masks)
271 | 
272 |     validation_inputs = torch.tensor(validation_inputs)
273 |     validation_labels = torch.tensor(validation_labels)
274 |     validation_masks = torch.tensor(validation_masks)
275 | 
276 |     test_inputs = torch.tensor(ids_test)
277 |     test_labels = torch.tensor(labels_test)
278 |     test_masks = torch.tensor(amasks_test)
279 | 
280 |     # Next we create PyTorch DataLoaders for all data sets.
281 |     #
282 |     # For fine-tuning BERT on a specific task, the authors recommend a
283 |     # batch size of 16 or 32.
284 | 
285 |     BATCH_SIZE = 32
286 | 
287 |     print('Train: ', end="")
288 |     train_dataset = TensorDataset(train_inputs, train_masks,
289 |                                   train_labels)
290 |     train_sampler = RandomSampler(train_dataset)
291 |     train_loader = DataLoader(train_dataset, sampler=train_sampler,
292 |                               batch_size=BATCH_SIZE)
293 |     print(len(train_dataset), 'messages')
294 | 
295 |     print('Validation: ', end="")
296 |     validation_dataset = TensorDataset(validation_inputs, validation_masks,
297 |                                        validation_labels)
298 |     validation_sampler = SequentialSampler(validation_dataset)
299 |     validation_loader = DataLoader(validation_dataset,
300 |                                    sampler=validation_sampler,
301 |                                    batch_size=BATCH_SIZE)
302 |     print(len(validation_dataset), 'messages')
303 | 
304 |     print('Test: ', end="")
305 |     test_dataset = TensorDataset(test_inputs, test_masks, test_labels)
306 |     test_sampler = SequentialSampler(test_dataset)
307 |     test_loader = DataLoader(test_dataset, sampler=test_sampler,
308 |                              batch_size=BATCH_SIZE)
309 |     print(len(test_dataset), 'messages')
310 | 
311 |     # ## BERT model initialization
312 |     #
313 |     # We now load a pretrained BERT model with a single linear
314 |     # classification layer added on top.
315 | 
316 |     print('Initializing BertForSequenceClassification')
317 | 
318 |     model = BertForSequenceClassification.from_pretrained(BERTMODEL,
319 |                                                           num_labels=20)
320 |     model = model.to(device)
321 | 
322 |     # We set the remaining hyperparameters needed for fine-tuning the
323 |     # pretrained model:
324 |     #   * num_epochs: the number of training epochs in fine-tuning
325 |     #     (recommended values between 2 and 4)
326 |     #   * weight_decay: weight decay for the Adam optimizer
327 |     #   * lr: learning rate for the Adam optimizer (2e-5 to 5e-5 recommended)
328 |     #   * warmup_steps: number of warmup steps to (linearly) reach the set
329 |     #     learning rate
330 |     #
331 |     # We also need to grab the training parameters from the pretrained model.
332 | 
333 |     num_epochs = 4
334 |     weight_decay = 0.01
335 |     lr = 2e-5
336 |     warmup_steps = int(0.2*len(train_loader))
337 | 
338 |     no_decay = ['bias', 'LayerNorm.weight']
339 |     optimizer_grouped_parameters = [
340 |         {'params': [p for n, p in model.named_parameters()
341 |                     if not any(nd in n for nd in no_decay)],
342 |          'weight_decay': weight_decay},
343 |         {'params': [p for n, p in model.named_parameters()
344 |                     if any(nd in n for nd in no_decay)],
345 |          'weight_decay': 0.0}
346 |     ]
347 |     optimizer = AdamW(optimizer_grouped_parameters, lr=lr, eps=1e-8)
348 |     scheduler = get_linear_schedule_with_warmup(
349 |         optimizer, num_warmup_steps=warmup_steps,
350 |         num_training_steps=len(train_loader)*num_epochs)
351 | 
352 |     # Training loop
353 |     start_time = datetime.now()
354 |     for epoch in range(num_epochs):
355 |         train_ret = train(train_loader, model, scheduler, optimizer)
356 |         log_measures(train_ret, log, "train", epoch)
357 | 
358 |         val_ret = test(validation_loader, model)
359 |         log_measures(val_ret, log, "val", epoch)
360 |         print(f"Epoch {epoch+1}: "
361 |               f"train loss: {train_ret['loss']:.6f} "
362 |               f"train accuracy: {train_ret['accuracy']:.2%}, "
363 |               f"val accuracy: {val_ret['accuracy']:.2%}")
364 | 
365 |     end_time = datetime.now()
366 |     print('Total training time: {}.'.format(end_time - start_time))
367 | 
368 |     # Inference
369 |     ret = test(test_loader, model)
370 |     print(f"\nTesting: accuracy: {ret['accuracy']:.2%}")
371 | 
372 | 
373 | if __name__ == "__main__":
374 |     main()
375 | 


--------------------------------------------------------------------------------
/day2/pytorch_20ng_cnn.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | # 20 Newsgroups text classification with pre-trained word embeddings
  5 | #
  6 | # In this script, we'll use pre-trained [GloVe word
  7 | # embeddings](http://nlp.stanford.edu/projects/glove/) for text
  8 | # classification using PyTorch.
  9 | 
 10 | import torch
 11 | import torch.nn as nn
 12 | import torch.nn.functional as F
 13 | import torch.optim as optim
 14 | from torch.autograd import Variable
 15 | from torch.utils.data import TensorDataset, DataLoader
 16 | 
 17 | from packaging.version import Version as LV
 18 | 
 19 | from gensim.utils import simple_preprocess
 20 | from gensim.corpora import Dictionary
 21 | 
 22 | from sklearn.model_selection import train_test_split
 23 | from sklearn.metrics import confusion_matrix
 24 | 
 25 | from datetime import datetime
 26 | 
 27 | import os
 28 | import sys
 29 | 
 30 | import numpy as np
 31 | 
 32 | torch.manual_seed(42)
 33 | 
 34 | if torch.cuda.is_available():
 35 |     device = torch.device('cuda')
 36 | else:
 37 |     device = torch.device('cpu')
 38 | 
 39 | print('Using PyTorch version:', torch.__version__, ' Device:', device)
 40 | assert(LV(torch.__version__) >= LV("1.0.0"))
 41 | 
 42 | 
 43 | class Net(nn.Module):
 44 |     def __init__(self, embedding_matrix):
 45 |         super(Net, self).__init__()
 46 |         self.emb = nn.Embedding.from_pretrained(embedding_matrix, freeze=True)
 47 |         self.layers = nn.Sequential(
 48 |             nn.Conv1d(100, 128, 5),  # output: batch_size x 128 x seq_len-4
 49 |             nn.ReLU(),
 50 |             nn.MaxPool1d(5),         # output: bs x 128 x 199
 51 |             nn.Conv1d(128, 128, 5),  # output: bs x 128 x 199
 52 |             nn.ReLU(),
 53 |             nn.MaxPool1d(5),         # output: bs x 128 x 39
 54 |             nn.Conv1d(128, 128, 5),  # output: bs x 128 x 35
 55 |             nn.ReLU(),
 56 |             nn.AdaptiveMaxPool1d(1)  # output: bs x 128 x 1
 57 |             )
 58 |         self.linear_layers = nn.Sequential(
 59 |             nn.Flatten(),
 60 |             nn.Linear(128, 128),
 61 |             nn.ReLU(),
 62 |             nn.Linear(128, 20),
 63 |         )
 64 | 
 65 |     def forward(self, x):
 66 |         x = self.emb(x)      # output from embedding: batch_size x seq_len x embedding dim.
 67 |         x = x.transpose(1,2) # change to: batch_size x embedding_dim x seq_len
 68 |         x = self.layers(x)
 69 |         x = self.linear_layers(x)
 70 |         return x
 71 | 
 72 | 
 73 | def correct(output, target):
 74 |     predicted = output.argmax(1) # pick class with largest network output
 75 |     correct_ones = (predicted == target).type(torch.float)
 76 |     return correct_ones.sum().item() # count number of correct ones
 77 | 
 78 | 
 79 | def train(data_loader, model, criterion, optimizer):
 80 |     model.train()
 81 | 
 82 |     num_batches = 0
 83 |     num_items = 0
 84 | 
 85 |     total_loss = 0
 86 |     total_correct = 0
 87 |     for data, target in data_loader:
 88 |         # Copy data and targets to GPU
 89 |         data = data.to(device)
 90 |         target = target.to(device)
 91 | 
 92 |         # Do a forward pass
 93 |         output = model(data)
 94 | 
 95 |         # Calculate the loss
 96 |         loss = criterion(output, target)
 97 |         total_loss += loss
 98 |         num_batches += 1
 99 | 
100 |         # Count number of correct
101 |         total_correct += correct(output, target)
102 |         num_items += len(target)
103 | 
104 |         # Backpropagation
105 |         loss.backward()
106 |         optimizer.step()
107 |         optimizer.zero_grad()
108 | 
109 |     return {
110 |         'loss': total_loss/num_batches,
111 |         'accuracy': total_correct/num_items
112 |         }
113 | 
114 | 
115 | def test(test_loader, model, criterion):
116 |     model.eval()
117 | 
118 |     num_batches = len(test_loader)
119 |     num_items = len(test_loader.dataset)
120 | 
121 |     test_loss = 0
122 |     total_correct = 0
123 | 
124 |     with torch.no_grad():
125 |         for data, target in test_loader:
126 |             # Copy data and targets to GPU
127 |             data = data.to(device)
128 |             target = target.to(device)
129 | 
130 |             # Do a forward pass
131 |             output = model(data)
132 | 
133 |             # Calculate the loss
134 |             loss = criterion(output, target)
135 |             test_loss += loss.item()
136 | 
137 |             # Count number of correct digits
138 |             total_correct += correct(output, target)
139 | 
140 |     return {
141 |         'loss': test_loss/num_batches,
142 |         'accuracy': total_correct/num_items
143 |     }
144 | 
145 | 
146 | def log_measures(ret, log, prefix, epoch):
147 |     if log is not None:
148 |         for key, value in ret.items():
149 |             log.add_scalar(prefix + "_" + key, value, epoch)
150 | 
151 | 
152 | def main():
153 |     try:
154 |         import tensorboardX
155 |         time_str = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
156 |         logdir = os.path.join(os.getcwd(), "logs", "20ng-cnn-" + time_str)
157 |         print('TensorBoard log directory:', logdir)
158 |         os.makedirs(logdir)
159 |         log = tensorboardX.SummaryWriter(logdir)
160 |     except (ImportError, FileExistsError):
161 |         log = None
162 | 
163 |     # ## GloVe word embeddings
164 |     #
165 |     # Let's begin by loading a datafile containing pre-trained word
166 |     # embeddings.  The datafile contains 100-dimensional embeddings for
167 |     # 400,000 English words.
168 | 
169 |     datapath = os.getenv('DATADIR')
170 |     if datapath is None:
171 |         print("Please set DATADIR environment variable!")
172 |         sys.exit(1)
173 | 
174 |     glove_dir = os.path.join(datapath, "glove.6B")
175 | 
176 |     print('Indexing word vectors.')
177 | 
178 |     embeddings_index = {}
179 |     with open(os.path.join(glove_dir, 'glove.6B.100d.txt')) as f:
180 |         for line in f:
181 |             values = line.split()
182 |             word = values[0]
183 |             coefs = np.asarray(values[1:], dtype='float32')
184 |             embeddings_index[word] = coefs
185 | 
186 |     print('Found %s word vectors.' % len(embeddings_index))
187 | 
188 |     
189 |     # ## 20 Newsgroups data set
190 |     #
191 |     # Next we'll load the [20 Newsgroups]
192 |     # (http://www.cs.cmu.edu/afs/cs.cmu.edu/project/theo-20/www/data/news20.html)
193 |     # data set.
194 |     #
195 |     # The dataset contains 20000 messages collected from 20 different
196 |     # Usenet newsgroups (1000 messages from each group):
197 |     #
198 |     # | alt.atheism           | soc.religion.christian   | comp.windows.x     | sci.crypt
199 |     # | talk.politics.guns    | comp.sys.ibm.pc.hardware | rec.autos          | sci.electronics
200 |     # | talk.politics.mideast | comp.graphics            | rec.motorcycles    | sci.space
201 |     # | talk.politics.misc    | comp.os.ms-windows.misc  | rec.sport.baseball | sci.med
202 |     # | talk.religion.misc    | comp.sys.mac.hardware    | rec.sport.hockey   | misc.forsale
203 | 
204 |     text_data_dir = os.path.join(datapath, "20_newsgroup")
205 | 
206 |     print('Processing text dataset')
207 | 
208 |     texts = []  # list of text samples
209 |     labels_index = {}  # dictionary mapping label name to numeric id
210 |     labels = []  # list of label ids
211 |     for name in sorted(os.listdir(text_data_dir)):
212 |         path = os.path.join(text_data_dir, name)
213 |         if os.path.isdir(path):
214 |             label_id = len(labels_index)
215 |             labels_index[name] = label_id
216 |             print('-', name, label_id)
217 |             for fname in sorted(os.listdir(path)):
218 |                 if fname.isdigit():
219 |                     fpath = os.path.join(path, fname)
220 |                     args = {} if sys.version_info < (3,) else {'encoding': 'latin-1'}
221 |                     with open(fpath, **args) as f:
222 |                         t = f.read()
223 |                         i = t.find('\n\n')  # skip header
224 |                         if 0 < i:
225 |                             t = t[i:]
226 |                         texts.append(t)
227 |                     labels.append(label_id)
228 | 
229 |     print('Found %s texts.' % len(texts))
230 | 
231 |     # Tokenize the texts using gensim.
232 | 
233 |     tokens = list()
234 |     for text in texts:
235 |         tokens.append(simple_preprocess(text))
236 | 
237 |     # Vectorize the text samples into a 2D integer tensor.
238 | 
239 |     MAX_NUM_WORDS = 10000 # 2 words reserved: 0=pad, 1=oov
240 |     MAX_SEQUENCE_LENGTH = 1000
241 | 
242 |     dictionary = Dictionary(tokens)
243 |     dictionary.filter_extremes(no_below=0, no_above=1.0,
244 |                                keep_n=MAX_NUM_WORDS-2)
245 | 
246 |     word_index = dictionary.token2id
247 |     print('Found %s unique tokens.' % len(word_index))
248 | 
249 |     data = [dictionary.doc2idx(t) for t in tokens]
250 | 
251 |     # Truncate and pad sequences.
252 | 
253 |     data = [i[:MAX_SEQUENCE_LENGTH] for i in data]
254 |     data = np.array([np.pad(i, (MAX_SEQUENCE_LENGTH-len(i), 0),
255 |                             mode='constant', constant_values=-2)
256 |                      for i in data], dtype=int)
257 |     data = data + 2
258 | 
259 |     print('Shape of data tensor:', data.shape)
260 |     print('Length of label vector:', len(labels))
261 | 
262 |     # Split the data into a training set and a validation set
263 | 
264 |     VALIDATION_SET, TEST_SET = 1000, 4000
265 | 
266 |     x_train, x_test, y_train, y_test = train_test_split(data, labels,
267 |                                                         test_size=TEST_SET,
268 |                                                         shuffle=True,
269 |                                                         random_state=42)
270 | 
271 |     x_train, x_val, y_train, y_val = train_test_split(x_train, y_train,
272 |                                                       test_size=VALIDATION_SET,
273 |                                                       shuffle=False)
274 | 
275 |     print('Shape of training data tensor:', x_train.shape)
276 |     print('Length of training label vector:', len(y_train))
277 |     print('Shape of validation data tensor:', x_val.shape)
278 |     print('Length of validation label vector:', len(y_val))
279 |     print('Shape of test data tensor:', x_test.shape)
280 |     print('Length of test label vector:', len(y_test))
281 | 
282 |     # Create PyTorch DataLoaders for all data sets:
283 | 
284 |     BATCH_SIZE = 128
285 | 
286 |     print('Train: ', end="")
287 |     train_dataset = TensorDataset(torch.LongTensor(x_train),
288 |                                   torch.LongTensor(y_train))
289 |     train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE,
290 |                               shuffle=True, num_workers=4)
291 |     print(len(train_dataset), 'messages')
292 | 
293 |     print('Validation: ', end="")
294 |     validation_dataset = TensorDataset(torch.LongTensor(x_val),
295 |                                        torch.LongTensor(y_val))
296 |     validation_loader = DataLoader(validation_dataset, batch_size=BATCH_SIZE,
297 |                                    shuffle=False, num_workers=4)
298 |     print(len(validation_dataset), 'messages')
299 | 
300 |     print('Test: ', end="")
301 |     test_dataset = TensorDataset(torch.LongTensor(x_test),
302 |                                  torch.LongTensor(y_test))
303 |     test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE,
304 |                              shuffle=False, num_workers=4)
305 |     print(len(test_dataset), 'messages')
306 | 
307 |     # Prepare the embedding matrix:
308 | 
309 |     print('Preparing embedding matrix.')
310 | 
311 |     EMBEDDING_DIM = 100
312 | 
313 |     embedding_matrix = np.zeros((MAX_NUM_WORDS, EMBEDDING_DIM))
314 |     n_not_found = 0
315 |     for word, i in word_index.items():
316 |         if i >= MAX_NUM_WORDS-2:
317 |             continue
318 |         embedding_vector = embeddings_index.get(word)
319 |         if embedding_vector is not None:
320 |             # words not found in embedding index will be all-zeros.
321 |             embedding_matrix[i+2] = embedding_vector
322 |         else:
323 |             n_not_found += 1
324 | 
325 |     embedding_matrix = torch.FloatTensor(embedding_matrix)
326 |     print('Shape of embedding matrix:', embedding_matrix.shape)
327 |     print('Words not found in pre-trained embeddings:', n_not_found)
328 | 
329 |     model = Net(embedding_matrix)
330 |     model = model.to(device)
331 |     
332 |     # optimizer = optim.RMSprop(model.parameters(), lr=0.001)
333 |     optimizer = optim.Adam(model.parameters(), lr=0.0005)
334 |     
335 |     criterion = nn.CrossEntropyLoss()
336 | 
337 |     print(model)
338 | 
339 |     num_epochs = 40
340 | 
341 |     # Training loop
342 |     start_time = datetime.now()
343 |     for epoch in range(num_epochs):
344 |         train_ret = train(train_loader, model, criterion, optimizer)
345 |         log_measures(train_ret, log, "train", epoch)
346 | 
347 |         val_ret = test(validation_loader, model, criterion)
348 |         log_measures(val_ret, log, "val", epoch)
349 |         print(f"Epoch {epoch+1}: "
350 |               f"train loss: {train_ret['loss']:.6f} "
351 |               f"train accuracy: {train_ret['accuracy']:.2%}, "
352 |               f"val accuracy: {val_ret['accuracy']:.2%}")
353 | 
354 |     end_time = datetime.now()
355 |     print('Total training time: {}.'.format(end_time - start_time))
356 | 
357 |     # Inference
358 |     ret = test(test_loader, model, criterion)
359 |     print(f"\nTesting: accuracy: {ret['accuracy']:.2%}")
360 | 
361 | 
362 | if __name__ == "__main__":
363 |     main()
364 | 


--------------------------------------------------------------------------------
/day2/pytorch_20ng_rnn.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | # 20 Newsgroups text classification with pre-trained word embeddings
  5 | #
  6 | # In this script, we'll use pre-trained [GloVe word
  7 | # embeddings](http://nlp.stanford.edu/projects/glove/) for text
  8 | # classification using PyTorch.
  9 | 
 10 | import torch
 11 | import torch.nn as nn
 12 | import torch.nn.functional as F
 13 | import torch.optim as optim
 14 | from torch.autograd import Variable
 15 | from torch.utils.data import TensorDataset, DataLoader
 16 | 
 17 | from packaging.version import Version as LV
 18 | 
 19 | from gensim.utils import simple_preprocess
 20 | from gensim.corpora import Dictionary
 21 | 
 22 | from sklearn.model_selection import train_test_split
 23 | from sklearn.metrics import confusion_matrix
 24 | 
 25 | from datetime import datetime
 26 | 
 27 | import os
 28 | import sys
 29 | 
 30 | import numpy as np
 31 | 
 32 | torch.manual_seed(42)
 33 | 
 34 | if torch.cuda.is_available():
 35 |     device = torch.device('cuda')
 36 | else:
 37 |     device = torch.device('cpu')
 38 | 
 39 | print('Using PyTorch version:', torch.__version__, ' Device:', device)
 40 | assert(LV(torch.__version__) >= LV("1.0.0"))
 41 | 
 42 | 
 43 | class Net(nn.Module):
 44 |     def __init__(self, embedding_matrix):
 45 |         super(Net, self).__init__()
 46 |         self.emb = nn.Embedding.from_pretrained(embedding_matrix, freeze=True)
 47 |         self.lstm = nn.LSTM(100, 128, num_layers=2, batch_first=True)
 48 |         self.linear = nn.Linear(128,20)
 49 | 
 50 |     def forward(self, x):
 51 |         x = self.emb(x)
 52 |         
 53 |         # LSTM also returns the values of the internal h_n and c_n parameters
 54 |         x, (hn, cn) = self.lstm(x)
 55 |         
 56 |         # we pick only the last output after having processed the whole sequence
 57 |         x = self.linear(x[:, -1, :])
 58 | 
 59 |         return x
 60 | 
 61 | 
 62 | def correct(output, target):
 63 |     predicted = output.argmax(1) # pick class with largest network output
 64 |     correct_ones = (predicted == target).type(torch.float)
 65 |     return correct_ones.sum().item() # count number of correct ones
 66 | 
 67 | 
 68 | def train(data_loader, model, criterion, optimizer):
 69 |     model.train()
 70 | 
 71 |     num_batches = 0
 72 |     num_items = 0
 73 | 
 74 |     total_loss = 0
 75 |     total_correct = 0
 76 |     for data, target in data_loader:
 77 |         # Copy data and targets to GPU
 78 |         data = data.to(device)
 79 |         target = target.to(device)
 80 | 
 81 |         # Do a forward pass
 82 |         output = model(data)
 83 | 
 84 |         # Calculate the loss
 85 |         loss = criterion(output, target)
 86 |         total_loss += loss
 87 |         num_batches += 1
 88 | 
 89 |         # Count number of correct
 90 |         total_correct += correct(output, target)
 91 |         num_items += len(target)
 92 | 
 93 |         # Backpropagation
 94 |         loss.backward()
 95 |         optimizer.step()
 96 |         optimizer.zero_grad()
 97 | 
 98 |     return {
 99 |         'loss': total_loss/num_batches,
100 |         'accuracy': total_correct/num_items
101 |         }
102 | 
103 | 
104 | def test(test_loader, model, criterion):
105 |     model.eval()
106 | 
107 |     num_batches = len(test_loader)
108 |     num_items = len(test_loader.dataset)
109 | 
110 |     test_loss = 0
111 |     total_correct = 0
112 | 
113 |     with torch.no_grad():
114 |         for data, target in test_loader:
115 |             # Copy data and targets to GPU
116 |             data = data.to(device)
117 |             target = target.to(device)
118 | 
119 |             # Do a forward pass
120 |             output = model(data)
121 | 
122 |             # Calculate the loss
123 |             loss = criterion(output, target)
124 |             test_loss += loss.item()
125 | 
126 |             # Count number of correct digits
127 |             total_correct += correct(output, target)
128 | 
129 |     return {
130 |         'loss': test_loss/num_batches,
131 |         'accuracy': total_correct/num_items
132 |     }
133 | 
134 | 
135 | def log_measures(ret, log, prefix, epoch):
136 |     if log is not None:
137 |         for key, value in ret.items():
138 |             log.add_scalar(prefix + "_" + key, value, epoch)
139 | 
140 | 
141 | def main():
142 |     try:
143 |         import tensorboardX
144 |         time_str = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
145 |         logdir = os.path.join(os.getcwd(), "logs", "20ng-rnn-" + time_str)
146 |         print('TensorBoard log directory:', logdir)
147 |         os.makedirs(logdir)
148 |         log = tensorboardX.SummaryWriter(logdir)
149 |     except (ImportError, FileExistsError):
150 |         log = None
151 | 
152 |     # ## GloVe word embeddings
153 |     #
154 |     # Let's begin by loading a datafile containing pre-trained word
155 |     # embeddings.  The datafile contains 100-dimensional embeddings for
156 |     # 400,000 English words.
157 | 
158 |     datapath = os.getenv('DATADIR')
159 |     if datapath is None:
160 |         print("Please set DATADIR environment variable!")
161 |         sys.exit(1)
162 | 
163 |     glove_dir = os.path.join(datapath, "glove.6B")
164 | 
165 |     print('Indexing word vectors.')
166 | 
167 |     embeddings_index = {}
168 |     with open(os.path.join(glove_dir, 'glove.6B.100d.txt')) as f:
169 |         for line in f:
170 |             values = line.split()
171 |             word = values[0]
172 |             coefs = np.asarray(values[1:], dtype='float32')
173 |             embeddings_index[word] = coefs
174 | 
175 |     print('Found %s word vectors.' % len(embeddings_index))
176 | 
177 |     
178 |     # ## 20 Newsgroups data set
179 |     #
180 |     # Next we'll load the [20 Newsgroups]
181 |     # (http://www.cs.cmu.edu/afs/cs.cmu.edu/project/theo-20/www/data/news20.html)
182 |     # data set.
183 |     #
184 |     # The dataset contains 20000 messages collected from 20 different
185 |     # Usenet newsgroups (1000 messages from each group):
186 |     #
187 |     # | alt.atheism           | soc.religion.christian   | comp.windows.x     | sci.crypt
188 |     # | talk.politics.guns    | comp.sys.ibm.pc.hardware | rec.autos          | sci.electronics
189 |     # | talk.politics.mideast | comp.graphics            | rec.motorcycles    | sci.space
190 |     # | talk.politics.misc    | comp.os.ms-windows.misc  | rec.sport.baseball | sci.med
191 |     # | talk.religion.misc    | comp.sys.mac.hardware    | rec.sport.hockey   | misc.forsale
192 | 
193 |     text_data_dir = os.path.join(datapath, "20_newsgroup")
194 | 
195 |     print('Processing text dataset')
196 | 
197 |     texts = []  # list of text samples
198 |     labels_index = {}  # dictionary mapping label name to numeric id
199 |     labels = []  # list of label ids
200 |     for name in sorted(os.listdir(text_data_dir)):
201 |         path = os.path.join(text_data_dir, name)
202 |         if os.path.isdir(path):
203 |             label_id = len(labels_index)
204 |             labels_index[name] = label_id
205 |             print('-', name, label_id)
206 |             for fname in sorted(os.listdir(path)):
207 |                 if fname.isdigit():
208 |                     fpath = os.path.join(path, fname)
209 |                     args = {} if sys.version_info < (3,) else {'encoding': 'latin-1'}
210 |                     with open(fpath, **args) as f:
211 |                         t = f.read()
212 |                         i = t.find('\n\n')  # skip header
213 |                         if 0 < i:
214 |                             t = t[i:]
215 |                         texts.append(t)
216 |                     labels.append(label_id)
217 | 
218 |     print('Found %s texts.' % len(texts))
219 | 
220 |     # Tokenize the texts using gensim.
221 | 
222 |     tokens = list()
223 |     for text in texts:
224 |         tokens.append(simple_preprocess(text))
225 | 
226 |     # Vectorize the text samples into a 2D integer tensor.
227 | 
228 |     MAX_NUM_WORDS = 10000 # 2 words reserved: 0=pad, 1=oov
229 |     MAX_SEQUENCE_LENGTH = 1000
230 | 
231 |     dictionary = Dictionary(tokens)
232 |     dictionary.filter_extremes(no_below=0, no_above=1.0,
233 |                                keep_n=MAX_NUM_WORDS-2)
234 | 
235 |     word_index = dictionary.token2id
236 |     print('Found %s unique tokens.' % len(word_index))
237 | 
238 |     data = [dictionary.doc2idx(t) for t in tokens]
239 | 
240 |     # Truncate and pad sequences.
241 | 
242 |     data = [i[:MAX_SEQUENCE_LENGTH] for i in data]
243 |     data = np.array([np.pad(i, (MAX_SEQUENCE_LENGTH-len(i), 0),
244 |                             mode='constant', constant_values=-2)
245 |                      for i in data], dtype=int)
246 |     data = data + 2
247 | 
248 |     print('Shape of data tensor:', data.shape)
249 |     print('Length of label vector:', len(labels))
250 | 
251 |     # Split the data into a training set and a validation set
252 | 
253 |     VALIDATION_SET, TEST_SET = 1000, 4000
254 | 
255 |     x_train, x_test, y_train, y_test = train_test_split(data, labels,
256 |                                                         test_size=TEST_SET,
257 |                                                         shuffle=True,
258 |                                                         random_state=42)
259 | 
260 |     x_train, x_val, y_train, y_val = train_test_split(x_train, y_train,
261 |                                                       test_size=VALIDATION_SET,
262 |                                                       shuffle=False)
263 | 
264 |     print('Shape of training data tensor:', x_train.shape)
265 |     print('Length of training label vector:', len(y_train))
266 |     print('Shape of validation data tensor:', x_val.shape)
267 |     print('Length of validation label vector:', len(y_val))
268 |     print('Shape of test data tensor:', x_test.shape)
269 |     print('Length of test label vector:', len(y_test))
270 | 
271 |     # Create PyTorch DataLoaders for all data sets:
272 | 
273 |     BATCH_SIZE = 128
274 | 
275 |     print('Train: ', end="")
276 |     train_dataset = TensorDataset(torch.LongTensor(x_train),
277 |                                   torch.LongTensor(y_train))
278 |     train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE,
279 |                               shuffle=True, num_workers=4)
280 |     print(len(train_dataset), 'messages')
281 | 
282 |     print('Validation: ', end="")
283 |     validation_dataset = TensorDataset(torch.LongTensor(x_val),
284 |                                        torch.LongTensor(y_val))
285 |     validation_loader = DataLoader(validation_dataset, batch_size=BATCH_SIZE,
286 |                                    shuffle=False, num_workers=4)
287 |     print(len(validation_dataset), 'messages')
288 | 
289 |     print('Test: ', end="")
290 |     test_dataset = TensorDataset(torch.LongTensor(x_test),
291 |                                  torch.LongTensor(y_test))
292 |     test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE,
293 |                              shuffle=False, num_workers=4)
294 |     print(len(test_dataset), 'messages')
295 | 
296 |     # Prepare the embedding matrix:
297 | 
298 |     print('Preparing embedding matrix.')
299 | 
300 |     EMBEDDING_DIM = 100
301 | 
302 |     embedding_matrix = np.zeros((MAX_NUM_WORDS, EMBEDDING_DIM))
303 |     n_not_found = 0
304 |     for word, i in word_index.items():
305 |         if i >= MAX_NUM_WORDS-2:
306 |             continue
307 |         embedding_vector = embeddings_index.get(word)
308 |         if embedding_vector is not None:
309 |             # words not found in embedding index will be all-zeros.
310 |             embedding_matrix[i+2] = embedding_vector
311 |         else:
312 |             n_not_found += 1
313 | 
314 |     embedding_matrix = torch.FloatTensor(embedding_matrix)
315 |     print('Shape of embedding matrix:', embedding_matrix.shape)
316 |     print('Words not found in pre-trained embeddings:', n_not_found)
317 | 
318 |     model = Net(embedding_matrix)
319 |     model = model.to(device)
320 |     
321 |     optimizer = optim.RMSprop(model.parameters(), lr=0.005)
322 |     #optimizer = optim.Adam(model.parameters())
323 |     criterion = nn.CrossEntropyLoss()
324 | 
325 |     print(model)
326 | 
327 |     num_epochs = 20
328 | 
329 |     # Training loop
330 |     start_time = datetime.now()
331 |     for epoch in range(num_epochs):
332 |         train_ret = train(train_loader, model, criterion, optimizer)
333 |         log_measures(train_ret, log, "train", epoch)
334 | 
335 |         val_ret = test(validation_loader, model, criterion)
336 |         log_measures(val_ret, log, "val", epoch)
337 |         print(f"Epoch {epoch+1}: "
338 |               f"train loss: {train_ret['loss']:.6f} "
339 |               f"train accuracy: {train_ret['accuracy']:.2%}, "
340 |               f"val accuracy: {val_ret['accuracy']:.2%}")
341 | 
342 |     end_time = datetime.now()
343 |     print('Total training time: {}.'.format(end_time - start_time))
344 | 
345 |     # Inference
346 |     ret = test(test_loader, model, criterion)
347 |     print(f"\nTesting: accuracy: {ret['accuracy']:.2%}")
348 | 
349 | 
350 | if __name__ == "__main__":
351 |     main()
352 | 


--------------------------------------------------------------------------------
/day2/pytorch_dvc_cnn_pretrained.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | # # Dogs-vs-cats classification with CNNs
  5 | #
  6 | # In this script, we'll train a convolutional neural network (CNN) to
  7 | # classify images of dogs from images of cats using PyTorch.
  8 | #
  9 | # ## Option 2: Reuse a pre-trained CNN
 10 | #
 11 | # Here we'll use the VGG16 pre-trained network:
 12 | # https://pytorch.org/docs/stable/torchvision/models.html#torchvision.models.vgg16
 13 | #
 14 | # It has weights learned using ImageNet.  We remove the top layers and
 15 | # freeze the pre-trained weights, and then stack our own, randomly
 16 | # initialized, layers on top of the VGG16 network.
 17 | #
 18 | 
 19 | import torch
 20 | import torch.nn as nn
 21 | import torch.optim as optim
 22 | from torch.utils.data import DataLoader
 23 | from torchvision import datasets, transforms, models
 24 | from packaging.version import Version as LV
 25 | from datetime import datetime
 26 | import os
 27 | import sys
 28 | 
 29 | torch.manual_seed(42)
 30 | 
 31 | if torch.cuda.is_available():
 32 |     device = torch.device('cuda')
 33 | else:
 34 |     device = torch.device('cpu')
 35 | 
 36 | print('Using PyTorch version:', torch.__version__, ' Device:', device)
 37 | assert LV(torch.__version__) >= LV("1.0.0")
 38 | 
 39 | 
 40 | class PretrainedNet(nn.Module):
 41 |     def __init__(self):
 42 |         super(PretrainedNet, self).__init__()
 43 |         self.vgg_features = models.vgg16(weights=models.VGG16_Weights.DEFAULT).features
 44 | 
 45 |         # Freeze the VGG16 layers
 46 |         for param in self.vgg_features.parameters():
 47 |             param.requires_grad = False
 48 | 
 49 |         # Add our own layers on top
 50 |         self.own_layers = nn.Sequential(
 51 |             nn.Flatten(),
 52 |             nn.Linear(512*4*4, 64),
 53 |             nn.ReLU(),
 54 |             nn.Linear(64, 1),
 55 |             nn.Sigmoid()
 56 |         )
 57 | 
 58 |     def forward(self, x):
 59 |         x = self.vgg_features(x)
 60 |         return self.own_layers(x).squeeze()
 61 | 
 62 | 
 63 | def correct(output, target):
 64 |     class_pred = output.round().int()          # set to 0 for <0.5, 1 for >0.5
 65 |     correct_ones = class_pred == target.int()  # 1 for correct, 0 for incorrect
 66 |     return correct_ones.sum().item()           # count number of correct ones
 67 | 
 68 | 
 69 | def train(data_loader, model, criterion, optimizer):
 70 |     model.train()
 71 | 
 72 |     num_batches = 0
 73 |     num_items = 0
 74 | 
 75 |     total_loss = 0
 76 |     total_correct = 0
 77 |     for data, target in data_loader:
 78 |         # Copy data and targets to GPU
 79 |         data = data.to(device)
 80 |         target = target.to(device).to(torch.float)
 81 | 
 82 |         # Do a forward pass
 83 |         output = model(data)
 84 | 
 85 |         # Calculate the loss
 86 |         loss = criterion(output, target)
 87 |         total_loss += loss
 88 |         num_batches += 1
 89 | 
 90 |         # Count number of correct
 91 |         total_correct += correct(output, target)
 92 |         num_items += len(target)
 93 | 
 94 |         # Backpropagation
 95 |         loss.backward()
 96 |         optimizer.step()
 97 |         optimizer.zero_grad()
 98 | 
 99 |     return {
100 |         'loss': total_loss/num_batches,
101 |         'accuracy': total_correct/num_items
102 |         }
103 | 
104 | 
105 | def test(test_loader, model, criterion):
106 |     model.eval()
107 | 
108 |     num_batches = len(test_loader)
109 |     num_items = len(test_loader.dataset)
110 | 
111 |     test_loss = 0
112 |     total_correct = 0
113 | 
114 |     with torch.no_grad():
115 |         for data, target in test_loader:
116 |             # Copy data and targets to GPU
117 |             data = data.to(device)
118 |             target = target.to(device).to(torch.float)
119 | 
120 |             # Do a forward pass
121 |             output = model(data)
122 | 
123 |             # Calculate the loss
124 |             loss = criterion(output, target)
125 |             test_loss += loss.item()
126 | 
127 |             # Count number of correct digits
128 |             total_correct += correct(output, target)
129 | 
130 |     return {
131 |         'loss': test_loss/num_batches,
132 |         'accuracy': total_correct/num_items
133 |     }
134 | 
135 | 
136 | def log_measures(ret, log, prefix, epoch):
137 |     if log is not None:
138 |         for key, value in ret.items():
139 |             log.add_scalar(prefix + "_" + key, value, epoch)
140 | 
141 | 
142 | def main():
143 |     # TensorBoard for logging
144 |     try:
145 |         import tensorboardX
146 |         time_str = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
147 |         logdir = os.path.join(os.getcwd(), "logs", "dvc-pretrained-" + time_str)
148 |         print('TensorBoard log directory:', logdir)
149 |         os.makedirs(logdir)
150 |         log = tensorboardX.SummaryWriter(logdir)
151 |     except ImportError:
152 |         log = None
153 | 
154 |     # The training dataset consists of 2000 images of dogs and cats, split
155 |     # in half.  In addition, the validation set consists of 1000 images,
156 |     # and the test set of 22000 images.
157 |     #
158 |     # First, we'll resize all training and validation images to a fixed
159 |     # size.
160 |     #
161 |     # Then, to make the most of our limited number of training examples,
162 |     # we'll apply random transformations to them each time we are looping
163 |     # over them. This way, we "augment" our training dataset to contain
164 |     # more data. There are various transformations available in
165 |     # torchvision, see:
166 |     # https://pytorch.org/docs/stable/torchvision/transforms.html
167 | 
168 |     datapath = os.getenv('DATADIR')
169 |     if datapath is None:
170 |         print("Please set DATADIR environment variable!")
171 |         sys.exit(1)
172 |     datapath = os.path.join(datapath, 'dogs-vs-cats/train-2000')
173 | 
174 |     input_image_size = (150, 150)
175 | 
176 |     data_transform = transforms.Compose([
177 |             transforms.Resize(input_image_size),
178 |             transforms.RandomAffine(degrees=0, translate=None,
179 |                                     scale=(0.8, 1.2), shear=0.2),
180 |             transforms.RandomHorizontalFlip(),
181 |             transforms.ToTensor()
182 |         ])
183 | 
184 |     noop_transform = transforms.Compose([
185 |             transforms.Resize(input_image_size),
186 |             transforms.ToTensor()
187 |         ])
188 | 
189 |     # Data loaders
190 |     batch_size = 25
191 | 
192 |     print('Train: ', end="")
193 |     train_dataset = datasets.ImageFolder(root=datapath+'/train',
194 |                                          transform=data_transform)
195 |     train_loader = DataLoader(train_dataset, batch_size=batch_size,
196 |                               shuffle=True, num_workers=4)
197 |     print('Found', len(train_dataset), 'images belonging to',
198 |           len(train_dataset.classes), 'classes')
199 | 
200 |     print('Validation: ', end="")
201 |     validation_dataset = datasets.ImageFolder(root=datapath+'/validation',
202 |                                               transform=noop_transform)
203 |     validation_loader = DataLoader(validation_dataset, batch_size=batch_size,
204 |                                    shuffle=False, num_workers=4)
205 |     print('Found', len(validation_dataset), 'images belonging to',
206 |           len(validation_dataset.classes), 'classes')
207 | 
208 |     print('Test: ', end="")
209 |     test_dataset = datasets.ImageFolder(root=datapath+'/test',
210 |                                         transform=noop_transform)
211 |     test_loader = DataLoader(test_dataset, batch_size=batch_size,
212 |                              shuffle=False, num_workers=4)
213 |     print('Found', len(test_dataset), 'images belonging to',
214 |           len(test_dataset.classes), 'classes')
215 | 
216 |     # Define the network and training parameters
217 |     model = PretrainedNet()
218 |     model = model.to(device)
219 |     optimizer = optim.SGD(model.parameters(), lr=0.01)
220 |     criterion = nn.BCELoss()
221 | 
222 |     print(model)
223 | 
224 |     num_epochs = 10
225 | 
226 |     # Training loop
227 |     start_time = datetime.now()
228 |     for epoch in range(num_epochs):
229 |         train_ret = train(train_loader, model, criterion, optimizer)
230 |         log_measures(train_ret, log, "train", epoch)
231 | 
232 |         val_ret = test(validation_loader, model, criterion)
233 |         log_measures(val_ret, log, "val", epoch)
234 |         print(f"Epoch {epoch+1}: "
235 |               f"train accuracy: {train_ret['accuracy']:.2%}, "
236 |               f"val accuracy: {val_ret['accuracy']:.2%}")
237 | 
238 |     end_time = datetime.now()
239 |     print('Total training time: {}.'.format(end_time - start_time))
240 | 
241 |     # Inference
242 |     ret = test(test_loader, model, criterion)
243 |     print("\nTesting (pretrained, before fine-tuning): "
244 |           f"accuracy: {ret['accuracy']:.2%}\n")
245 | 
246 |     #  Fine-tuning
247 |     #
248 |     # Once the top layers have learned some reasonable weights, we can
249 |     # continue training by unfreezing the last convolution block of
250 |     # VGG16 so that it may adapt to our data. The learning rate should
251 |     # be smaller than usual.
252 |     #
253 |     # Below we loop over all layers and set only the last three Conv2d
254 |     # layers to trainable. In the printout we mark trainable layers
255 |     # with '+', frozen with '-'.  Other layers don't have trainable
256 |     # parameters.
257 | 
258 |     print("Marking layers for training (+) or frozen (-):")
259 |     for name, layer in model.vgg_features.named_children():
260 |         note = ' '
261 |         for param in layer.parameters():
262 |             note = '-'
263 |             if int(name) >= 24:
264 |                 param.requires_grad = True
265 |                 note = '+'
266 |         print(name, note, layer, len(param))
267 | 
268 |     # We set up the training, note that we need to give only the
269 |     # parameters that are set to be trainable.
270 |     params = filter(lambda p: p.requires_grad, model.parameters())
271 |     #optimizer = optim.SGD(model.parameters(), lr=1e-3)
272 |     optimizer = optim.RMSprop(params, lr=1e-5)
273 |     criterion = nn.BCELoss()
274 | 
275 |     # Note that before continuing the training, we create a separate
276 |     # TensorBoard log directory.
277 |     if log is not None:
278 |         logdir_pt = logdir + '-pretrained-finetune'
279 |         os.makedirs(logdir_pt)
280 |         log = tensorboardX.SummaryWriter(logdir_pt)
281 | 
282 |     prev_epochs = num_epochs
283 |     num_epochs = 20
284 | 
285 |     start_time = datetime.now()
286 |     for epoch in range(prev_epochs, prev_epochs+num_epochs):
287 |         train_ret = train(train_loader, model, criterion, optimizer)
288 |         log_measures(train_ret, log, "train", epoch)
289 | 
290 |         val_ret = test(validation_loader, model, criterion)
291 |         log_measures(val_ret, log, "val", epoch)
292 | 
293 |         print(f"Epoch {epoch+1}: "
294 |               f"train loss: {train_ret['loss']:.6f} "
295 |               f"train accuracy: {train_ret['accuracy']:.2%}, "
296 |               f"val accuracy: {val_ret['accuracy']:.2%}")
297 | 
298 |     end_time = datetime.now()
299 |     print('Total fine-tuning time: {}.'.format(end_time - start_time))
300 | 
301 |     # Inference
302 |     ret = test(test_loader, model, criterion)
303 |     print("\nTesting (pretrained, after fine-tuning): "
304 |           f"accuracy: {ret['accuracy']:.2%}\n")
305 | 
306 | 
307 | if __name__ == "__main__":
308 |     main()
309 | 


--------------------------------------------------------------------------------
/day2/pytorch_dvc_cnn_pretrained_multigpu.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | # # Dogs-vs-cats classification with CNNs
  5 | #
  6 | # In this script, we'll train a convolutional neural network (CNN) to
  7 | # classify images of dogs from images of cats using PyTorch.
  8 | #
  9 | # ## Option 2: Reuse a pre-trained CNN
 10 | #
 11 | # Here we'll use the VGG16 pre-trained network:
 12 | # https://pytorch.org/docs/stable/torchvision/models.html#torchvision.models.vgg16
 13 | #
 14 | # It has weights learned using ImageNet.  We remove the top layers and
 15 | # freeze the pre-trained weights, and then stack our own, randomly
 16 | # initialized, layers on top of the VGG16 network.
 17 | #
 18 | 
 19 | import torch
 20 | import torch.nn as nn
 21 | import torch.optim as optim
 22 | from torch.utils.data import DataLoader
 23 | from torch.utils.tensorboard import SummaryWriter
 24 | from torchvision import datasets, transforms, models
 25 | from packaging.version import Version as LV
 26 | from datetime import datetime
 27 | import os
 28 | import sys
 29 | 
 30 | import torch.distributed as dist
 31 | from torch.nn.parallel import DistributedDataParallel
 32 | from torch.utils.data.distributed import DistributedSampler
 33 | 
 34 | torch.manual_seed(42)
 35 | 
 36 | if torch.cuda.is_available():
 37 |     device = torch.device('cuda')
 38 | else:
 39 |     device = torch.device('cpu')
 40 | 
 41 | print('Using PyTorch version:', torch.__version__, ' Device:', device)
 42 | assert LV(torch.__version__) >= LV("1.0.0")
 43 | 
 44 | 
 45 | class PretrainedNet(nn.Module):
 46 |     def __init__(self):
 47 |         super(PretrainedNet, self).__init__()
 48 |         self.vgg_features = models.vgg16(weights=models.VGG16_Weights.DEFAULT).features
 49 | 
 50 |         # Freeze the VGG16 layers
 51 |         for param in self.vgg_features.parameters():
 52 |             param.requires_grad = False
 53 | 
 54 |         # Add our own layers on top
 55 |         self.own_layers = nn.Sequential(
 56 |             nn.Flatten(),
 57 |             nn.Linear(512*4*4, 64),
 58 |             nn.ReLU(),
 59 |             nn.Linear(64, 1),
 60 |             nn.Sigmoid()
 61 |         )
 62 | 
 63 |     def forward(self, x):
 64 |         x = self.vgg_features(x)
 65 |         return self.own_layers(x).squeeze()
 66 | 
 67 | 
 68 | def correct(output, target):
 69 |     class_pred = output.round().int()          # set to 0 for <0.5, 1 for >0.5
 70 |     correct_ones = class_pred == target.int()  # 1 for correct, 0 for incorrect
 71 |     return correct_ones.sum().item()           # count number of correct ones
 72 | 
 73 | 
 74 | def train(data_loader, model, criterion, optimizer):
 75 |     model.train()
 76 | 
 77 |     num_batches = 0
 78 |     num_items = 0
 79 | 
 80 |     total_loss = 0
 81 |     total_correct = 0
 82 |     for data, target in data_loader:
 83 |         # Copy data and targets to GPU
 84 |         data = data.to(device)
 85 |         target = target.to(device).to(torch.float)
 86 | 
 87 |         # Do a forward pass
 88 |         output = model(data)
 89 | 
 90 |         # Calculate the loss
 91 |         loss = criterion(output, target)
 92 |         total_loss += loss
 93 |         num_batches += 1
 94 | 
 95 |         # Count number of correct
 96 |         total_correct += correct(output, target)
 97 |         num_items += len(target)
 98 | 
 99 |         # Backpropagation
100 |         loss.backward()
101 |         optimizer.step()
102 |         optimizer.zero_grad()
103 | 
104 |     return {
105 |         'loss': total_loss/num_batches,
106 |         'accuracy': total_correct/num_items
107 |         }
108 | 
109 | 
110 | def test(test_loader, model, criterion):
111 |     model.eval()
112 | 
113 |     num_batches = len(test_loader)
114 |     num_items = len(test_loader.dataset)
115 | 
116 |     test_loss = 0
117 |     total_correct = 0
118 | 
119 |     with torch.no_grad():
120 |         for data, target in test_loader:
121 |             # Copy data and targets to GPU
122 |             data = data.to(device)
123 |             target = target.to(device).to(torch.float)
124 | 
125 |             # Do a forward pass
126 |             output = model(data)
127 | 
128 |             # Calculate the loss
129 |             loss = criterion(output, target)
130 |             test_loss += loss.item()
131 | 
132 |             # Count number of correct digits
133 |             total_correct += correct(output, target)
134 | 
135 |     return {
136 |         'loss': test_loss/num_batches,
137 |         'accuracy': total_correct/num_items
138 |     }
139 | 
140 | 
141 | def log_measures(ret, log, prefix, epoch):
142 |     if log is not None:
143 |         for key, value in ret.items():
144 |             log.add_scalar(prefix + "_" + key, value, epoch)
145 | 
146 | 
147 | def main():
148 |     # Initialize PyTorch distributed
149 |     dist.init_process_group(backend='nccl')
150 |     
151 |     local_rank = int(os.environ['LOCAL_RANK'])
152 |     torch.cuda.set_device(local_rank)
153 | 
154 |     rank_0 = dist.get_rank() == 0
155 |     
156 |     # TensorBoard for logging
157 |     log = None
158 |     try:
159 |         if rank_0:
160 |             time_str = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
161 |             logdir = os.path.join(os.getcwd(), "logs", "dvc-pretrained-" + time_str)
162 |             print('TensorBoard log directory:', logdir)
163 |             os.makedirs(logdir)
164 |             log = SummaryWriter(logdir)
165 |     except ImportError:
166 |         pass
167 | 
168 |     # The training dataset consists of 2000 images of dogs and cats, split
169 |     # in half.  In addition, the validation set consists of 1000 images,
170 |     # and the test set of 22000 images.
171 |     #
172 |     # First, we'll resize all training and validation images to a fixed
173 |     # size.
174 |     #
175 |     # Then, to make the most of our limited number of training examples,
176 |     # we'll apply random transformations to them each time we are looping
177 |     # over them. This way, we "augment" our training dataset to contain
178 |     # more data. There are various transformations available in
179 |     # torchvision, see:
180 |     # https://pytorch.org/docs/stable/torchvision/transforms.html
181 | 
182 |     datapath = os.getenv('DATADIR')
183 |     if datapath is None:
184 |         print("Please set DATADIR environment variable!")
185 |         sys.exit(1)
186 |     datapath = os.path.join(datapath, 'dogs-vs-cats/train-2000')
187 | 
188 |     input_image_size = (150, 150)
189 | 
190 |     data_transform = transforms.Compose([
191 |             transforms.Resize(input_image_size),
192 |             transforms.RandomAffine(degrees=0, translate=None,
193 |                                     scale=(0.8, 1.2), shear=0.2),
194 |             transforms.RandomHorizontalFlip(),
195 |             transforms.ToTensor()
196 |         ])
197 | 
198 |     noop_transform = transforms.Compose([
199 |             transforms.Resize(input_image_size),
200 |             transforms.ToTensor()
201 |         ])
202 | 
203 |     # Data loaders
204 |     batch_size = 25
205 | 
206 |     train_dataset = datasets.ImageFolder(root=datapath+'/train',
207 |                                          transform=data_transform)
208 |     train_sampler = DistributedSampler(train_dataset, drop_last=True)
209 |     train_loader = DataLoader(train_dataset, batch_size=batch_size,
210 |                               shuffle=False, num_workers=4,
211 |                               sampler=train_sampler)
212 |     if rank_0:
213 |         print('Train: ', end="")
214 |         print('Found', len(train_dataset), 'images belonging to',
215 |               len(train_dataset.classes), 'classes')
216 | 
217 |     validation_dataset = datasets.ImageFolder(root=datapath+'/validation',
218 |                                               transform=noop_transform)
219 |     validation_loader = DataLoader(validation_dataset, batch_size=batch_size,
220 |                                    shuffle=False, num_workers=4)
221 |     if rank_0:
222 |         print('Validation: ', end="")
223 |         print('Found', len(validation_dataset), 'images belonging to',
224 |               len(validation_dataset.classes), 'classes')
225 | 
226 |     test_dataset = datasets.ImageFolder(root=datapath+'/test',
227 |                                         transform=noop_transform)
228 |     test_loader = DataLoader(test_dataset, batch_size=batch_size,
229 |                              shuffle=False, num_workers=4)
230 |     if rank_0:
231 |         print('Test: ', end="")
232 |         print('Found', len(test_dataset), 'images belonging to',
233 |               len(test_dataset.classes), 'classes')
234 | 
235 |     # Define the network and training parameters
236 |     model = PretrainedNet()
237 |     model = model.to(device)
238 |     model = DistributedDataParallel(model, device_ids=[local_rank])
239 |     if rank_0:
240 |         print(model)
241 |    
242 |     optimizer = optim.SGD(model.parameters(), lr=0.01)
243 |     criterion = nn.BCELoss()
244 | 
245 |     num_epochs = 10
246 | 
247 |     # Training loop
248 |     start_time = datetime.now()
249 |     for epoch in range(num_epochs):
250 |         train_ret = train(train_loader, model, criterion, optimizer)
251 |         if rank_0:
252 |             log_measures(train_ret, log, "train", epoch)
253 | 
254 |         val_ret = test(validation_loader, model, criterion)
255 |         if rank_0:
256 |             log_measures(val_ret, log, "val", epoch)
257 |             print(f"Epoch {epoch+1}: "
258 |                   f"train accuracy: {train_ret['accuracy']:.2%}, "
259 |                   f"val accuracy: {val_ret['accuracy']:.2%}")
260 | 
261 |     end_time = datetime.now()
262 |     if rank_0:
263 |         print('Total training time: {}.'.format(end_time - start_time))
264 | 
265 |         # Inference
266 |         ret = test(test_loader, model, criterion)
267 |         print("\nTesting (pretrained, before fine-tuning): "
268 |               f"accuracy: {ret['accuracy']:.2%}\n")
269 | 
270 |     #  Fine-tuning
271 |     #
272 |     # Once the top layers have learned some reasonable weights, we can
273 |     # continue training by unfreezing the last convolution block of
274 |     # VGG16 so that it may adapt to our data. The learning rate should
275 |     # be smaller than usual.
276 |     #
277 |     # Below we loop over all layers and set only the last three Conv2d
278 |     # layers to trainable. In the printout we mark trainable layers
279 |     # with '+', frozen with '-'.  Other layers don't have trainable
280 |     # parameters.
281 | 
282 |     if rank_0:
283 |         print("Marking layers for training (+) or frozen (-):")
284 |     for name, layer in model.module.vgg_features.named_children():
285 |         note = ' '
286 |         for param in layer.parameters():
287 |             note = '-'
288 |             if int(name) >= 24:
289 |                 param.requires_grad = True
290 |                 note = '+'
291 |         if rank_0:
292 |             print(name, note, layer, len(param))
293 | 
294 |     # We set up the training, note that we need to give only the
295 |     # parameters that are set to be trainable.
296 |     params = filter(lambda p: p.requires_grad, model.parameters())
297 |     #optimizer = optim.SGD(model.parameters(), lr=1e-3)
298 |     optimizer = optim.RMSprop(params, lr=1e-5)
299 |     criterion = nn.BCELoss()
300 | 
301 |     # Note that before continuing the training, we create a separate
302 |     # TensorBoard log directory.
303 |     if log is not None:
304 |         logdir_pt = logdir + '-pretrained-finetune'
305 |         os.makedirs(logdir_pt)
306 |         log = SummaryWriter(logdir_pt)
307 | 
308 |     prev_epochs = num_epochs
309 |     num_epochs = 20
310 | 
311 |     start_time = datetime.now()
312 |     for epoch in range(prev_epochs, prev_epochs+num_epochs):
313 |         train_ret = train(train_loader, model, criterion, optimizer)
314 |         if rank_0:
315 |             log_measures(train_ret, log, "train", epoch)
316 | 
317 |         val_ret = test(validation_loader, model, criterion)
318 | 
319 |         if rank_0:
320 |             log_measures(val_ret, log, "val", epoch)
321 |             
322 |             print(f"Epoch {epoch+1}: "
323 |                   f"train loss: {train_ret['loss']:.6f} "
324 |                   f"train accuracy: {train_ret['accuracy']:.2%}, "
325 |                   f"val accuracy: {val_ret['accuracy']:.2%}")
326 | 
327 |     end_time = datetime.now()
328 |     if rank_0:
329 |         print('Total fine-tuning time: {}.'.format(end_time - start_time))
330 | 
331 |     # Inference
332 |     if rank_0:
333 |         ret = test(test_loader, model, criterion)
334 |         print("\nTesting (pretrained, after fine-tuning): "
335 |               f"accuracy: {ret['accuracy']:.2%}\n")
336 | 
337 | 
338 | if __name__ == "__main__":
339 |     main()
340 | 


--------------------------------------------------------------------------------
/day2/pytorch_dvc_cnn_simple.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | # # Dogs-vs-cats classification with CNNs
  5 | #
  6 | # In this script, we'll train a convolutional neural network (CNN) to
  7 | # classify images of dogs from images of cats using PyTorch.
  8 | #
  9 | # ## Option 1: Train a small CNN from scratch
 10 | #
 11 | # Similarly as with MNIST digits, we can start from scratch and train
 12 | # a CNN for the classification task. However, due to the small number
 13 | # of training images, a large network will easily overfit, regardless
 14 | # of the data augmentation.
 15 | 
 16 | import torch
 17 | import torch.nn as nn
 18 | import torch.optim as optim
 19 | from torch.utils.data import DataLoader
 20 | from torchvision import datasets, transforms
 21 | from packaging.version import Version as LV
 22 | from datetime import datetime
 23 | import os
 24 | import sys
 25 | 
 26 | torch.manual_seed(42)
 27 | 
 28 | if torch.cuda.is_available():
 29 |     device = torch.device('cuda')
 30 | else:
 31 |     device = torch.device('cpu')
 32 | 
 33 | print('Using PyTorch version:', torch.__version__, ' Device:', device)
 34 | assert LV(torch.__version__) >= LV("1.0.0")
 35 | 
 36 | 
 37 | class Net(nn.Module):
 38 |     def __init__(self):
 39 |         super(Net, self).__init__()
 40 |         self.layers = nn.Sequential(
 41 |             nn.Conv2d(3, 32, (3, 3)),
 42 |             nn.ReLU(),
 43 |             nn.MaxPool2d((2, 2)),
 44 | 
 45 |             nn.Conv2d(32, 32, (3, 3)),
 46 |             nn.ReLU(),
 47 |             nn.MaxPool2d((2, 2)),
 48 | 
 49 |             nn.Conv2d(32, 64, (3, 3)),
 50 |             nn.ReLU(),
 51 |             nn.MaxPool2d((2, 2)),
 52 | 
 53 |             nn.Flatten(),             # flatten 2D to 1D
 54 |             nn.Linear(17*17*64, 64),
 55 |             nn.ReLU(),
 56 |             nn.Dropout(0.5),
 57 |             nn.Linear(64, 1),
 58 |             nn.Sigmoid()
 59 |         )
 60 | 
 61 |     def forward(self, x):
 62 |         return self.layers(x).squeeze()
 63 | 
 64 | 
 65 | def correct(output, target):
 66 |     class_pred = output.round().int()          # set to 0 for <0.5, 1 for >0.5
 67 |     correct_ones = class_pred == target.int()  # 1 for correct, 0 for incorrect
 68 |     return correct_ones.sum().item()           # count number of correct ones
 69 | 
 70 | 
 71 | def train(data_loader, model, criterion, optimizer):
 72 |     model.train()
 73 | 
 74 |     num_batches = 0
 75 |     num_items = 0
 76 | 
 77 |     total_loss = 0
 78 |     total_correct = 0
 79 |     for data, target in data_loader:
 80 |         # Copy data and targets to GPU
 81 |         data = data.to(device)
 82 |         target = target.to(device).to(torch.float)
 83 | 
 84 |         # Do a forward pass
 85 |         output = model(data)
 86 | 
 87 |         # Calculate the loss
 88 |         loss = criterion(output, target)
 89 |         total_loss += loss
 90 |         num_batches += 1
 91 | 
 92 |         # Count number of correct
 93 |         total_correct += correct(output, target)
 94 |         num_items += len(target)
 95 | 
 96 |         # Backpropagation
 97 |         loss.backward()
 98 |         optimizer.step()
 99 |         optimizer.zero_grad()
100 | 
101 |     return {
102 |         'loss': total_loss/num_batches,
103 |         'accuracy': total_correct/num_items
104 |         }
105 | 
106 | 
107 | def test(test_loader, model, criterion):
108 |     model.eval()
109 | 
110 |     num_batches = len(test_loader)
111 |     num_items = len(test_loader.dataset)
112 | 
113 |     test_loss = 0
114 |     total_correct = 0
115 | 
116 |     with torch.no_grad():
117 |         for data, target in test_loader:
118 |             # Copy data and targets to GPU
119 |             data = data.to(device)
120 |             target = target.to(device).to(torch.float)
121 | 
122 |             # Do a forward pass
123 |             output = model(data)
124 | 
125 |             # Calculate the loss
126 |             loss = criterion(output, target)
127 |             test_loss += loss.item()
128 | 
129 |             # Count number of correct digits
130 |             total_correct += correct(output, target)
131 | 
132 |     return {
133 |         'loss': test_loss/num_batches,
134 |         'accuracy': total_correct/num_items
135 |     }
136 | 
137 | 
138 | def log_measures(ret, log, prefix, epoch):
139 |     if log is not None:
140 |         for key, value in ret.items():
141 |             log.add_scalar(prefix + "_" + key, value, epoch)
142 | 
143 | 
144 | def main():
145 |     # TensorBoard for logging
146 |     try:
147 |         import tensorboardX
148 |         time_str = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
149 |         logdir = os.path.join(os.getcwd(), "logs", "dvc-" + time_str)
150 |         print('TensorBoard log directory:', logdir)
151 |         os.makedirs(logdir)
152 |         log = tensorboardX.SummaryWriter(logdir)
153 |     except ImportError:
154 |         log = None
155 | 
156 |     # The training dataset consists of 2000 images of dogs and cats, split
157 |     # in half.  In addition, the validation set consists of 1000 images,
158 |     # and the test set of 22000 images.
159 |     #
160 |     # First, we'll resize all training and validation images to a fixed
161 |     # size.
162 |     #
163 |     # Then, to make the most of our limited number of training examples,
164 |     # we'll apply random transformations to them each time we are looping
165 |     # over them. This way, we "augment" our training dataset to contain
166 |     # more data. There are various transformations available in
167 |     # torchvision, see:
168 |     # https://pytorch.org/docs/stable/torchvision/transforms.html
169 | 
170 |     datapath = os.getenv('DATADIR')
171 |     if datapath is None:
172 |         print("Please set DATADIR environment variable!")
173 |         sys.exit(1)
174 |     datapath = os.path.join(datapath, 'dogs-vs-cats/train-2000')
175 | 
176 |     input_image_size = (150, 150)
177 | 
178 |     data_transform = transforms.Compose([
179 |             transforms.Resize(input_image_size),
180 |             transforms.RandomAffine(degrees=0, translate=None,
181 |                                     scale=(0.8, 1.2), shear=0.2),
182 |             transforms.RandomHorizontalFlip(),
183 |             transforms.ToTensor()
184 |         ])
185 | 
186 |     noop_transform = transforms.Compose([
187 |             transforms.Resize(input_image_size),
188 |             transforms.ToTensor()
189 |         ])
190 | 
191 |     # Data loaders
192 |     batch_size = 25
193 | 
194 |     print('Train: ', end="")
195 |     train_dataset = datasets.ImageFolder(root=datapath+'/train',
196 |                                          transform=data_transform)
197 |     train_loader = DataLoader(train_dataset, batch_size=batch_size,
198 |                               shuffle=True, num_workers=4)
199 |     print('Found', len(train_dataset), 'images belonging to',
200 |           len(train_dataset.classes), 'classes')
201 | 
202 |     print('Validation: ', end="")
203 |     validation_dataset = datasets.ImageFolder(root=datapath+'/validation',
204 |                                               transform=noop_transform)
205 |     validation_loader = DataLoader(validation_dataset, batch_size=batch_size,
206 |                                    shuffle=False, num_workers=4)
207 |     print('Found', len(validation_dataset), 'images belonging to',
208 |           len(validation_dataset.classes), 'classes')
209 | 
210 |     print('Test: ', end="")
211 |     test_dataset = datasets.ImageFolder(root=datapath+'/test',
212 |                                         transform=noop_transform)
213 |     test_loader = DataLoader(test_dataset, batch_size=batch_size,
214 |                              shuffle=False, num_workers=4)
215 |     print('Found', len(test_dataset), 'images belonging to',
216 |           len(test_dataset.classes), 'classes')
217 | 
218 |     # Define the network and training parameters
219 |     model = Net()
220 |     model = model.to(device)
221 |     optimizer = optim.SGD(model.parameters(), lr=0.05)
222 |     criterion = nn.BCELoss()
223 | 
224 |     print(model)
225 | 
226 |     num_epochs = 50
227 | 
228 |     # Training loop
229 |     start_time = datetime.now()
230 |     for epoch in range(num_epochs):
231 |         train_ret = train(train_loader, model, criterion, optimizer)
232 |         log_measures(train_ret, log, "train", epoch)
233 | 
234 |         val_ret = test(validation_loader, model, criterion)
235 |         log_measures(val_ret, log, "val", epoch)
236 |         print(f"Epoch {epoch+1}: "
237 |               f"train loss: {train_ret['loss']:.6f} "
238 |               f"train accuracy: {train_ret['accuracy']:.2%}, "
239 |               f"val accuracy: {val_ret['accuracy']:.2%}")
240 | 
241 |     end_time = datetime.now()
242 |     print('Total training time: {}.'.format(end_time - start_time))
243 | 
244 |     # Inference
245 |     ret = test(test_loader, model, criterion)
246 |     print(f"\nTesting: accuracy: {ret['accuracy']:.2%}")
247 | 
248 | 
249 | if __name__ == "__main__":
250 |     main()
251 | 


--------------------------------------------------------------------------------
/day2/pytorch_dvc_vit.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | # # Dogs-vs-cats classification with CNNs
  5 | #
  6 | # In this script, we'll finetune a Vision Transformer
  7 | # (https://arxiv.org/abs/2010.11929) (ViT) to classify images of cats
  8 | # and dogs using PyTorch and HuggingFace Transformers:
  9 | # https://github.com/huggingface/transformers
 10 | 
 11 | import torch
 12 | import torch.nn as nn
 13 | import torch.optim as optim
 14 | from torch.utils.data import DataLoader
 15 | from torchvision import datasets, transforms
 16 | from packaging.version import Version as LV
 17 | from datetime import datetime
 18 | import os
 19 | import sys
 20 | 
 21 | from transformers import AutoImageProcessor, ViTForImageClassification
 22 | from transformers import __version__ as transformers_version
 23 | 
 24 | torch.manual_seed(42)
 25 | 
 26 | if torch.cuda.is_available():
 27 |     device = torch.device('cuda')
 28 | else:
 29 |     device = torch.device('cpu')
 30 | 
 31 | print('Using PyTorch version:', torch.__version__,
 32 |       'Transformers version:', transformers_version,
 33 |       'Device:', device)
 34 | assert LV(torch.__version__) >= LV("1.0.0")
 35 | 
 36 | 
 37 | def correct(output, target):
 38 |     class_pred = (output > 0).int()            # set to 0 for <0.5, 1 for >0.5
 39 |     correct_ones = class_pred == target.int()  # 1 for correct, 0 for incorrect
 40 |     return correct_ones.sum().item()           # count number of correct ones
 41 | 
 42 | 
 43 | def train(data_loader, model, criterion, optimizer):
 44 |     model.train()
 45 | 
 46 |     num_batches = 0
 47 |     num_items = 0
 48 | 
 49 |     total_loss = 0
 50 |     total_correct = 0
 51 | 
 52 |     for data, target in data_loader:
 53 |         # Copy data and targets to GPU
 54 |         data = data.to(device)
 55 |         target = target.to(device)
 56 | 
 57 |         # Do a forward pass
 58 |         output = model(data).logits.squeeze()
 59 | 
 60 |         # Calculate the loss
 61 |         loss = criterion(output, target)
 62 |         total_loss += loss
 63 |         num_batches += 1
 64 | 
 65 |         # Count number of correct
 66 |         total_correct += correct(output, target)
 67 |         num_items += len(target)
 68 | 
 69 |         # Backpropagation
 70 |         loss.backward()
 71 |         optimizer.step()
 72 |         optimizer.zero_grad()
 73 | 
 74 |     return {
 75 |         'loss': total_loss/num_batches,
 76 |         'accuracy': total_correct/num_items
 77 |         }
 78 | 
 79 | 
 80 | def test(test_loader, model, criterion):
 81 |     model.eval()
 82 | 
 83 |     num_batches = len(test_loader)
 84 |     num_items = len(test_loader.dataset)
 85 | 
 86 |     test_loss = 0
 87 |     total_correct = 0
 88 | 
 89 |     with torch.no_grad():
 90 |         for data, target in test_loader:
 91 |             # Copy data and targets to GPU
 92 |             data = data.to(device)
 93 |             target = target.to(device)
 94 | 
 95 |             # Do a forward pass
 96 |             output = model(data).logits.squeeze()
 97 | 
 98 |             # Calculate the loss
 99 |             loss = criterion(output, target)
100 |             test_loss += loss.item()
101 | 
102 |             # Count number of correct digits
103 |             total_correct += correct(output, target)
104 | 
105 |     return {
106 |         'loss': test_loss/num_batches,
107 |         'accuracy': total_correct/num_items
108 |     }
109 | 
110 | 
111 | def log_measures(ret, log, prefix, epoch):
112 |     if log is not None:
113 |         for key, value in ret.items():
114 |             log.add_scalar(prefix + "_" + key, value, epoch)
115 | 
116 | 
117 | class ImageClassificationCollator:
118 |     def __init__(self, processor):
119 |         self.processor = processor
120 | 
121 |     def __call__(self, batch):
122 |         data = self.processor([x[0] for x in batch], do_rescale=False,
123 |                               return_tensors='pt').pixel_values
124 |         targets = torch.tensor([x[1] for x in batch], dtype=torch.float32)
125 |         return data, targets
126 | 
127 | 
128 | def main():
129 |     # TensorBoard for logging
130 |     try:
131 |         import tensorboardX
132 |         time_str = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
133 |         logdir = os.path.join(os.getcwd(), "logs", "dvc-vit-" + time_str)
134 |         print('TensorBoard log directory:', logdir)
135 |         os.makedirs(logdir)
136 |         log = tensorboardX.SummaryWriter(logdir)
137 |     except ImportError:
138 |         log = None
139 | 
140 |     # The training dataset consists of 2000 images of dogs and cats, split
141 |     # in half.  In addition, the validation set consists of 1000 images,
142 |     # and the test set of 22000 images.
143 |     #
144 |     # First, we'll resize all training and validation images to a fixed
145 |     # size.
146 |     #
147 |     # Then, to make the most of our limited number of training examples,
148 |     # we'll apply random transformations to them each time we are looping
149 |     # over them. This way, we "augment" our training dataset to contain
150 |     # more data. There are various transformations available in
151 |     # torchvision, see:
152 |     # https://pytorch.org/docs/stable/torchvision/transforms.html
153 | 
154 |     datapath = os.getenv('DATADIR')
155 |     if datapath is None:
156 |         print("Please set DATADIR environment variable!")
157 |         sys.exit(1)
158 |     datapath = os.path.join(datapath, 'dogs-vs-cats/train-2000')
159 | 
160 |     # Data loaders
161 |     batch_size = 32
162 | 
163 |     vitmodel = 'google/vit-base-patch16-224'
164 |     processor = AutoImageProcessor.from_pretrained(vitmodel)
165 |     collator = ImageClassificationCollator(processor)
166 | 
167 |     print('Train: ', end="")
168 |     train_dataset = datasets.ImageFolder(root=datapath+'/train',
169 |                                          transform=transforms.ToTensor())
170 |     train_loader = DataLoader(train_dataset, batch_size=batch_size,
171 |                               shuffle=True, num_workers=4,
172 |                               collate_fn=collator)
173 |     print('Found', len(train_dataset), 'images belonging to',
174 |           len(train_dataset.classes), 'classes')
175 | 
176 |     print('Validation: ', end="")
177 |     validation_dataset = datasets.ImageFolder(root=datapath+'/validation',
178 |                                               transform=transforms.ToTensor())
179 |     validation_loader = DataLoader(validation_dataset, batch_size=batch_size,
180 |                                    shuffle=False, num_workers=4,
181 |                                    collate_fn=collator)
182 |     print('Found', len(validation_dataset), 'images belonging to',
183 |           len(validation_dataset.classes), 'classes')
184 | 
185 |     print('Test: ', end="")
186 |     test_dataset = datasets.ImageFolder(root=datapath+'/test',
187 |                                         transform=transforms.ToTensor())
188 |     test_loader = DataLoader(test_dataset, batch_size=batch_size,
189 |                              shuffle=False, num_workers=4,
190 |                              collate_fn=collator)
191 |     print('Found', len(test_dataset), 'images belonging to',
192 |           len(test_dataset.classes), 'classes')
193 | 
194 |     # Define the network and training parameters
195 |     model = ViTForImageClassification.from_pretrained(
196 |         vitmodel, num_labels=1, ignore_mismatched_sizes=True)
197 |     model = model.to(device)
198 |     optimizer = optim.Adam(model.parameters(), lr=1e-5)
199 |     criterion = nn.BCEWithLogitsLoss()
200 | 
201 |     print(model)
202 | 
203 |     num_epochs = 10
204 | 
205 |     # Training loop
206 |     start_time = datetime.now()
207 |     for epoch in range(num_epochs):
208 |         train_ret = train(train_loader, model, criterion, optimizer)
209 |         log_measures(train_ret, log, "train", epoch)
210 | 
211 |         val_ret = test(validation_loader, model, criterion)
212 |         log_measures(val_ret, log, "val", epoch)
213 |         print(f"Epoch {epoch+1}: "
214 |               f"train loss: {train_ret['loss']:.6f} "
215 |               f"train accuracy: {train_ret['accuracy']:.2%}, "
216 |               f"val accuracy: {val_ret['accuracy']:.2%}")
217 | 
218 |     end_time = datetime.now()
219 |     print('Total training time: {}.'.format(end_time - start_time))
220 | 
221 |     # Inference
222 |     ret = test(test_loader, model, criterion)
223 |     print(f"\nTesting: accuracy: {ret['accuracy']:.2%}")
224 | 
225 | 
226 | if __name__ == "__main__":
227 |     main()
228 | 


--------------------------------------------------------------------------------
/day2/pytorch_generate_gpt.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "c0e6e42c-a2a3-4ba9-be85-056035147486",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# IMDB movie review text generation\n",
  9 |     "\n",
 10 |     "Once you have fine-tuned your model you can test it interactively with this notebook."
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": null,
 16 |    "id": "fa458fec-a1e9-4960-9a9f-c7f21d0a7b6e",
 17 |    "metadata": {},
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "from transformers import pipeline\n",
 21 |     "\n",
 22 |     "path_to_model = \"/scratch/project_462000863/data/users/YOUR_USERNAME_HERE/gpt-imdb-model/checkpoint-5000/\"\n",
 23 |     "generator = pipeline(\"text-generation\", model=path_to_model)"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "code",
 28 |    "execution_count": null,
 29 |    "id": "3a5ecc40-1c1d-4c9d-a41c-937bbbbaf025",
 30 |    "metadata": {},
 31 |    "outputs": [],
 32 |    "source": [
 33 |     "def print_output(output):\n",
 34 |     "    for item in output:\n",
 35 |     "        text = item['generated_text']\n",
 36 |     "        text = text.replace(\"<br />\", \"\\n\")\n",
 37 |     "        print('-', text)\n",
 38 |     "        print()"
 39 |    ]
 40 |   },
 41 |   {
 42 |    "cell_type": "code",
 43 |    "execution_count": null,
 44 |    "id": "bf677501-f93d-46b1-a618-0fb792cd44cd",
 45 |    "metadata": {},
 46 |    "outputs": [],
 47 |    "source": [
 48 |     "output = generator(\"This movie was\")\n",
 49 |     "print_output(output)"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "markdown",
 54 |    "id": "73fb8536-887f-4fad-a0b3-190d1749a594",
 55 |    "metadata": {},
 56 |    "source": [
 57 |     "## Experiment with the generation strategy\n",
 58 |     "\n",
 59 |     "You can play with the text generation if you wish. Text generation strategies are discussed here: https://huggingface.co/docs/transformers/generation_strategies\n",
 60 |     "\n",
 61 |     "Note that we are here using the easy-to-use `TextGenerationPipeline` and its `generator()` function, but the link discusses the `model.generate()` method. The same parameters can be used, though, the pipeline just takes care of some of the pre- and post-processing.\n",
 62 |     "\n",
 63 |     "In particular these parameters of the `generator()` function might be interesting:\n",
 64 |     "\n",
 65 |     "- `max_new_tokens`: the maximum number of tokens to generate\n",
 66 |     "- `num_beams`: activate Beam search by setting this > 1\n",
 67 |     "- `do_sample`: activate multinomial sampling if set to True\n",
 68 |     "- `num_return_sequences`: the number of candidate sentences to return (available only for beam search and sampling)\n",
 69 |     "\n",
 70 |     "Here is a nice blog post explaining in more detail about the different generation strategies: https://huggingface.co/blog/how-to-generate"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "code",
 75 |    "execution_count": null,
 76 |    "id": "6816b3f3-9a0f-4ca8-a7d9-d7962b0207fc",
 77 |    "metadata": {},
 78 |    "outputs": [],
 79 |    "source": [
 80 |     "output = generator(\"This movie was awful because\", num_return_sequences=1, max_new_tokens=100, do_sample=True)\n",
 81 |     "print_output(output)"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "markdown",
 86 |    "id": "df008ff8-cb03-488f-b643-4aa2314de52c",
 87 |    "metadata": {},
 88 |    "source": [
 89 |     "## Compare with the original model without fine-tuning\n",
 90 |     "\n",
 91 |     "We can also load the original `distilgpt2` model and see how it would have worked without fine-tuning."
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "code",
 96 |    "execution_count": null,
 97 |    "id": "3ba1f550-970e-419a-aaff-d4e821bacc87",
 98 |    "metadata": {},
 99 |    "outputs": [],
100 |    "source": [
101 |     "generator_orig = pipeline(\"text-generation\", model='distilgpt2')"
102 |    ]
103 |   },
104 |   {
105 |    "cell_type": "code",
106 |    "execution_count": null,
107 |    "id": "4995c393-29ad-4df1-b01a-83cd85008297",
108 |    "metadata": {},
109 |    "outputs": [],
110 |    "source": [
111 |     "output = generator_orig(\"This movie was awful because\", num_return_sequences=1, max_new_tokens=100, do_sample=True)\n",
112 |     "print_output(output)"
113 |    ]
114 |   }
115 |  ],
116 |  "metadata": {
117 |   "kernelspec": {
118 |    "display_name": "Python 3 (ipykernel)",
119 |    "language": "python",
120 |    "name": "python3"
121 |   },
122 |   "language_info": {
123 |    "codemirror_mode": {
124 |     "name": "ipython",
125 |     "version": 3
126 |    },
127 |    "file_extension": ".py",
128 |    "mimetype": "text/x-python",
129 |    "name": "python",
130 |    "nbconvert_exporter": "python",
131 |    "pygments_lexer": "ipython3",
132 |    "version": "3.10.12"
133 |   }
134 |  },
135 |  "nbformat": 4,
136 |  "nbformat_minor": 5
137 | }
138 | 


--------------------------------------------------------------------------------
/day2/pytorch_gtsrb_cnn_pretrained.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | # # Traffic sign classification with CNNs
  5 | #
  6 | # In this script, we'll train a convolutional neural network (CNN) to
  7 | # classify images of traffic signs from the German Traffic Sign
  8 | # Recognition Benchmark:
  9 | # https://benchmark.ini.rub.de/gtsrb_news.html
 10 | #
 11 | # ## Option 2: Reuse a pre-trained CNN
 12 | #
 13 | # Here we'll use the VGG16 pre-trained network:
 14 | # https://pytorch.org/docs/stable/torchvision/models.html#torchvision.models.vgg16
 15 | #
 16 | # It has weights learned using ImageNet.  We remove the top layers and
 17 | # freeze the pre-trained weights, and then stack our own, randomly
 18 | # initialized, layers on top of the VGG16 network.
 19 | #
 20 | 
 21 | import torch
 22 | import torch.nn as nn
 23 | import torch.optim as optim
 24 | from torch.utils.data import DataLoader
 25 | from torchvision import datasets, transforms, models
 26 | from packaging.version import Version as LV
 27 | from datetime import datetime
 28 | import os
 29 | import sys
 30 | 
 31 | torch.manual_seed(42)
 32 | 
 33 | if torch.cuda.is_available():
 34 |     device = torch.device('cuda')
 35 | else:
 36 |     device = torch.device('cpu')
 37 | 
 38 | print('Using PyTorch version:', torch.__version__, ' Device:', device)
 39 | assert LV(torch.__version__) >= LV("1.0.0")
 40 | 
 41 | 
 42 | class PretrainedNet(nn.Module):
 43 |     def __init__(self):
 44 |         super(PretrainedNet, self).__init__()
 45 |         self.vgg_features = models.vgg16(weights=models.VGG16_Weights.DEFAULT).features
 46 | 
 47 |         # Freeze the VGG16 layers
 48 |         for param in self.vgg_features.parameters():
 49 |             param.requires_grad = False
 50 | 
 51 |         # Add our own layers on top
 52 |         self.own_layers = nn.Sequential(
 53 |             nn.Flatten(),
 54 |             nn.Linear(512*2*2, 256),
 55 |             nn.ReLU(),
 56 |             nn.Linear(256, 43)
 57 |         )
 58 | 
 59 |     def forward(self, x):
 60 |         x = self.vgg_features(x)
 61 |         return self.own_layers(x).squeeze()
 62 | 
 63 | #
 64 | # There are some broken folders, but we need to keep the class indices
 65 | # the same. We created a custom Dataset class to handle this.
 66 | #
 67 | class ImageFolderRemoveDirs(datasets.ImageFolder):
 68 |     def __init__(self, root, transform, remove_dirs):
 69 |         self.remove_dirs = remove_dirs
 70 |         super(ImageFolderRemoveDirs, self).__init__(root=root, transform=transform)
 71 | 
 72 |     def find_classes(self, directory):
 73 |         classes, class_to_idx = super(ImageFolderRemoveDirs, self).find_classes(directory)
 74 |         for d in self.remove_dirs:
 75 |             print('Removing directory', d)
 76 |             classes.remove(d)
 77 |             del class_to_idx[d]
 78 |         return classes, class_to_idx
 79 | 
 80 | def correct(output, target):
 81 |     predicted = output.argmax(1) # pick class with largest network output
 82 |     correct_ones = (predicted == target).type(torch.float)
 83 |     return correct_ones.sum().item() # count number of correct ones
 84 | 
 85 | def train(data_loader, model, criterion, optimizer):
 86 |     model.train()
 87 | 
 88 |     num_batches = 0
 89 |     num_items = 0
 90 | 
 91 |     total_loss = 0
 92 |     total_correct = 0
 93 |     for data, target in data_loader:
 94 |         # Copy data and targets to GPU
 95 |         data = data.to(device)
 96 |         target = target.to(device)
 97 | 
 98 |         # Do a forward pass
 99 |         output = model(data)
100 | 
101 |         # Calculate the loss
102 |         loss = criterion(output, target)
103 |         total_loss += loss
104 |         num_batches += 1
105 | 
106 |         # Count number of correct
107 |         total_correct += correct(output, target)
108 |         num_items += len(target)
109 | 
110 |         # Backpropagation
111 |         loss.backward()
112 |         optimizer.step()
113 |         optimizer.zero_grad()
114 | 
115 |     return {
116 |         'loss': total_loss/num_batches,
117 |         'accuracy': total_correct/num_items
118 |         }
119 | 
120 | 
121 | def test(test_loader, model, criterion):
122 |     model.eval()
123 | 
124 |     num_batches = len(test_loader)
125 |     num_items = len(test_loader.dataset)
126 | 
127 |     test_loss = 0
128 |     total_correct = 0
129 | 
130 |     with torch.no_grad():
131 |         for data, target in test_loader:
132 |             # Copy data and targets to GPU
133 |             data = data.to(device)
134 |             target = target.to(device)
135 | 
136 |             # Do a forward pass
137 |             output = model(data)
138 | 
139 |             # Calculate the loss
140 |             loss = criterion(output, target)
141 |             test_loss += loss.item()
142 | 
143 |             # Count number of correct digits
144 |             total_correct += correct(output, target)
145 | 
146 |     return {
147 |         'loss': test_loss/num_batches,
148 |         'accuracy': total_correct/num_items
149 |     }
150 | 
151 | 
152 | def log_measures(ret, log, prefix, epoch):
153 |     if log is not None:
154 |         for key, value in ret.items():
155 |             log.add_scalar(prefix + "_" + key, value, epoch)
156 | 
157 | 
158 | def main():
159 |     # TensorBoard for logging
160 |     try:
161 |         import tensorboardX
162 |         time_str = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
163 |         logdir = os.path.join(os.getcwd(), "logs", "gtsrb-pretrained-" + time_str)
164 |         print('TensorBoard log directory:', logdir)
165 |         os.makedirs(logdir)
166 |         log = tensorboardX.SummaryWriter(logdir)
167 |     except ImportError:
168 |         log = None
169 | 
170 |     # The training dataset consists of 5535 images of traffic signs of
171 |     # varying size. There are 43 different types of traffic signs.
172 |     #
173 |     # The validation and test sets consist of 999 and 12630 images,
174 |     # respectively.
175 |     #
176 |     # Then, to make the most of our limited number of training examples,
177 |     # we'll apply random transformations to them each time we are looping
178 |     # over them. This way, we "augment" our training dataset to contain
179 |     # more data. There are various transformations available in
180 |     # torchvision, see:
181 |     # https://pytorch.org/docs/stable/torchvision/transforms.html
182 | 
183 |     datapath = os.getenv('DATADIR')
184 |     if datapath is None:
185 |         print("Please set DATADIR environment variable!")
186 |         sys.exit(1)
187 |     datapath = os.path.join(datapath, 'gtsrb/train-5535')
188 | 
189 |     input_image_size = (75, 75)
190 | 
191 |     data_transform = transforms.Compose([
192 |             transforms.Resize(input_image_size),
193 |             transforms.RandomAffine(degrees=0, translate=None,
194 |                                     scale=(0.8, 1.2), shear=0.2),
195 |             # transforms.RandomHorizontalFlip(),
196 |             transforms.ToTensor()
197 |         ])
198 | 
199 |     noop_transform = transforms.Compose([
200 |             transforms.Resize(input_image_size),
201 |             transforms.ToTensor()
202 |         ])
203 | 
204 |     # Data loaders
205 |     batch_size = 25
206 | 
207 |     print('Train: ', end="")
208 |     train_dataset = datasets.ImageFolder(root=datapath+'/train',
209 |                                          transform=data_transform)
210 |     train_loader = DataLoader(train_dataset, batch_size=batch_size,
211 |                               shuffle=True, num_workers=4)
212 |     print('Found', len(train_dataset), 'images belonging to',
213 |           len(train_dataset.classes), 'classes')
214 | 
215 |     print('Validation: ', end="")
216 |     validation_dataset = ImageFolderRemoveDirs(root=datapath+'/validation',
217 |                                                transform=noop_transform,
218 |                                                remove_dirs=['00027', '00039'])
219 |     validation_loader = DataLoader(validation_dataset, batch_size=batch_size,
220 |                                    shuffle=False, num_workers=4)
221 |     print('Found', len(validation_dataset), 'images belonging to',
222 |           len(validation_dataset.classes), 'classes')
223 | 
224 |     print('Test: ', end="")
225 |     test_dataset = datasets.ImageFolder(root=datapath+'/test',
226 |                                         transform=noop_transform)
227 |     test_loader = DataLoader(test_dataset, batch_size=batch_size,
228 |                              shuffle=False, num_workers=4)
229 |     print('Found', len(test_dataset), 'images belonging to',
230 |           len(test_dataset.classes), 'classes')
231 | 
232 |     # Define the network and training parameters
233 |     model = PretrainedNet()
234 |     model = model.to(device)
235 |     optimizer = optim.SGD(model.parameters(), lr=0.01)
236 |     criterion = nn.CrossEntropyLoss()
237 | 
238 |     print(model)
239 | 
240 |     num_epochs = 20
241 | 
242 |     # Training loop
243 |     start_time = datetime.now()
244 |     for epoch in range(num_epochs):
245 |         train_ret = train(train_loader, model, criterion, optimizer)
246 |         log_measures(train_ret, log, "train", epoch)
247 | 
248 |         val_ret = test(validation_loader, model, criterion)
249 |         log_measures(val_ret, log, "val", epoch)
250 |         print(f"Epoch {epoch+1}: "
251 |               f"train accuracy: {train_ret['accuracy']:.2%}, "
252 |               f"val accuracy: {val_ret['accuracy']:.2%}")
253 | 
254 |     end_time = datetime.now()
255 |     print('Total training time: {}.'.format(end_time - start_time))
256 | 
257 |     # Inference
258 |     ret = test(test_loader, model, criterion)
259 |     print("\nTesting (pretrained, before fine-tuning): "
260 |           f"accuracy: {ret['accuracy']:.2%}\n")
261 | 
262 |     #  Fine-tuning
263 |     #
264 |     # Once the top layers have learned some reasonable weights, we can
265 |     # continue training by unfreezing the last convolution block of
266 |     # VGG16 so that it may adapt to our data. The learning rate should
267 |     # be smaller than usual.
268 |     #
269 |     # Below we loop over all layers and set only the last three Conv2d
270 |     # layers to trainable. In the printout we mark trainable layers
271 |     # with '+', frozen with '-'.  Other layers don't have trainable
272 |     # parameters.
273 | 
274 |     print("Marking layers for training (+) or frozen (-):")
275 |     for name, layer in model.vgg_features.named_children():
276 |         note = ' '
277 |         for param in layer.parameters():
278 |             note = '-'
279 |             if int(name) >= 24:
280 |                 param.requires_grad = True
281 |                 note = '+'
282 |         print(name, note, layer, len(param))
283 | 
284 |     # We set up the training, note that we need to give only the
285 |     # parameters that are set to be trainable.
286 |     params = filter(lambda p: p.requires_grad, model.parameters())
287 |     #optimizer = optim.SGD(model.parameters(), lr=1e-3)
288 |     optimizer = optim.RMSprop(params, lr=1e-5)
289 |     criterion = nn.CrossEntropyLoss()
290 | 
291 |     # Note that before continuing the training, we create a separate
292 |     # TensorBoard log directory.
293 |     if log is not None:
294 |         logdir_pt = logdir + '-pretrained-finetune'
295 |         os.makedirs(logdir_pt)
296 |         log = tensorboardX.SummaryWriter(logdir_pt)
297 | 
298 |     prev_epochs = num_epochs
299 |     num_epochs = 20
300 | 
301 |     start_time = datetime.now()
302 |     for epoch in range(prev_epochs, prev_epochs+num_epochs):
303 |         train_ret = train(train_loader, model, criterion, optimizer)
304 |         log_measures(train_ret, log, "train", epoch)
305 | 
306 |         val_ret = test(validation_loader, model, criterion)
307 |         log_measures(val_ret, log, "val", epoch)
308 | 
309 |         print(f"Epoch {epoch+1}: "
310 |               f"train loss: {train_ret['loss']:.6f} "
311 |               f"train accuracy: {train_ret['accuracy']:.2%}, "
312 |               f"val accuracy: {val_ret['accuracy']:.2%}")
313 | 
314 |     end_time = datetime.now()
315 |     print('Total fine-tuning time: {}.'.format(end_time - start_time))
316 | 
317 |     # Inference
318 |     ret = test(test_loader, model, criterion)
319 |     print("\nTesting (pretrained, after fine-tuning): "
320 |           f"accuracy: {ret['accuracy']:.2%}\n")
321 | 
322 | 
323 | if __name__ == "__main__":
324 |     main()
325 | 


--------------------------------------------------------------------------------
/day2/pytorch_gtsrb_cnn_simple.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | # # Traffic sign classification with CNNs
  5 | #
  6 | # In this script, we'll train a convolutional neural network (CNN) to
  7 | # classify images of traffic signs from the German Traffic Sign
  8 | # Recognition Benchmark:
  9 | # https://benchmark.ini.rub.de/gtsrb_news.html
 10 | #
 11 | # ## Option 1: Train a small CNN from scratch
 12 | #
 13 | # Similarly as with MNIST digits, we can start from scratch and train
 14 | # a CNN for the classification task. However, due to the small number
 15 | # of training images, a large network will easily overfit, regardless
 16 | # of the data augmentation.
 17 | 
 18 | import torch
 19 | import torch.nn as nn
 20 | import torch.nn.functional as F
 21 | import torch.optim as optim
 22 | from torch.utils.data import DataLoader
 23 | from torchvision import datasets, transforms
 24 | from packaging.version import Version as LV
 25 | from datetime import datetime
 26 | import os
 27 | import sys
 28 | 
 29 | torch.manual_seed(42)
 30 | 
 31 | if torch.cuda.is_available():
 32 |     device = torch.device('cuda')
 33 | else:
 34 |     device = torch.device('cpu')
 35 | 
 36 | print('Using PyTorch version:', torch.__version__, ' Device:', device)
 37 | assert(LV(torch.__version__) >= LV("1.0.0"))
 38 | 
 39 | 
 40 | class Net(nn.Module):
 41 |     def __init__(self):
 42 |         super(Net, self).__init__()
 43 |         self.layers = nn.Sequential(
 44 |             nn.Conv2d(3, 32, (3, 3)),
 45 |             nn.ReLU(),
 46 |             nn.MaxPool2d((2, 2)),
 47 | 
 48 |             nn.Conv2d(32, 32, (3, 3)),
 49 |             nn.ReLU(),
 50 |             nn.MaxPool2d((2, 2)),
 51 | 
 52 |             nn.Conv2d(32, 64, (3, 3)),
 53 |             nn.ReLU(),
 54 |             nn.MaxPool2d((2, 2)),
 55 | 
 56 |             nn.Flatten(),             # flatten 2D to 1D
 57 |             nn.Linear(7*7*64, 128),
 58 |             nn.ReLU(),
 59 |             nn.Dropout(0.5),
 60 |             nn.Linear(128, 43)
 61 |         )
 62 | 
 63 |     def forward(self, x):
 64 |         return self.layers(x).squeeze()
 65 | 
 66 | 
 67 | #
 68 | # There are some broken folders, but we need to keep the class indices
 69 | # the same. We created a custom Dataset class to handle this.
 70 | #
 71 | class ImageFolderRemoveDirs(datasets.ImageFolder):
 72 |     def __init__(self, root, transform, remove_dirs):
 73 |         self.remove_dirs = remove_dirs
 74 |         super(ImageFolderRemoveDirs, self).__init__(root=root, transform=transform)
 75 | 
 76 |     def find_classes(self, directory):
 77 |         classes, class_to_idx = super(ImageFolderRemoveDirs, self).find_classes(directory)
 78 |         for d in self.remove_dirs:
 79 |             print('Removing directory', d)
 80 |             classes.remove(d)
 81 |             del class_to_idx[d]
 82 |         return classes, class_to_idx
 83 | 
 84 | 
 85 | def correct(output, target):
 86 |     predicted = output.argmax(1) # pick class with largest network output
 87 |     correct_ones = (predicted == target).type(torch.float)
 88 |     return correct_ones.sum().item() # count number of correct ones
 89 | 
 90 | 
 91 | def train(data_loader, model, criterion, optimizer):
 92 |     model.train()
 93 | 
 94 |     num_batches = 0
 95 |     num_items = 0
 96 | 
 97 |     total_loss = 0
 98 |     total_correct = 0
 99 |     for data, target in data_loader:
100 |         # Copy data and targets to GPU
101 |         data = data.to(device)
102 |         target = target.to(device)
103 | 
104 |         # Do a forward pass
105 |         output = model(data)
106 | 
107 |         # Calculate the loss
108 |         loss = criterion(output, target)
109 |         total_loss += loss
110 |         num_batches += 1
111 | 
112 |         # Count number of correct
113 |         total_correct += correct(output, target)
114 |         num_items += len(target)
115 | 
116 |         # Backpropagation
117 |         loss.backward()
118 |         optimizer.step()
119 |         optimizer.zero_grad()
120 | 
121 |     return {
122 |         'loss': total_loss/num_batches,
123 |         'accuracy': total_correct/num_items
124 |         }
125 | 
126 | 
127 | def test(test_loader, model, criterion):
128 |     model.eval()
129 | 
130 |     num_batches = len(test_loader)
131 |     num_items = len(test_loader.dataset)
132 | 
133 |     test_loss = 0
134 |     total_correct = 0
135 | 
136 |     with torch.no_grad():
137 |         for data, target in test_loader:
138 |             # Copy data and targets to GPU
139 |             data = data.to(device)
140 |             target = target.to(device)
141 | 
142 |             # Do a forward pass
143 |             output = model(data)
144 | 
145 |             # Calculate the loss
146 |             loss = criterion(output, target)
147 |             test_loss += loss.item()
148 | 
149 |             # Count number of correct digits
150 |             total_correct += correct(output, target)
151 | 
152 |     return {
153 |         'loss': test_loss/num_batches,
154 |         'accuracy': total_correct/num_items
155 |     }
156 | 
157 | 
158 | def log_measures(ret, log, prefix, epoch):
159 |     if log is not None:
160 |         for key, value in ret.items():
161 |             log.add_scalar(prefix + "_" + key, value, epoch)
162 | 
163 | 
164 | def main():
165 |     # TensorBoard for logging
166 |     try:
167 |         import tensorboardX
168 |         time_str = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
169 |         logdir = os.path.join(os.getcwd(), "logs", "gtsrb-" + time_str)
170 |         print('TensorBoard log directory:', logdir)
171 |         os.makedirs(logdir)
172 |         log = tensorboardX.SummaryWriter(logdir)
173 |     except ImportError:
174 |         log = None
175 | 
176 | 
177 |     # The training dataset consists of 5535 images of traffic signs of
178 |     # varying size. There are 43 different types of traffic signs.
179 |     #
180 |     # The validation and test sets consist of 999 and 12630 images,
181 |     # respectively.
182 | 
183 |     # First, we'll resize all training and validation images to a fixed
184 |     # size.
185 |     #
186 |     # Then, to make the most of our limited number of training examples,
187 |     # we'll apply random transformations to them each time we are looping
188 |     # over them. This way, we "augment" our training dataset to contain
189 |     # more data. There are various transformations available in
190 |     # torchvision, see:
191 |     # https://pytorch.org/docs/stable/torchvision/transforms.html
192 | 
193 |     datapath = os.getenv('DATADIR')
194 |     if datapath is None:
195 |         print("Please set DATADIR environment variable!")
196 |         sys.exit(1)
197 |     datapath = os.path.join(datapath, 'gtsrb/train-5535')
198 | 
199 |     input_image_size = (75, 75)
200 | 
201 |     data_transform = transforms.Compose([
202 |             transforms.Resize(input_image_size),
203 |             transforms.RandomAffine(degrees=0, translate=None,
204 |                                     scale=(0.8, 1.2), shear=0.2),
205 |             # transforms.RandomHorizontalFlip(),
206 |             transforms.ToTensor()
207 |         ])
208 | 
209 |     noop_transform = transforms.Compose([
210 |             transforms.Resize(input_image_size),
211 |             transforms.ToTensor()
212 |         ])
213 | 
214 |     # Data loaders
215 |     batch_size = 50
216 | 
217 |     print('Train: ', end="")
218 |     train_dataset = datasets.ImageFolder(root=datapath+'/train',
219 |                                          transform=data_transform)
220 |     train_loader = DataLoader(train_dataset, batch_size=batch_size,
221 |                               shuffle=True, num_workers=4)
222 |     print('Found', len(train_dataset), 'images belonging to',
223 |           len(train_dataset.classes), 'classes')
224 | 
225 |     print('Validation: ', end="")
226 |     validation_dataset = ImageFolderRemoveDirs(root=datapath+'/validation',
227 |                                                transform=noop_transform,
228 |                                                remove_dirs=['00027', '00039'])
229 |     validation_loader = DataLoader(validation_dataset, batch_size=batch_size,
230 |                                    shuffle=False, num_workers=4)
231 |     print('Found', len(validation_dataset), 'images belonging to',
232 |           len(validation_dataset.classes), 'classes')
233 | 
234 |     print('Test: ', end="")
235 |     test_dataset = datasets.ImageFolder(root=datapath+'/test',
236 |                                         transform=noop_transform)
237 |     test_loader = DataLoader(test_dataset, batch_size=batch_size,
238 |                              shuffle=False, num_workers=4)
239 |     print('Found', len(test_dataset), 'images belonging to',
240 |           len(test_dataset.classes), 'classes')
241 | 
242 |     # Define the network and training parameters
243 |     model = Net()
244 |     model = model.to(device)
245 |     optimizer = optim.SGD(model.parameters(), lr=0.03)
246 |     #optimizer = optim.RMSprop(model.parameters())
247 |     criterion = nn.CrossEntropyLoss()
248 | 
249 |     print(model)
250 | 
251 |     num_epochs = 50
252 | 
253 |     # Training loop
254 |     start_time = datetime.now()
255 |     for epoch in range(num_epochs):
256 |         train_ret = train(train_loader, model, criterion, optimizer)
257 |         log_measures(train_ret, log, "train", epoch)
258 | 
259 |         val_ret = test(validation_loader, model, criterion)
260 |         log_measures(val_ret, log, "val", epoch)
261 |         print(f"Epoch {epoch+1}: "
262 |               f"train loss: {train_ret['loss']:.6f} "
263 |               f"train accuracy: {train_ret['accuracy']:.2%}, "
264 |               f"val accuracy: {val_ret['accuracy']:.2%}")
265 | 
266 |     end_time = datetime.now()
267 |     print('Total training time: {}.'.format(end_time - start_time))
268 | 
269 |     # Inference
270 |     ret = test(test_loader, model, criterion)
271 |     print(f"\nTesting: accuracy: {ret['accuracy']:.2%}")
272 | 
273 | 
274 | if __name__ == "__main__":
275 |     main()
276 | 


--------------------------------------------------------------------------------
/day2/pytorch_gtsrb_vit.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | # # Traffic sign classification with CNNs
  5 | # 
  6 | # In this script, we'll finetune a Vision Transformer
  7 | # (https://arxiv.org/abs/2010.11929) (ViT) to classify images of
  8 | # traffic signs using PyTorch and HuggingFace Transformers:
  9 | # https://github.com/huggingface/transformers
 10 | 
 11 | import torch
 12 | import torch.nn as nn
 13 | import torch.optim as optim
 14 | from torch.utils.data import DataLoader
 15 | from torchvision import datasets, transforms
 16 | from packaging.version import Version as LV
 17 | from datetime import datetime
 18 | import os
 19 | import sys
 20 | 
 21 | from transformers import AutoImageProcessor, ViTForImageClassification
 22 | from transformers import __version__ as transformers_version
 23 | 
 24 | torch.manual_seed(42)
 25 | 
 26 | if torch.cuda.is_available():
 27 |     device = torch.device('cuda')
 28 | else:
 29 |     device = torch.device('cpu')
 30 | 
 31 | print('Using PyTorch version:', torch.__version__,
 32 |       'Transformers version:', transformers_version,
 33 |       'Device:', device)
 34 | assert LV(torch.__version__) >= LV("1.0.0")
 35 | 
 36 | #
 37 | # There are some broken folders, but we need to keep the class indices
 38 | # the same. We created a custom Dataset class to handle this.
 39 | #
 40 | class ImageFolderRemoveDirs(datasets.ImageFolder):
 41 |     def __init__(self, root, transform, remove_dirs):
 42 |         self.remove_dirs = remove_dirs
 43 |         super(ImageFolderRemoveDirs, self).__init__(root=root, transform=transform)
 44 | 
 45 |     def find_classes(self, directory):
 46 |         classes, class_to_idx = super(ImageFolderRemoveDirs, self).find_classes(directory)
 47 |         for d in self.remove_dirs:
 48 |             print('Removing directory', d)
 49 |             classes.remove(d)
 50 |             del class_to_idx[d]
 51 |         return classes, class_to_idx
 52 | 
 53 | 
 54 | def correct(output, target):
 55 |     predicted = output.argmax(1) # pick class with largest network output
 56 |     correct_ones = (predicted == target).type(torch.float)
 57 |     return correct_ones.sum().item() # count number of correct ones
 58 | 
 59 | def train(data_loader, model, criterion, optimizer):
 60 |     model.train()
 61 | 
 62 |     num_batches = 0
 63 |     num_items = 0
 64 | 
 65 |     total_loss = 0
 66 |     total_correct = 0
 67 | 
 68 |     for data, target in data_loader:
 69 |         # Copy data and targets to GPU
 70 |         data = data.to(device)
 71 |         target = target.to(device)
 72 | 
 73 |         # Do a forward pass
 74 |         output = model(data).logits #.squeeze()
 75 | 
 76 |         # Calculate the loss
 77 |         loss = criterion(output, target)
 78 |         total_loss += loss
 79 |         num_batches += 1
 80 | 
 81 |         # Count number of correct
 82 |         total_correct += correct(output, target)
 83 |         num_items += len(target)
 84 | 
 85 |         # Backpropagation
 86 |         loss.backward()
 87 |         optimizer.step()
 88 |         optimizer.zero_grad()
 89 | 
 90 |     return {
 91 |         'loss': total_loss/num_batches,
 92 |         'accuracy': total_correct/num_items
 93 |         }
 94 | 
 95 | 
 96 | def test(test_loader, model, criterion):
 97 |     model.eval()
 98 | 
 99 |     num_batches = len(test_loader)
100 |     num_items = len(test_loader.dataset)
101 | 
102 |     test_loss = 0
103 |     total_correct = 0
104 | 
105 |     with torch.no_grad():
106 |         for data, target in test_loader:
107 |             # Copy data and targets to GPU
108 |             data = data.to(device)
109 |             target = target.to(device)
110 | 
111 |             # Do a forward pass
112 |             output = model(data).logits #.squeeze()
113 | 
114 |             # Calculate the loss
115 |             loss = criterion(output, target)
116 |             test_loss += loss.item()
117 | 
118 |             # Count number of correct digits
119 |             total_correct += correct(output, target)
120 | 
121 |     return {
122 |         'loss': test_loss/num_batches,
123 |         'accuracy': total_correct/num_items
124 |     }
125 | 
126 | 
127 | def log_measures(ret, log, prefix, epoch):
128 |     if log is not None:
129 |         for key, value in ret.items():
130 |             log.add_scalar(prefix + "_" + key, value, epoch)
131 | 
132 | 
133 | class ImageClassificationCollator:
134 |     def __init__(self, processor):
135 |         self.processor = processor
136 | 
137 |     def __call__(self, batch):
138 |         data = self.processor([x[0] for x in batch], do_rescale=False,
139 |                               return_tensors='pt').pixel_values
140 |         targets = torch.tensor([x[1] for x in batch]) #, dtype=torch.float32)
141 |         return data, targets
142 | 
143 | 
144 | def main():
145 |     # TensorBoard for logging
146 |     try:
147 |         import tensorboardX
148 |         time_str = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
149 |         logdir = os.path.join(os.getcwd(), "logs", "gtsrb-vit-" + time_str)
150 |         print('TensorBoard log directory:', logdir)
151 |         os.makedirs(logdir)
152 |         log = tensorboardX.SummaryWriter(logdir)
153 |     except ImportError:
154 |         log = None
155 | 
156 |     # The training dataset consists of 2000 images of dogs and cats, split
157 |     # in half.  In addition, the validation set consists of 1000 images,
158 |     # and the test set of 22000 images.
159 |     #
160 |     # First, we'll resize all training and validation images to a fixed
161 |     # size.
162 |     #
163 |     # Then, to make the most of our limited number of training examples,
164 |     # we'll apply random transformations to them each time we are looping
165 |     # over them. This way, we "augment" our training dataset to contain
166 |     # more data. There are various transformations available in
167 |     # torchvision, see:
168 |     # https://pytorch.org/docs/stable/torchvision/transforms.html
169 | 
170 |     datapath = os.getenv('DATADIR')
171 |     if datapath is None:
172 |         print("Please set DATADIR environment variable!")
173 |         sys.exit(1)
174 |     datapath = os.path.join(datapath, 'gtsrb/train-5535')
175 | 
176 |     # Data loaders
177 |     batch_size = 32
178 | 
179 |     vitmodel = 'google/vit-base-patch16-224'
180 |     processor = AutoImageProcessor.from_pretrained(vitmodel)
181 |     collator = ImageClassificationCollator(processor)
182 | 
183 |     print('Train: ', end="")
184 |     train_dataset = datasets.ImageFolder(root=datapath+'/train',
185 |                                          transform=transforms.ToTensor())
186 |     train_loader = DataLoader(train_dataset, batch_size=batch_size,
187 |                               shuffle=True, num_workers=4,
188 |                               collate_fn=collator)
189 |     print('Found', len(train_dataset), 'images belonging to',
190 |           len(train_dataset.classes), 'classes')
191 | 
192 |     print('Validation: ', end="")
193 |     validation_dataset = ImageFolderRemoveDirs(root=datapath+'/validation',
194 |                                                transform=transforms.ToTensor(),
195 |                                                remove_dirs=['00027', '00039'])
196 |     validation_loader = DataLoader(validation_dataset, batch_size=batch_size,
197 |                                    shuffle=False, num_workers=4,
198 |                                    collate_fn=collator)
199 |     print('Found', len(validation_dataset), 'images belonging to',
200 |           len(validation_dataset.classes), 'classes')
201 | 
202 |     print('Test: ', end="")
203 |     test_dataset = datasets.ImageFolder(root=datapath+'/test',
204 |                                         transform=transforms.ToTensor())
205 |     test_loader = DataLoader(test_dataset, batch_size=batch_size,
206 |                              shuffle=False, num_workers=4,
207 |                              collate_fn=collator)
208 |     print('Found', len(test_dataset), 'images belonging to',
209 |           len(test_dataset.classes), 'classes')
210 | 
211 |     # Define the network and training parameters
212 |     model = ViTForImageClassification.from_pretrained(
213 |         vitmodel, num_labels=43, ignore_mismatched_sizes=True)
214 |     model = model.to(device)
215 |     optimizer = optim.Adam(model.parameters(), lr=1e-5)
216 |     criterion = nn.CrossEntropyLoss()
217 | 
218 |     print(model)
219 | 
220 |     num_epochs = 5
221 | 
222 |     # Training loop
223 |     start_time = datetime.now()
224 |     for epoch in range(num_epochs):
225 |         train_ret = train(train_loader, model, criterion, optimizer)
226 |         log_measures(train_ret, log, "train", epoch)
227 | 
228 |         val_ret = test(validation_loader, model, criterion)
229 |         log_measures(val_ret, log, "val", epoch)
230 |         print(f"Epoch {epoch+1}: "
231 |               f"train loss: {train_ret['loss']:.6f} "
232 |               f"train accuracy: {train_ret['accuracy']:.2%}, "
233 |               f"val accuracy: {val_ret['accuracy']:.2%}")
234 | 
235 |     end_time = datetime.now()
236 |     print('Total training time: {}.'.format(end_time - start_time))
237 | 
238 |     # Inference
239 |     ret = test(test_loader, model, criterion)
240 |     print(f"\nTesting: accuracy: {ret['accuracy']:.2%}")
241 | 
242 | 
243 | if __name__ == "__main__":
244 |     main()
245 | 


--------------------------------------------------------------------------------
/day2/pytorch_imdb_gpt.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | # # IMDB movie review text generation
  5 | #
  6 | # In this script, we'll fine-tune a GPT3-like model to generate more
  7 | # movie reviews based on a prompt.
  8 | 
  9 | import math
 10 | import os
 11 | import sys
 12 | import time
 13 | from pprint import pprint
 14 | 
 15 | import torch
 16 | from datasets import load_dataset
 17 | from transformers import (
 18 |     AutoModelForCausalLM,
 19 |     AutoTokenizer,
 20 |     DataCollatorForLanguageModeling,
 21 |     PreTrainedTokenizerFast,
 22 |     Trainer,
 23 |     TrainingArguments
 24 | )
 25 | 
 26 | 
 27 | def preprocess_data(train_dataset, eval_dataset,
 28 |                     tokenizer: PreTrainedTokenizerFast,
 29 |                     training_args: TrainingArguments):
 30 |     # IMDb examples are presented as a dictionary:
 31 |     # {
 32 |     #    'text': the review text as a string,
 33 |     #    'label': a sentiment label as an integer,
 34 |     # }.
 35 |     #
 36 |     # We tokenize the text and add the special token for indicating
 37 |     # the end of the text at the end of each review. We also truncate
 38 |     # reviews to a maximum length to avoid excessively long sequences
 39 |     # during training.  As we have no use for the label, we discard
 40 |     # it.
 41 |     max_length = 128
 42 | 
 43 |     def tokenize(x):
 44 |         texts = [example + tokenizer.eos_token for example in x["text"]]
 45 |         return tokenizer(
 46 |             texts,
 47 |             max_length=max_length,
 48 |             truncation=True,
 49 |             add_special_tokens=True,
 50 |             return_overflowing_tokens=True,
 51 |             return_length=False,
 52 |         )
 53 | 
 54 |     train_dataset_tokenized = train_dataset.map(
 55 |         tokenize,
 56 |         remove_columns=["text", "label"],
 57 |         batched=True,
 58 |         batch_size=training_args.train_batch_size,
 59 |         num_proc=training_args.dataloader_num_workers,
 60 |     )
 61 | 
 62 |     eval_dataset_tokenized = eval_dataset.map(
 63 |         tokenize,
 64 |         remove_columns=["text", "label"],
 65 |         batched=True,
 66 |         num_proc=training_args.dataloader_num_workers,
 67 |     )
 68 | 
 69 |     # We split a small amount of training data as "validation" test
 70 |     # set to keep track of evaluation of the loss on non-training data
 71 |     # during training.  This is purely because computing the loss on
 72 |     # the full evaluation dataset takes much longer.
 73 |     train_validate_splits = train_dataset_tokenized.train_test_split(
 74 |         test_size=1000, seed=42, keep_in_memory=True
 75 |     )
 76 |     train_dataset_tokenized = train_validate_splits["train"]
 77 |     validate_dataset_tokenized = train_validate_splits["test"]
 78 | 
 79 |     return (train_dataset_tokenized, validate_dataset_tokenized,
 80 |             eval_dataset_tokenized)
 81 | 
 82 | 
 83 | if __name__ == "__main__":
 84 |     # Determine which device to train the model on, CPU or GPU
 85 |     print('Using PyTorch version:', torch.__version__)
 86 |     if torch.cuda.is_available():
 87 |         device = torch.device('cuda')
 88 |         print('Using GPU, device name:', torch.cuda.get_device_name(0))
 89 |     else:
 90 |         print('No GPU found, using CPU instead.')
 91 |         device = torch.device('cpu')
 92 | 
 93 |     # Use DATADIR environment variable to set path for data
 94 |     datapath = os.getenv('DATADIR')
 95 |     if datapath is None:
 96 |         print("Please set DATADIR environment variable!")
 97 |         sys.exit(1)
 98 |     user_datapath = os.path.join(datapath, "users", os.getenv('USER'))
 99 |     os.makedirs(user_datapath, exist_ok=True)
100 | 
101 |     # ## IMDB data set
102 |     #
103 |     # Next we'll load the IMDB data set, this time using the Hugging Face
104 |     # datasets library: https://huggingface.co/docs/datasets/index.
105 |     #
106 |     # The dataset contains 100,000 movies reviews from the Internet Movie
107 |     # Database, split into 25,000 reviews for training and 25,000 reviews
108 |     # for testing and 50,000 without labels (unsupervised).
109 | 
110 |     train_dataset = load_dataset("imdb", keep_in_memory=True,
111 |                                  split="train+unsupervised")
112 |     test_dataset = load_dataset("imdb", keep_in_memory=True,
113 |                                 split="test")
114 | 
115 |     # Let's print one sample from the dataset.
116 |     print('Sample from dataset')
117 |     pprint(train_dataset[200])
118 | 
119 |     # #### Loading the GPT-3 model
120 |     #
121 |     # We'll use the gpt-neo models from the Hugging Face library:
122 |     # https://huggingface.co/EleutherAI/gpt-neo-125m
123 |     pretrained_model = "EleutherAI/gpt-neo-125m"
124 | 
125 |     # If you have time, you can also test with a larger 1.3 billion
126 |     # parameter version of the same model:
127 |     # https://huggingface.co/EleutherAI/gpt-neo-1.3B
128 | 
129 |     # pretrained_model = "EleutherAI/gpt-neo-1.3B"
130 | 
131 |     # Load the tokenizer associated with the model
132 |     print("Loading model and tokenizer")
133 |     start = time.time()
134 |     tokenizer = AutoTokenizer.from_pretrained(pretrained_model, use_fast=True)
135 |     tokenizer.pad_token = tokenizer.eos_token
136 | 
137 |     # Load the actual base model from Hugging Face
138 |     model = AutoModelForCausalLM.from_pretrained(pretrained_model)
139 |     model.to(device)
140 |     stop = time.time()
141 |     print(f"Loading model and tokenizer took: {stop-start:.2f} seconds")
142 | 
143 |     # Setting up the training configuration
144 |     train_batch_size = 32
145 |     test_batch_size = 128
146 | 
147 |     output_dir = os.path.join(user_datapath, "gpt-imdb-model")
148 |     training_args = TrainingArguments(
149 |         output_dir=output_dir,
150 |         overwrite_output_dir=True,
151 |         save_strategy="steps",  # save a snapshot of the model every 
152 |         save_steps=100,         # 100 steps
153 |         save_total_limit=4,     # only keep the last 4 snapshots
154 |         logging_dir="logs",
155 |         eval_strategy="steps",
156 |         eval_steps=1000,  # compute validation loss every 1000 steps
157 |         learning_rate=2e-5,
158 |         weight_decay=0.01,
159 |         bf16=True,  # use 16-bit floating point precision
160 |         per_device_train_batch_size=train_batch_size,
161 |         per_device_eval_batch_size=test_batch_size,
162 |         max_steps=5000,
163 |         dataloader_num_workers=7,
164 |         dataloader_pin_memory=True,
165 |         report_to=["tensorboard"],  # log statistics for tensorboard
166 |     )
167 | 
168 |     # ## Preprocessing of training data
169 |     #
170 |     # We tokenize the data into torch tensors, split training into
171 |     # training and validation and set up a collator that is able to
172 |     # arrange single data samples into batches.
173 | 
174 |     (train_dataset_tokenized, validate_dataset_tokenized,
175 |      test_dataset_tokenized) = preprocess_data(train_dataset,
176 |                                                test_dataset,
177 |                                                tokenizer,
178 |                                                training_args)
179 | 
180 |     collator = DataCollatorForLanguageModeling(
181 |         tokenizer, mlm=False, return_tensors="pt"
182 |     )
183 | 
184 |     # Sanity check: How does the training data look like after preprocessing?
185 |     print("Sample of tokenized data")
186 |     for b in train_dataset_tokenized:
187 |         pprint(b, compact=True)
188 |         print("Length of input_ids:", len(b["input_ids"]))
189 |         break
190 |     print("Length of dataset (tokenized)", len(train_dataset_tokenized))
191 | 
192 |     trainer = Trainer(
193 |         model=model,
194 |         args=training_args,
195 |         tokenizer=tokenizer,
196 |         data_collator=collator,
197 |         train_dataset=train_dataset_tokenized,
198 |         eval_dataset=validate_dataset_tokenized,
199 |     )
200 | 
201 |     trainer.train()
202 | 
203 |     print()
204 |     print("Training done, you can find all the model checkpoints in",
205 |           output_dir)
206 | 
207 |     with torch.no_grad():
208 |         model.eval()
209 | 
210 |         # Calculate perplexity
211 |         validate_results = trainer.evaluate()
212 |         test_results = trainer.evaluate(test_dataset_tokenized)
213 | 
214 |         print(f'Perplexity (val): {math.exp(validate_results["eval_loss"]):.2f}')
215 |         print(f'Perplexity (test): {math.exp(test_results["eval_loss"]):.2f}')
216 | 
217 |         # Let's print a few sample generated reviews
218 |         prompt = "The movie about LUMI AI Factory was great because"
219 |         input_ids = tokenizer(prompt, return_tensors='pt').input_ids.to(device)
220 |         outputs = model.generate(input_ids, do_sample=True, max_length=80,
221 |                                  num_return_sequences=4)
222 |         decoded_outputs = tokenizer.batch_decode(outputs,
223 |                                                  skip_special_tokens=True)
224 | 
225 |         print('Sample generated review:')
226 |         for txt in decoded_outputs:
227 |             print('-', txt)
228 | 


--------------------------------------------------------------------------------
/day2/pytorch_imdb_gpt_multigpu.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | # # IMDB movie review text generation
  5 | #
  6 | # In this script, we'll fine-tune a GPT3-like model to generate more
  7 | # movie reviews based on a prompt.
  8 | 
  9 | import math
 10 | import os
 11 | import sys
 12 | import time
 13 | from pprint import pprint
 14 | 
 15 | import torch
 16 | import torch.distributed as dist
 17 | 
 18 | from datasets import load_dataset
 19 | from transformers import (
 20 |     AutoModelForCausalLM,
 21 |     AutoTokenizer,
 22 |     DataCollatorForLanguageModeling,
 23 |     PreTrainedTokenizerFast,
 24 |     Trainer,
 25 |     TrainingArguments
 26 | )
 27 | 
 28 | 
 29 | def preprocess_data(train_dataset, eval_dataset,
 30 |                     tokenizer: PreTrainedTokenizerFast,
 31 |                     training_args: TrainingArguments):
 32 |     # IMDb examples are presented as a dictionary:
 33 |     # {
 34 |     #    'text': the review text as a string,
 35 |     #    'label': a sentiment label as an integer,
 36 |     # }.
 37 |     #
 38 |     # We tokenize the text and add the special token for indicating
 39 |     # the end of the text at the end of each review. We also truncate
 40 |     # reviews to a maximum length to avoid excessively long sequences
 41 |     # during training.  As we have no use for the label, we discard
 42 |     # it.
 43 |     max_length = 128
 44 | 
 45 |     def tokenize(x):
 46 |         texts = [example + tokenizer.eos_token for example in x["text"]]
 47 |         return tokenizer(
 48 |             texts,
 49 |             max_length=max_length,
 50 |             truncation=True,
 51 |             add_special_tokens=True,
 52 |             return_overflowing_tokens=True,
 53 |             return_length=False,
 54 |         )
 55 | 
 56 |     train_dataset_tokenized = train_dataset.map(
 57 |         tokenize,
 58 |         remove_columns=["text", "label"],
 59 |         batched=True,
 60 |         batch_size=training_args.train_batch_size,
 61 |         num_proc=training_args.dataloader_num_workers,
 62 |     )
 63 | 
 64 |     eval_dataset_tokenized = eval_dataset.map(
 65 |         tokenize,
 66 |         remove_columns=["text", "label"],
 67 |         batched=True,
 68 |         num_proc=training_args.dataloader_num_workers,
 69 |     )
 70 | 
 71 |     # We split a small amount of training data as "validation" test
 72 |     # set to keep track of evaluation of the loss on non-training data
 73 |     # during training.  This is purely because computing the loss on
 74 |     # the full evaluation dataset takes much longer.
 75 |     train_validate_splits = train_dataset_tokenized.train_test_split(
 76 |         test_size=1000, seed=42, keep_in_memory=True
 77 |     )
 78 |     train_dataset_tokenized = train_validate_splits["train"]
 79 |     validate_dataset_tokenized = train_validate_splits["test"]
 80 | 
 81 |     return (train_dataset_tokenized, validate_dataset_tokenized,
 82 |             eval_dataset_tokenized)
 83 | 
 84 | 
 85 | if __name__ == "__main__":
 86 |     # Determine which device to train the model on, CPU or GPU
 87 |     print('Using PyTorch version:', torch.__version__)
 88 |     if torch.cuda.is_available():
 89 |         device = torch.device('cuda')
 90 |         print('Using GPU, device name:', torch.cuda.get_device_name(0))
 91 |     else:
 92 |         print('No GPU found, using CPU instead.')
 93 |         device = torch.device('cpu')
 94 | 
 95 |     dist.init_process_group(backend='nccl')
 96 |     rank_0 = dist.get_rank() == 0
 97 | 
 98 |     # Use DATADIR environment variable to set path for data
 99 |     datapath = os.getenv('DATADIR')
100 |     if datapath is None:
101 |         print("Please set DATADIR environment variable!")
102 |         sys.exit(1)
103 |     user_datapath = os.path.join(datapath, "users", os.getenv('USER'))
104 |     os.makedirs(user_datapath, exist_ok=True)
105 | 
106 |     # ## IMDB data set
107 |     #
108 |     # Next we'll load the IMDB data set, this time using the Hugging Face
109 |     # datasets library: https://huggingface.co/docs/datasets/index.
110 |     #
111 |     # The dataset contains 100,000 movies reviews from the Internet Movie
112 |     # Database, split into 25,000 reviews for training and 25,000 reviews
113 |     # for testing and 50,000 without labels (unsupervised).
114 | 
115 |     train_dataset = load_dataset("imdb", keep_in_memory=True,
116 |                                  split="train+unsupervised")
117 |     test_dataset = load_dataset("imdb", keep_in_memory=True,
118 |                                 split="test")
119 | 
120 |     # Let's print one sample from the dataset.
121 |     if rank_0:
122 |         print('Sample from dataset')
123 |         pprint(train_dataset[200])
124 | 
125 |     # #### Loading the GPT-3 model
126 |     #
127 |     # We'll use the gpt-neo models from the Hugging Face library:
128 |     # https://huggingface.co/EleutherAI/gpt-neo-125m
129 |     pretrained_model = "EleutherAI/gpt-neo-125m"
130 | 
131 |     # If you have time, you can also test with a larger 1.3 billion
132 |     # parameter version of the same model:
133 |     # https://huggingface.co/EleutherAI/gpt-neo-1.3B
134 |     #pretrained_model = "EleutherAI/gpt-neo-1.3B"
135 | 
136 |     # Load the tokenizer associated with the model
137 |     print("Loading model and tokenizer")
138 |     start = time.time()
139 |     tokenizer = AutoTokenizer.from_pretrained(pretrained_model, use_fast=True)
140 |     tokenizer.pad_token = tokenizer.eos_token
141 | 
142 |     # Load the actual base model from Hugging Face
143 |     model = AutoModelForCausalLM.from_pretrained(pretrained_model)
144 |     model.to(device)
145 |     stop = time.time()
146 |     print(f"Loading model and tokenizer took: {stop-start:.2f} seconds")
147 | 
148 |     # Setting up the training configuration
149 |     train_batch_size = 32
150 |     test_batch_size = 128
151 | 
152 |     output_dir = os.path.join(user_datapath, "gpt-imdb-model")
153 |     training_args = TrainingArguments(
154 |         output_dir=output_dir,
155 |         overwrite_output_dir=True,
156 |         save_strategy="steps",  # save a snapshot of the model every 
157 |         save_steps=100,         # 100 steps
158 |         save_total_limit=4,     # only keep the last 4 snapshots
159 |         logging_dir="logs",
160 |         eval_strategy="steps",
161 |         eval_steps=1000,  # compute validation loss every 1000 steps
162 |         learning_rate=2e-5,
163 |         weight_decay=0.01,
164 |         bf16=True,  # use 16-bit floating point precision
165 |         per_device_train_batch_size=train_batch_size,
166 |         per_device_eval_batch_size=test_batch_size,
167 |         max_steps=5000,
168 |         dataloader_num_workers=7,
169 |         dataloader_pin_memory=True,
170 |         report_to=["tensorboard"],  # log statistics for tensorboard
171 |     )
172 | 
173 |     # ## Preprocessing of training data
174 |     #
175 |     # We tokenize the data into torch tensors, split training into
176 |     # training and validation and set up a collator that is able to
177 |     # arrange single data samples into batches.
178 | 
179 |     (train_dataset_tokenized, validate_dataset_tokenized,
180 |      test_dataset_tokenized) = preprocess_data(train_dataset,
181 |                                                test_dataset,
182 |                                                tokenizer,
183 |                                                training_args)
184 | 
185 |     collator = DataCollatorForLanguageModeling(
186 |         tokenizer, mlm=False, return_tensors="pt"
187 |     )
188 | 
189 |     # Sanity check: How does the training data look like after preprocessing?
190 |     if rank_0:
191 |         print("Sample of tokenized data")
192 |         for b in train_dataset_tokenized:
193 |             pprint(b, compact=True)
194 |             print("Length of input_ids:", len(b["input_ids"]))
195 |             break
196 |         print("Length of dataset (tokenized)", len(train_dataset_tokenized))
197 | 
198 |     trainer = Trainer(
199 |         model=model,
200 |         args=training_args,
201 |         tokenizer=tokenizer,
202 |         data_collator=collator,
203 |         train_dataset=train_dataset_tokenized,
204 |         eval_dataset=validate_dataset_tokenized,
205 |     )
206 | 
207 |     trainer.train()
208 | 
209 |     if rank_0:
210 |         print()
211 |         print("Training done, you can find all the model checkpoints in",
212 |               output_dir)
213 | 
214 |     with torch.no_grad():
215 |         model.eval()
216 | 
217 |         # Calculate perplexity
218 |         validate_results = trainer.evaluate()
219 |         test_results = trainer.evaluate(test_dataset_tokenized)
220 | 
221 |         if rank_0:
222 |             print(f'Perplexity (val): {math.exp(validate_results["eval_loss"]):.2f}')
223 |             print(f'Perplexity (test): {math.exp(test_results["eval_loss"]):.2f}')
224 | 
225 |             # Let's print a few sample generated reviews
226 |             prompt = "The movie about LUMI AI Factory was great because"
227 |             input_ids = tokenizer(prompt, return_tensors='pt').input_ids.to(device)
228 |             outputs = model.generate(input_ids, do_sample=True, max_length=80,
229 |                                      num_return_sequences=4)
230 |             decoded_outputs = tokenizer.batch_decode(outputs,
231 |                                                      skip_special_tokens=True)
232 |             
233 |             print('Sample generated review:')
234 |             for txt in decoded_outputs:
235 |                 print('-', txt)
236 | 


--------------------------------------------------------------------------------
/day2/pytorch_test.py:
--------------------------------------------------------------------------------
 1 | # Script for testing the PyTorch setup
 2 | 
 3 | import torch
 4 | import torch.nn as nn
 5 | import torchvision
 6 | from torch.utils.data import DataLoader
 7 | from torchvision import datasets
 8 | import torchvision.transforms as transforms
 9 | 
10 | from packaging.version import Version as LV
11 | from tqdm import tqdm
12 | 
13 | import numpy as np
14 | import matplotlib.pyplot as plt
15 | import seaborn as sns
16 | sns.set()
17 | 
18 | print('Using PyTorch version:', torch.__version__)
19 | assert(LV(torch.__version__) >= LV("2.0"))
20 | 
21 | if torch.cuda.is_available():
22 |     print('Using GPU, device name:', torch.cuda.get_device_name(0))
23 |     device = torch.device('cuda')
24 | else:
25 |     print('No GPU found, using CPU instead.') 
26 |     device = torch.device('cpu')
27 | 
28 | # Create some tensors
29 | x = torch.ones(3, 4)
30 | data = [[1, 2, 3],[4, 5, 6]]
31 | y = torch.tensor(data, dtype=torch.float)
32 | 
33 | # Copy them to the GPU
34 | x = x.to(device)
35 | y = y.to(device)
36 | 
37 | # Perform matrix multiplication on GPU
38 | z = y.matmul(x)
39 | 
40 | print("z =", z)
41 | 


--------------------------------------------------------------------------------
/day2/run-2gpus-torchrun.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --account=project_462000863
 3 | #SBATCH --partition=small-g
 4 | #SBATCH --ntasks=1
 5 | #SBATCH --cpus-per-task=14
 6 | #SBATCH --gpus-per-node=2
 7 | #SBATCH --mem=120G
 8 | #SBATCH --time=1:00:00
 9 | ##SBATCH --reservation=pdl_day2-no-ood
10 | 
11 | module purge
12 | module use /appl/local/csc/modulefiles/
13 | module load pytorch
14 | 
15 | COURSE_SCRATCH="/scratch/${SLURM_JOB_ACCOUNT}"
16 | 
17 | export DATADIR=$COURSE_SCRATCH/data
18 | export TORCH_HOME=$COURSE_SCRATCH/torch-cache
19 | export HF_HOME=$COURSE_SCRATCH/hf-cache
20 | export MLFLOW_TRACKING_URI=$COURSE_SCRATCH/data/users/$USER/mlruns
21 | 
22 | set -xv
23 | torchrun --standalone --nnodes=1 --nproc_per_node=$SLURM_GPUS_PER_NODE $*
24 | 


--------------------------------------------------------------------------------
/day2/run-2gpus.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --account=project_462000863
 3 | #SBATCH --partition=small-g
 4 | #SBATCH --ntasks=1
 5 | #SBATCH --cpus-per-task=14
 6 | #SBATCH --gpus-per-node=2
 7 | #SBATCH --mem=120G
 8 | #SBATCH --time=1:00:00
 9 | #SBATCH --reservation=pdl_day2-no-ood
10 | 
11 | module purge
12 | module use /appl/local/csc/modulefiles/
13 | module load pytorch/2.4
14 | 
15 | COURSE_SCRATCH="/scratch/${SLURM_JOB_ACCOUNT}"
16 | 
17 | export DATADIR=$COURSE_SCRATCH/data
18 | export TORCH_HOME=$COURSE_SCRATCH/torch-cache
19 | export HF_HOME=$COURSE_SCRATCH/hf-cache
20 | 
21 | export MLFLOW_TRACKING_URI=$COURSE_SCRATCH/data/users/$USER/mlruns
22 | export TOKENIZERS_PARALLELISM=false
23 | 
24 | umask 002
25 | 
26 | set -xv
27 | torchrun --standalone --nnodes=1 --nproc_per_node=$SLURM_GPUS_PER_NODE $*
28 | 


--------------------------------------------------------------------------------
/day2/run-8gpus.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --account=project_462000863
 3 | #SBATCH --partition=small-g
 4 | #SBATCH --ntasks=1
 5 | #SBATCH --cpus-per-task=56
 6 | #SBATCH --gpus-per-node=8
 7 | #SBATCH --mem=480G
 8 | #SBATCH --time=1:00:00
 9 | #SBATCH --reservation=pdl_day2-no-ood
10 | 
11 | module purge
12 | module use /appl/local/csc/modulefiles/
13 | module load pytorch/2.4
14 | 
15 | COURSE_SCRATCH="/scratch/${SLURM_JOB_ACCOUNT}"
16 | 
17 | export DATADIR=$COURSE_SCRATCH/data
18 | export TORCH_HOME=$COURSE_SCRATCH/torch-cache
19 | export HF_HOME=$COURSE_SCRATCH/hf-cache
20 | 
21 | export MLFLOW_TRACKING_URI=$COURSE_SCRATCH/data/users/$USER/mlruns
22 | export TOKENIZERS_PARALLELISM=false
23 | 
24 | umask 002
25 | 
26 | set -xv
27 | torchrun --standalone --nnodes=1 --nproc_per_node=$SLURM_GPUS_PER_NODE $*
28 | 


--------------------------------------------------------------------------------
/day2/run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --account=project_462000863
 3 | #SBATCH --partition=small-g
 4 | #SBATCH --ntasks=1
 5 | #SBATCH --cpus-per-task=7
 6 | #SBATCH --gpus-per-task=1
 7 | #SBATCH --mem=60G
 8 | #SBATCH --time=1:00:00
 9 | #SBATCH --reservation=pdl_day2-no-ood
10 | 
11 | module purge
12 | module use /appl/local/csc/modulefiles/
13 | module load pytorch/2.4
14 | 
15 | COURSE_SCRATCH="/scratch/${SLURM_JOB_ACCOUNT}"
16 | 
17 | export DATADIR=$COURSE_SCRATCH/data
18 | export TORCH_HOME=$COURSE_SCRATCH/torch-cache
19 | export HF_HOME=$COURSE_SCRATCH/hf-cache
20 | 
21 | export MLFLOW_TRACKING_URI=$COURSE_SCRATCH/data/users/$USER/mlruns
22 | export TOKENIZERS_PARALLELISM=false
23 | 
24 | umask 002
25 | 
26 | set -xv
27 | python3 $*
28 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | pytorch
2 | matplotlib
3 | seaborn
4 | notebook
5 | pydot
6 | scikit-learn
7 | 


--------------------------------------------------------------------------------