├── Adam and weight decay
    ├── Tests SGD with Adam and wd.ipynb
    └── cifar10-dawn-adam.ipynb
├── Bug with frozen LSTM layer.ipynb
├── Building a French LM.ipynb
├── Cache pointer.ipynb
├── Cyclical LR and momentums.ipynb
├── DeepPainterlyHarmonization.ipynb
├── Experiments
    ├── Cifar10-mixup-cutout.ipynb
    ├── Post process logs.ipynb
    ├── multiGPU
    │   ├── callbacks.py
    │   ├── databunch.py
    │   ├── sampler.py
    │   ├── train_cifar10.py
    │   └── utils.py
    └── record_logs.py
├── First neural net in pytorch.ipynb
├── Initialize the bias in the final layer of an SSD.ipynb
├── LM_wikitext.ipynb
├── LM_wikitext_MOTAS.ipynb
├── LM_wikitext_mixup.ipynb
├── Learning rate finder.ipynb
├── Lesson 9 loss function
    ├── The loss function from scratch.ipynb
    ├── overlaps0.npy
    ├── overlaps4.npy
    ├── pred_bb.npy
    ├── pred_bb1.npy
    ├── pred_cls.npy
    ├── pred_cls1.npy
    ├── targ_bb.npy
    └── targ_cls.npy
├── README.md
├── Resnet 50 and Darknet 53.ipynb
├── Retina net Pascal.ipynb
├── Retina net Pascal1.ipynb
├── Understanding the new fastai API for scheduling training.ipynb
├── Using the callback system in fastai.ipynb
├── img
    ├── FPN.png
    └── RetinaHead.png
├── mAP
    ├── Computing the mAP metric.ipynb
    └── focus-4b.h5
└── wikitext_103.ipynb


/Bug with frozen LSTM layer.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import torch\n",
 10 |     "from torch.autograd import Variable as V\n",
 11 |     "import torch.nn as nn\n",
 12 |     "import torch.nn.functional as F"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "markdown",
 17 |    "metadata": {},
 18 |    "source": [
 19 |     "Simple model for repro. We have a pretrained Language Model and want to freeze all of it except the embeddings in a first phase."
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "code",
 24 |    "execution_count": 11,
 25 |    "metadata": {},
 26 |    "outputs": [],
 27 |    "source": [
 28 |     "model = nn.Sequential(nn.Linear(10,20), nn.ReLU(inplace=True),nn.LSTM(20,5, 1)).cuda()"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "markdown",
 33 |    "metadata": {},
 34 |    "source": [
 35 |     "Freeze the parameters linked to the LSTM."
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": 3,
 41 |    "metadata": {},
 42 |    "outputs": [],
 43 |    "source": [
 44 |     "for param in list(model.parameters())[2:]: param.requires_grad=False"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "markdown",
 49 |    "metadata": {},
 50 |    "source": [
 51 |     "Grab a random tensor and feed it to the model."
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": 4,
 57 |    "metadata": {},
 58 |    "outputs": [],
 59 |    "source": [
 60 |     "x = torch.randn(2,4,10).cuda()\n",
 61 |     "x.requires_grad = True"
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "code",
 66 |    "execution_count": 5,
 67 |    "metadata": {},
 68 |    "outputs": [],
 69 |    "source": [
 70 |     "z = model(x)"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "code",
 75 |    "execution_count": 6,
 76 |    "metadata": {},
 77 |    "outputs": [
 78 |     {
 79 |      "data": {
 80 |       "text/plain": [
 81 |        "True"
 82 |       ]
 83 |      },
 84 |      "execution_count": 6,
 85 |      "metadata": {},
 86 |      "output_type": "execute_result"
 87 |     }
 88 |    ],
 89 |    "source": [
 90 |     "z[0].requires_grad"
 91 |    ]
 92 |   },
 93 |   {
 94 |    "cell_type": "code",
 95 |    "execution_count": 7,
 96 |    "metadata": {},
 97 |    "outputs": [
 98 |     {
 99 |      "data": {
100 |       "text/plain": [
101 |        "torch.Size([2, 4, 5])"
102 |       ]
103 |      },
104 |      "execution_count": 7,
105 |      "metadata": {},
106 |      "output_type": "execute_result"
107 |     }
108 |    ],
109 |    "source": [
110 |     "z[0].size()"
111 |    ]
112 |   },
113 |   {
114 |    "cell_type": "markdown",
115 |    "metadata": {},
116 |    "source": [
117 |     "Create a random target to get some loss."
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "code",
122 |    "execution_count": 8,
123 |    "metadata": {},
124 |    "outputs": [],
125 |    "source": [
126 |     "y = torch.Tensor([0,1,2,3, 0,1,2,3]).long().cuda()"
127 |    ]
128 |   },
129 |   {
130 |    "cell_type": "code",
131 |    "execution_count": 9,
132 |    "metadata": {},
133 |    "outputs": [],
134 |    "source": [
135 |     "loss = F.cross_entropy(z[0].view(-1,5),y)"
136 |    ]
137 |   },
138 |   {
139 |    "cell_type": "code",
140 |    "execution_count": 10,
141 |    "metadata": {},
142 |    "outputs": [
143 |     {
144 |      "ename": "RuntimeError",
145 |      "evalue": "inconsistent range for TensorList output",
146 |      "output_type": "error",
147 |      "traceback": [
148 |       "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
149 |       "\u001b[1;31mRuntimeError\u001b[0m                              Traceback (most recent call last)",
150 |       "\u001b[1;32m<ipython-input-10-52a0569421b1>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mloss\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mbackward\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
151 |       "\u001b[1;32m~\\Anaconda3\\envs\\fastai\\lib\\site-packages\\torch\\tensor.py\u001b[0m in \u001b[0;36mbackward\u001b[1;34m(self, gradient, retain_graph, create_graph)\u001b[0m\n\u001b[0;32m     91\u001b[0m                 \u001b[0mproducts\u001b[0m\u001b[1;33m.\u001b[0m \u001b[0mDefaults\u001b[0m \u001b[0mto\u001b[0m\u001b[0;31m \u001b[0m\u001b[0;31m`\u001b[0m\u001b[0;31m`\u001b[0m\u001b[1;32mFalse\u001b[0m\u001b[0;31m`\u001b[0m\u001b[0;31m`\u001b[0m\u001b[1;33m.\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     92\u001b[0m         \"\"\"\n\u001b[1;32m---> 93\u001b[1;33m         \u001b[0mtorch\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mautograd\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mbackward\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mgradient\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mretain_graph\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcreate_graph\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m     94\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     95\u001b[0m     \u001b[1;32mdef\u001b[0m \u001b[0mregister_hook\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mhook\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
152 |       "\u001b[1;32m~\\Anaconda3\\envs\\fastai\\lib\\site-packages\\torch\\autograd\\__init__.py\u001b[0m in \u001b[0;36mbackward\u001b[1;34m(tensors, grad_tensors, retain_graph, create_graph, grad_variables)\u001b[0m\n\u001b[0;32m     87\u001b[0m     Variable._execution_engine.run_backward(\n\u001b[0;32m     88\u001b[0m         \u001b[0mtensors\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mgrad_tensors\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mretain_graph\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcreate_graph\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 89\u001b[1;33m         allow_unreachable=True)  # allow_unreachable flag\n\u001b[0m\u001b[0;32m     90\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     91\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
153 |       "\u001b[1;31mRuntimeError\u001b[0m: inconsistent range for TensorList output"
154 |      ]
155 |     }
156 |    ],
157 |    "source": [
158 |     "loss.backward()"
159 |    ]
160 |   },
161 |   {
162 |    "cell_type": "code",
163 |    "execution_count": null,
164 |    "metadata": {},
165 |    "outputs": [],
166 |    "source": []
167 |   }
168 |  ],
169 |  "metadata": {
170 |   "kernelspec": {
171 |    "display_name": "Python 3",
172 |    "language": "python",
173 |    "name": "python3"
174 |   },
175 |   "language_info": {
176 |    "codemirror_mode": {
177 |     "name": "ipython",
178 |     "version": 3
179 |    },
180 |    "file_extension": ".py",
181 |    "mimetype": "text/x-python",
182 |    "name": "python",
183 |    "nbconvert_exporter": "python",
184 |    "pygments_lexer": "ipython3",
185 |    "version": "3.6.4"
186 |   }
187 |  },
188 |  "nbformat": 4,
189 |  "nbformat_minor": 2
190 | }
191 | 


--------------------------------------------------------------------------------
/Cache pointer.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "This notebook goes with [this blog post](https://sgugger.github.io/pointer-cache-for-language-model.html#pointer-cache-for-language-model) that explains what the continuous cache pointer is. This technique was introduce by Grave et al. in [this article](https://arxiv.org/pdf/1612.04426.pdf)."
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 1,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "%matplotlib inline\n",
 17 |     "%reload_ext autoreload\n",
 18 |     "%autoreload 2"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "markdown",
 23 |    "metadata": {},
 24 |    "source": [
 25 |     "This notebook uses the [fastai](https://github.com/fastai/fastai) library."
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": 2,
 31 |    "metadata": {},
 32 |    "outputs": [],
 33 |    "source": [
 34 |     "from fastai.text import *"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "markdown",
 39 |    "metadata": {},
 40 |    "source": [
 41 |     "Be sure to change the path to where the data is on your hard drive. The wikitext-2 can be downloaded [here](https://www.salesforce.com/products/einstein/ai-research/the-wikitext-dependency-language-modeling-dataset/)."
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "code",
 46 |    "execution_count": 3,
 47 |    "metadata": {},
 48 |    "outputs": [],
 49 |    "source": [
 50 |     "EOS = '<eos>'\n",
 51 |     "PATH=Path('../data/wikitext')"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "markdown",
 56 |    "metadata": {},
 57 |    "source": [
 58 |     "As indicated on their website, we just had the EOS token at the end of each line."
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "code",
 63 |    "execution_count": 4,
 64 |    "metadata": {},
 65 |    "outputs": [],
 66 |    "source": [
 67 |     "def read_file(filename):\n",
 68 |     "    tokens = []\n",
 69 |     "    with open(PATH/filename, encoding='utf8') as f:\n",
 70 |     "        for line in f:\n",
 71 |     "            tokens.append(line.split() + [EOS])\n",
 72 |     "    return np.array(tokens)"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "code",
 77 |    "execution_count": 5,
 78 |    "metadata": {},
 79 |    "outputs": [],
 80 |    "source": [
 81 |     "trn_tok = read_file('wiki.train.tokens')\n",
 82 |     "val_tok = read_file('wiki.valid.tokens')\n",
 83 |     "tst_tok = read_file('wiki.test.tokens')"
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "code",
 88 |    "execution_count": 6,
 89 |    "metadata": {},
 90 |    "outputs": [
 91 |     {
 92 |      "data": {
 93 |       "text/plain": [
 94 |        "36718"
 95 |       ]
 96 |      },
 97 |      "execution_count": 6,
 98 |      "metadata": {},
 99 |      "output_type": "execute_result"
100 |     }
101 |    ],
102 |    "source": [
103 |     "len(trn_tok)"
104 |    ]
105 |   },
106 |   {
107 |    "cell_type": "markdown",
108 |    "metadata": {},
109 |    "source": [
110 |     "We numericliaze the tokens into ids."
111 |    ]
112 |   },
113 |   {
114 |    "cell_type": "code",
115 |    "execution_count": 7,
116 |    "metadata": {},
117 |    "outputs": [],
118 |    "source": [
119 |     "cnt = Counter(word for sent in trn_tok for word in sent)\n",
120 |     "itos = [o for o,c in cnt.most_common()]\n",
121 |     "itos.insert(0,'_pad_')"
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "code",
126 |    "execution_count": 8,
127 |    "metadata": {},
128 |    "outputs": [
129 |     {
130 |      "data": {
131 |       "text/plain": [
132 |        "33279"
133 |       ]
134 |      },
135 |      "execution_count": 8,
136 |      "metadata": {},
137 |      "output_type": "execute_result"
138 |     }
139 |    ],
140 |    "source": [
141 |     "vocab_size = len(itos); vocab_size"
142 |    ]
143 |   },
144 |   {
145 |    "cell_type": "markdown",
146 |    "metadata": {},
147 |    "source": [
148 |     "And here is the way from tokens to ids."
149 |    ]
150 |   },
151 |   {
152 |    "cell_type": "code",
153 |    "execution_count": 9,
154 |    "metadata": {},
155 |    "outputs": [],
156 |    "source": [
157 |     "stoi = collections.defaultdict(lambda : 5, {w:i for i,w in enumerate(itos)})"
158 |    ]
159 |   },
160 |   {
161 |    "cell_type": "code",
162 |    "execution_count": 10,
163 |    "metadata": {},
164 |    "outputs": [],
165 |    "source": [
166 |     "trn_ids = np.array([([stoi[w] for w in s]) for s in trn_tok])\n",
167 |     "val_ids = np.array([([stoi[w] for w in s]) for s in val_tok])\n",
168 |     "tst_ids = np.array([([stoi[w] for w in s]) for s in tst_tok])"
169 |    ]
170 |   },
171 |   {
172 |    "cell_type": "markdown",
173 |    "metadata": {},
174 |    "source": [
175 |     "Thos are the parameters of our model"
176 |    ]
177 |   },
178 |   {
179 |    "cell_type": "code",
180 |    "execution_count": 11,
181 |    "metadata": {},
182 |    "outputs": [],
183 |    "source": [
184 |     "em_sz,nh,nl = 400,1150,3\n",
185 |     "drops = np.array([0.6,0.4,0.5,0.05,0.2])"
186 |    ]
187 |   },
188 |   {
189 |    "cell_type": "markdown",
190 |    "metadata": {},
191 |    "source": [
192 |     "This is just to create a learner object that won't be used since we don't train here."
193 |    ]
194 |   },
195 |   {
196 |    "cell_type": "code",
197 |    "execution_count": 12,
198 |    "metadata": {},
199 |    "outputs": [],
200 |    "source": [
201 |     "bptt, bs = 5,2"
202 |    ]
203 |   },
204 |   {
205 |    "cell_type": "code",
206 |    "execution_count": 13,
207 |    "metadata": {},
208 |    "outputs": [],
209 |    "source": [
210 |     "trn_dl = LanguageModelLoader(np.concatenate(trn_ids), bs, bptt)\n",
211 |     "val_dl = LanguageModelLoader(np.concatenate(val_ids), bs, bptt)\n",
212 |     "md = LanguageModelData(PATH, 0, vocab_size, trn_dl, val_dl, bs=bs, bptt=bptt)"
213 |    ]
214 |   },
215 |   {
216 |    "cell_type": "code",
217 |    "execution_count": 14,
218 |    "metadata": {},
219 |    "outputs": [],
220 |    "source": [
221 |     "opt_fn = partial(optim.SGD, momentum=0.9)\n",
222 |     "learner= md.get_model(opt_fn, em_sz, nh, nl,\n",
223 |     "    dropouti=drops[0], dropout=drops[1], wdrop=drops[2], dropoute=drops[3], dropouth=drops[4])"
224 |    ]
225 |   },
226 |   {
227 |    "cell_type": "markdown",
228 |    "metadata": {},
229 |    "source": [
230 |     "The model I use as en example is stored [here](https://s3.us-east-2.amazonaws.com/sgugger/best.h5). Be sure to have the file best.h5 in a directory called models where the variable PATH points to (our replace by any model you've saved)."
231 |    ]
232 |   },
233 |   {
234 |    "cell_type": "code",
235 |    "execution_count": 15,
236 |    "metadata": {},
237 |    "outputs": [],
238 |    "source": [
239 |     "learner.load('best')"
240 |    ]
241 |   },
242 |   {
243 |    "cell_type": "markdown",
244 |    "metadata": {},
245 |    "source": [
246 |     "Let's begin by computing how well our model is doing before anything else. To do that we will need a way to go through all of our text, but instead of using the fastai LanguageModelLoader (who randomly modifies the bptt) we'll change the code to have a fixed bptt.\n",
247 |     "\n",
248 |     "Also we don't want to do mini-batches on this validation because it resets the hidden state at each batch, making us lose valuable information. It makes a tiny bit of difference as we will see. "
249 |    ]
250 |   },
251 |   {
252 |    "cell_type": "code",
253 |    "execution_count": 16,
254 |    "metadata": {},
255 |    "outputs": [],
256 |    "source": [
257 |     "#Comes from the LanguageModelLoader class, I just removed the minibatch and fixed the bptt.\n",
258 |     "#Now it gives an iterator that will spit bits of size bptt.\n",
259 |     "class TextReader():\n",
260 |     "    def __init__(self, nums, bptt, backwards=False):\n",
261 |     "        self.bptt,self.backwards = bptt,backwards\n",
262 |     "        self.data = self.batchify(nums)\n",
263 |     "        self.i,self.iter = 0,0\n",
264 |     "        self.n = len(self.data)\n",
265 |     "\n",
266 |     "    def __iter__(self):\n",
267 |     "        self.i,self.iter = 0,0\n",
268 |     "        while self.i < self.n-1 and self.iter<len(self):\n",
269 |     "            res = self.get_batch(self.i, self.bptt)\n",
270 |     "            self.i += self.bptt\n",
271 |     "            self.iter += 1\n",
272 |     "            yield res\n",
273 |     "\n",
274 |     "    def __len__(self): return self.n // self.bptt \n",
275 |     "\n",
276 |     "    def batchify(self, data):\n",
277 |     "        data = np.array(data)[:,None]\n",
278 |     "        if self.backwards: data=data[::-1]\n",
279 |     "        return T(data)\n",
280 |     "\n",
281 |     "    def get_batch(self, i, seq_len):\n",
282 |     "        source = self.data\n",
283 |     "        seq_len = min(seq_len, len(source) - 1 - i)\n",
284 |     "        return source[i:i+seq_len], source[i+1:i+1+seq_len].view(-1)"
285 |    ]
286 |   },
287 |   {
288 |    "cell_type": "markdown",
289 |    "metadata": {},
290 |    "source": [
291 |     "This TextReader will give us an iterator that will allow us to go through the text. "
292 |    ]
293 |   },
294 |   {
295 |    "cell_type": "code",
296 |    "execution_count": 25,
297 |    "metadata": {},
298 |    "outputs": [],
299 |    "source": [
300 |     "def my_validate(model, source, bptt=2000):\n",
301 |     "    data_source = TextReader(source, bptt)\n",
302 |     "    model.eval()\n",
303 |     "    model.reset()\n",
304 |     "    total_loss = 0.\n",
305 |     "    for inputs, targets in tqdm(data_source):\n",
306 |     "        #The language model throws up a bucnh of things, we'll focus on that later. For now we just want the ouputs.\n",
307 |     "        outputs, raws, outs = model(V(inputs))\n",
308 |     "        #The output doesn't go through softmax so we can use the CrossEntropy loss directly \n",
309 |     "        total_loss += F.cross_entropy(outputs, V(targets), size_average=False).data[0]\n",
310 |     "    #Total size is length of our iterator times bptt\n",
311 |     "    mean = total_loss / (bptt * len(data_source))\n",
312 |     "    #Returns loss and perplexity.\n",
313 |     "    return mean, np.exp(mean)"
314 |    ]
315 |   },
316 |   {
317 |    "cell_type": "code",
318 |    "execution_count": 26,
319 |    "metadata": {},
320 |    "outputs": [
321 |     {
322 |      "name": "stdout",
323 |      "output_type": "stream",
324 |      "text": [
325 |       "\n",
326 |       "100%|██████████| 108/108 [00:37<00:00,  2.85it/s]\n"
327 |      ]
328 |     },
329 |     {
330 |      "data": {
331 |       "text/plain": [
332 |        "(4.304896561234085, 74.06155422085088)"
333 |       ]
334 |      },
335 |      "execution_count": 26,
336 |      "metadata": {},
337 |      "output_type": "execute_result"
338 |     }
339 |    ],
340 |    "source": [
341 |     "my_validate(learner.model, np.concatenate(val_ids))"
342 |    ]
343 |   },
344 |   {
345 |    "cell_type": "markdown",
346 |    "metadata": {},
347 |    "source": [
348 |     "This model was giving me a final validation loss of 4.317807 when it was computed with mini-batches, so we can see we gained a tiny bit by not reseting the hidden state."
349 |    ]
350 |   },
351 |   {
352 |    "cell_type": "code",
353 |    "execution_count": 27,
354 |    "metadata": {},
355 |    "outputs": [
356 |     {
357 |      "name": "stdout",
358 |      "output_type": "stream",
359 |      "text": [
360 |       "100%|██████████| 122/122 [00:42<00:00,  2.84it/s]\n"
361 |      ]
362 |     },
363 |     {
364 |      "data": {
365 |       "text/plain": [
366 |        "(4.25155159972144, 70.21427231666625)"
367 |       ]
368 |      },
369 |      "execution_count": 27,
370 |      "metadata": {},
371 |      "output_type": "execute_result"
372 |     }
373 |    ],
374 |    "source": [
375 |     "my_validate(learner.model, np.concatenate(tst_ids))"
376 |    ]
377 |   },
378 |   {
379 |    "cell_type": "markdown",
380 |    "metadata": {},
381 |    "source": [
382 |     "We will need to one-hot encode our targets so we'll use little helper function."
383 |    ]
384 |   },
385 |   {
386 |    "cell_type": "code",
387 |    "execution_count": 28,
388 |    "metadata": {},
389 |    "outputs": [],
390 |    "source": [
391 |     "def one_hot(vec, size=vocab_size, cuda=True):\n",
392 |     "    a = torch.zeros(len(vec), size)\n",
393 |     "    for i,v in enumerate(vec):\n",
394 |     "        a[i,v] = 1.\n",
395 |     "    return V(a)"
396 |    ]
397 |   },
398 |   {
399 |    "cell_type": "markdown",
400 |    "metadata": {},
401 |    "source": [
402 |     "Before we write the cache_pointer, let's have a look at what our language model spits out when we send him an input. Looking at the source code of get_language_model, we see our model is of type SequentialRNN and combines an RNNEncoder and a LinearDecoder. \n",
403 |     "\n",
404 |     "SequentialRNN is just to wrap a sequence of models while keeping a reset attribute (to reset the hidden states of the RNN basically). The last model being the LinearDecoder, it's the one that will give the output, so let's have a look.\n",
405 |     "\n",
406 |     "```\n",
407 |     "def forward(self, input):\n",
408 |     "    raw_outputs, outputs = input\n",
409 |     "    output = self.dropout(outputs[-1])\n",
410 |     "    decoded = self.decoder(output.view(output.size(0)*output.size(1), output.size(2)))\n",
411 |     "    result = decoded.view(-1, decoded.size(1))\n",
412 |     "    return result, raw_outputs, outputs\n",
413 |     "```\n",
414 |     "It returns three things: the result, which is the decoded version of the last output of our RNNs, and it also returns raw_outputs and outputs, which seem to come from the previous block. So let's have a look at the RNNEncoder forward function.\n",
415 |     "\n",
416 |     "```\n",
417 |     "def forward(self, input):\n",
418 |     "    sl,bs = input.size()\n",
419 |     "    if bs!=self.bs:\n",
420 |     "        self.bs=bs\n",
421 |     "        self.reset()\n",
422 |     "\n",
423 |     "    emb = self.encoder_with_dropout(input, dropout=self.dropoute if self.training else 0)\n",
424 |     "    emb = self.dropouti(emb)\n",
425 |     "\n",
426 |     "    raw_output = emb\n",
427 |     "    new_hidden,raw_outputs,outputs = [],[],[]\n",
428 |     "    for l, (rnn,drop) in enumerate(zip(self.rnns, self.dropouths)):\n",
429 |     "        current_input = raw_output\n",
430 |     "        with warnings.catch_warnings():\n",
431 |     "            warnings.simplefilter(\"ignore\")\n",
432 |     "            raw_output, new_h = rnn(raw_output, self.hidden[l])\n",
433 |     "        new_hidden.append(new_h)\n",
434 |     "        raw_outputs.append(raw_output)\n",
435 |     "        if l != self.nlayers - 1: raw_output = drop(raw_output)\n",
436 |     "        outputs.append(raw_output)\n",
437 |     "\n",
438 |     "    self.hidden = repackage_var(new_hidden)\n",
439 |     "    return raw_outputs, outputs\n",
440 |     "```\n",
441 |     "And now, we see that the raw_ouputs are the outputs (aka the hidden states) of our RNN, then ouputs is the same after dropout has been applied. We will need the real hidden state for our neural cache so we will use the raw_outputs."
442 |    ]
443 |   },
444 |   {
445 |    "cell_type": "markdown",
446 |    "metadata": {},
447 |    "source": [
448 |     "This function will evaluate the model with the cache pointer on top of it. If you want to make a single prediction, you will have to adapt the code a bit. Hyperparameters values are stolen from Stephen Merity et al."
449 |    ]
450 |   },
451 |   {
452 |    "cell_type": "code",
453 |    "execution_count": 29,
454 |    "metadata": {},
455 |    "outputs": [],
456 |    "source": [
457 |     "def my_cache_pointer(model, source, theta = 0.662, lambd = 0.1279, window=3785, bptt=2000):\n",
458 |     "    data_source = TextReader(source, bptt)\n",
459 |     "    #Set the model into eval mode.\n",
460 |     "    model.eval()\n",
461 |     "    #Just to create a hidden state.\n",
462 |     "    model.reset()\n",
463 |     "    total_loss = 0.\n",
464 |     "    #Containers for the previous targets/hidden states.\n",
465 |     "    targ_history = None\n",
466 |     "    hid_history = None\n",
467 |     "    for inputs, targets in tqdm(data_source):\n",
468 |     "        outputs, raws, outs = model(V(inputs))\n",
469 |     "        #The outputs aren't softmaxed, sowe have to do it to get the p_vocab vectors.\n",
470 |     "        p_vocab = F.softmax(outputs,1)\n",
471 |     "        #We take the last hidden states (raws contains one Tensor for the results of each layer) and remove the batch dimension.\n",
472 |     "        hiddens = raws[-1].squeeze() \n",
473 |     "        #Start index inside our history.\n",
474 |     "        start = 0 if targ_history is None else targ_history.size(0)\n",
475 |     "        #Add the targets and hidden states to our history.\n",
476 |     "        targ_history = one_hot(targets) if targ_history is None else torch.cat([targ_history, one_hot(targets)])\n",
477 |     "        hid_history = hiddens if hid_history is None else torch.cat([hid_history, hiddens])\n",
478 |     "        for i, pv in enumerate(p_vocab):\n",
479 |     "            #Get the cached values\n",
480 |     "            p = pv\n",
481 |     "            if start + i > 0:\n",
482 |     "                targ_cache = targ_history[:start+i] if start + i <= window else targ_history[start+i-window:start+i]\n",
483 |     "                hid_cache = hid_history[:start+i] if start + i <= window else hid_history[start+i-window:start+i]\n",
484 |     "                #This is explained in the blog post.\n",
485 |     "                all_dot_prods = torch.mv(theta * hid_cache, hiddens[i])\n",
486 |     "                softmaxed = F.softmax(all_dot_prods).unsqueeze(1)\n",
487 |     "                p_cache = (softmaxed.expand_as(targ_cache) * targ_cache).sum(0).squeeze()\n",
488 |     "                p = (1-lambd) * pv + lambd * p_cache\n",
489 |     "            total_loss -= torch.log(p[targets[i]]).data[0]\n",
490 |     "        targ_history = targ_history[-window:]\n",
491 |     "        hid_history = hid_history[-window:]\n",
492 |     "    #Total size is length of our iterator times bptt\n",
493 |     "    mean = total_loss / (bptt * len(data_source))\n",
494 |     "    #Returns loss and perplexity\n",
495 |     "    return mean, np.exp(mean)"
496 |    ]
497 |   },
498 |   {
499 |    "cell_type": "markdown",
500 |    "metadata": {},
501 |    "source": [
502 |     "This differs a bit from the implementation of Stephen Merity et al. [here](https://github.com/salesforce/awd-lstm-lm) since they only start using the cache when they have at least windows values before, but I found slightlybetter results using it since the very beginning.  "
503 |    ]
504 |   },
505 |   {
506 |    "cell_type": "code",
507 |    "execution_count": 30,
508 |    "metadata": {},
509 |    "outputs": [
510 |     {
511 |      "name": "stdout",
512 |      "output_type": "stream",
513 |      "text": [
514 |       "100%|██████████| 108/108 [21:46<00:00, 12.10s/it]\n"
515 |      ]
516 |     },
517 |     {
518 |      "data": {
519 |       "text/plain": [
520 |        "(3.9970045292146206, 54.434847575761616)"
521 |       ]
522 |      },
523 |      "execution_count": 30,
524 |      "metadata": {},
525 |      "output_type": "execute_result"
526 |     }
527 |    ],
528 |    "source": [
529 |     "my_cache_pointer(learner.model, np.concatenate(val_ids))"
530 |    ]
531 |   },
532 |   {
533 |    "cell_type": "code",
534 |    "execution_count": 31,
535 |    "metadata": {},
536 |    "outputs": [
537 |     {
538 |      "name": "stdout",
539 |      "output_type": "stream",
540 |      "text": [
541 |       "100%|██████████| 122/122 [24:17<00:00, 11.94s/it]\n"
542 |      ]
543 |     },
544 |     {
545 |      "data": {
546 |       "text/plain": [
547 |        "(3.95311762462693, 52.09753447104239)"
548 |       ]
549 |      },
550 |      "execution_count": 31,
551 |      "metadata": {},
552 |      "output_type": "execute_result"
553 |     }
554 |    ],
555 |    "source": [
556 |     "my_cache_pointer(learner.model, np.concatenate(tst_ids))"
557 |    ]
558 |   },
559 |   {
560 |    "cell_type": "markdown",
561 |    "metadata": {},
562 |    "source": [
563 |     "So we went from 74.06/70.21 perplexity to 54.43/52.10, not so bad!"
564 |    ]
565 |   },
566 |   {
567 |    "cell_type": "code",
568 |    "execution_count": null,
569 |    "metadata": {},
570 |    "outputs": [],
571 |    "source": []
572 |   }
573 |  ],
574 |  "metadata": {
575 |   "kernelspec": {
576 |    "display_name": "Python 3",
577 |    "language": "python",
578 |    "name": "python3"
579 |   },
580 |   "language_info": {
581 |    "codemirror_mode": {
582 |     "name": "ipython",
583 |     "version": 3
584 |    },
585 |    "file_extension": ".py",
586 |    "mimetype": "text/x-python",
587 |    "name": "python",
588 |    "nbconvert_exporter": "python",
589 |    "pygments_lexer": "ipython3",
590 |    "version": "3.6.4"
591 |   }
592 |  },
593 |  "nbformat": 4,
594 |  "nbformat_minor": 2
595 | }
596 | 


--------------------------------------------------------------------------------
/Experiments/Cifar10-mixup-cutout.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## CIFAR 10"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 1,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "%matplotlib inline\n",
 17 |     "%reload_ext autoreload\n",
 18 |     "%autoreload 2"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": 2,
 24 |    "metadata": {},
 25 |    "outputs": [],
 26 |    "source": [
 27 |     "from fastai.conv_learner import *\n",
 28 |     "from fastai.models.cifar10.wideresnet import wrn_22\n",
 29 |     "torch.backends.cudnn.benchmark = True\n",
 30 |     "PATH = Path(\"../data/cifar10/\")\n",
 31 |     "os.makedirs(PATH,exist_ok=True)"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": 3,
 37 |    "metadata": {},
 38 |    "outputs": [],
 39 |    "source": [
 40 |     "classes = ['airplane', 'automobile', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck']\n",
 41 |     "stats = (np.array([ 0.4914 ,  0.48216,  0.44653]), np.array([ 0.24703,  0.24349,  0.26159]))\n",
 42 |     "\n",
 43 |     "bs=128\n",
 44 |     "sz=32"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "markdown",
 49 |    "metadata": {},
 50 |    "source": [
 51 |     "# Cutout"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "markdown",
 56 |    "metadata": {},
 57 |    "source": [
 58 |     "Cutout is already implemented in the fastai library. Args are n_holes (1 in the paper), size of the hole, probability of applying (default: 0.5). Applying Cutout after normalization or before (which means blanking with 0s or the means) doesn't change the results."
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "code",
 63 |    "execution_count": 4,
 64 |    "metadata": {
 65 |     "scrolled": true
 66 |    },
 67 |    "outputs": [],
 68 |    "source": [
 69 |     "tfms = tfms_from_stats(stats, sz, aug_tfms=[RandomCrop(sz), RandomFlip(), Cutout(1,16)], pad=sz//8)\n",
 70 |     "data = ImageClassifierData.from_paths(PATH, tfms=tfms, bs=bs, val_name='test')"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "code",
 75 |    "execution_count": 9,
 76 |    "metadata": {},
 77 |    "outputs": [],
 78 |    "source": [
 79 |     "m = wrn_22()\n",
 80 |     "opt_fn = partial(optim.Adam, betas=(0.95,0.99))\n",
 81 |     "learn = ConvLearner.from_model_data(m, data, opt_fn=opt_fn)\n",
 82 |     "learn.crit = nn.CrossEntropyLoss()\n",
 83 |     "learn.metrics = [accuracy]\n",
 84 |     "wd=0.03"
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "code",
 89 |    "execution_count": 10,
 90 |    "metadata": {},
 91 |    "outputs": [
 92 |     {
 93 |      "data": {
 94 |       "application/vnd.jupyter.widget-view+json": {
 95 |        "model_id": "e2b08c78f7f341a9bc973c6d2765dacf",
 96 |        "version_major": 2,
 97 |        "version_minor": 0
 98 |       },
 99 |       "text/plain": [
100 |        "HBox(children=(IntProgress(value=0, description='Epoch', max=34), HTML(value='')))"
101 |       ]
102 |      },
103 |      "metadata": {},
104 |      "output_type": "display_data"
105 |     },
106 |     {
107 |      "name": "stdout",
108 |      "output_type": "stream",
109 |      "text": [
110 |       "epoch      trn_loss   val_loss   accuracy                   \n",
111 |       "    0      1.265698   1.131748   0.5859    \n",
112 |       "    1      1.011879   0.925887   0.6743                     \n",
113 |       "    2      0.874358   0.83785    0.7146                      \n",
114 |       "    3      0.756501   0.661968   0.7674                      \n",
115 |       "    4      0.702329   0.576785   0.8056                      \n",
116 |       "    5      0.679879   0.568449   0.8077                      \n",
117 |       "    6      0.624821   0.590153   0.7986                      \n",
118 |       "    7      0.584256   0.636968   0.7912                      \n",
119 |       "    8      0.555692   0.508859   0.8294                      \n",
120 |       "    9      0.521765   0.60031    0.8008                      \n",
121 |       "    10     0.516555   0.498995   0.835                       \n",
122 |       "    11     0.494465   0.593156   0.8126                      \n",
123 |       "    12     0.478316   0.486834   0.8367                      \n",
124 |       "    13     0.453475   0.383599   0.8663                      \n",
125 |       "    14     0.429461   0.397569   0.8682                      \n",
126 |       "    15     0.413949   0.458565   0.8433                      \n",
127 |       "    16     0.393276   0.341568   0.8866                      \n",
128 |       "    17     0.375791   0.33701    0.8912                      \n",
129 |       "    18     0.344201   0.312567   0.8961                      \n",
130 |       "    19     0.30564    0.304323   0.9015                      \n",
131 |       "    20     0.302724   0.277031   0.9075                      \n",
132 |       "    21     0.272228   0.239014   0.9218                      \n",
133 |       "    22     0.254668   0.257381   0.915                       \n",
134 |       "    23     0.234972   0.218338   0.9288                      \n",
135 |       "    24     0.214596   0.223773   0.9261                      \n",
136 |       "    25     0.200005   0.215595   0.9333                      \n",
137 |       "    26     0.178975   0.211857   0.9336                      \n",
138 |       "    27     0.176236   0.209497   0.9337                      \n",
139 |       "    28     0.159292   0.19866    0.9393                      \n",
140 |       "    29     0.140852   0.194932   0.9394                      \n",
141 |       "    30     0.120123   0.193329   0.9411                      \n",
142 |       "    31     0.117817   0.192284   0.9458                      \n",
143 |       "    32     0.104493   0.186788   0.9465                      \n",
144 |       "    33     0.101104   0.185037   0.9481                       \n",
145 |       "\n",
146 |       "CPU times: user 1h 27min 53s, sys: 17min 56s, total: 1h 45min 49s\n",
147 |       "Wall time: 28min 58s\n"
148 |      ]
149 |     },
150 |     {
151 |      "data": {
152 |       "text/plain": [
153 |        "[0.18503666452169418, 0.9481]"
154 |       ]
155 |      },
156 |      "execution_count": 10,
157 |      "metadata": {},
158 |      "output_type": "execute_result"
159 |     }
160 |    ],
161 |    "source": [
162 |     "%time learn.fit(3e-3, 1, cycle_len=34, use_clr_beta=(10,7.5,0.95,0.85), wds=wd, use_wd_sched=True)"
163 |    ]
164 |   },
165 |   {
166 |    "cell_type": "markdown",
167 |    "metadata": {},
168 |    "source": [
169 |     "I did obtain 95% in 34 epochs, but not too sure with which hyper-parameters. Maybe a bit more wd. Anyway, you get the idea."
170 |    ]
171 |   },
172 |   {
173 |    "cell_type": "markdown",
174 |    "metadata": {},
175 |    "source": [
176 |     "# Mixup"
177 |    ]
178 |   },
179 |   {
180 |    "cell_type": "code",
181 |    "execution_count": 11,
182 |    "metadata": {},
183 |    "outputs": [],
184 |    "source": [
185 |     "class MixUpDataLoader(object):\n",
186 |     "    \"\"\"\n",
187 |     "    Creates a new data loader with mixup from a given dataloader.\n",
188 |     "    \n",
189 |     "    Mixup is applied between a batch and a shuffled version of itself. \n",
190 |     "    If we use a regular beta distribution, this can create near duplicates as some lines might be \n",
191 |     "    1 * original + 0 * shuffled while others could be 0 * original + 1 * shuffled, this is why\n",
192 |     "    there is a trick where we take the maximum of lambda and 1-lambda.\n",
193 |     "    \n",
194 |     "    Arguments:\n",
195 |     "    dl (DataLoader): the data loader to mix up\n",
196 |     "    alpha (float): value of the parameter to use in the beta distribution.\n",
197 |     "    \"\"\"\n",
198 |     "    def __init__(self, dl, alpha):\n",
199 |     "        self.dl, self.alpha = dl, alpha\n",
200 |     "        \n",
201 |     "    def __len__(self):\n",
202 |     "        return len(self.dl)\n",
203 |     "    \n",
204 |     "    def __iter__(self):\n",
205 |     "        for (x, y) in iter(self.dl):\n",
206 |     "            #Taking one different lambda per image speeds up training \n",
207 |     "            lambd = np.random.beta(self.alpha, self.alpha, y.size(0))\n",
208 |     "            #Trick to avoid near duplicates\n",
209 |     "            lambd = np.concatenate([lambd[:,None], 1-lambd[:,None]], 1).max(1)\n",
210 |     "            lambd = to_gpu(VV(lambd))\n",
211 |     "            shuffle = torch.randperm(y.size(0))\n",
212 |     "            x1, y1 = x[shuffle], y[shuffle]\n",
213 |     "            yield (x * lambd.view(lambd.size(0),1,1,1) + x1 * (1-lambd).view(lambd.size(0),1,1,1), [y, y1, lambd])"
214 |    ]
215 |   },
216 |   {
217 |    "cell_type": "code",
218 |    "execution_count": 12,
219 |    "metadata": {},
220 |    "outputs": [],
221 |    "source": [
222 |     "class MixUpLoss(nn.Module):\n",
223 |     "    \"\"\"\n",
224 |     "    Adapts the loss function to go with mixup.\n",
225 |     "    \n",
226 |     "    Since the targets aren't one-hot encoded, we use the linearity of the loss function with\n",
227 |     "    regards to the target to mix up the loss instead of one-hot encoded targets.\n",
228 |     "    \n",
229 |     "    Argument:\n",
230 |     "    crit: a loss function. It must have the parameter reduced=False to have the loss per element.\n",
231 |     "    \"\"\"\n",
232 |     "    def __init__(self, crit):\n",
233 |     "        super().__init__()\n",
234 |     "        self.crit = crit()\n",
235 |     "        \n",
236 |     "    def forward(self, output, target):\n",
237 |     "        if not isinstance(target, list): return self.crit(output, target).mean()\n",
238 |     "        loss1, loss2 = self.crit(output,target[0]), self.crit(output,target[1])\n",
239 |     "        return (loss1 * target[2] + loss2 * (1-target[2])).mean()"
240 |    ]
241 |   },
242 |   {
243 |    "cell_type": "code",
244 |    "execution_count": 23,
245 |    "metadata": {},
246 |    "outputs": [],
247 |    "source": [
248 |     "tfms = tfms_from_stats(stats, sz, aug_tfms=[RandomCrop(sz), RandomFlip()], pad=sz//8)\n",
249 |     "data = ImageClassifierData.from_paths(PATH, tfms=tfms, bs=bs, val_name='test')"
250 |    ]
251 |   },
252 |   {
253 |    "cell_type": "code",
254 |    "execution_count": 24,
255 |    "metadata": {},
256 |    "outputs": [],
257 |    "source": [
258 |     "mixup_dl = MixUpDataLoader(data.trn_dl, 0.6)"
259 |    ]
260 |   },
261 |   {
262 |    "cell_type": "code",
263 |    "execution_count": 25,
264 |    "metadata": {},
265 |    "outputs": [],
266 |    "source": [
267 |     "m = wrn_22()\n",
268 |     "opt_fn = partial(optim.Adam, betas=(0.95,0.99))\n",
269 |     "learn = ConvLearner.from_model_data(m, data)\n",
270 |     "learn.metrics = [accuracy]\n",
271 |     "wd=1e-4\n",
272 |     "learn.opt_fn = opt_fn\n",
273 |     "learn.data.trn_dl = mixup_dl\n",
274 |     "learn.crit = MixUpLoss(partial(nn.CrossEntropyLoss, reduce=False))"
275 |    ]
276 |   },
277 |   {
278 |    "cell_type": "code",
279 |    "execution_count": 26,
280 |    "metadata": {},
281 |    "outputs": [
282 |     {
283 |      "data": {
284 |       "application/vnd.jupyter.widget-view+json": {
285 |        "model_id": "90adbd92213b4fc28c4b3e0670f59857",
286 |        "version_major": 2,
287 |        "version_minor": 0
288 |       },
289 |       "text/plain": [
290 |        "HBox(children=(IntProgress(value=0, description='Epoch', max=30), HTML(value='')))"
291 |       ]
292 |      },
293 |      "metadata": {},
294 |      "output_type": "display_data"
295 |     },
296 |     {
297 |      "name": "stdout",
298 |      "output_type": "stream",
299 |      "text": [
300 |       "epoch      trn_loss   val_loss   accuracy                   \n",
301 |       "    0      1.577392   1.604844   0.4625    \n",
302 |       "    1      1.381691   1.012047   0.6521                     \n",
303 |       "    2      1.285132   0.81694    0.7366                     \n",
304 |       "    3      1.238322   0.772833   0.7486                     \n",
305 |       "    4      1.208034   0.606934   0.8108                     \n",
306 |       "    5      1.155568   0.715823   0.7695                     \n",
307 |       "    6      1.138336   0.585958   0.8217                     \n",
308 |       "    7      1.107028   0.530601   0.8452                     \n",
309 |       "    8      1.094954   0.536645   0.8332                     \n",
310 |       "    9      1.068575   0.53234    0.8518                     \n",
311 |       "    10     1.068159   0.430891   0.869                      \n",
312 |       "    11     1.043203   0.520973   0.8434                     \n",
313 |       "    12     1.051916   0.504943   0.8461                     \n",
314 |       "    13     1.042599   0.451895   0.8745                     \n",
315 |       "    14     1.024971   0.469676   0.8564                     \n",
316 |       "    15     1.002356   0.427404   0.8823                      \n",
317 |       "    16     0.976197   0.459503   0.8596                      \n",
318 |       "    17     0.956913   0.367024   0.8956                      \n",
319 |       "    18     0.940494   0.359177   0.8988                      \n",
320 |       "    19     0.928801   0.356722   0.894                       \n",
321 |       "    20     0.91274    0.361805   0.9037                      \n",
322 |       "    21     0.889095   0.318632   0.913                       \n",
323 |       "    22     0.876805   0.293216   0.9204                      \n",
324 |       "    23     0.858679   0.269319   0.9275                      \n",
325 |       "    24     0.830605   0.239478   0.9366                      \n",
326 |       "    25     0.824642   0.245498   0.9381                      \n",
327 |       "    26     0.805051   0.217718   0.944                       \n",
328 |       "    27     0.784053   0.217338   0.9459                      \n",
329 |       "    28     0.774043   0.20877    0.9492                      \n",
330 |       "    29     0.76365    0.204386   0.95                        \n",
331 |       "\n",
332 |       "CPU times: user 25min 15s, sys: 12min 58s, total: 38min 14s\n",
333 |       "Wall time: 23min 5s\n"
334 |      ]
335 |     },
336 |     {
337 |      "data": {
338 |       "text/plain": [
339 |        "[0.20438589594364165, 0.95]"
340 |       ]
341 |      },
342 |      "execution_count": 26,
343 |      "metadata": {},
344 |      "output_type": "execute_result"
345 |     }
346 |    ],
347 |    "source": [
348 |     "%time learn.fit(3e-3, 1, cycle_len=30, use_clr_beta=(10,7.5,0.95,0.85), wds=0.1, use_wd_sched=True)"
349 |    ]
350 |   },
351 |   {
352 |    "cell_type": "markdown",
353 |    "metadata": {},
354 |    "source": [
355 |     "Training is as fast as without mixup (23 minutes)."
356 |    ]
357 |   },
358 |   {
359 |    "cell_type": "markdown",
360 |    "metadata": {},
361 |    "source": [
362 |     "This second one is with a different alpha: here mixup_dl = MixUpDataLoader(data.trn_dl, 0.4)"
363 |    ]
364 |   },
365 |   {
366 |    "cell_type": "code",
367 |    "execution_count": 22,
368 |    "metadata": {},
369 |    "outputs": [
370 |     {
371 |      "data": {
372 |       "application/vnd.jupyter.widget-view+json": {
373 |        "model_id": "534c42bdc5864eb089ceb754de4dee00",
374 |        "version_major": 2,
375 |        "version_minor": 0
376 |       },
377 |       "text/plain": [
378 |        "HBox(children=(IntProgress(value=0, description='Epoch', max=30), HTML(value='')))"
379 |       ]
380 |      },
381 |      "metadata": {},
382 |      "output_type": "display_data"
383 |     },
384 |     {
385 |      "name": "stdout",
386 |      "output_type": "stream",
387 |      "text": [
388 |       " 20%|██        | 79/391 [00:08<00:33,  9.32it/s, loss=1.88]\n",
389 |       "epoch      trn_loss   val_loss   accuracy                   \n",
390 |       "    0      1.472878   1.345358   0.5291    \n",
391 |       "    1      1.269906   0.84233    0.7231                     \n",
392 |       "    2      1.179544   0.839368   0.7125                     \n",
393 |       "    3      1.112833   0.738385   0.7596                     \n",
394 |       "    4      1.055841   0.98404    0.6708                     \n",
395 |       "    5      1.06348    0.654288   0.7998                     \n",
396 |       "    6      1.015946   0.515297   0.8349                     \n",
397 |       "    7      0.996191   0.493962   0.8435                      \n",
398 |       "    8      0.979429   0.525252   0.8424                      \n",
399 |       "    9      0.966817   0.509232   0.8455                      \n",
400 |       "    10     0.957474   0.571616   0.8228                      \n",
401 |       "    11     0.926887   0.495118   0.8441                      \n",
402 |       "    12     0.938132   0.487569   0.8488                      \n",
403 |       "    13     0.930551   0.472688   0.8567                      \n",
404 |       "    14     0.912599   0.442748   0.8683                      \n",
405 |       "    15     0.887375   0.443511   0.869                       \n",
406 |       "    16     0.869825   0.385284   0.881                       \n",
407 |       "    17     0.843568   0.39869    0.8834                      \n",
408 |       "    18     0.83644    0.40036    0.8781                      \n",
409 |       "    19     0.803586   0.317143   0.9071                      \n",
410 |       "    20     0.790489   0.316681   0.9058                      \n",
411 |       "    21     0.782296   0.290825   0.9189                      \n",
412 |       "    22     0.757262   0.280219   0.9208                      \n",
413 |       "    23     0.743585   0.247127   0.9333                      \n",
414 |       "    24     0.709734   0.253216   0.929                       \n",
415 |       "    25     0.698053   0.230306   0.9391                      \n",
416 |       "    26     0.692384   0.209626   0.9426                      \n",
417 |       "    27     0.673137   0.204203   0.9449                      \n",
418 |       "    28     0.653099   0.201423   0.9489                      \n",
419 |       "    29     0.650861   0.193649   0.9493                      \n",
420 |       "\n",
421 |       "CPU times: user 25min 34s, sys: 12min 41s, total: 38min 15s\n",
422 |       "Wall time: 23min 11s\n"
423 |      ]
424 |     },
425 |     {
426 |      "data": {
427 |       "text/plain": [
428 |        "[0.1936487452030182, 0.9493]"
429 |       ]
430 |      },
431 |      "execution_count": 22,
432 |      "metadata": {},
433 |      "output_type": "execute_result"
434 |     }
435 |    ],
436 |    "source": [
437 |     "%time learn.fit(3e-3, 1, cycle_len=30, use_clr_beta=(10,7.5,0.95,0.85), wds=0.1, use_wd_sched=True)"
438 |    ]
439 |   },
440 |   {
441 |    "cell_type": "markdown",
442 |    "metadata": {},
443 |    "source": [
444 |     "# Logs"
445 |    ]
446 |   },
447 |   {
448 |    "cell_type": "markdown",
449 |    "metadata": {},
450 |    "source": [
451 |     "The regular traning with AdamW and using of LogResults callback."
452 |    ]
453 |   },
454 |   {
455 |    "cell_type": "code",
456 |    "execution_count": 27,
457 |    "metadata": {},
458 |    "outputs": [],
459 |    "source": [
460 |     "class LogResults(Callback):\n",
461 |     "    \"\"\"\n",
462 |     "    Callback to log all the results of the training:\n",
463 |     "    - at the end of each epoch: training loss, validation loss and metrics\n",
464 |     "    - at the end of the first batches then every epoch: deciles of the params and their gradients\n",
465 |     "    \"\"\"\n",
466 |     "    \n",
467 |     "    def __init__(self, learn, fname, init_text=''):\n",
468 |     "        super().__init__()\n",
469 |     "        self.learn, self.fname, self.init_text = learn, fname, init_text\n",
470 |     "        self.pcts = [0.001, 0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99, 0.999]\n",
471 |     "        self.pnames = {p:n for n,p in learn.model.named_parameters()}\n",
472 |     "        self.module_names = get_module_names(learn.model)\n",
473 |     "        \n",
474 |     "    def on_train_begin(self):\n",
475 |     "        self.logs, self.epoch, self.n = self.init_text + \"\\n\", 0, 0\n",
476 |     "        self.deciles = {}\n",
477 |     "        for name in self.pnames.values(): \n",
478 |     "            self.deciles[name] = collections.defaultdict(list)\n",
479 |     "            self.deciles[name + '.grad'] = collections.defaultdict(list)\n",
480 |     "        for name in self.module_names.values(): self.deciles[name] = collections.defaultdict(list)\n",
481 |     "        names = [\"epoch\", \"trn_loss\", \"val_loss\", \"metric\"]\n",
482 |     "        layout = \"{!s:10} \" * len(names)\n",
483 |     "        self.logs += layout.format(*names) + \"\\n\"\n",
484 |     "    \n",
485 |     "    def on_batch_begin(self):\n",
486 |     "        if self.n == 0 or (self.epoch == 0 and is_power_of_two(self.n+1)):\n",
487 |     "            self.hooks = []\n",
488 |     "            self.learn.model.apply(self.register_hook)\n",
489 |     "    \n",
490 |     "    def on_batch_end(self, metrics):\n",
491 |     "        self.loss = metrics\n",
492 |     "        if self.n == 0 or (self.epoch == 0 and is_power_of_two(self.n+1)):\n",
493 |     "            self.save_deciles()\n",
494 |     "        if len(self.hooks) != 0:\n",
495 |     "            for h in self.hooks: h.remove()\n",
496 |     "            self.hooks=[]\n",
497 |     "        self.n += 1\n",
498 |     "    \n",
499 |     "    def on_epoch_end(self, metrics):\n",
500 |     "        self.save_stats(self.epoch, [self.loss] + metrics)\n",
501 |     "        self.epoch += 1\n",
502 |     "        self.n=0\n",
503 |     "        \n",
504 |     "    def save_stats(self, epoch, values, decimals=6):\n",
505 |     "        layout = \"{!s:^10}\" + \" {!s:10}\" * len(values)\n",
506 |     "        values = [epoch] + list(np.round(values, decimals))\n",
507 |     "        self.logs += layout.format(*values) + \"\\n\"\n",
508 |     "    \n",
509 |     "    def save_deciles(self):\n",
510 |     "        for group_param in self.learn.sched.layer_opt.opt_params():\n",
511 |     "            for param in group_param['params']:\n",
512 |     "                self.add_deciles(self.pnames[param], to_np(param))\n",
513 |     "                self.add_deciles(self.pnames[param] + '.grad', to_np(param.grad))\n",
514 |     "    \n",
515 |     "    def separate_pcts(self,arr):\n",
516 |     "        n = len(arr.reshape(-1))\n",
517 |     "        pos, neg = arr[arr > 0], arr[arr < 0]\n",
518 |     "        pos_pcts = np.percentile(pos, self.pcts) if len(pos) > 0 else np.array([])\n",
519 |     "        neg_pcts = np.percentile(neg, self.pcts) if len(neg) > 0 else np.array([])\n",
520 |     "        return len(pos)/n, len(neg)/n, pos_pcts, neg_pcts\n",
521 |     "    \n",
522 |     "    def add_deciles(self, name, arr):\n",
523 |     "        pos, neg, pct_pos, pct_neg = self.separate_pcts(arr)\n",
524 |     "        self.deciles[name]['sgn'].append([pos, neg])\n",
525 |     "        self.deciles[name]['pos'].append(pct_pos)\n",
526 |     "        self.deciles[name]['neg'].append(pct_neg)\n",
527 |     "                                                        \n",
528 |     "    def on_train_end(self):\n",
529 |     "        with open(self.fname + '.txt', 'a') as f: f.write(self.logs)\n",
530 |     "        pickle.dump(self.deciles, open(self.fname + '.pkl', 'wb'))\n",
531 |     "        \n",
532 |     "    def register_hook(self, module):\n",
533 |     "        def hook_save_act(module, input, output):\n",
534 |     "            pos, neg, pct_pos, pct_neg = self.separate_pcts(to_np(output))\n",
535 |     "            m_name = self.module_names[module]\n",
536 |     "            self.deciles[m_name]['sgn'].append([pos, neg])\n",
537 |     "            self.deciles[m_name]['pos'].append(pct_pos)\n",
538 |     "            self.deciles[m_name]['neg'].append(pct_neg)\n",
539 |     "        if (not isinstance(module, nn.Sequential) and not isinstance(module, nn.ModuleList) and not (module == self.learn.model)):\n",
540 |     "            self.hooks.append(module.register_forward_hook(hook_save_act))\n",
541 |     "\n",
542 |     "def get_module_names(model):\n",
543 |     "    def register_names(module):\n",
544 |     "        if (not isinstance(module, nn.Sequential) and not isinstance(module, nn.ModuleList) and not (module == model)):\n",
545 |     "            class_name = str(module.__class__).split('.')[-1].split(\"'\")[0]\n",
546 |     "            m_name = f'{class_name}-{len(names)+1}'\n",
547 |     "            names[module] = m_name\n",
548 |     "    names = {}\n",
549 |     "    model.apply(register_names)\n",
550 |     "    return names\n",
551 |     "\n",
552 |     "def is_power_of_two(n):\n",
553 |     "    while n>1:\n",
554 |     "        if n%2 != 0: return False\n",
555 |     "        n = n//2\n",
556 |     "    return True"
557 |    ]
558 |   },
559 |   {
560 |    "cell_type": "code",
561 |    "execution_count": 31,
562 |    "metadata": {},
563 |    "outputs": [],
564 |    "source": [
565 |     "tfms = tfms_from_stats(stats, sz, aug_tfms=[RandomCrop(sz), RandomFlip()], pad=sz//8)\n",
566 |     "data = ImageClassifierData.from_paths(PATH, tfms=tfms, bs=bs, val_name='test')"
567 |    ]
568 |   },
569 |   {
570 |    "cell_type": "code",
571 |    "execution_count": 32,
572 |    "metadata": {},
573 |    "outputs": [],
574 |    "source": [
575 |     "m = wrn_22()\n",
576 |     "opt_fn = partial(optim.Adam, betas=(0.95,0.99))\n",
577 |     "learn = ConvLearner.from_model_data(m, data, opt_fn=opt_fn)\n",
578 |     "learn.crit = nn.CrossEntropyLoss()\n",
579 |     "learn.metrics = [accuracy]\n",
580 |     "wd=0.1"
581 |    ]
582 |   },
583 |   {
584 |    "cell_type": "code",
585 |    "execution_count": 33,
586 |    "metadata": {},
587 |    "outputs": [],
588 |    "source": [
589 |     "log_cb = LogResults(learn, str(PATH/'cifar10'))"
590 |    ]
591 |   },
592 |   {
593 |    "cell_type": "code",
594 |    "execution_count": 34,
595 |    "metadata": {},
596 |    "outputs": [
597 |     {
598 |      "data": {
599 |       "application/vnd.jupyter.widget-view+json": {
600 |        "model_id": "736336f88e444b049c814aea37b8d588",
601 |        "version_major": 2,
602 |        "version_minor": 0
603 |       },
604 |       "text/plain": [
605 |        "HBox(children=(IntProgress(value=0, description='Epoch', max=30), HTML(value='')))"
606 |       ]
607 |      },
608 |      "metadata": {},
609 |      "output_type": "display_data"
610 |     },
611 |     {
612 |      "name": "stdout",
613 |      "output_type": "stream",
614 |      "text": [
615 |       "epoch      trn_loss   val_loss   accuracy                   \n",
616 |       "    0      1.099058   1.092296   0.6064    \n",
617 |       "    1      0.840501   0.879823   0.6933                      \n",
618 |       "    2      0.691204   0.750623   0.7482                      \n",
619 |       "    3      0.617986   0.590799   0.8001                      \n",
620 |       "    4      0.553649   0.562221   0.808                       \n",
621 |       "    5      0.503341   0.857572   0.7261                      \n",
622 |       "    6      0.460281   0.526698   0.8185                      \n",
623 |       "    7      0.440226   0.519977   0.8228                      \n",
624 |       "    8      0.413289   0.56235    0.8171                      \n",
625 |       "    9      0.393606   0.447091   0.8463                      \n",
626 |       "    10     0.370249   0.484564   0.8476                      \n",
627 |       "    11     0.353899   0.442357   0.8554                      \n",
628 |       "    12     0.34476    0.385429   0.8705                      \n",
629 |       "    13     0.343156   0.456771   0.8485                      \n",
630 |       "    14     0.304945   0.429652   0.8579                      \n",
631 |       "    15     0.274984   0.33127    0.8914                      \n",
632 |       "    16     0.253462   0.325269   0.8898                      \n",
633 |       "    17     0.227426   0.382997   0.8765                      \n",
634 |       "    18     0.209817   0.299588   0.8988                      \n",
635 |       "    19     0.194897   0.293427   0.9044                      \n",
636 |       "    20     0.169136   0.291551   0.9095                      \n",
637 |       "    21     0.148512   0.308182   0.904                       \n",
638 |       "    22     0.132138   0.256267   0.9215                      \n",
639 |       "    23     0.112354   0.259355   0.922                        \n",
640 |       "    24     0.08252    0.253447   0.923                        \n",
641 |       "    25     0.062529   0.237996   0.9301                       \n",
642 |       "    26     0.044835   0.221493   0.9358                       \n",
643 |       "    27     0.0276     0.231811   0.9398                       \n",
644 |       "    28     0.01912    0.236096   0.9399                       \n",
645 |       "    29     0.012238   0.226691   0.9409                       \n",
646 |       "\n",
647 |       "CPU times: user 44min 47s, sys: 13min 45s, total: 58min 33s\n",
648 |       "Wall time: 26min 36s\n"
649 |      ]
650 |     },
651 |     {
652 |      "data": {
653 |       "text/plain": [
654 |        "[0.22669122726917268, 0.9409]"
655 |       ]
656 |      },
657 |      "execution_count": 34,
658 |      "metadata": {},
659 |      "output_type": "execute_result"
660 |     }
661 |    ],
662 |    "source": [
663 |     "%time learn.fit(3e-3, 1, cycle_len=30, use_clr_beta=(10,7.5,0.95,0.85), wds=0.1, use_wd_sched=True, callbacks=[log_cb])"
664 |    ]
665 |   },
666 |   {
667 |    "cell_type": "markdown",
668 |    "metadata": {},
669 |    "source": [
670 |     "Loggging the results affects performance a bit (especially in the first epoch where we log results every 2^n batch), here we go from 23 minutes to 26min 36s."
671 |    ]
672 |   },
673 |   {
674 |    "cell_type": "code",
675 |    "execution_count": null,
676 |    "metadata": {},
677 |    "outputs": [],
678 |    "source": []
679 |   }
680 |  ],
681 |  "metadata": {
682 |   "kernelspec": {
683 |    "display_name": "Python 3",
684 |    "language": "python",
685 |    "name": "python3"
686 |   },
687 |   "language_info": {
688 |    "codemirror_mode": {
689 |     "name": "ipython",
690 |     "version": 3
691 |    },
692 |    "file_extension": ".py",
693 |    "mimetype": "text/x-python",
694 |    "name": "python",
695 |    "nbconvert_exporter": "python",
696 |    "pygments_lexer": "ipython3",
697 |    "version": "3.6.5"
698 |   },
699 |   "toc": {
700 |    "colors": {
701 |     "hover_highlight": "#DAA520",
702 |     "navigate_num": "#000000",
703 |     "navigate_text": "#333333",
704 |     "running_highlight": "#FF0000",
705 |     "selected_highlight": "#FFD700",
706 |     "sidebar_border": "#EEEEEE",
707 |     "wrapper_background": "#FFFFFF"
708 |    },
709 |    "moveMenuLeft": true,
710 |    "nav_menu": {
711 |     "height": "266px",
712 |     "width": "252px"
713 |    },
714 |    "navigate_menu": true,
715 |    "number_sections": true,
716 |    "sideBar": true,
717 |    "threshold": 4,
718 |    "toc_cell": false,
719 |    "toc_section_display": "block",
720 |    "toc_window_display": false,
721 |    "widenNotebook": false
722 |   }
723 |  },
724 |  "nbformat": 4,
725 |  "nbformat_minor": 2
726 | }
727 | 


--------------------------------------------------------------------------------
/Experiments/multiGPU/callbacks.py:
--------------------------------------------------------------------------------
 1 | from fastai.conv_learner import *
 2 | 
 3 | 
 4 | class LogResults(Callback):
 5 |     """
 6 |     Callback to log all the results of the training:
 7 |     - at the end of each epoch: training loss, validation loss and metrics
 8 |     """
 9 |     
10 |     def __init__(self, learn, fname):
11 |         super().__init__()
12 |         self.learn, self.fname = learn, fname
13 |         
14 |     def on_train_begin(self):
15 |         self.logs, self.epoch, self.n = "", 0, 0
16 |         names = ["epoch", "trn_loss", "val_loss", "accuracy"]
17 |         layout = "{!s:10} " * len(names)
18 |         self.logs += layout.format(*names) + "\n"
19 |     
20 |     def on_batch_end(self, metrics):
21 |         self.loss = metrics
22 |     
23 |     def on_epoch_end(self, metrics):
24 |         self.save_stats(self.epoch, [self.loss] + metrics)
25 |         self.epoch += 1
26 |         
27 |     def save_stats(self, epoch, values, decimals=6):
28 |         layout = "{!s:^10}" + " {!s:10}" * len(values)
29 |         values = [epoch] + list(np.round(values, decimals))
30 |         self.logs += layout.format(*values) + "\n"
31 | 
32 |     def on_train_end(self):
33 |         with open(self.fname, 'a') as f: f.write(self.logs)
34 | 


--------------------------------------------------------------------------------
/Experiments/multiGPU/databunch.py:
--------------------------------------------------------------------------------
  1 | from PIL import Image
  2 | from torch.utils.data import Dataset
  3 | import pickle, gzip, torch, math, random, numpy as np, torch.nn.functional as F
  4 | from pathlib import Path
  5 | from IPython.core.debugger import set_trace
  6 | from torch import nn, optim
  7 | from torch.utils.data import TensorDataset, DataLoader as DataLoader1
  8 | import torchvision
  9 | from collections import Iterable
 10 | from functools import reduce,partial
 11 | from tqdm import tqdm, tqdm_notebook, trange, tnrange
 12 | from sampler import DistributedSampler
 13 | 
 14 | def find_classes(folder):
 15 |     classes = [d for d in folder.iterdir()
 16 |                if d.is_dir() and not d.name.startswith('.')]
 17 |     classes.sort(key=lambda d: d.name)
 18 |     return classes
 19 | 
 20 | def get_image_files(c):
 21 |     return [o for o in list(c.iterdir())
 22 |             if not o.name.startswith('.') and not o.is_dir()]
 23 | 
 24 | class FilesDataset1(Dataset):#Renamed to avoid conflict with fastai FilesDataset
 25 |     def __init__(self, folder, tfms):
 26 |         cls_dirs = find_classes(folder)
 27 |         self.fns, self.y = [], []
 28 |         self.classes = [cls.name for cls in cls_dirs]
 29 |         for i, cls_dir in enumerate(cls_dirs):
 30 |             fnames = get_image_files(cls_dir)
 31 |             self.fns += fnames
 32 |             self.y += [i] * len(fnames)
 33 |         self.tfms = tfms
 34 |         
 35 |     def __len__(self): return len(self.fns)
 36 | 
 37 |     def __getitem__(self,i):
 38 |         x = Image.open(self.fns[i])
 39 |         x = torch.tensor(np.array(x, dtype=np.float32).transpose(2,0,1)).div_(255.)
 40 |         if self.tfms is not None: x = self.tfms(x)[0]
 41 |         return x,self.y[i]
 42 | 
 43 | 
 44 | def get_dataloader(ds, bs, shuffle, device, stats, sampler):
 45 |     return DeviceDataLoader(DataLoader1(ds, batch_size=bs, shuffle=shuffle,num_workers=8, sampler=sampler), device, stats)
 46 | 
 47 | class DeviceDataLoader():
 48 |     def __init__(self, dl, device, stats):
 49 |         self.dl,self.device = dl,device
 50 |         self.m, self.s = map(lambda x:torch.tensor(x, dtype=torch.float32, device=device), stats)
 51 |         
 52 |     def __iter__(self):
 53 |         for b in self.dl:
 54 |             x, y = b[0].to(self.device),b[1].to(self.device)
 55 |             x = (x - self.m[None,:,None,None]) / self.s[None,:,None,None]
 56 |             yield x,y
 57 |     
 58 |     def __len__(self): return (len(self.dl))
 59 | 
 60 | class DataBunch():
 61 |     def __init__(self, trn_ds, val_ds, stats, device, trn_sampler=None, bs=64):
 62 |         if hasattr(trn_ds, 'classes'): self.classes = trn_ds.classes
 63 |         if trn_sampler is not None:
 64 |             self.trn_dl = get_dataloader(trn_ds, bs, shuffle=False, device=device, stats=stats, sampler=trn_sampler)
 65 |         else: self.trn_dl = get_dataloader(trn_ds, bs, shuffle=True, device=device, stats=stats, sampler=None)
 66 |         self.val_dl = get_dataloader(val_ds, bs*2, shuffle=False, device=device, stats=stats, sampler=None)
 67 | 
 68 |     @classmethod
 69 |     def from_files(cls, Path, trn_tfms, val_tfms, stats, device, distrib=False, trn_name='train', val_name='valid', bs=64):
 70 |         trn_ds, val_ds = FilesDataset1(Path/trn_name, trn_tfms), FilesDataset1(Path/val_name, val_tfms)
 71 |         trn_sampler = DistributedSampler(trn_ds) if distrib else None
 72 |         return cls(trn_ds, val_ds, stats, device, trn_sampler, bs)
 73 | 
 74 | def interpolate(x, coords, padding='reflect'):
 75 |     if padding=='reflect':#Reflect padding isn't implemented in grid_sample yet
 76 |         coords[coords < -1] = coords[coords < -1].mul_(-1).add_(-2)
 77 |         coords[coords > 1] = coords[coords > 1].mul_(-1).add_(2)
 78 |         padding='zeros'
 79 |     return F.grid_sample(x, coords, padding_mode=padding)
 80 | 
 81 | def affine_transform(img, matrix, interpol=True, padding='reflect'):
 82 |     """
 83 |     Applies an affine transformation to an image.
 84 |     
 85 |     Optional: only computes the new coordinates without doing the interpolation to create the new images.
 86 |     Args:
 87 |     x: a batch of images
 88 |     matrix: a matrix of size 2 by 3 describing the transformation.
 89 |             if the transformation is Ax + b, the matrix is (A|b)
 90 |     interpol: if False, returns only the new coordinates
 91 |     padding: padding to apply during the interpolation. Supports zeros, border, reflect
 92 |     
 93 |     """
 94 |     coords = F.affine_grid(matrix[None], img[None].size())
 95 |     return interpolate(img[None],coords,padding) if interpol else coords
 96 | 
 97 | def get_random_rot_matrix(degrees):
 98 |     theta = random.uniform(-degrees,degrees) * math.pi / 180
 99 |     return torch.tensor([[math.cos(theta), -math.sin(theta), 0],
100 |                          [math.sin(theta), math.cos(theta),  0],
101 |                          [0,               0,                1]])
102 | 
103 | def get_random_scale_matrix(zoom_range):
104 |     scale = random.uniform(*zoom_range)
105 |     return torch.tensor([[scale, 0, 0],
106 |                          [0, scale, 0],
107 |                          [0,  0,    1]])
108 | 
109 | def get_random_flip(prob):
110 |     if np.random.rand() < prob:
111 |         return torch.tensor([[-1, 0, 0],
112 |                              [0,  1, 0],
113 |                              [0,  0, 1]]).float()
114 |     else: return torch.eye(3)
115 | 
116 | class CustomTfm():
117 |     
118 |     def __init__(self, p_flip, pad, size, size_mult):
119 |         self.p_flip,self.pad,self.size,self.size_mult = p_flip,pad,size,size_mult
120 |         
121 |     def __call__(self, x):
122 |         _, h, w = x.size()
123 |         #Resize the image so that the lower dimension is size * size_mult
124 |         ratio = (self.size * self.size_mult) / min(h,w)
125 |         h,w = int(h * ratio), int(w*ratio)
126 |         #Pads
127 |         x = F.pad(x[None], (self.pad,self.pad,self.pad,self.pad), 'reflect') #Symmetric not implemented in F.pad
128 |         #Affine transforms
129 |         matrix = get_random_flip(self.p_flip)
130 |         matrix = matrix[:2,:]
131 |         img_size = torch.Size([1,3,h+2*self.pad,w+2*self.pad])
132 |         coords = F.affine_grid(matrix[None], img_size)
133 |         #Coords transforms then crop
134 |         a = random.randint(0, h+2*self.pad-self.size) if h + 2*self.pad>= self.size else 0
135 |         b = random.randint(0, w+2*self.pad-self.size) if w + 2*self.pad>= self.size else 0
136 |         coords = coords[:,a:a+self.size,b:b+self.size,:]
137 |         #Interpolation
138 |         return interpolate(x, coords)


--------------------------------------------------------------------------------
/Experiments/multiGPU/sampler.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | import torch
 3 | from torch.utils.data.sampler import Sampler
 4 | from torch.distributed import get_world_size, get_rank
 5 | 
 6 | 
 7 | class DistributedSampler(Sampler):
 8 |     """Sampler that restricts data loading to a subset of the dataset.
 9 | 
10 |     It is especially useful in conjunction with
11 |     :class:`torch.nn.parallel.DistributedDataParallel`. In such case, each
12 |     process can pass a DistributedSampler instance as a DataLoader sampler,
13 |     and load a subset of the original dataset that is exclusive to it.
14 | 
15 |     .. note::
16 |         Dataset is assumed to be of constant size.
17 | 
18 |     Arguments:
19 |         dataset: Dataset used for sampling.
20 |         num_replicas (optional): Number of processes participating in
21 |             distributed training.
22 |         rank (optional): Rank of the current process within num_replicas.
23 |     """
24 | 
25 |     def __init__(self, dataset, num_replicas=None, rank=None):
26 |         if num_replicas is None:
27 |             num_replicas = get_world_size()
28 |         print(num_replicas)
29 |         if rank is None:
30 |             rank = get_rank()
31 |         print(rank)
32 |         self.dataset = dataset
33 |         self.num_replicas = num_replicas
34 |         self.rank = rank
35 |         self.epoch = 0
36 |         self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas))
37 |         self.total_size = self.num_samples * self.num_replicas
38 | 
39 |     def __iter__(self):
40 |         # deterministically shuffle based on epoch
41 |         g = torch.Generator()
42 |         g.manual_seed(self.epoch)
43 |         indices = list(torch.randperm(len(self.dataset), generator=g))
44 | 
45 |         # add extra samples to make it evenly divisible
46 |         indices += indices[:(self.total_size - len(indices))]
47 |         assert len(indices) == self.total_size
48 | 
49 |         # subsample
50 |         offset = self.num_samples * self.rank
51 |         indices = indices[offset:offset + self.num_samples]
52 |         assert len(indices) == self.num_samples
53 | 
54 |         return iter(indices)
55 | 
56 |     def __len__(self):
57 |         return self.num_samples
58 | 
59 |     def set_epoch(self, epoch):
60 |         self.epoch = epoch


--------------------------------------------------------------------------------
/Experiments/multiGPU/train_cifar10.py:
--------------------------------------------------------------------------------
 1 | from fastai.conv_learner import *
 2 | from fastai.fp16 import *
 3 | from fastai.models.cifar10.wideresnet import wrn_22
 4 | from utils import get_opt_fn, get_phases, log_msg
 5 | from callbacks import *
 6 | import argparse
 7 | from databunch import DataBunch, CustomTfm
 8 | import torch.multiprocessing as mp
 9 | 
10 | def main_train(lr, bs, cuda_id, not_distrib, fp16, loss_scale):
11 |     """
12 |     Trains a Language Model
13 | 
14 |     lr (float): maximum learning rate
15 |     moms (float/tuple): value of the momentum/beta1. If tuple, cyclical momentums will be used
16 |     wd (float): weight decay to be used
17 |     wd_loss (bool): weight decay computed inside the loss if True (l2 reg) else outside (true wd)
18 |     opt_fn (optimizer): name of the optim function to use (should be SGD, RMSProp or Adam)
19 |     bs (int): batch size
20 |     cyc_len (int): length of the cycle
21 |     beta2 (float): beta2 parameter of Adam or alpha parameter of RMSProp
22 |     amsgrad (bool): for Adam, sues amsgrad or not
23 |     div (float): value to divide the maximum learning rate by
24 |     pct (float): percentage to leave for the annealing at the end
25 |     lin_end (bool): if True, the annealing phase goes from the minimum lr to 1/100th of it linearly
26 |                     if False, uses a cosine annealing to 0
27 |     tta (bool): if True, uses Test Time Augmentation to evaluate the model
28 |     """
29 |     torch.backends.cudnn.benchmark = True
30 |     if fp16: assert torch.backends.cudnn.enabled, "missing cudnn"
31 |     stats = (np.array([ 0.4914 ,  0.48216,  0.44653]), np.array([ 0.24703,  0.24349,  0.26159]))
32 |     sz=32
33 |     PATH = Path("../../data/cifar10/")
34 |     tfms = tfms_from_stats(stats, sz, aug_tfms=[RandomCrop(sz), RandomFlip()], pad=sz//8)
35 |     data1 = ImageClassifierData.from_paths(PATH, val_name='test', tfms=tfms, bs=bs)
36 |     m = wrn_22().cuda()
37 |     if not not_distrib: m = nn.parallel.DistributedDataParallel(m, device_ids=[cuda_id], output_device=cuda_id)
38 |     learn = ConvLearner.from_model_data(m, data1)
39 |     learn.crit = nn.CrossEntropyLoss()
40 |     learn.metrics = [accuracy]
41 |     trn_tfms = CustomTfm(0.5, 4, 32, 1)
42 |     val_tfms = None
43 |     data = DataBunch.from_files(PATH, trn_tfms, val_tfms, stats, torch.device('cuda', cuda_id), distrib=not not_distrib, val_name='test', bs=bs)
44 |     learn.data.trn_dl, learn.data.val_dl = data.trn_dl, data.val_dl
45 |     if fp16: learn.half()
46 |     x,y = next(iter(data.trn_dl))
47 |     opt_fn = get_opt_fn('Adam', 0.95, 0.99, False)
48 |     learn.opt_fn = opt_fn
49 |     cyc_len, pct = 30, 0.075
50 |     nbs = [cyc_len * (1-pct) / 2, cyc_len * (1-pct) / 2, cyc_len * pct]
51 |     phases = get_phases(lr, (0.95,0.85), opt_fn, 10, nbs, 0.1, True, False)
52 |     #print_lr = PrintLR(learn)
53 |     learn.fit_opt_sched(phases, loss_scale=loss_scale)
54 |    
55 | class PrintLR(Callback):
56 | 
57 |     def __init__(self, learner):
58 |         self.learner = learner
59 | 
60 |     def on_train_begin(self):
61 |         self.n = 0
62 |     
63 |     def on_batch_begin(self):
64 |         if self.n ==0:
65 |             print(self.learner.sched.layer_opt.opt.param_groups[0]['lr'])
66 |         self.n += 1
67 |     
68 |     def on_epoch_end(self, metrics):
69 |         self.n = 0
70 | 
71 | def main():
72 |     """
73 |     Launches the trainings.
74 | 
75 |     See main_train for the description of all the arguments.
76 |     name (string): name to be added to the log file
77 |     cuda_id (int): index of the GPU to use
78 |     nb_exp (int): number of experiments to run in a row
79 |     """
80 |     parser = argparse.ArgumentParser()
81 |     parser.add_argument("--lr", type=float, default=12e-3)
82 |     parser.add_argument("--bs", type=int, default=128)
83 |     parser.add_argument('--fp16', action='store_true', help='Run model fp16 mode.')
84 |     parser.add_argument('--not_distrib', action='store_true', help='Run model fp16 mode.')
85 |     parser.add_argument('--loss_scale', type=float, default=1)
86 |     parser.add_argument("--local_rank", type=int)
87 |     arg = parser.parse_args()
88 |     torch.cuda.set_device(arg.local_rank)
89 |     if not arg.not_distrib: torch.distributed.init_process_group('nccl', init_method='env://')
90 |     main_train(arg.lr, arg.bs, arg.local_rank, arg.not_distrib, arg.fp16, arg.loss_scale)
91 | 
92 | if __name__ == '__main__': 
93 |     #mp.set_start_method('spawn')
94 |     main()


--------------------------------------------------------------------------------
/Experiments/multiGPU/utils.py:
--------------------------------------------------------------------------------
 1 | from fastai.conv_learner import *
 2 | 
 3 | def log_msg(file, msg):
 4 |     print(msg)
 5 |     file.write('\n' + msg)
 6 | 
 7 | def get_opt_fn(opt_fn, mom, beta, amsgrad):
 8 |     """
 9 |     Helper function to return a proper optim function from its name
10 | 
11 |     opt_fn (string): name of the optim function (should be SGD, RMSProp or Adam)
12 |     mom (float): momentum to use (or beta1 in the case of Adam)
13 |     beta (float): alpha parameter in RMSProp and beta2 in Adam
14 |     amsgrad (bool): for Adam only, uses amsgrad or not
15 |     """
16 |     assert opt_fn in {'SGD', 'RMSProp', 'Adam'}, 'optim should be SGD, RMSProp or Adam'
17 |     if opt_fn=='SGD': res = optim.SGD
18 |     elif opt_fn=='RMSProp': res = optim.RMSprop if beta is None else partial(optim.RMSProp, alpha=beta)
19 |     else: res = partial(optim.Adam, amsgrad=amsgrad) if beta is None else partial(optim.Adam, betas=(mom,beta), amsgrad=amsgrad)
20 |     return res
21 | 
22 | def get_one_phase(nb, opt_fn, lr, lr_decay, moms, wd, wd_loss):
23 |     """
24 |     Helper function to create one training phase.
25 | 
26 |     nb (int): number of epochs
27 |     opt_fn (optimizer): the optim function to use
28 |     lr (float/tuple): the learning rate(s) to use. If tuple, going from the first to the second
29 |     lr_decay (DecayType): the decay type to go from lr1 to lr2
30 |     moms (float/tuple): the momentum(s) to use. If tuple, going from the first to the second linearly
31 |     wd (float): weight decay
32 |     wd_loss (bool): weight decay computed inside the loss if True (l2 reg) else outside (true wd)
33 |     """
34 |     if isinstance(moms, Iterable):
35 |         return TrainingPhase(nb, opt_fn, lr=lr, lr_decay=lr_decay, momentum=moms, 
36 |                              momentum_decay=DecayType.LINEAR, wds=wd, wd_loss=wd_loss)
37 |     else:
38 |         return TrainingPhase(nb, opt_fn, lr=lr, lr_decay=lr_decay, momentum=moms, 
39 |                              wds=wd, wd_loss=wd_loss)
40 | 
41 | def get_phases(lr, moms, opt_fn, div, nbs, wd, lin_end=False, wd_loss=True):
42 |     """
43 |     Creates the phases for a 1cycle policy (or a variant)
44 | 
45 |     lr (float): maximum learning rate
46 |     moms (float/tuple): value of the momentum/beta1. If tuple, cyclical momentums will be used
47 |     opt_fn (optimizer): the optim function to use
48 |     div (float): value to divide the maximum learning rate by
49 |     nbs (list): number of epochs for each phase (ascending, constant if len==4, descending, annealing)
50 |     wd (float): weight decay
51 |     lin_end (bool): if True, the annealing phase goes from the minimum lr to 1/100th of it linearly
52 |                     if False, uses a cosine annealing to 0
53 |     wd_loss (bool): weight decay computed inside the loss if True (l2 reg) else outside (true wd)
54 |     """
55 |     max_mom = moms[0] if isinstance(moms, Iterable) else moms
56 |     min_mom = moms[1] if isinstance(moms, Iterable) else moms
57 |     moms_r = (moms[1],moms[0]) if isinstance(moms, Iterable) else moms
58 |     phases = [get_one_phase(nbs[0], opt_fn, (lr/div,lr), DecayType.LINEAR, moms, wd, wd_loss)]
59 |     if len(nbs)==4:
60 |         phases.append(get_one_phase(nbs[1], opt_fn, lr, DecayType.NO, min_mom, wd, wd_loss))
61 |         nbs = [nbs[0]] + nbs[2:]
62 |     phases.append(get_one_phase(nbs[1], opt_fn, (lr,lr/div), DecayType.LINEAR, moms_r, wd, wd_loss))
63 |     if lin_end:
64 |         phases.append(get_one_phase(nbs[2], opt_fn, (lr/div,lr/(100*div)), DecayType.LINEAR, max_mom, wd, wd_loss))
65 |     else:
66 |         phases.append(get_one_phase(nbs[2], opt_fn, lr/div, DecayType.COSINE, max_mom, wd, wd_loss))
67 |     return phases


--------------------------------------------------------------------------------
/Experiments/record_logs.py:
--------------------------------------------------------------------------------
 1 | from fastai.conv_learner import *
 2 | 
 3 | class LogResults(Callback):
 4 |     """
 5 |     Callback to log all the results of the training:
 6 |     - at the end of each epoch: training loss, validation loss and metrics
 7 |     - at the end of the first batches then every epoch: deciles of the params and their gradients
 8 |     """
 9 |     
10 |     def __init__(self, learn, fname, init_text=''):
11 |         super().__init__()
12 |         self.learn, self.fname, self.init_text = learn, fname, init_text
13 |         self.pcts = [0.001, 0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99, 0.999]
14 |         self.pnames = {p:n for n,p in learn.model.named_parameters()}
15 |         self.module_names = get_module_names(learn.model)
16 |         
17 |     def on_train_begin(self):
18 |         self.logs, self.epoch, self.n = self.init_text + "\n", 0, 0
19 |         self.deciles = {}
20 |         for name in self.pnames.values(): 
21 |             self.deciles[name] = collections.defaultdict(list)
22 |             self.deciles[name + '.grad'] = collections.defaultdict(list)
23 |         for name in self.module_names.values(): self.deciles[name] = collections.defaultdict(list)
24 |         names = ["epoch", "trn_loss", "val_loss", "metric"]
25 |         layout = "{!s:10} " * len(names)
26 |         self.logs += layout.format(*names) + "\n"
27 |     
28 |     def on_batch_begin(self):
29 |         if self.n == 0 or (self.epoch == 0 and is_power_of_two(self.n+1)):
30 |             self.hooks = []
31 |             self.learn.model.apply(self.register_hook)
32 |     
33 |     def on_batch_end(self, metrics):
34 |         self.loss = metrics
35 |         if self.n == 0 or (self.epoch == 0 and is_power_of_two(self.n+1)):
36 |             self.save_deciles()
37 |         if len(self.hooks) != 0:
38 |             for h in self.hooks: h.remove()
39 |             self.hooks=[]
40 |         self.n += 1
41 |     
42 |     def on_epoch_end(self, metrics):
43 |         self.save_stats(self.epoch, [self.loss] + metrics)
44 |         self.epoch += 1
45 |         self.n=0
46 |         
47 |     def save_stats(self, epoch, values, decimals=6):
48 |         layout = "{!s:^10}" + " {!s:10}" * len(values)
49 |         values = [epoch] + list(np.round(values, decimals))
50 |         self.logs += layout.format(*values) + "\n"
51 |     
52 |     def save_deciles(self):
53 |         for group_param in self.learn.sched.layer_opt.opt_params():
54 |             for param in group_param['params']:
55 |                 self.add_deciles(self.pnames[param], to_np(param))
56 |                 self.add_deciles(self.pnames[param] + '.grad', to_np(param.grad))
57 |     
58 |     def separate_pcts(self,arr):
59 |         n = len(arr.reshape(-1))
60 |         pos, neg = arr[arr > 0], arr[arr < 0]
61 |         pos_pcts = np.percentile(pos, self.pcts) if len(pos) > 0 else np.array([])
62 |         neg_pcts = np.percentile(neg, self.pcts) if len(neg) > 0 else np.array([])
63 |         return len(pos)/n, len(neg)/n, pos_pcts, neg_pcts
64 |     
65 |     def add_deciles(self, name, arr):
66 |         pos, neg, pct_pos, pct_neg = self.separate_pcts(arr)
67 |         self.deciles[name]['sgn'].append([pos, neg])
68 |         self.deciles[name]['pos'].append(pct_pos)
69 |         self.deciles[name]['neg'].append(pct_neg)
70 |                                                         
71 |     def on_train_end(self):
72 |         with open(self.fname + '.txt', 'a') as f: f.write(self.logs)
73 |         pickle.dump(self.deciles, open(self.fname + '.pkl', 'wb'))
74 |         
75 |     def register_hook(self, module):
76 |         def hook_save_act(module, input, output):
77 |             pos, neg, pct_pos, pct_neg = self.separate_pcts(to_np(output))
78 |             m_name = self.module_names[module]
79 |             self.deciles[m_name]['sgn'].append([pos, neg])
80 |             self.deciles[m_name]['pos'].append(pct_pos)
81 |             self.deciles[m_name]['neg'].append(pct_neg)
82 |         if (not isinstance(module, nn.Sequential) and not isinstance(module, nn.ModuleList) and not (module == self.learn.model)):
83 |             self.hooks.append(module.register_forward_hook(hook_save_act))
84 | 
85 | def get_module_names(model):
86 |     def register_names(module):
87 |         if (not isinstance(module, nn.Sequential) and not isinstance(module, nn.ModuleList) and not (module == model)):
88 |             class_name = str(module.__class__).split('.')[-1].split("'")[0]
89 |             m_name = f'{class_name}-{len(names)+1}'
90 |             names[module] = m_name
91 |     names = {}
92 |     model.apply(register_names)
93 |     return names
94 | 
95 | def is_power_of_two(n):
96 |     while n>1:
97 |         if n%2 != 0: return False
98 |         n = n//2
99 |     return True


--------------------------------------------------------------------------------
/First neural net in pytorch.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "This notebook is intended to go with [this blog article](https://sgugger.github.io/a-neural-net-in-pytorch.html) which explains the theory behind it."
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "First of all, the necessary libraries:"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 1,
 20 |    "metadata": {},
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "%matplotlib inline\n",
 24 |     "#pytorch packages \n",
 25 |     "import torch\n",
 26 |     "import torch.nn as nn\n",
 27 |     "import torch.nn.functional as F\n",
 28 |     "import torch.optim as optim\n",
 29 |     "from torch.autograd import Variable\n",
 30 |     "#To download the dataset for torchvision\n",
 31 |     "import torchvision\n",
 32 |     "from torchvision import datasets, transforms\n",
 33 |     "#For plots\n",
 34 |     "import matplotlib.pyplot as plt"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "markdown",
 39 |    "metadata": {},
 40 |    "source": [
 41 |     "# Download and look at the data"
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "markdown",
 46 |    "metadata": {},
 47 |    "source": [
 48 |     "For this example, we will try to recognize hand-written digits, specifically the ones of the MNIST dataset, that contains overall 70,000 28-by-28-pixels pictures of hand-written digits. This dataset is easily accessible in pytorch via dataset.MNSIT. You just have to specify you want to download it if it's not already in the directory, and pytorch will process it to create a DataSet."
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "code",
 53 |    "execution_count": 2,
 54 |    "metadata": {},
 55 |    "outputs": [],
 56 |    "source": [
 57 |     "#Change to the directory of your choice.\n",
 58 |     "PATH = '.'"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "markdown",
 63 |    "metadata": {},
 64 |    "source": [
 65 |     "Grab the training and the test set."
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "code",
 70 |    "execution_count": 3,
 71 |    "metadata": {},
 72 |    "outputs": [],
 73 |    "source": [
 74 |     "trn_set = datasets.MNIST(PATH, train=True, download=True)\n",
 75 |     "tst_set = datasets.MNIST(PATH, train=False, download=True)"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "markdown",
 80 |    "metadata": {},
 81 |    "source": [
 82 |     "Let's have a look at the data in the training set first."
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "code",
 87 |    "execution_count": 4,
 88 |    "metadata": {},
 89 |    "outputs": [
 90 |     {
 91 |      "data": {
 92 |       "text/plain": [
 93 |        "(60000, 10000)"
 94 |       ]
 95 |      },
 96 |      "execution_count": 4,
 97 |      "metadata": {},
 98 |      "output_type": "execute_result"
 99 |     }
100 |    ],
101 |    "source": [
102 |     "len(trn_set.train_data), len(tst_set.test_data)"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "markdown",
107 |    "metadata": {},
108 |    "source": [
109 |     "Each image is represented by a tensor of size 28 by 28, each value represents the color of the corresponding pixel, from 0 (black) to 255 (white). Torch tensors are the equivalent of numpy ndarrays."
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "code",
114 |    "execution_count": 5,
115 |    "metadata": {},
116 |    "outputs": [
117 |     {
118 |      "data": {
119 |       "text/plain": [
120 |        "\n",
121 |        "\n",
122 |        "Columns 0 to 12 \n",
123 |        "    0     0     0     0     0     0     0     0     0     0     0     0     0\n",
124 |        "    0     0     0     0     0     0     0     0     0     0     0     0     0\n",
125 |        "    0     0     0     0     0     0     0     0     0     0     0     0     0\n",
126 |        "    0     0     0     0     0     0     0     0     0     0     0     0     0\n",
127 |        "    0     0     0     0     0     0     0     0     0     0     0     0     0\n",
128 |        "    0     0     0     0     0     0     0     0     0     0     0     0     3\n",
129 |        "    0     0     0     0     0     0     0     0    30    36    94   154   170\n",
130 |        "    0     0     0     0     0     0     0    49   238   253   253   253   253\n",
131 |        "    0     0     0     0     0     0     0    18   219   253   253   253   253\n",
132 |        "    0     0     0     0     0     0     0     0    80   156   107   253   253\n",
133 |        "    0     0     0     0     0     0     0     0     0    14     1   154   253\n",
134 |        "    0     0     0     0     0     0     0     0     0     0     0   139   253\n",
135 |        "    0     0     0     0     0     0     0     0     0     0     0    11   190\n",
136 |        "    0     0     0     0     0     0     0     0     0     0     0     0    35\n",
137 |        "    0     0     0     0     0     0     0     0     0     0     0     0     0\n",
138 |        "    0     0     0     0     0     0     0     0     0     0     0     0     0\n",
139 |        "    0     0     0     0     0     0     0     0     0     0     0     0     0\n",
140 |        "    0     0     0     0     0     0     0     0     0     0     0     0     0\n",
141 |        "    0     0     0     0     0     0     0     0     0     0     0     0     0\n",
142 |        "    0     0     0     0     0     0     0     0     0     0     0     0    39\n",
143 |        "    0     0     0     0     0     0     0     0     0     0    24   114   221\n",
144 |        "    0     0     0     0     0     0     0     0    23    66   213   253   253\n",
145 |        "    0     0     0     0     0     0    18   171   219   253   253   253   253\n",
146 |        "    0     0     0     0    55   172   226   253   253   253   253   244   133\n",
147 |        "    0     0     0     0   136   253   253   253   212   135   132    16     0\n",
148 |        "    0     0     0     0     0     0     0     0     0     0     0     0     0\n",
149 |        "    0     0     0     0     0     0     0     0     0     0     0     0     0\n",
150 |        "    0     0     0     0     0     0     0     0     0     0     0     0     0\n",
151 |        "\n",
152 |        "Columns 13 to 25 \n",
153 |        "    0     0     0     0     0     0     0     0     0     0     0     0     0\n",
154 |        "    0     0     0     0     0     0     0     0     0     0     0     0     0\n",
155 |        "    0     0     0     0     0     0     0     0     0     0     0     0     0\n",
156 |        "    0     0     0     0     0     0     0     0     0     0     0     0     0\n",
157 |        "    0     0     0     0     0     0     0     0     0     0     0     0     0\n",
158 |        "   18    18    18   126   136   175    26   166   255   247   127     0     0\n",
159 |        "  253   253   253   253   253   225   172   253   242   195    64     0     0\n",
160 |        "  253   253   253   253   251    93    82    82    56    39     0     0     0\n",
161 |        "  253   198   182   247   241     0     0     0     0     0     0     0     0\n",
162 |        "  205    11     0    43   154     0     0     0     0     0     0     0     0\n",
163 |        "   90     0     0     0     0     0     0     0     0     0     0     0     0\n",
164 |        "  190     2     0     0     0     0     0     0     0     0     0     0     0\n",
165 |        "  253    70     0     0     0     0     0     0     0     0     0     0     0\n",
166 |        "  241   225   160   108     1     0     0     0     0     0     0     0     0\n",
167 |        "   81   240   253   253   119    25     0     0     0     0     0     0     0\n",
168 |        "    0    45   186   253   253   150    27     0     0     0     0     0     0\n",
169 |        "    0     0    16    93   252   253   187     0     0     0     0     0     0\n",
170 |        "    0     0     0     0   249   253   249    64     0     0     0     0     0\n",
171 |        "    0    46   130   183   253   253   207     2     0     0     0     0     0\n",
172 |        "  148   229   253   253   253   250   182     0     0     0     0     0     0\n",
173 |        "  253   253   253   253   201    78     0     0     0     0     0     0     0\n",
174 |        "  253   253   198    81     2     0     0     0     0     0     0     0     0\n",
175 |        "  195    80     9     0     0     0     0     0     0     0     0     0     0\n",
176 |        "   11     0     0     0     0     0     0     0     0     0     0     0     0\n",
177 |        "    0     0     0     0     0     0     0     0     0     0     0     0     0\n",
178 |        "    0     0     0     0     0     0     0     0     0     0     0     0     0\n",
179 |        "    0     0     0     0     0     0     0     0     0     0     0     0     0\n",
180 |        "    0     0     0     0     0     0     0     0     0     0     0     0     0\n",
181 |        "\n",
182 |        "Columns 26 to 27 \n",
183 |        "    0     0\n",
184 |        "    0     0\n",
185 |        "    0     0\n",
186 |        "    0     0\n",
187 |        "    0     0\n",
188 |        "    0     0\n",
189 |        "    0     0\n",
190 |        "    0     0\n",
191 |        "    0     0\n",
192 |        "    0     0\n",
193 |        "    0     0\n",
194 |        "    0     0\n",
195 |        "    0     0\n",
196 |        "    0     0\n",
197 |        "    0     0\n",
198 |        "    0     0\n",
199 |        "    0     0\n",
200 |        "    0     0\n",
201 |        "    0     0\n",
202 |        "    0     0\n",
203 |        "    0     0\n",
204 |        "    0     0\n",
205 |        "    0     0\n",
206 |        "    0     0\n",
207 |        "    0     0\n",
208 |        "    0     0\n",
209 |        "    0     0\n",
210 |        "    0     0\n",
211 |        "[torch.ByteTensor of size 28x28]"
212 |       ]
213 |      },
214 |      "execution_count": 5,
215 |      "metadata": {},
216 |      "output_type": "execute_result"
217 |     }
218 |    ],
219 |    "source": [
220 |     "trn_set.train_data[0]"
221 |    ]
222 |   },
223 |   {
224 |    "cell_type": "markdown",
225 |    "metadata": {},
226 |    "source": [
227 |     "It's easy to convert a torch tensor to a numpy array via the .numpy() command.\n",
228 |     "\n",
229 |     "Conversely, you can create a torch Tensor from a numpy array x via torch.Tensor(x)"
230 |    ]
231 |   },
232 |   {
233 |    "cell_type": "code",
234 |    "execution_count": 6,
235 |    "metadata": {},
236 |    "outputs": [
237 |     {
238 |      "data": {
239 |       "text/plain": [
240 |        "array([[  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,\n",
241 |        "          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,\n",
242 |        "          0,   0],\n",
243 |        "       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,\n",
244 |        "          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,\n",
245 |        "          0,   0],\n",
246 |        "       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,\n",
247 |        "          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,\n",
248 |        "          0,   0],\n",
249 |        "       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,\n",
250 |        "          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,\n",
251 |        "          0,   0],\n",
252 |        "       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,\n",
253 |        "          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,\n",
254 |        "          0,   0],\n",
255 |        "       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   3,\n",
256 |        "         18,  18,  18, 126, 136, 175,  26, 166, 255, 247, 127,   0,   0,\n",
257 |        "          0,   0],\n",
258 |        "       [  0,   0,   0,   0,   0,   0,   0,   0,  30,  36,  94, 154, 170,\n",
259 |        "        253, 253, 253, 253, 253, 225, 172, 253, 242, 195,  64,   0,   0,\n",
260 |        "          0,   0],\n",
261 |        "       [  0,   0,   0,   0,   0,   0,   0,  49, 238, 253, 253, 253, 253,\n",
262 |        "        253, 253, 253, 253, 251,  93,  82,  82,  56,  39,   0,   0,   0,\n",
263 |        "          0,   0],\n",
264 |        "       [  0,   0,   0,   0,   0,   0,   0,  18, 219, 253, 253, 253, 253,\n",
265 |        "        253, 198, 182, 247, 241,   0,   0,   0,   0,   0,   0,   0,   0,\n",
266 |        "          0,   0],\n",
267 |        "       [  0,   0,   0,   0,   0,   0,   0,   0,  80, 156, 107, 253, 253,\n",
268 |        "        205,  11,   0,  43, 154,   0,   0,   0,   0,   0,   0,   0,   0,\n",
269 |        "          0,   0],\n",
270 |        "       [  0,   0,   0,   0,   0,   0,   0,   0,   0,  14,   1, 154, 253,\n",
271 |        "         90,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,\n",
272 |        "          0,   0],\n",
273 |        "       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0, 139, 253,\n",
274 |        "        190,   2,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,\n",
275 |        "          0,   0],\n",
276 |        "       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,  11, 190,\n",
277 |        "        253,  70,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,\n",
278 |        "          0,   0],\n",
279 |        "       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,  35,\n",
280 |        "        241, 225, 160, 108,   1,   0,   0,   0,   0,   0,   0,   0,   0,\n",
281 |        "          0,   0],\n",
282 |        "       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,\n",
283 |        "         81, 240, 253, 253, 119,  25,   0,   0,   0,   0,   0,   0,   0,\n",
284 |        "          0,   0],\n",
285 |        "       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,\n",
286 |        "          0,  45, 186, 253, 253, 150,  27,   0,   0,   0,   0,   0,   0,\n",
287 |        "          0,   0],\n",
288 |        "       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,\n",
289 |        "          0,   0,  16,  93, 252, 253, 187,   0,   0,   0,   0,   0,   0,\n",
290 |        "          0,   0],\n",
291 |        "       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,\n",
292 |        "          0,   0,   0,   0, 249, 253, 249,  64,   0,   0,   0,   0,   0,\n",
293 |        "          0,   0],\n",
294 |        "       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,\n",
295 |        "          0,  46, 130, 183, 253, 253, 207,   2,   0,   0,   0,   0,   0,\n",
296 |        "          0,   0],\n",
297 |        "       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,  39,\n",
298 |        "        148, 229, 253, 253, 253, 250, 182,   0,   0,   0,   0,   0,   0,\n",
299 |        "          0,   0],\n",
300 |        "       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,  24, 114, 221,\n",
301 |        "        253, 253, 253, 253, 201,  78,   0,   0,   0,   0,   0,   0,   0,\n",
302 |        "          0,   0],\n",
303 |        "       [  0,   0,   0,   0,   0,   0,   0,   0,  23,  66, 213, 253, 253,\n",
304 |        "        253, 253, 198,  81,   2,   0,   0,   0,   0,   0,   0,   0,   0,\n",
305 |        "          0,   0],\n",
306 |        "       [  0,   0,   0,   0,   0,   0,  18, 171, 219, 253, 253, 253, 253,\n",
307 |        "        195,  80,   9,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,\n",
308 |        "          0,   0],\n",
309 |        "       [  0,   0,   0,   0,  55, 172, 226, 253, 253, 253, 253, 244, 133,\n",
310 |        "         11,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,\n",
311 |        "          0,   0],\n",
312 |        "       [  0,   0,   0,   0, 136, 253, 253, 253, 212, 135, 132,  16,   0,\n",
313 |        "          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,\n",
314 |        "          0,   0],\n",
315 |        "       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,\n",
316 |        "          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,\n",
317 |        "          0,   0],\n",
318 |        "       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,\n",
319 |        "          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,\n",
320 |        "          0,   0],\n",
321 |        "       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,\n",
322 |        "          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,\n",
323 |        "          0,   0]], dtype=uint8)"
324 |       ]
325 |      },
326 |      "execution_count": 6,
327 |      "metadata": {},
328 |      "output_type": "execute_result"
329 |     }
330 |    ],
331 |    "source": [
332 |     "trn_set.train_data[0].numpy()"
333 |    ]
334 |   },
335 |   {
336 |    "cell_type": "markdown",
337 |    "metadata": {},
338 |    "source": [
339 |     "It's then easy to see the corresponding picture via plt."
340 |    ]
341 |   },
342 |   {
343 |    "cell_type": "code",
344 |    "execution_count": 7,
345 |    "metadata": {},
346 |    "outputs": [
347 |     {
348 |      "data": {
349 |       "text/plain": [
350 |        "<matplotlib.image.AxesImage at 0x23085107240>"
351 |       ]
352 |      },
353 |      "execution_count": 7,
354 |      "metadata": {},
355 |      "output_type": "execute_result"
356 |     },
357 |     {
358 |      "data": {
359 |       "image/png": "iVBORw0KGgoAAAANSUhEUgAAAP8AAAD8CAYAAAC4nHJkAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvNQv5yAAADmVJREFUeJzt3X+MVPW5x/HPI4KoEIOyUGLxbtuouYakWx1JDWL2UiXUNAGCNSWxoZF0G63JxRBTs39Yf+QaYi6tGE2T7QXBpLVUAcHEtCgx8ZJodfxVRdSqWcteEJaoVIjSAM/9Yw/NijvfGWbOzBn2eb8SszPnOd89jwMfzsx858zX3F0A4jmt6AYAFIPwA0ERfiAowg8ERfiBoAg/EBThB4Ii/EBQhB8I6vRWHmzy5Mne2dnZykMCofT392v//v1Wy74Nhd/M5klaJWmMpP9x9xWp/Ts7O1Uulxs5JICEUqlU8751P+03szGSHpL0fUmXSFpsZpfU+/sAtFYjr/lnSnrP3T9w939K+oOk+fm0BaDZGgn/+ZJ2Dbs/kG37EjPrMbOymZUHBwcbOByAPDUS/pHeVPjK9cHu3ufuJXcvdXR0NHA4AHlqJPwDkqYPu/91SbsbawdAqzQS/pckXWhm3zCzcZJ+JGlLPm0BaLa6p/rc/YiZ3SLpzxqa6lvj7jty6wxAUzU0z+/uT0l6KqdeALQQH+8FgiL8QFCEHwiK8ANBEX4gKMIPBEX4gaAIPxAU4QeCIvxAUIQfCIrwA0ERfiAowg8ERfiBoAg/EBThB4Ii/EBQhB8IivADQRF+ICjCDwRF+IGgCD8QFOEHgiL8QFCEHwiK8ANBEX4gqIZW6TWzfkmfSToq6Yi7l/JoCvk5duxYsn748OGmHn/dunUVa4cOHUqOfeutt5L1+++/P1nv7e2tWHvwwQeTY88888xkfeXKlcn6TTfdlKy3g4bCn/kPd9+fw+8B0EI87QeCajT8Lmmrmb1sZj15NASgNRp92j/L3Xeb2RRJT5vZ2+7+3PAdsn8UeiTpggsuaPBwAPLS0Jnf3XdnP/dJ2iRp5gj79Ll7yd1LHR0djRwOQI7qDr+ZnW1mE4/fljRX0pt5NQaguRp52j9V0iYzO/57fu/uf8qlKwBNV3f43f0DSd/OsZdR68CBA8n60aNHk/XXX389Wd+6dWvF2qeffpoc29fXl6wXqbOzM1lfvnx5sr569eqKtXPOOSc5dvbs2cn6nDlzkvVTAVN9QFCEHwiK8ANBEX4gKMIPBEX4gaDyuKovvIGBgWS9q6srWf/kk0/ybOeUcdpp6XNPaqpOqn7Z7dKlSyvWpkyZkhw7YcKEZH00fFqVMz8QFOEHgiL8QFCEHwiK8ANBEX4gKMIPBMU8fw7OO++8ZH3q1KnJejvP88+dOzdZr/b/vnHjxoq1M844Izm2u7s7WUdjOPMDQRF+ICjCDwRF+IGgCD8QFOEHgiL8QFDM8+eg2nXla9euTdYff/zxZP2KK65I1hctWpSsp1x55ZXJ+ubNm5P1cePGJesfffRRxdqqVauSY9FcnPmBoAg/EBThB4Ii/EBQhB8IivADQRF+IChz9/QOZmsk/UDSPnefkW07V9J6SZ2S+iVd7+5VL0ovlUpeLpcbbHn0OXz4cLJebS69t7e3Yu2+++5Ljn322WeT9auuuipZR3splUoql8tWy761nPnXSpp3wrbbJW1z9wslbcvuAziFVA2/uz8n6eMTNs+XtC67vU7Sgpz7AtBk9b7mn+rueyQp+5le+whA22n6G35m1mNmZTMrDw4ONvtwAGpUb/j3mtk0Scp+7qu0o7v3uXvJ3UujYXFDYLSoN/xbJC3Jbi+RlL70C0DbqRp+M3tU0vOSLjazATNbKmmFpGvM7G+SrsnuAziFVL2e390XVyh9L+dewqr2/fXVTJo0qe6xDzzwQLI+e/bsZN2spilltCE+4QcERfiBoAg/EBThB4Ii/EBQhB8Iiq/uHgWWLVtWsfbiiy8mx27atClZ37FjR7I+Y8aMZB3tizM/EBThB4Ii/EBQhB8IivADQRF+ICjCDwTFPP8okPpq776+vuTYbdu2Jevz589P1hcsSH9366xZsyrWFi5cmBzL5cLNxZkfCIrwA0ERfiAowg8ERfiBoAg/EBThB4KqukR3nliiu/1Uu95/3rwTF2j+sgMHDtR97DVr1iTrixYtStYnTJhQ97FHq7yX6AYwChF+ICjCDwRF+IGgCD8QFOEHgiL8QFBVr+c3szWSfiBpn7vPyLbdKemnkgaz3Xrd/almNYnmmTlzZrJe7Xv7b7311mT9scceq1i78cYbk2Pff//9ZP22225L1idOnJisR1fLmX+tpJE+6fFrd+/K/iP4wCmmavjd/TlJH7egFwAt1Mhr/lvM7K9mtsbMJuXWEYCWqDf8v5H0LUldkvZIWllpRzPrMbOymZUHBwcr7QagxeoKv7vvdfej7n5M0m8lVXzXyN373L3k7qWOjo56+wSQs7rCb2bTht1dKOnNfNoB0Cq1TPU9Kqlb0mQzG5D0S0ndZtYlySX1S/pZE3sE0ARcz4+GfPHFF8n6Cy+8ULF29dVXJ8dW+7t53XXXJevr169P1kcjrucHUBXhB4Ii/EBQhB8IivADQRF+ICiW6EZDxo8fn6x3d3dXrI0ZMyY59siRI8n6E088kay/8847FWsXX3xxcmwEnPmBoAg/EBThB4Ii/EBQhB8IivADQRF+ICjm+ZG0e/fuZH3jxo3J+vPPP1+xVm0ev5rLL788Wb/ooosa+v2jHWd+ICjCDwRF+IGgCD8QFOEHgiL8QFCEHwiKef5RrtoSaQ899FCy/vDDDyfrAwMDJ91Trapd79/Z2Zmsm9X0DdZhceYHgiL8QFCEHwiK8ANBEX4gKMIPBEX4gaCqzvOb2XRJj0j6mqRjkvrcfZWZnStpvaROSf2Srnf3T5rXalwHDx5M1p988smKtbvvvjs59t13362rpzzMmTMnWV+xYkWyftlll+XZTji1nPmPSFru7v8u6buSfm5ml0i6XdI2d79Q0rbsPoBTRNXwu/sed38lu/2ZpJ2Szpc0X9K6bLd1khY0q0kA+Tup1/xm1inpO5L+Immqu++Rhv6BkDQl7+YANE/N4TezCZI2SFrm7v84iXE9ZlY2s3K1z5kDaJ2awm9mYzUU/N+5+/FvbNxrZtOy+jRJ+0Ya6+597l5y91JHR0cePQPIQdXw29ClUasl7XT3Xw0rbZG0JLu9RNLm/NsD0Cy1XNI7S9KPJb1hZq9l23olrZD0RzNbKunvkn7YnBZPfYcOHUrWd+3alazfcMMNyfqrr7560j3lZe7cucn6XXfdVbFW7au3uSS3uaqG3923S6r0p/C9fNsB0Cp8wg8IivADQRF+ICjCDwRF+IGgCD8QFF/dXaPPP/+8Ym3ZsmXJsdu3b0/W33777bp6ysO1116brN9xxx3JeldXV7I+duzYk+4JrcGZHwiK8ANBEX4gKMIPBEX4gaAIPxAU4QeCCjPP39/fn6zfe++9yfozzzxTsfbhhx/W01JuzjrrrIq1e+65Jzn25ptvTtbHjRtXV09of5z5gaAIPxAU4QeCIvxAUIQfCIrwA0ERfiCoMPP8GzZsSNZXr17dtGNfeumlyfrixYuT9dNPT/8x9fT0VKyNHz8+ORZxceYHgiL8QFCEHwiK8ANBEX4gKMIPBEX4gaDM3dM7mE2X9Iikr0k6JqnP3VeZ2Z2SfippMNu1192fSv2uUqnk5XK54aYBjKxUKqlcLlst+9byIZ8jkpa7+ytmNlHSy2b2dFb7tbv/d72NAihO1fC7+x5Je7Lbn5nZTknnN7sxAM11Uq/5zaxT0nck/SXbdIuZ/dXM1pjZpApjesysbGblwcHBkXYBUICaw29mEyRtkLTM3f8h6TeSviWpS0PPDFaONM7d+9y95O6ljo6OHFoGkIeawm9mYzUU/N+5+0ZJcve97n7U3Y9J+q2kmc1rE0DeqobfzEzSakk73f1Xw7ZPG7bbQklv5t8egGap5d3+WZJ+LOkNM3st29YrabGZdUlySf2SftaUDgE0RS3v9m+XNNK8YXJOH0B74xN+QFCEHwiK8ANBEX4gKMIPBEX4gaAIPxAU4QeCIvxAUIQfCIrwA0ERfiAowg8ERfiBoKp+dXeuBzMblPThsE2TJe1vWQMnp117a9e+JHqrV569/Zu71/R9eS0N/1cOblZ291JhDSS0a2/t2pdEb/Uqqjee9gNBEX4gqKLD31fw8VPatbd27Uuit3oV0luhr/kBFKfoMz+AghQSfjObZ2bvmNl7ZnZ7ET1UYmb9ZvaGmb1mZoUuKZwtg7bPzN4ctu1cM3vazP6W/RxxmbSCervTzP4ve+xeM7NrC+ptupk9a2Y7zWyHmf1ntr3Qxy7RVyGPW8uf9pvZGEnvSrpG0oCklyQtdve3WtpIBWbWL6nk7oXPCZvZVZIOSnrE3Wdk2+6T9LG7r8j+4Zzk7r9ok97ulHSw6JWbswVlpg1fWVrSAkk/UYGPXaKv61XA41bEmX+mpPfc/QN3/6ekP0iaX0Afbc/dn5P08Qmb50tal91ep6G/PC1Xobe24O573P2V7PZnko6vLF3oY5foqxBFhP98SbuG3R9Qey357ZK2mtnLZtZTdDMjmJotm358+fQpBfdzoqorN7fSCStLt81jV8+K13krIvwjrf7TTlMOs9z9Uknfl/Tz7OktalPTys2tMsLK0m2h3hWv81ZE+AckTR92/+uSdhfQx4jcfXf2c5+kTWq/1Yf3Hl8kNfu5r+B+/qWdVm4eaWVptcFj104rXhcR/pckXWhm3zCzcZJ+JGlLAX18hZmdnb0RIzM7W9Jctd/qw1skLcluL5G0ucBevqRdVm6utLK0Cn7s2m3F60I+5JNNZdwvaYykNe7+Xy1vYgRm9k0Nne2loUVMf19kb2b2qKRuDV31tVfSLyU9IemPki6Q9HdJP3T3lr/xVqG3bg09df3Xys3HX2O3uLcrJf2vpDckHcs292ro9XVhj12ir8Uq4HHjE35AUHzCDwiK8ANBEX4gKMIPBEX4gaAIPxAU4QeCIvxAUP8PRZ8Vlgh2BcUAAAAASUVORK5CYII=\n",
360 |       "text/plain": [
361 |        "<matplotlib.figure.Figure at 0x23081997cc0>"
362 |       ]
363 |      },
364 |      "metadata": {},
365 |      "output_type": "display_data"
366 |     }
367 |    ],
368 |    "source": [
369 |     "plt.imshow(trn_set.train_data[0].numpy(), cmap='Greys')"
370 |    ]
371 |   },
372 |   {
373 |    "cell_type": "markdown",
374 |    "metadata": {},
375 |    "source": [
376 |     "Let's have a look at the corresponding label..."
377 |    ]
378 |   },
379 |   {
380 |    "cell_type": "code",
381 |    "execution_count": 8,
382 |    "metadata": {},
383 |    "outputs": [
384 |     {
385 |      "data": {
386 |       "text/plain": [
387 |        "5"
388 |       ]
389 |      },
390 |      "execution_count": 8,
391 |      "metadata": {},
392 |      "output_type": "execute_result"
393 |     }
394 |    ],
395 |    "source": [
396 |     "trn_set.train_labels[0]"
397 |    ]
398 |   },
399 |   {
400 |    "cell_type": "markdown",
401 |    "metadata": {},
402 |    "source": [
403 |     "# Preparing the data"
404 |    ]
405 |   },
406 |   {
407 |    "cell_type": "markdown",
408 |    "metadata": {},
409 |    "source": [
410 |     "A pytorch neural network will expect the data to come in the form of minibatches of tensors. To do that, we use a pytorch object called DataLoader. It will randomly separate the pictures (with the associated label) in minibatches. If you have multiple GPUs, it also prepares the work to be parallelized between them (just change num_workers from 0 to your custom value). We only shuffle the data randomly for the training.\n",
411 |     "\n",
412 |     "First we need to explicitely ask our dataset to transform the images in tensors."
413 |    ]
414 |   },
415 |   {
416 |    "cell_type": "code",
417 |    "execution_count": 9,
418 |    "metadata": {},
419 |    "outputs": [],
420 |    "source": [
421 |     "tsfms = transforms.ToTensor()\n",
422 |     "trn_set = datasets.MNIST(PATH, train=True, download=True, transform=tsfms)\n",
423 |     "tst_set = datasets.MNIST(PATH, train=False, download=True, transform=tsfms)"
424 |    ]
425 |   },
426 |   {
427 |    "cell_type": "code",
428 |    "execution_count": 10,
429 |    "metadata": {},
430 |    "outputs": [],
431 |    "source": [
432 |     "trn_loader = torch.utils.data.DataLoader(trn_set, batch_size=64, shuffle=True, num_workers=0)\n",
433 |     "tst_loader = torch.utils.data.DataLoader(tst_set, batch_size=64, shuffle=False, num_workers=0)"
434 |    ]
435 |   },
436 |   {
437 |    "cell_type": "markdown",
438 |    "metadata": {},
439 |    "source": [
440 |     "Let's have a look at an example. A data loader can be converted into an iterator and we can then ask him for a minibatch."
441 |    ]
442 |   },
443 |   {
444 |    "cell_type": "code",
445 |    "execution_count": 11,
446 |    "metadata": {},
447 |    "outputs": [],
448 |    "source": [
449 |     "mb_example = next(iter(trn_loader))"
450 |    ]
451 |   },
452 |   {
453 |    "cell_type": "markdown",
454 |    "metadata": {},
455 |    "source": [
456 |     "Such a minibacth containts two torch tensors: the first one contains the data (here our pictures) and the second one the expected labels."
457 |    ]
458 |   },
459 |   {
460 |    "cell_type": "code",
461 |    "execution_count": 12,
462 |    "metadata": {},
463 |    "outputs": [
464 |     {
465 |      "data": {
466 |       "text/plain": [
467 |        "(torch.Size([64, 1, 28, 28]), torch.Size([64]))"
468 |       ]
469 |      },
470 |      "execution_count": 12,
471 |      "metadata": {},
472 |      "output_type": "execute_result"
473 |     }
474 |    ],
475 |    "source": [
476 |     "mb_example[0].size(), mb_example[1].size()"
477 |    ]
478 |   },
479 |   {
480 |    "cell_type": "markdown",
481 |    "metadata": {},
482 |    "source": [
483 |     "Note that pytorch has automatically added one dimension to our images (the 1 in second position). It would be 3 if we had had the three usual channels for the colors (RGB). Pytorch puts this channel in the second dimension and not the last because it simplifies some computation.\n",
484 |     "\n",
485 |     "Let's see the first tensor."
486 |    ]
487 |   },
488 |   {
489 |    "cell_type": "code",
490 |    "execution_count": 13,
491 |    "metadata": {},
492 |    "outputs": [
493 |     {
494 |      "data": {
495 |       "text/plain": [
496 |        "\n",
497 |        "\n",
498 |        "Columns 0 to 9 \n",
499 |        " 0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000\n",
500 |        " 0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000\n",
501 |        " 0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000\n",
502 |        " 0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000\n",
503 |        " 0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000\n",
504 |        " 0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0667  0.9059\n",
505 |        " 0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0118  0.6824  0.9961\n",
506 |        " 0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0941  0.9922  0.9961\n",
507 |        " 0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0353  0.8235  1.0000\n",
508 |        " 0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.3294  0.9961\n",
509 |        " 0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.7961\n",
510 |        " 0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0745\n",
511 |        " 0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000\n",
512 |        " 0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000\n",
513 |        " 0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000\n",
514 |        " 0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000\n",
515 |        " 0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000\n",
516 |        " 0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000\n",
517 |        " 0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000\n",
518 |        " 0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000\n",
519 |        " 0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000\n",
520 |        " 0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000\n",
521 |        " 0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000\n",
522 |        " 0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000\n",
523 |        " 0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000\n",
524 |        " 0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000\n",
525 |        " 0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000\n",
526 |        " 0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000\n",
527 |        "\n",
528 |        "Columns 10 to 19 \n",
529 |        " 0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000\n",
530 |        " 0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000\n",
531 |        " 0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000\n",
532 |        " 0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000\n",
533 |        " 0.4471  0.7961  0.9961  0.9961  0.7961  0.4706  0.0745  0.0000  0.0000  0.0000\n",
534 |        " 0.9843  0.9922  0.8588  0.4510  0.6471  0.9569  0.7961  0.4000  0.0000  0.0000\n",
535 |        " 0.5098  0.1804  0.0510  0.0000  0.0000  0.1490  0.4431  0.8549  0.0667  0.0000\n",
536 |        " 0.0902  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000\n",
537 |        " 0.1882  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000\n",
538 |        " 0.8784  0.0980  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000\n",
539 |        " 0.9922  0.6392  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0196  0.6824\n",
540 |        " 0.9255  0.9647  0.4627  0.0000  0.0000  0.0000  0.0000  0.3333  0.7725  0.6588\n",
541 |        " 0.2000  0.8824  0.9961  0.5333  0.0275  0.1255  0.6667  0.9255  0.6549  0.0667\n",
542 |        " 0.0000  0.1059  0.7765  0.9961  0.7804  0.9451  0.9255  0.1961  0.0000  0.0000\n",
543 |        " 0.0000  0.0000  0.0667  0.8627  0.9922  0.9922  0.3686  0.0000  0.0000  0.0000\n",
544 |        " 0.0000  0.0000  0.4000  0.9961  0.8588  0.9922  0.7608  0.1333  0.0000  0.0000\n",
545 |        " 0.0000  0.4235  0.9961  0.4980  0.0000  0.4902  0.9961  0.8980  0.1569  0.0000\n",
546 |        " 0.1686  0.9098  0.7255  0.0353  0.0000  0.0118  0.3529  0.9961  0.8784  0.3647\n",
547 |        " 0.8392  0.9490  0.1490  0.0000  0.0000  0.0000  0.0000  0.5333  0.9922  0.9412\n",
548 |        " 0.9020  0.9020  0.0000  0.0000  0.0000  0.0000  0.0000  0.0745  0.5294  0.9922\n",
549 |        " 0.9059  0.9294  0.0745  0.0000  0.0000  0.0000  0.0000  0.0000  0.0667  0.8824\n",
550 |        " 0.3451  0.9569  0.6627  0.0667  0.0000  0.0000  0.0000  0.0000  0.1020  0.9098\n",
551 |        " 0.0000  0.4784  0.9647  0.8824  0.6157  0.4549  0.4549  0.6824  0.8471  0.9176\n",
552 |        " 0.0000  0.0000  0.1961  0.8627  0.9922  0.9922  0.9922  0.9961  0.9255  0.2118\n",
553 |        " 0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000\n",
554 |        " 0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000\n",
555 |        " 0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000\n",
556 |        " 0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000\n",
557 |        "\n",
558 |        "Columns 20 to 27 \n",
559 |        " 0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000\n",
560 |        " 0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000\n",
561 |        " 0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000\n",
562 |        " 0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000\n",
563 |        " 0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000\n",
564 |        " 0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000\n",
565 |        " 0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000\n",
566 |        " 0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000\n",
567 |        " 0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000\n",
568 |        " 0.0667  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000\n",
569 |        " 0.8784  0.5255  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000\n",
570 |        " 0.2706  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000\n",
571 |        " 0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000\n",
572 |        " 0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000\n",
573 |        " 0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000\n",
574 |        " 0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000\n",
575 |        " 0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000\n",
576 |        " 0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000\n",
577 |        " 0.1569  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000\n",
578 |        " 0.6275  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000\n",
579 |        " 0.8627  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000\n",
580 |        " 0.9255  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000\n",
581 |        " 0.2118  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000\n",
582 |        " 0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000\n",
583 |        " 0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000\n",
584 |        " 0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000\n",
585 |        " 0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000\n",
586 |        " 0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000\n",
587 |        "[torch.FloatTensor of size 28x28]"
588 |       ]
589 |      },
590 |      "execution_count": 13,
591 |      "metadata": {},
592 |      "output_type": "execute_result"
593 |     }
594 |    ],
595 |    "source": [
596 |     "mb_example[0][0,0]"
597 |    ]
598 |   },
599 |   {
600 |    "cell_type": "markdown",
601 |    "metadata": {},
602 |    "source": [
603 |     "Note that pytorch transformed the values that went from 0 to 255 into floats that go from 0. to 1.\n",
604 |     "\n",
605 |     "We can have a look at the first pictures and draw them."
606 |    ]
607 |   },
608 |   {
609 |    "cell_type": "code",
610 |    "execution_count": 14,
611 |    "metadata": {},
612 |    "outputs": [
613 |     {
614 |      "data": {
615 |       "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXQAAAB4CAYAAADi1gmcAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvNQv5yAAADtZJREFUeJzt3Xuw1eMex/H3U7pIISRNdSq30WAqzsitGKZRjeNSoWGK45YIZciYOuYcVIhScY5bUmcaHISZmM4wLnMaUhmKxCl0O11cplGiUL/zx97Ps7/LXtl7rb3W/q31rM9rxng8e+/1e/rtX1/P7/vcXJIkiIhI+WuSdgNERKQwFNBFRCKhgC4iEgkFdBGRSCigi4hEQgFdRCQSCugiIpGoyIDunOvqnHvNObfVObfZOfewc26ftNsVE+fcUOfcSufcDufcF865Pmm3qdw55374zT+7nXMz0m5XDJxzLZxzM51za51z251zHzrnBqTdrlxVZEAH/g58DXQAegJnANen2qKIOOf6AfcBfwbaAH2BL1NtVASSJGnt/wHaAz8Bz6fcrFjsA6ynKhYcAPwF+JdzrmuKbcpZpfZKuwEPJ0myE9jsnFsAHJtym2LyN+CuJEkWVf/3/9JsTKSGUNUp+U/aDYlBkiQ7gL+aqvnOua+AE4E1abQpH5XaQ58GDHXOtXLOdQQGAAtSblMUnHNNgT8C7Zxzq51zG6pTWvum3bbIXA7MSbR3R1E459oDRwMr0m5LLio1oL9DVY98G7ABWAq8nGqL4tEeaEZVD7IPVSmtXsD4NBsVE+fcH6hKDcxOuy0xcs41A+YCs5Mk+Szt9uSi4gK6c64J8G9gHrAfcAjQlqqcrzTcT9X/npEkyaYkSb4FpgADU2xTbIYDC5Mk+SrthsSmOj78E/gZGJVyc3JWcQEdOAjoTFUOfVeSJN8Bs1DAKYgkSbZS9dajVEDxDEe984JzzjlgJlVvmYOTJPkl5SblrOICenWP8StgpHNuH+fcgVTlI5el27KozAJudM4d6pxrC4wG5qfcpig4504FOqLZLcXwD6A78KckSX6q65tLUcUF9GqDgP7AN8Bq4FdgTKotisvdwBLgv8BK4ENgQqotisflwLwkSban3ZCYOOe6ACOoGvPZbOb6X5Zy03LiNEguIhKHSu2hi4hERwFdRCQSCugiIpFQQBcRiYQCuohIJBp7cy5Nqamby/PndG/rlu+9Bd3f+tCzWzz1urfqoYuIREIBXUQkEgroIiKRUEAXEYlEpZ5YJEXy/fffA3D22WeHutNPPz2UH3rooUZvk0ilUA9dRCQSCugiIpFQykUKavLkyQB89NFHoW7//fdPqzkiWW3btg2AkSNHhrpnnnkGgJtvvjnU+ecZYOvWrQAceuihoW7s2LGhfN996R96ph66iEgkouyh+//TAjzxxBMAvP3227W+78wzzwzl4cOHh/Ill1wCwL776qD6+nj//fdDeeLEiQBUneZV5dlnn230Non8npUrVwKZz6Z/ZqdPnx7qmjZtGso+hjRpUrr94NJtmYiI5EQBXUQkEmWfctm5cycARx55ZKjbsmVLKO/ZsweA7t27h7px48YBsGjRolB34403hvKcOXMAuPXWW0PdwIEDC9nsKPz4448ADBs2LNT519arr7461NlBpEq3fv36UO7duzcA9hjIxYsXh3Lnzp0br2EV5sUXX6xV51MpPmYATJ069Xc/xz7npUA9dBGRSCigi4hEoixTLvaVaNSoUQBs3Lgx1NlR6kGDBgFw8MEHh7oWLVoAcOmll4a6O++8M5TfeOMNAG6//fZQd9ZZZ4Vyy5YtG/YHiIRf0v/FF1+EulatWgFwzz33pNKmUvfYY4+F8tdffw1kplxOPvnkUPbPtp0X7e+vNEy7du1q1Z144okALFmypN6fU2q/D/XQRUQi4WzvoBEU5GI//PBDKB9wwAFVH2z+HNu3bw/l/fbbrxCXbEwlfeqL71UCdOjQAcicc+57oFdddVVjNCdXqZ9YtGrVqlAeMWIEkLlGwt5L/0zbujvuuAOoWSsBcMQRR4RyPj1Gv6HaggULQp39/ByU9LNrbdiwAYAuXbrk/LN2sHr58uWhXOQV0TqxSESkkiigi4hEoixTLr/88kso9+nTB8hcfu5fpwA6duxYiEs2ppJ7bf35559D+YYbbgjlmTNnAplzcR9//PFiNaMQUk+5WH4e/+jRo0PdU089VXPBLCmXbHWHH354KHfq1AmASZMmhTo/331vxo8fX+sz77777nr+KTKU3LO7N7t37wZqfgcAc+fOBeC5554LdQsXLgxlOxnDW7duXSgXOdYo5SIiUknKsodu+dWep556aqizPRa/jWvr1q0LfemwShUKOpWx5Ho5dnVjt27dai5Y/eysWbMm1JX46saS6qFnYwedly5dCtT0HAHefPPNWt9n/w77XratW7t2bShn+/1cc801ABx22GGhLvYeen35QWiA+++/v9bX1UMXEZGiUEAXEYlEWa4UtfzKOru3+TvvvBPKfsCnoYcT+0EUyD5/2A8WjhkzpkHXKUV33XVXKNtXeT/nvKFpFps+6N+/PwDLli0LdT169AAyB76bNWvWoGuWKruRmd8Qzm4M5+eM+w3kAD7++ONQtoOq9WUHQyXTBx98UKvOrubNtuI0Teqhi4hEQgFdRCQSZZ9y8V599dVQ7tq1ayj7I+hOOOGEUGePm/s97733XijbUX8/0+OVV14Jdccee2xuDS4DPsVhX+Pt63lD9oK26ZN+/fqF8o4dO2pdx6df7O+jb9++eV+7nPmtLuzWCn6TNKhJieUye63U0gZps1uHrF69utbX/Vx/gObNmzdKm+pLPXQRkUhE00O3Bzr7+btQM1h6xRVXhLpvvvkGyNyW1G745eeezps3L9RddNFFofzAAw8AcW6ja+ecX3jhhUBmb6+hg8t+Zd4pp5wS6mxv3J8cdfzxx4c6P0/69ddfD3WV2kP37IC8HUD297JXr16hzs4vz+a6664D4OWXXy5gC8uXPTjazuH3bNwoNeqhi4hEQgFdRCQS0aRcLDsv2i/9v/jii0PdbbfdBmS+wttBOj/QYVM3Jb6kvWCynahjB5T963m+fPrEplmmTZsWyldeeWWtOv+9ee7RHRW/UZo9YSvbAOhrr70WynXN2ffPtj0ovZJ9+umnWesPOeQQIDOdVWrUQxcRiUSUPXSrTZs2ALz00kuhzvc4bQ/d8meKVkqvfG98z2/w4MGhLp8Vmnag1Q842V7l+eefH8rfffcdADNmzAh1Z5xxBgDHHXdczteOzaOPPgrUvHlC9qmkdsWp1I/flttv+PdbBx54IJA5AaPUqIcuIhIJBXQRkUhEn3Lx/D7SAJ999hkA3bt3r1UHNXOte/bsGeratm1b7CaWhGyrQocMGdKgz7TpLv+Z7du3D3X2cN3JkycDmRt2HXPMMQ26frmzp+o8/fTTQGbKyh4Mnc/h3H7DL5vG8WmumGzbti2U/fNl06p+3cnixYuz/vz06dOL2LrCUA9dRCQSCugiIpGIPuXiX6Ns2uCRRx4BYOTIkaHOHm7s51rblIudkx7zZkabN28OZZ8eOeqooxr0mdmOS7PL0WfPnh3KEydOrPV1uxS7kvhUy7nnnhvq/DL/vW2SdtJJJ+V8HX///ZYYEE/KxaaR7KZ8K1asADI31bNp13KlHrqISCSi7KHbXqZf/WZXGWbb9tXW+UE4ewqS3UzqySefrPX1WBRqe9y6Pt9uKHXLLbfU+rpdKVqpc6r9c2ZP4PJvOOedd16omzp1aoOus2XLFgCaNCnv/t2ePXtCedy4cUDNRnq//brne+r14U+OsoPQ119/fSjbgX7PP+f2jcevhi6G8v4NiohIoIAuIhIJl8vJJgXQKBezm0l9+eWXQObJI36Tnbo8+OCDoTx27NhQ9qfGfP7556GugAOl+Z7YW5B726FDh1D2g5l2sCifAdLly5eHst/YyD53Ns2zadMmoGhploachtwoz64dQPa/C3t//H2zKauGbolwwQUXAJl70NsTunKQ6rO7ZMmSUPYHOds0iD3ZyW/Gt2HDhkJcuk5HH310KK9cuTKfj6jXvVUPXUQkEgroIiKRiHKWy7fffhvKfll5fdMslh3Btq+9fj/1e++9N9TZ9Ew5s0vH/Z9vxIgRoW7+/Pmh7Ef77V7yBx10EAAvvPBCqLM7J9r76NkUWSXOaPFL7yFz7UO2dOi7774LFHbnyVgOOLdpUc9uO9G7d+9Qfv755wEYOnTo736m/fvgY0k+s4rsTKViUg9dRCQSUQ6KdurUKZR37doFwLp160JdPvsZ79y5M5R9j6ZFixahbm+nnOQh1YEl29vu168fANu3bw91dq6yf3ayDdrZOjtv189t9wdxQ6P2yktyUNTuF9+tW7eaC1bfSzsnf9SoUQW/vu892sHrPE8vSvXZPe2000LZ72lu56HbtSSjR48GMgdSPX8eAmRfa5JtPntd7N+bbG+p9aBBURGRSqKALiISiShTLgsXLgzlAQMGADBs2LBQN2XKFABatmxZ78+0aQe/NcCvv/4a6vzy6QJI9bXV8vtC+yXokLlferb0ik+f2DSBfW1NeWOzkkq5rFq1CsjclsLOL/epKr+WAlK/f3VJ9dm1kyGyLcOvi9+L3x9kDjVHWJYApVxERCpJlD10a+bMmQDcdNNNoc5vzWoPP+7Ro0etn33rrbdC2W6f63vj9jDZLl26FKjFpdNDj1BJ9dAnTZoEwPjx40OdfdvxUxTz2RI3Jak+u3awcsKECQDMmjUr1K1du7bWz1x77bWh7N8qmzdvXojmFJp66CIilUQBXUQkEtGnXDw71/ecc84BMjfXysbeGzsY5QcLC5hmsZRyKZ7UUy729KXLLrsMyEwV2M3RNm7cWIhLNiY9u8WjlIuISCVRQBcRiUSUm3Nl07lz51Au4DJ9kZx88sknoexntNg0i916QSRX6qGLiESiYgZFy4gGloon9UHRyOnZLR4NioqIVBIFdBGRSCigi4hEQgFdRCQSCugiIpFQQBcRiYQCuohIJBp7HrqIiBSJeugiIpFQQBcRiYQCuohIJBTQRUQioYAuIhIJBXQRkUgooIuIREIBXUQkEgroIiKRUEAXEYmEArqISCQU0EVEIqGALiISCQV0EZFIKKCLiERCAV1EJBIK6CIikVBAFxGJhAK6iEgkFNBFRCKhgC4iEgkFdBGRSCigi4hE4v+X4lpRbDynPQAAAABJRU5ErkJggg==\n",
616 |       "text/plain": [
617 |        "<matplotlib.figure.Figure at 0x230851494e0>"
618 |       ]
619 |      },
620 |      "metadata": {},
621 |      "output_type": "display_data"
622 |     }
623 |    ],
624 |    "source": [
625 |     "fig = plt.figure()\n",
626 |     "for i in range(0,4):\n",
627 |     "    sub_plot = fig.add_subplot(1,4,i+1)\n",
628 |     "    sub_plot.axis('Off')\n",
629 |     "    plt.imshow(mb_example[0][i,0].numpy(), cmap='Greys')\n",
630 |     "    sub_plot.set_title(mb_example[1][i])"
631 |    ]
632 |   },
633 |   {
634 |    "cell_type": "markdown",
635 |    "metadata": {},
636 |    "source": [
637 |     "Another usual transformation we do before feeing the pictures to our neural network is to normalize the input. This means subtracting the mean and dividing by the standard deviation. We can either search for the usual values on Google or compute them from scratch."
638 |    ]
639 |   },
640 |   {
641 |    "cell_type": "code",
642 |    "execution_count": 15,
643 |    "metadata": {},
644 |    "outputs": [
645 |     {
646 |      "data": {
647 |       "text/plain": [
648 |        "(0.1306604762738429, 0.30810780717887876)"
649 |       ]
650 |      },
651 |      "execution_count": 15,
652 |      "metadata": {},
653 |      "output_type": "execute_result"
654 |     }
655 |    ],
656 |    "source": [
657 |     "mean = torch.mean(trn_set.train_data.type(torch.FloatTensor))/255.\n",
658 |     "std = torch.std(trn_set.train_data.type(torch.FloatTensor))/255.\n",
659 |     "mean,std"
660 |    ]
661 |   },
662 |   {
663 |    "cell_type": "markdown",
664 |    "metadata": {},
665 |    "source": [
666 |     "We divide by 255 to get the means of our data when it's convereted into floats from 0. to 1.\n",
667 |     "\n",
668 |     "Then we go back to creating a transfrom and add the normalization. Note that we use the same mean and std for the test set. Afterward, we reload our datasets, adding this transform."
669 |    ]
670 |   },
671 |   {
672 |    "cell_type": "code",
673 |    "execution_count": 16,
674 |    "metadata": {},
675 |    "outputs": [],
676 |    "source": [
677 |     "tsfms = transforms.Compose([transforms.ToTensor(), transforms.Normalize((mean,), (std,))])\n",
678 |     "trn_set = datasets.MNIST(PATH, train=True, download=True, transform=tsfms)\n",
679 |     "tst_set = datasets.MNIST(PATH, train=False, download=True, transform=tsfms)"
680 |    ]
681 |   },
682 |   {
683 |    "cell_type": "code",
684 |    "execution_count": 17,
685 |    "metadata": {},
686 |    "outputs": [],
687 |    "source": [
688 |     "trn_loader = torch.utils.data.DataLoader(trn_set, batch_size=64, shuffle=True, num_workers=0)\n",
689 |     "tst_loader = torch.utils.data.DataLoader(tst_set, batch_size=64, shuffle=False, num_workers=0)"
690 |    ]
691 |   },
692 |   {
693 |    "cell_type": "markdown",
694 |    "metadata": {},
695 |    "source": [
696 |     "Now if we want to plot our digits, we will have to denormalize the images."
697 |    ]
698 |   },
699 |   {
700 |    "cell_type": "code",
701 |    "execution_count": 18,
702 |    "metadata": {},
703 |    "outputs": [],
704 |    "source": [
705 |     "mb_example = next(iter(trn_loader))"
706 |    ]
707 |   },
708 |   {
709 |    "cell_type": "code",
710 |    "execution_count": 19,
711 |    "metadata": {},
712 |    "outputs": [
713 |     {
714 |      "data": {
715 |       "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXQAAAB4CAYAAADi1gmcAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvNQv5yAAADclJREFUeJzt3XmMlEUax/FvcahcE8RjFUU5xMjCwIgKXgF0Af9YD9YrKEFNxFVYFRWIROMR1mjUCGrkiAoRRUAUlUMUUASBNQgeC1EJEJVDGSEqyBEU8d0/Zqummu6Z6Z7p6e63+vdJCE+qp9+pead5qLdOE0URIiISfw3yXQEREckOJXQRkUAooYuIBEIJXUQkEEroIiKBUEIXEQmEErqISCCKMqEbY1oZY94yxuwzxmw2xlyf7zqFxhjT0RhzwBgzLd91CYkxppMxZokxZrcxZpMx5h/5rlMoQri3RZnQgfHA78BfgEHARGNM5/xWKTjjgdX5rkRIjDGNgDnAfKAV8E9gmjHm9LxWLACh3NuiS+jGmGbAVcADURTtjaJoBTAXGJzfmoXDGDMQ2AV8kO+6BOYMoDUwLoqiQ1EULQFWos9uNgRxb4suoQOnA4eiKNrglf0XUAs9C4wxJcAYYES+6xIgU0VZl1xXJEBB3NtiTOjNgd2Hle0GWuShLiH6NzA5iqKt+a5IgNYDO4BRxpjGxpj+QG+gaX6rFYQg7m0xJvS9QMlhZSXAnjzUJSjGmDKgLzAu33UJURRFB4EBwN+BciqegmYB2/JZrxCEcm8b5bsCebABaGSM6RhF0cb/l3UDvsxjnULRB2gLbDHGQMXTUENjzF+jKOqex3oFI4qitVS0HAEwxvwHmJq/GoUjhHtrinH7XGPMTCAChgBlwALg/CiKlNTrwBjTlMSnn5FUJPihURTtzEulAmOM6UpFo6QBMAz4F3BGFEW/5bViAQjh3hZjlwtU/LKaUNFnNoOKhKNkXkdRFO2Poqjc/qGie+uAknlWDQa2U/HZ/RvQL04Jp8DF/t4WZQtdRCRExdpCFxEJjhK6iEgglNBFRAKhhC4iEggldBGRQOR6YZGm1NQs1Z4S6dC9rVlt7y3o/qZDn936k9a9VQtdRCQQSugiIoFQQhcRCYQSuohIIJTQRUQCoYQuIhIIJXQRkUAooYuIBEIJXUQkEEroIiKBUEIXEQmEErqISCByvTlXQfjxxx8BWLhwoSt7/PHHXfzDDz8AMH78eFd2/fXX56h28bF06VIAXnnlFVd21113AVBaWpqPKokUNbXQRUQCoYQuIhIIE0U53Yo45/seb926FYBzzz3Xle3btw+AX3/9Ne3rDBs2zMX33HMPAO3bt89GFQ9X0HtKv//++y6+9tprAdi1a5cra9WqFQAff/yxK+vYsWPSdf78808Xl5eXu/jll18GKrtuAI466qi6VtsKZj/0zZs3u/iWW25x8eLFiwE45phjXNmmTZtc3LJly/qsVkF/drNl7dq1Lva7G//44w8Ann76aVdmP8/23wrAkUceWZtvq/3QRUSKSZAtdL/10rNnTwB27NiRtevbVug333zjykpKSrJ1+YJr5axatcrFvXr1cvHBgwerfM+aNWtc3L1796TXDxw44OKmTZsmvT5o0CAX+62gOoplC91/mpk0aRIADzzwgCv75Zdfkt7TvHlzF3/00UcuLisrq48qWgX32a2rn3/+2cUPPvggUNnqBti7d29a13n77bddfPnll9emKmqhi4gUEyV0EZFABNPlsmfPHhd37tzZxdu2bUvr/f6jaIsWLQBYvnx5te/x566PGjUqre+ThoJ5bLXdIkOGDHFl06dPT/q6Cy+80MW33XYbANddd50rMyb5R7r99ttdPGHChKTX/YHQ/fv3Z1Lt6sSyy8XvMunTp0/S6/4g29y5cwE47rjjXNk555zj4gULFgDQr1+/bFcTCuizWxfr16938ZVXXplU3rZtW1d20003ufiaa65JupadRPDMM8+4Mn+QOgPqchERKSaxXylqpwpNnDjRldXUKrcDmFOmTHFll1xyiYuPOOIIAEaPHu3K/IGQn376CUhcSXr11VcD0K5du8x+gAJmn3pStcoBGjWq+PiMHDnSlaU74FPdgKpUeP3114HUq5T9lvq8efNc3KxZMwBeeOEFV3bo0CEX2yeor776ypXVchpdED7//HMX33///QAsWrTIlflTbu2gqP0boEGD6tvEn3zyCQDffvutK/OfuPxJBtmgFrqISCCU0EVEAhH7QVH7KNOhQ4dqv85/rFy2bBkAPXr0SPv7PP/88y62j62+K664AoC33nor7WtWoWAGluzPnOrnBTjttNMA2LBhQ8bXvvXWW13sdw9YxToo6ndF2e47u1kcQLdu3QBYsWKFK7PdLFDZBXniiSe6MttF6PNXSftz1uuoYD67qfifoxtvvBFI/Pdq5/v7n/cnn3zSxf59rs6bb77p4oEDBwKJay2+/PJLF5900klpXRMNioqIFBcldBGRQMR+lku67Ag2ZNbVUszeeeedal+fOXNmxte0S6X97RlSmT17dsbXDoG/pN92tfjdTy+++CKQ+Pjvz2IZOnQokLqbBeCiiy5KumbI/G4Wf9aa/Xwdf/zxruyNN94A4Mwzz3Rl6Xaz+L777jsX2zUAkydPdmUZdLNkTC10EZFAFE0L/dNPP3Xx77//DlTON5dK/oDNypUrk163g78AnTp1yvj6tvXiz/X12Zajv/1r6PyNymwr0eev1D322GOByk26IHEdQKoBZH+u9Lhx44DKNQSh+/rrr1383HPPJb1u54kDnHLKKdVeyw6arl692pX5W+nabYz9rZ+HDx8OQMOGDTOpdq2phS4iEggldBGRQMT+uWvdunVpfd2cOXNc/NtvvwHqcknF3zfe3wva8uc3N2nSJOPrz5o1q9rX7RYMxTRwbeeOQ+Ie+5bfVZCq26Aml156qYu7du2a8fvj7NVXX3Wx3/U0duxYANq0aZP0HpsfIHENwGuvvQbAfffd58pOPvlkF9sN6bI4rz9jaqGLiAQili10f+Dn0UcfTXrdH4Cwm2b17t3blaU6IUcq+Nve1ge/xSQV/AHK1q1bu9hvHdbFs88+m5XrxJH/ROOvFrdTB999911XZgc4/ffU9DvwTyyyky3ySS10EZFAKKGLiAQill0u/oki/jxSq7S01MUzZszISZ1CsXv37mpf91eP+qe1pKu8vDzj94TOX7XpHyZ87733AomDptbGjRtdnOqe+qcUFdOc/sPdeeedLrYDoZD6dKF0+QP2/mppe3h8PqmFLiISCCV0EZFAxLLLpSZjxozJynX8veJr6oooFlu3bnWxfyyfZMfZZ5/t4g8++KDKr3vkkUdc7B+JZtnDoKF2G0yF4rzzznNxy5YtXbxv3z4gsTvLrkvx56H7bNeYP7POPzC6EKiFLiISiCBb6HZFFySukkuXbZn7p8LYAaqqNG7cOOPvE4rOnTu7uEuXLkDinNwsnOIk/2dP6KpqsN+uXMznasVCctVVV7nY31ju+++/B2Dnzp2uzOaNp556KuW1li9fDsBZZ52V9Xpmi1roIiKBUEIXEQlEkF0u8+fPr9P79+zZAyRuF5BKWVmZi+1JMnG3dOlSF9tNiL744gtX1rdvXxfffffdALRv396V2RNg/MGmNWvWuHjAgAFA4iZgkj57ILe/z7e/1YXtLvCXuUsFf4uFU089FUicO/7ee+8lveexxx5zsX+SUaFSC11EJBBK6CIigTD+XOscyMo3s0dBQeUsllSPSwAvvfQSADfccEO11/zss89c/NBDDwE1H5LsL7n2D5utI1PL9+X0F1lbHTp0ACpnaxzOzkSop5kxtb23kMf7e/DgQRf36tULgFWrVrmyo48+2sVVHQ6dI7H57Nr71L17d1dm11j48/r92N9PPQ/SurdqoYuIBCKWg6L+/5Q1zf8eMWIEAIMGDXJldg/j2bNnu7I77rjDxXa/dWNS/6c4ZcoUoLg3PaovixcvBhIHUv3Vk8XIP+Dcb5lbdqBZqucfxj148GAgceWzba3bnAF5b5VnLF61FRGRKimhi4gEIpZdLr7Ro0cDMG/evJSv28GP/v37u7Lt27cDifuq+2xXS6dOnVzZ8OHDXWwHWOP2OBYHtrvLbp4kid1Pln9A980335zL6sSKP6C8cuVKF9tJFO3atXNlS5YsAaBFixY5ql32KSOJiAQi9i30E044Ia2v+/DDD9O+pt18xw7QQeLWmyK5YKcUT506Nek1f6D4/PPPz1md4mbZsmUu9p/SLTutGaCkpCQXVapXaqGLiARCCV1EJBCx73KxG+74c8JrWi1nTx65+OKLXZndaAoq56OqmyX7pk+fDiTuU5/n1Y0Fa8uWLUDiPHTriSeeyHV1YsWuNanqVK0hQ4YAcMEFF+SsTrmgFrqISCCU0EVEAhH7Lpc2bdoAiUt4/aXQixYtAuCyyy5zZQ8//DAQj/2NQ9OzZ08AFi5c6Mr8I71KS0sT/i5m69atAxL3O580aRIAPXr0yEud4sJ2U02bNs2V+bOBJkyYAIS3liSsn0ZEpIjFcvvcwMVmC9IYitX2uXYbZ3uCFsDYsWNzXY1MFMxn184/9w93t4c8QywPddf2uSIixUQJXUQkEOpyKTwF89gaoFh1ucSQPrv1R10uIiLFRAldRCQQSugiIoFQQhcRCUSuB0VFRKSeqIUuIhIIJXQRkUAooYuIBEIJXUQkEEroIiKBUEIXEQmEErqISCCU0EVEAqGELiISCCV0EZFAKKGLiARCCV1EJBBK6CIigVBCFxEJhBK6iEgglNBFRAKhhC4iEggldBGRQCihi4gEQgldRCQQSugiIoFQQhcRCYQSuohIIP4HKIMA+7Qjr00AAAAASUVORK5CYII=\n",
716 |       "text/plain": [
717 |        "<matplotlib.figure.Figure at 0x230847420f0>"
718 |       ]
719 |      },
720 |      "metadata": {},
721 |      "output_type": "display_data"
722 |     }
723 |    ],
724 |    "source": [
725 |     "fig = plt.figure()\n",
726 |     "for i in range(0,4):\n",
727 |     "    sub_plot = fig.add_subplot(1,4,i+1)\n",
728 |     "    sub_plot.axis('Off')\n",
729 |     "    plt.imshow(mb_example[0][i,0].numpy() * std + mean, cmap='Greys', interpolation=None)\n",
730 |     "    sub_plot.set_title(mb_example[1][i])"
731 |    ]
732 |   },
733 |   {
734 |    "cell_type": "markdown",
735 |    "metadata": {},
736 |    "source": [
737 |     "# Create a model"
738 |    ]
739 |   },
740 |   {
741 |    "cell_type": "markdown",
742 |    "metadata": {},
743 |    "source": [
744 |     "It's always a good idea to create a model as a subclass of nn.Module. That way, we can use all the features this class provides.\n",
745 |     "\n",
746 |     "We override the init function (but still call the init function of nn.Module) to define our custom layers (here two linear layers) and we have to define the forward function, which explains how to compute the output.\n",
747 |     "\n",
748 |     "The first line of the forward function is to flatten our input, since we saw it has four dimensions: minibatch by channel by height by width. We only keep the minibatch size as our first dimension (x.size(0)) and the -1 is to tell pytorch to determine the right number for the second dimension."
749 |    ]
750 |   },
751 |   {
752 |    "cell_type": "code",
753 |    "execution_count": 20,
754 |    "metadata": {},
755 |    "outputs": [],
756 |    "source": [
757 |     "class SimpleNeuralNet(nn.Module):\n",
758 |     "    def __init__(self, n_in, n_hidden, n_out):\n",
759 |     "        super().__init__()\n",
760 |     "        self.linear1 = nn.Linear(n_in, n_hidden)\n",
761 |     "        self.linear2 = nn.Linear(n_hidden, n_out)\n",
762 |     "    \n",
763 |     "    def forward(self,x):\n",
764 |     "        x = x.view(x.size(0),-1)\n",
765 |     "        x = F.relu(self.linear1(x))\n",
766 |     "        return F.log_softmax(self.linear2(x), dim=-1)"
767 |    ]
768 |   },
769 |   {
770 |    "cell_type": "markdown",
771 |    "metadata": {},
772 |    "source": [
773 |     "Then we can instanciate the class with our input size (28 * 28), an hidden size of 100 layers and 10 outputs (as many as digits).\n",
774 |     "\n",
775 |     "The optimizer will automatically do the Stochastic Gradient Descent for us (or any of its variant if we want)."
776 |    ]
777 |   },
778 |   {
779 |    "cell_type": "code",
780 |    "execution_count": 28,
781 |    "metadata": {},
782 |    "outputs": [],
783 |    "source": [
784 |     "net = SimpleNeuralNet(28*28,100,10)\n",
785 |     "optimizer = optim.SGD(net.parameters(),lr=1e-5)"
786 |    ]
787 |   },
788 |   {
789 |    "cell_type": "markdown",
790 |    "metadata": {},
791 |    "source": [
792 |     "Now we're ready to write our training loop. To compute the gradient automatically, pytorch requires us to put the torch tensors with our inputs and labels into Variable objects, that way it'll remember the transformation these go through until we arrive at our loss function. We then call loss.backward() to compute all the gradients (which will then be in the grad field of any variable).\n",
793 |     "\n",
794 |     "The optimizer takes care of the step of our gradient descent in the optimizer.step() function. Since the gradients are accumulated, we have to tell pytorch when to reinitialize them (which the purpose of the optimizer.zero_grad() command at the beginning)."
795 |    ]
796 |   },
797 |   {
798 |    "cell_type": "code",
799 |    "execution_count": 29,
800 |    "metadata": {},
801 |    "outputs": [],
802 |    "source": [
803 |     "def train(nb_epoch):\n",
804 |     "    for epoch in range(nb_epoch):\n",
805 |     "        running_loss = 0.\n",
806 |     "        corrects = 0\n",
807 |     "        print(f'Epoch {epoch+1}:')\n",
808 |     "        for data in trn_loader:\n",
809 |     "            #separate the inputs from the labels\n",
810 |     "            inputs,labels = data\n",
811 |     "            #wrap those into variables to keep track of how they are created and be able to compute their gradient.\n",
812 |     "            inputs, labels = Variable(inputs), Variable(labels)\n",
813 |     "            #Put the gradients back to zero\n",
814 |     "            optimizer.zero_grad()\n",
815 |     "            #Compute the outputs given by our model at this stage.\n",
816 |     "            outputs = net(inputs)\n",
817 |     "            _,preds = torch.max(outputs.data,1)\n",
818 |     "            #Compute the loss\n",
819 |     "            loss = F.nll_loss(outputs, labels)\n",
820 |     "            running_loss += loss.data[0] * inputs.size(0)\n",
821 |     "            corrects += torch.sum(labels.data == preds)\n",
822 |     "            #Backpropagate the computation of the gradients\n",
823 |     "            loss.backward()\n",
824 |     "            #Do the step of the SGD\n",
825 |     "            optimizer.step()\n",
826 |     "        print(f'Loss: {running_loss/len(trn_set)}  Accuracy: {100.*corrects/len(trn_set)}')"
827 |    ]
828 |   },
829 |   {
830 |    "cell_type": "markdown",
831 |    "metadata": {},
832 |    "source": [
833 |     "Now we're ready to train our model."
834 |    ]
835 |   },
836 |   {
837 |    "cell_type": "code",
838 |    "execution_count": 30,
839 |    "metadata": {},
840 |    "outputs": [
841 |     {
842 |      "name": "stdout",
843 |      "output_type": "stream",
844 |      "text": [
845 |       "Epoch 1:\n",
846 |       "Loss: 2.316405006790161  Accuracy: 10.841666666666667\n"
847 |      ]
848 |     }
849 |    ],
850 |    "source": [
851 |     "train(1)"
852 |    ]
853 |   },
854 |   {
855 |    "cell_type": "markdown",
856 |    "metadata": {},
857 |    "source": [
858 |     "96.23% accuracy is good, but that's on the training set and we may be overfitting. Let's try on the test set now to see if we're doing well or not."
859 |    ]
860 |   },
861 |   {
862 |    "cell_type": "code",
863 |    "execution_count": null,
864 |    "metadata": {},
865 |    "outputs": [],
866 |    "source": [
867 |     "def validate():\n",
868 |     "    running_loss = 0.\n",
869 |     "    corrects = 0\n",
870 |     "    for data in tst_loader:\n",
871 |     "        #separate the inputs from the labels\n",
872 |     "        inputs,labels = data\n",
873 |     "        #wrap those into variables to keep track of how they are created and be able to compute their gradient.\n",
874 |     "        #Even if we don't require the gradient here, a nn.Module expects a variable.\n",
875 |     "        inputs, labels = Variable(inputs), Variable(labels)\n",
876 |     "        #Compute the outputs given by our model at this stage.\n",
877 |     "        outputs = net(inputs)\n",
878 |     "        _,preds = torch.max(outputs.data,1)\n",
879 |     "        #Compute the loss\n",
880 |     "        loss = F.nll_loss(outputs, labels)\n",
881 |     "        running_loss += loss.data[0] * inputs.size(0)\n",
882 |     "        corrects += torch.sum(labels.data == preds)\n",
883 |     "    print(f'Loss: {running_loss/len(tst_set)}  Accuracy: {100.*corrects/len(tst_set)}')"
884 |    ]
885 |   },
886 |   {
887 |    "cell_type": "code",
888 |    "execution_count": null,
889 |    "metadata": {},
890 |    "outputs": [],
891 |    "source": [
892 |     "validate()"
893 |    ]
894 |   },
895 |   {
896 |    "cell_type": "markdown",
897 |    "metadata": {},
898 |    "source": [
899 |     "So we weren't overfitting!"
900 |    ]
901 |   }
902 |  ],
903 |  "metadata": {
904 |   "kernelspec": {
905 |    "display_name": "Python 3",
906 |    "language": "python",
907 |    "name": "python3"
908 |   },
909 |   "language_info": {
910 |    "codemirror_mode": {
911 |     "name": "ipython",
912 |     "version": 3
913 |    },
914 |    "file_extension": ".py",
915 |    "mimetype": "text/x-python",
916 |    "name": "python",
917 |    "nbconvert_exporter": "python",
918 |    "pygments_lexer": "ipython3",
919 |    "version": "3.6.4"
920 |   }
921 |  },
922 |  "nbformat": 4,
923 |  "nbformat_minor": 2
924 | }
925 | 


--------------------------------------------------------------------------------
/LM_wikitext_mixup.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "code",
   5 |    "execution_count": 1,
   6 |    "metadata": {},
   7 |    "outputs": [],
   8 |    "source": [
   9 |     "%matplotlib inline\n",
  10 |     "%reload_ext autoreload\n",
  11 |     "%autoreload 2"
  12 |    ]
  13 |   },
  14 |   {
  15 |    "cell_type": "code",
  16 |    "execution_count": 2,
  17 |    "metadata": {},
  18 |    "outputs": [],
  19 |    "source": [
  20 |     "from fastai.text import *"
  21 |    ]
  22 |   },
  23 |   {
  24 |    "cell_type": "markdown",
  25 |    "metadata": {},
  26 |    "source": [
  27 |     "# Load the data"
  28 |    ]
  29 |   },
  30 |   {
  31 |    "cell_type": "markdown",
  32 |    "metadata": {},
  33 |    "source": [
  34 |     "Data can be downloaded from [here](https://einstein.ai/research/the-wikitext-long-term-dependency-language-modeling-dataset). as suggested by smerity, we only add the eos flag at each end of sentence. "
  35 |    ]
  36 |   },
  37 |   {
  38 |    "cell_type": "code",
  39 |    "execution_count": 3,
  40 |    "metadata": {},
  41 |    "outputs": [],
  42 |    "source": [
  43 |     "EOS = '<eos>'\n",
  44 |     "PATH=Path('../data/wikitext')"
  45 |    ]
  46 |   },
  47 |   {
  48 |    "cell_type": "code",
  49 |    "execution_count": 4,
  50 |    "metadata": {},
  51 |    "outputs": [],
  52 |    "source": [
  53 |     "def read_file(filename):\n",
  54 |     "    tokens = []\n",
  55 |     "    with open(PATH/filename, encoding='utf8') as f:\n",
  56 |     "        for line in f:\n",
  57 |     "            tokens.append(line.split() + [EOS])\n",
  58 |     "    return np.array(tokens)"
  59 |    ]
  60 |   },
  61 |   {
  62 |    "cell_type": "code",
  63 |    "execution_count": 5,
  64 |    "metadata": {},
  65 |    "outputs": [],
  66 |    "source": [
  67 |     "trn_tok = read_file('wiki.train.tokens')\n",
  68 |     "val_tok = read_file('wiki.valid.tokens')\n",
  69 |     "tst_tok = read_file('wiki.test.tokens')"
  70 |    ]
  71 |   },
  72 |   {
  73 |    "cell_type": "code",
  74 |    "execution_count": 6,
  75 |    "metadata": {},
  76 |    "outputs": [
  77 |     {
  78 |      "data": {
  79 |       "text/plain": [
  80 |        "(36718, 3760, 4358)"
  81 |       ]
  82 |      },
  83 |      "execution_count": 6,
  84 |      "metadata": {},
  85 |      "output_type": "execute_result"
  86 |     }
  87 |    ],
  88 |    "source": [
  89 |     "len(trn_tok), len(val_tok), len(tst_tok)"
  90 |    ]
  91 |   },
  92 |   {
  93 |    "cell_type": "markdown",
  94 |    "metadata": {},
  95 |    "source": [
  96 |     "Then we numericalize the tokens."
  97 |    ]
  98 |   },
  99 |   {
 100 |    "cell_type": "code",
 101 |    "execution_count": 7,
 102 |    "metadata": {},
 103 |    "outputs": [],
 104 |    "source": [
 105 |     "cnt = Counter(word for sent in trn_tok for word in sent)"
 106 |    ]
 107 |   },
 108 |   {
 109 |    "cell_type": "code",
 110 |    "execution_count": 8,
 111 |    "metadata": {},
 112 |    "outputs": [
 113 |     {
 114 |      "data": {
 115 |       "text/plain": [
 116 |        "[('the', 113161),\n",
 117 |        " (',', 99913),\n",
 118 |        " ('.', 73388),\n",
 119 |        " ('of', 56889),\n",
 120 |        " ('<unk>', 54625),\n",
 121 |        " ('and', 50603),\n",
 122 |        " ('in', 39453),\n",
 123 |        " ('to', 39190),\n",
 124 |        " ('<eos>', 36718),\n",
 125 |        " ('a', 34237)]"
 126 |       ]
 127 |      },
 128 |      "execution_count": 8,
 129 |      "metadata": {},
 130 |      "output_type": "execute_result"
 131 |     }
 132 |    ],
 133 |    "source": [
 134 |     "cnt.most_common(10)"
 135 |    ]
 136 |   },
 137 |   {
 138 |    "cell_type": "code",
 139 |    "execution_count": 9,
 140 |    "metadata": {},
 141 |    "outputs": [],
 142 |    "source": [
 143 |     "itos = [o for o,c in cnt.most_common()]\n",
 144 |     "itos.insert(0,'<pad>')"
 145 |    ]
 146 |   },
 147 |   {
 148 |    "cell_type": "code",
 149 |    "execution_count": 10,
 150 |    "metadata": {},
 151 |    "outputs": [
 152 |     {
 153 |      "data": {
 154 |       "text/plain": [
 155 |        "33279"
 156 |       ]
 157 |      },
 158 |      "execution_count": 10,
 159 |      "metadata": {},
 160 |      "output_type": "execute_result"
 161 |     }
 162 |    ],
 163 |    "source": [
 164 |     "vocab_size = len(itos); vocab_size"
 165 |    ]
 166 |   },
 167 |   {
 168 |    "cell_type": "code",
 169 |    "execution_count": 11,
 170 |    "metadata": {},
 171 |    "outputs": [],
 172 |    "source": [
 173 |     "stoi = collections.defaultdict(lambda : 5, {w:i for i,w in enumerate(itos)})"
 174 |    ]
 175 |   },
 176 |   {
 177 |    "cell_type": "code",
 178 |    "execution_count": 12,
 179 |    "metadata": {},
 180 |    "outputs": [],
 181 |    "source": [
 182 |     "trn_ids = np.array([([stoi[w] for w in s]) for s in trn_tok])\n",
 183 |     "val_ids = np.array([([stoi[w] for w in s]) for s in val_tok])\n",
 184 |     "tst_ids = np.array([([stoi[w] for w in s]) for s in tst_tok])"
 185 |    ]
 186 |   },
 187 |   {
 188 |    "cell_type": "markdown",
 189 |    "metadata": {},
 190 |    "source": [
 191 |     "# Model"
 192 |    ]
 193 |   },
 194 |   {
 195 |    "cell_type": "markdown",
 196 |    "metadata": {},
 197 |    "source": [
 198 |     "This is the usual AWD LSTM with three layers."
 199 |    ]
 200 |   },
 201 |   {
 202 |    "cell_type": "code",
 203 |    "execution_count": 13,
 204 |    "metadata": {},
 205 |    "outputs": [],
 206 |    "source": [
 207 |     "em_sz,nh,nl = 400,1150,3\n",
 208 |     "bptt, bs = 70, 100"
 209 |    ]
 210 |   },
 211 |   {
 212 |    "cell_type": "markdown",
 213 |    "metadata": {},
 214 |    "source": [
 215 |     "Training schedule: 1cycle with either a third phase with cosine annealing or linear decay at one hundreth of the lowest lr. The second one seems to be slightly betters, but by a hair."
 216 |    ]
 217 |   },
 218 |   {
 219 |    "cell_type": "code",
 220 |    "execution_count": 17,
 221 |    "metadata": {},
 222 |    "outputs": [],
 223 |    "source": [
 224 |     "def one_cycle(steps,lr,opt_fn, div,max_mom,min_mom, wd):\n",
 225 |     "    return [TrainingPhase(steps[0], opt_fn, lr=(lr/div,lr), lr_decay=DecayType.LINEAR, \n",
 226 |     "                          momentum=(max_mom,min_mom), momentum_decay=DecayType.LINEAR, wds=wd),\n",
 227 |     "           TrainingPhase(steps[1], opt_fn, lr=(lr,lr/div), lr_decay=DecayType.LINEAR, \n",
 228 |     "                          momentum=(\n",
 229 |     "                              min_mom,max_mom), momentum_decay=DecayType.LINEAR, wds=wd),\n",
 230 |     "           TrainingPhase(steps[2], opt_fn, lr=lr/div, lr_decay=DecayType.COSINE, \n",
 231 |     "                          momentum=max_mom, wds=wd)]"
 232 |    ]
 233 |   },
 234 |   {
 235 |    "cell_type": "code",
 236 |    "execution_count": 18,
 237 |    "metadata": {},
 238 |    "outputs": [],
 239 |    "source": [
 240 |     "def one_cycle_lin(steps,lr,div,max_mom,min_mom, wd):\n",
 241 |     "    return [TrainingPhase(epochs=steps[0], opt_fn=optim.SGD, lr=(lr/div,lr), lr_decay=DecayType.LINEAR, \n",
 242 |     "                          momentum=(max_mom,min_mom), momentum_decay=DecayType.LINEAR, wds=wd),\n",
 243 |     "           TrainingPhase(epochs=steps[1], opt_fn=optim.SGD, lr=(lr,lr/div), lr_decay=DecayType.LINEAR, \n",
 244 |     "                          momentum=(min_mom,max_mom), momentum_decay=DecayType.LINEAR, wds=wd),\n",
 245 |     "           TrainingPhase(epochs=steps[2], opt_fn=optim.SGD, lr=(lr/div,lr/(div*100)), lr_decay=DecayType.LINEAR, \n",
 246 |     "                          momentum=max_mom, wds=wd)]"
 247 |    ]
 248 |   },
 249 |   {
 250 |    "cell_type": "code",
 251 |    "execution_count": 19,
 252 |    "metadata": {},
 253 |    "outputs": [],
 254 |    "source": [
 255 |     "def custom_cycle(steps, lr, opt_fn, div, max_mom, min_mom, wd):\n",
 256 |     "    return [TrainingPhase(steps[0], opt_fn, lr=(lr/div,lr), lr_decay=DecayType.LINEAR, \n",
 257 |     "                          momentum=(max_mom,min_mom), momentum_decay=DecayType.LINEAR, wds=wd),\n",
 258 |     "            TrainingPhase(steps[1], opt_fn, lr=lr, momentum=min_mom, wds=wd),\n",
 259 |     "            TrainingPhase(steps[2], opt_fn, lr=(lr,lr/div), lr_decay=DecayType.LINEAR, \n",
 260 |     "                          momentum=(min_mom,max_mom), momentum_decay=DecayType.LINEAR, wds=wd),\n",
 261 |     "            TrainingPhase(steps[3], opt_fn, lr=lr/div, lr_decay=DecayType.COSINE, \n",
 262 |     "                          momentum=max_mom, wds=wd)]"
 263 |    ]
 264 |   },
 265 |   {
 266 |    "cell_type": "markdown",
 267 |    "metadata": {},
 268 |    "source": [
 269 |     "Helper functions for the evaluation of the model at the end. TextReader is rewritten from the LanguageModelLoader class to have a constant bptt and only one batch."
 270 |    ]
 271 |   },
 272 |   {
 273 |    "cell_type": "code",
 274 |    "execution_count": 20,
 275 |    "metadata": {},
 276 |    "outputs": [],
 277 |    "source": [
 278 |     "class TextReader():\n",
 279 |     "\n",
 280 |     "    def __init__(self, nums, bptt, backwards=False):\n",
 281 |     "        self.bptt,self.backwards = bptt,backwards\n",
 282 |     "        self.data = self.batchify(nums)\n",
 283 |     "        self.i,self.iter = 0,0\n",
 284 |     "        self.n = len(self.data)\n",
 285 |     "\n",
 286 |     "    def __iter__(self):\n",
 287 |     "        self.i,self.iter = 0,0\n",
 288 |     "        while self.i < self.n-1 and self.iter<len(self):\n",
 289 |     "            res = self.get_batch(self.i, self.bptt)\n",
 290 |     "            self.i += self.bptt\n",
 291 |     "            self.iter += 1\n",
 292 |     "            yield res\n",
 293 |     "\n",
 294 |     "    def __len__(self): return self.n // self.bptt \n",
 295 |     "\n",
 296 |     "    def batchify(self, data):\n",
 297 |     "        data = np.array(data)[:,None]\n",
 298 |     "        if self.backwards: data=data[::-1]\n",
 299 |     "        return T(data)\n",
 300 |     "\n",
 301 |     "    def get_batch(self, i, seq_len):\n",
 302 |     "        source = self.data\n",
 303 |     "        seq_len = min(seq_len, len(source) - 1 - i)\n",
 304 |     "        return source[i:i+seq_len], source[i+1:i+1+seq_len].view(-1)"
 305 |    ]
 306 |   },
 307 |   {
 308 |    "cell_type": "markdown",
 309 |    "metadata": {},
 310 |    "source": [
 311 |     "Validation without reinitializing the hidden state."
 312 |    ]
 313 |   },
 314 |   {
 315 |    "cell_type": "code",
 316 |    "execution_count": 21,
 317 |    "metadata": {},
 318 |    "outputs": [],
 319 |    "source": [
 320 |     "def my_validate(model, source, bptt=2000):\n",
 321 |     "    data_source = TextReader(source, bptt)\n",
 322 |     "    model.eval()\n",
 323 |     "    model.reset()\n",
 324 |     "    total_loss = 0.\n",
 325 |     "    for inputs, targets in tqdm(data_source):\n",
 326 |     "        outputs, raws, outs = model(V(inputs))\n",
 327 |     "        p_vocab = F.softmax(outputs,1)\n",
 328 |     "        for i, pv in enumerate(p_vocab):\n",
 329 |     "            targ_pred = pv[targets[i]]\n",
 330 |     "            total_loss -= torch.log(targ_pred.detach())\n",
 331 |     "    mean = total_loss / (bptt * len(data_source))\n",
 332 |     "    return mean, np.exp(mean)"
 333 |    ]
 334 |   },
 335 |   {
 336 |    "cell_type": "markdown",
 337 |    "metadata": {},
 338 |    "source": [
 339 |     "Cache pointer"
 340 |    ]
 341 |   },
 342 |   {
 343 |    "cell_type": "code",
 344 |    "execution_count": 22,
 345 |    "metadata": {},
 346 |    "outputs": [],
 347 |    "source": [
 348 |     "def one_hot(vec, size=vocab_size):\n",
 349 |     "    a = torch.zeros(len(vec), size)\n",
 350 |     "    for i,v in enumerate(vec):\n",
 351 |     "        a[i,v] = 1.\n",
 352 |     "    return V(a)\n",
 353 |     "\n",
 354 |     "def my_cache_pointer(model, source, scale=1., theta = 0.662, lambd = 0.1279, window=200, bptt=2000):\n",
 355 |     "    data_source = TextReader(source, bptt)\n",
 356 |     "    model.eval()\n",
 357 |     "    model.reset()\n",
 358 |     "    total_loss = 0.\n",
 359 |     "    targ_history = None\n",
 360 |     "    hid_history = None\n",
 361 |     "    for inputs, targets in tqdm(data_source):\n",
 362 |     "        outputs, raws, outs = model(V(inputs))\n",
 363 |     "        p_vocab = F.softmax(outputs * scale,1)\n",
 364 |     "        start = 0 if targ_history is None else targ_history.size(0)\n",
 365 |     "        targ_history = one_hot(targets) if targ_history is None else torch.cat([targ_history, one_hot(targets)])\n",
 366 |     "        hiddens = raws[-1].squeeze() #results of the last layer + remove the batch size.\n",
 367 |     "        hid_history = scale * hiddens if hid_history is None else torch.cat([hid_history, scale * hiddens])\n",
 368 |     "        for i, pv in enumerate(p_vocab):\n",
 369 |     "            #Get the cached values\n",
 370 |     "            p = pv\n",
 371 |     "            if start + i > 0:\n",
 372 |     "                targ_cache = targ_history[:start+i] if start + i <= window else targ_history[start+i-window:start+i]\n",
 373 |     "                hid_cache = hid_history[:start+i] if start + i <= window else hid_history[start+i-window:start+i]\n",
 374 |     "                all_dot_prods = torch.mv(theta * hid_cache, hiddens[i])\n",
 375 |     "                exp_dot_prods = F.softmax(all_dot_prods).unsqueeze(1)\n",
 376 |     "                p_cache = (exp_dot_prods.expand_as(targ_cache) * targ_cache).sum(0).squeeze()\n",
 377 |     "                p = (1-lambd) * pv + lambd * p_cache\n",
 378 |     "            targ_pred = p[targets[i]]\n",
 379 |     "            total_loss -= torch.log(targ_pred.detach())\n",
 380 |     "        targ_history = targ_history[-window:]\n",
 381 |     "        hid_history = hid_history[-window:]\n",
 382 |     "    mean = total_loss / (bptt * len(data_source))\n",
 383 |     "    return mean, np.exp(mean)"
 384 |    ]
 385 |   },
 386 |   {
 387 |    "cell_type": "code",
 388 |    "execution_count": 24,
 389 |    "metadata": {},
 390 |    "outputs": [],
 391 |    "source": [
 392 |     "from fastai.rnn_reg import dropout_mask\n",
 393 |     "\n",
 394 |     "class EmbeddingDropout1(nn.Module):\n",
 395 |     "\n",
 396 |     "    \"\"\" Rewritten from EmbeddingDropout. \n",
 397 |     "    \n",
 398 |     "    Does the same thing but accept either a regular input, or an array of the form\n",
 399 |     "    [input1, input2, lambda] to mixup.\n",
 400 |     "    \"\"\"\n",
 401 |     "\n",
 402 |     "    def __init__(self, embed):\n",
 403 |     "        super().__init__()\n",
 404 |     "        self.embed = embed\n",
 405 |     "\n",
 406 |     "    def forward(self, words, dropout=0.1, scale=None):\n",
 407 |     "        if dropout:\n",
 408 |     "            size = (self.embed.weight.size(0),1)\n",
 409 |     "            mask = Variable(dropout_mask(self.embed.weight.data, size, dropout))\n",
 410 |     "            masked_embed_weight = mask * self.embed.weight\n",
 411 |     "        else: masked_embed_weight = self.embed.weight\n",
 412 |     "\n",
 413 |     "        if scale: masked_embed_weight = scale * masked_embed_weight\n",
 414 |     "\n",
 415 |     "        padding_idx = self.embed.padding_idx\n",
 416 |     "        if padding_idx is None: padding_idx = -1\n",
 417 |     "\n",
 418 |     "        if IS_TORCH_04:\n",
 419 |     "            #New here: if the input is a list, take the embeddings for the first two args, then mix them up.\n",
 420 |     "            if isinstance(words, list):\n",
 421 |     "                X1 = F.embedding(words[0], masked_embed_weight, padding_idx, self.embed.max_norm,\n",
 422 |     "                   self.embed.norm_type, self.embed.scale_grad_by_freq, self.embed.sparse)\n",
 423 |     "                X2 = F.embedding(words[1], masked_embed_weight, padding_idx, self.embed.max_norm,\n",
 424 |     "                   self.embed.norm_type, self.embed.scale_grad_by_freq, self.embed.sparse)\n",
 425 |     "                lambd = words[2].view(1,-1,1)\n",
 426 |     "                X = X1 * lambd + X2 * (1-lambd)\n",
 427 |     "            else:\n",
 428 |     "                X = F.embedding(words, masked_embed_weight, padding_idx, self.embed.max_norm,\n",
 429 |     "                   self.embed.norm_type, self.embed.scale_grad_by_freq, self.embed.sparse)\n",
 430 |     "        else:\n",
 431 |     "            #New here: if the input is a list, take the embeddings for the first two args, then mix them up.\n",
 432 |     "            if isinstance(words, list):\n",
 433 |     "                X1 = self.embed._backend.Embedding.apply(words, masked_embed_weight, padding_idx, self.embed.max_norm,\n",
 434 |     "                   self.embed.norm_type, self.embed.scale_grad_by_freq, self.embed.sparse)\n",
 435 |     "                X2 = self.embed._backend.Embedding.apply(words, masked_embed_weight, padding_idx, self.embed.max_norm,\n",
 436 |     "                   self.embed.norm_type, self.embed.scale_grad_by_freq, self.embed.sparse)\n",
 437 |     "                lambd = words[2].view(1,-1,1)\n",
 438 |     "                X = X1 * lambd + X2 * (1-lambd)\n",
 439 |     "            else:\n",
 440 |     "                X = self.embed._backend.Embedding.apply(words, masked_embed_weight, padding_idx, self.embed.max_norm,\n",
 441 |     "                  self.embed.norm_type, self.embed.scale_grad_by_freq, self.embed.sparse)\n",
 442 |     "\n",
 443 |     "        return X"
 444 |    ]
 445 |   },
 446 |   {
 447 |    "cell_type": "code",
 448 |    "execution_count": 25,
 449 |    "metadata": {},
 450 |    "outputs": [],
 451 |    "source": [
 452 |     "class RNN_Encoder1(nn.Module):\n",
 453 |     "\n",
 454 |     "    \"\"\"Rewritten from RNN_Encoder to accept multiple inputs for mixup.\n",
 455 |     "    \"\"\"\n",
 456 |     "\n",
 457 |     "    initrange=0.1\n",
 458 |     "\n",
 459 |     "    def __init__(self, ntoken, emb_sz, nhid, nlayers, pad_token, bidir=False,\n",
 460 |     "                 dropouth=0.3, dropouti=0.65, dropoute=0.1, wdrop=0.5, qrnn=False):\n",
 461 |     "        \"\"\" Default constructor for the RNN_Encoder class\n",
 462 |     "\n",
 463 |     "            Args:\n",
 464 |     "                bs (int): batch size of input data\n",
 465 |     "                ntoken (int): number of vocabulary (or tokens) in the source dataset\n",
 466 |     "                emb_sz (int): the embedding size to use to encode each token\n",
 467 |     "                nhid (int): number of hidden activation per LSTM layer\n",
 468 |     "                nlayers (int): number of LSTM layers to use in the architecture\n",
 469 |     "                pad_token (int): the int value used for padding text.\n",
 470 |     "                dropouth (float): dropout to apply to the activations going from one LSTM layer to another\n",
 471 |     "                dropouti (float): dropout to apply to the input layer.\n",
 472 |     "                dropoute (float): dropout to apply to the embedding layer.\n",
 473 |     "                wdrop (float): dropout used for a LSTM's internal (or hidden) recurrent weights.\n",
 474 |     "\n",
 475 |     "            Returns:\n",
 476 |     "                None\n",
 477 |     "          \"\"\"\n",
 478 |     "\n",
 479 |     "        super().__init__()\n",
 480 |     "        self.ndir = 2 if bidir else 1\n",
 481 |     "        self.bs, self.qrnn = 1, qrnn\n",
 482 |     "        self.encoder = nn.Embedding(ntoken, emb_sz, padding_idx=pad_token)\n",
 483 |     "        self.encoder_with_dropout = EmbeddingDropout1(self.encoder)\n",
 484 |     "        if self.qrnn:\n",
 485 |     "            #Using QRNN requires cupy: https://github.com/cupy/cupy\n",
 486 |     "            from .torchqrnn.qrnn import QRNNLayer\n",
 487 |     "            self.rnns = [QRNNLayer(emb_sz if l == 0 else nhid, (nhid if l != nlayers - 1 else emb_sz)//self.ndir,\n",
 488 |     "                save_prev_x=True, zoneout=0, window=2 if l == 0 else 1, output_gate=True) for l in range(nlayers)]\n",
 489 |     "            if wdrop:\n",
 490 |     "                for rnn in self.rnns:\n",
 491 |     "                    rnn.linear = WeightDrop(rnn.linear, wdrop, weights=['weight'])\n",
 492 |     "        else:\n",
 493 |     "            self.rnns = [nn.LSTM(emb_sz if l == 0 else nhid, (nhid if l != nlayers - 1 else emb_sz)//self.ndir,\n",
 494 |     "                1, bidirectional=bidir) for l in range(nlayers)]\n",
 495 |     "            if wdrop: self.rnns = [WeightDrop(rnn, wdrop) for rnn in self.rnns]\n",
 496 |     "        self.rnns = torch.nn.ModuleList(self.rnns)\n",
 497 |     "        self.encoder.weight.data.uniform_(-self.initrange, self.initrange)\n",
 498 |     "\n",
 499 |     "        self.emb_sz,self.nhid,self.nlayers,self.dropoute = emb_sz,nhid,nlayers,dropoute\n",
 500 |     "        self.dropouti = LockedDropout(dropouti)\n",
 501 |     "        self.dropouths = nn.ModuleList([LockedDropout(dropouth) for l in range(nlayers)])\n",
 502 |     "\n",
 503 |     "    def forward(self, input):\n",
 504 |     "        \"\"\" Invoked during the forward propagation of the RNN_Encoder module.\n",
 505 |     "        Args:\n",
 506 |     "            input (Tensor): input of shape (sentence length x batch_size)\n",
 507 |     "\n",
 508 |     "        Returns:\n",
 509 |     "            raw_outputs (tuple(list (Tensor), list(Tensor)): list of tensors evaluated from each RNN layer without using\n",
 510 |     "            dropouth, list of tensors evaluated from each RNN layer using dropouth,\n",
 511 |     "        \"\"\"\n",
 512 |     "        \n",
 513 |     "        sl,bs = input[0].size() if isinstance(input,list) else input.size()\n",
 514 |     "        if bs!=self.bs:\n",
 515 |     "            self.bs=bs\n",
 516 |     "            self.reset()\n",
 517 |     "        #New line here: if the 4-th element of the input is 1, then reset the hidden state.\n",
 518 |     "        if is_listy(input) and input[3] == 1: self.reset()\n",
 519 |     "        with set_grad_enabled(self.training):\n",
 520 |     "            emb = self.encoder_with_dropout(input, dropout=self.dropoute if self.training else 0)\n",
 521 |     "            raw_output = self.dropouti(emb)\n",
 522 |     "            new_hidden,raw_outputs,outputs = [],[],[]\n",
 523 |     "            for l, (rnn,drop) in enumerate(zip(self.rnns, self.dropouths)):\n",
 524 |     "                current_input = raw_output\n",
 525 |     "                with warnings.catch_warnings():\n",
 526 |     "                    warnings.simplefilter(\"ignore\")\n",
 527 |     "                    raw_output, new_h = rnn(raw_output, self.hidden[l])\n",
 528 |     "                new_hidden.append(new_h)\n",
 529 |     "                raw_outputs.append(raw_output)\n",
 530 |     "                if l != self.nlayers - 1: raw_output = drop(raw_output)\n",
 531 |     "                outputs.append(raw_output)\n",
 532 |     "\n",
 533 |     "            self.hidden = repackage_var(new_hidden)\n",
 534 |     "        return raw_outputs, outputs\n",
 535 |     "\n",
 536 |     "    def one_hidden(self, l):\n",
 537 |     "        nh = (self.nhid if l != self.nlayers - 1 else self.emb_sz)//self.ndir\n",
 538 |     "        if IS_TORCH_04: return Variable(self.weights.new(self.ndir, self.bs, nh).zero_())\n",
 539 |     "        else: return Variable(self.weights.new(self.ndir, self.bs, nh).zero_(), volatile=not self.training)\n",
 540 |     "\n",
 541 |     "    def reset(self):\n",
 542 |     "        if self.qrnn: [r.reset() for r in self.rnns]\n",
 543 |     "        self.weights = next(self.parameters()).data\n",
 544 |     "        if self.qrnn: self.hidden = [self.one_hidden(l) for l in range(self.nlayers)]\n",
 545 |     "        else: self.hidden = [(self.one_hidden(l), self.one_hidden(l)) for l in range(self.nlayers)]"
 546 |    ]
 547 |   },
 548 |   {
 549 |    "cell_type": "code",
 550 |    "execution_count": 26,
 551 |    "metadata": {},
 552 |    "outputs": [],
 553 |    "source": [
 554 |     "def get_language_model1(n_tok, emb_sz, nhid, nlayers, pad_token,\n",
 555 |     "                 dropout=0.4, dropouth=0.3, dropouti=0.5, dropoute=0.1, wdrop=0.5, tie_weights=True, qrnn=False, bias=False):\n",
 556 |     "    \"\"\"\n",
 557 |     "    Same as get_language_model but creates RNN_Encoder1\n",
 558 |     "    \"\"\"\n",
 559 |     "\n",
 560 |     "    rnn_enc = RNN_Encoder1(n_tok, emb_sz, nhid=nhid, nlayers=nlayers, pad_token=pad_token,\n",
 561 |     "                 dropouth=dropouth, dropouti=dropouti, dropoute=dropoute, wdrop=wdrop, qrnn=qrnn)\n",
 562 |     "    enc = rnn_enc.encoder if tie_weights else None\n",
 563 |     "    return SequentialRNN(rnn_enc, LinearDecoder(n_tok, emb_sz, dropout, tie_encoder=enc, bias=bias))"
 564 |    ]
 565 |   },
 566 |   {
 567 |    "cell_type": "code",
 568 |    "execution_count": 28,
 569 |    "metadata": {},
 570 |    "outputs": [],
 571 |    "source": [
 572 |     "class MixUpDataLoader():\n",
 573 |     "    \n",
 574 |     "    def __init__(self, nums, bs, bptt, n_keep, alpha, backwards=False):\n",
 575 |     "        \"\"\"\n",
 576 |     "        Create an instance of a mixup dataloader.\n",
 577 |     "        \n",
 578 |     "        Args:\n",
 579 |     "        nums (np.array): the corpus numericalized\n",
 580 |     "        bs (int): batch size\n",
 581 |     "        bptt (int): bptt, the number of steps taken into account into backprop\n",
 582 |     "        n_keep (int): we reset the model every chunk of n_keep batches\n",
 583 |     "        alpha (float): parameter for the beta distribution when picking the lambdas\n",
 584 |     "        \"\"\"\n",
 585 |     "        self.bs,self.bptt, self.n_keep, self.backwards, self.alpha = bs,bptt,n_keep,backwards,alpha\n",
 586 |     "        self.data = self.batchify(nums)\n",
 587 |     "        self.n = len(self.data)\n",
 588 |     "        \n",
 589 |     "    def __iter__(self):\n",
 590 |     "        self.idx = 0\n",
 591 |     "        #Shuffle and predraw the pairs of chunks\n",
 592 |     "        self.shuffle_chunks()\n",
 593 |     "        while self.idx < len(self.chunks1):\n",
 594 |     "            #Go through the chunks of batches\n",
 595 |     "            self.i = 0\n",
 596 |     "            self.len_chunk = self.chunks1[self.idx][1] - self.chunks1[self.idx][0]\n",
 597 |     "            while self.i < self.len_chunk:\n",
 598 |     "                #Then through the batches\n",
 599 |     "                res = self.get_batch() \n",
 600 |     "                yield res\n",
 601 |     "            self.idx += 1\n",
 602 |     "\n",
 603 |     "    def __len__(self): \n",
 604 |     "        return (self.n-1) // (self.bptt) if (self.n-1) % self.bptt == 0 else (self.n-1) // (self.bptt) + 1\n",
 605 |     "\n",
 606 |     "    def batchify(self, data):\n",
 607 |     "        nb = data.shape[0] // self.bs\n",
 608 |     "        data = np.array(data[:nb*self.bs])\n",
 609 |     "        data = data.reshape(self.bs, -1).T\n",
 610 |     "        if self.backwards: data=data[::-1]\n",
 611 |     "        return T(data)\n",
 612 |     "    \n",
 613 |     "    def shuffle_chunks(self):\n",
 614 |     "        #Number of chunks: roughly (n-1) / (bptt * n_keep)\n",
 615 |     "        n_chunks = (self.n-1) // (self.bptt * self.n_keep) + 1\n",
 616 |     "        if (self.n-1) % (self.bptt * self.n_keep) == 0: n_chunks -= 1\n",
 617 |     "        n_res = self.n - 1 - (n_chunks-1) * (self.bptt * self.n_keep)\n",
 618 |     "        self.chunks1, self.chunks2 = [], []\n",
 619 |     "        #Randomly draw where we will pick the chunk of batches with a length lower than the others.\n",
 620 |     "        put_res = np.random.randint(n_chunks)\n",
 621 |     "        start = 0\n",
 622 |     "        for k in range(n_chunks):\n",
 623 |     "            #Split the data into chunks of n_keep batches. \n",
 624 |     "            if k != put_res:\n",
 625 |     "                self.chunks1.append([start, start + (self.bptt * self.n_keep)])\n",
 626 |     "                start += (self.bptt * self.n_keep)\n",
 627 |     "            else:\n",
 628 |     "                self.chunks1.append([start, start + n_res])\n",
 629 |     "                start += n_res\n",
 630 |     "        #Remove the chunk with a length different from the others because it needs to be at the same position\n",
 631 |     "        #in our two lists of chunks.\n",
 632 |     "        res = self.chunks1.pop(put_res)\n",
 633 |     "        #Shuffle the chunks\n",
 634 |     "        self.chunks1 = np.random.permutation(self.chunks1)\n",
 635 |     "        self.chunks2 = np.random.permutation(self.chunks1)\n",
 636 |     "        #Add the one with a lower length at the end.\n",
 637 |     "        self.chunks1 = np.concatenate([self.chunks1, np.array([res])])\n",
 638 |     "        self.chunks2 = np.concatenate([self.chunks1, np.array([res])])\n",
 639 |     "\n",
 640 |     "    def get_batch(self):\n",
 641 |     "        source, i = self.data, self.i\n",
 642 |     "        seq_len = min(bptt, self.chunks1[self.idx][1] - self.chunks1[self.idx][0] - self.i)\n",
 643 |     "        if self.i == 0:\n",
 644 |     "            #At the beggining of a new chunk, draw the lambdas and a pemutation of the batches.\n",
 645 |     "            self.lambd = np.random.beta(self.alpha, self.alpha, self.bs)\n",
 646 |     "            self.lambd = to_gpu(VV(self.lambd))\n",
 647 |     "            self.shuffle = to_gpu(torch.Tensor(np.random.permutation(range(self.bs))).long())\n",
 648 |     "            reinit=True\n",
 649 |     "        else: reinit = False\n",
 650 |     "        #Start indexes for each chunks.\n",
 651 |     "        start1, start2 = self.chunks1[self.idx][0] + i, self.chunks2[self.idx][0] + i\n",
 652 |     "        #Input: source1, source2, lambda, reinit\n",
 653 |     "        res1 = [source[start1:start1+seq_len], source[start2:start2+seq_len,self.shuffle], self.lambd, np.array(reinit).astype(np.int8)]\n",
 654 |     "        #Target: source1 shifted, source2 shifted, lambda\n",
 655 |     "        targ1 = source[start1+1:start1+1+seq_len].contiguous().view(-1)\n",
 656 |     "        targ2 = source[start1+1:start1+1+seq_len,self.shuffle].contiguous().view(-1)\n",
 657 |     "        res2 = [targ1,targ2, self.lambd]\n",
 658 |     "        self.i += seq_len\n",
 659 |     "        return (res1, res2)"
 660 |    ]
 661 |   },
 662 |   {
 663 |    "cell_type": "code",
 664 |    "execution_count": 29,
 665 |    "metadata": {},
 666 |    "outputs": [],
 667 |    "source": [
 668 |     "class MixUpLoss(nn.Module):\n",
 669 |     "    \"\"\"\n",
 670 |     "    A new loss function that accepts targets of the form target1, target2, lambda\n",
 671 |     "    \"\"\"\n",
 672 |     "    def __init__(self, crit):\n",
 673 |     "        super().__init__()\n",
 674 |     "        self.crit = crit()\n",
 675 |     "        \n",
 676 |     "    def forward(self, output, target):\n",
 677 |     "        if not isinstance(target, list): return self.crit(output, target).mean()\n",
 678 |     "        loss1, loss2 = self.crit(output,target[0]), self.crit(output,target[1])\n",
 679 |     "        loss1, loss2 = loss1.view(-1,target[2].size(0)), loss2.view(-1,target[2].size(0))\n",
 680 |     "        return (loss1 * target[2].unsqueeze(0) + loss2 * (1-target[2].unsqueeze(0))).mean()"
 681 |    ]
 682 |   },
 683 |   {
 684 |    "cell_type": "markdown",
 685 |    "metadata": {},
 686 |    "source": [
 687 |     "The train dataloader is a mixup, the validation dataloader a regular one. The parameters that gave he best results are 7/0.7."
 688 |    ]
 689 |   },
 690 |   {
 691 |    "cell_type": "code",
 692 |    "execution_count": 30,
 693 |    "metadata": {},
 694 |    "outputs": [],
 695 |    "source": [
 696 |     "trn_dl = MixUpDataLoader1(np.concatenate(trn_ids), bs, bptt, 7, 0.6)\n",
 697 |     "val_dl = LanguageModelLoader(np.concatenate(val_ids), bs, bptt)\n",
 698 |     "md = LanguageModelData(PATH, 0, vocab_size, trn_dl, val_dl, bs=bs, bptt=bptt)"
 699 |    ]
 700 |   },
 701 |   {
 702 |    "cell_type": "markdown",
 703 |    "metadata": {},
 704 |    "source": [
 705 |     "Dropouts. Using mixup allows us to lower this regularization."
 706 |    ]
 707 |   },
 708 |   {
 709 |    "cell_type": "code",
 710 |    "execution_count": 14,
 711 |    "metadata": {},
 712 |    "outputs": [],
 713 |    "source": [
 714 |     "drops = np.array([0.6,0.4,0.5,0.1,0.2]) #Smerity's dropouts from the github repo"
 715 |    ]
 716 |   },
 717 |   {
 718 |    "cell_type": "code",
 719 |    "execution_count": 32,
 720 |    "metadata": {},
 721 |    "outputs": [],
 722 |    "source": [
 723 |     "drops = drops * 0.7"
 724 |    ]
 725 |   },
 726 |   {
 727 |    "cell_type": "markdown",
 728 |    "metadata": {},
 729 |    "source": [
 730 |     "The rest of the parameters are the same, with the exception of weight decay that can be divided by 2."
 731 |    ]
 732 |   },
 733 |   {
 734 |    "cell_type": "code",
 735 |    "execution_count": 33,
 736 |    "metadata": {},
 737 |    "outputs": [],
 738 |    "source": [
 739 |     "opt_fn = partial(optim.Adam, betas=(0.8,0.99))\n",
 740 |     "m = get_language_model1(vocab_size, em_sz, nh, nl, 0,\n",
 741 |     "    dropouti=drops[0], dropout=drops[1], wdrop=drops[2], dropoute=drops[3], dropouth=drops[4], bias=True)\n",
 742 |     "model = LanguageModel(to_gpu(m))\n",
 743 |     "learner = RNN_Learner(md, model, opt_fn=opt_fn)\n",
 744 |     "learner.crit = MixUpLoss(partial(nn.CrossEntropyLoss, reduce=False))\n",
 745 |     "learner.metrics = [accuracy]\n",
 746 |     "learner.clip=0.12\n",
 747 |     "learner.unfreeze()\n",
 748 |     "learner.reg_fn=partial(seq2seq_reg, alpha=2, beta=1)\n",
 749 |     "wd = 6e-7"
 750 |    ]
 751 |   },
 752 |   {
 753 |    "cell_type": "code",
 754 |    "execution_count": 59,
 755 |    "metadata": {},
 756 |    "outputs": [
 757 |     {
 758 |      "data": {
 759 |       "application/vnd.jupyter.widget-view+json": {
 760 |        "model_id": "367906930f9d480fae51fc7952f9938c",
 761 |        "version_major": 2,
 762 |        "version_minor": 0
 763 |       },
 764 |       "text/plain": [
 765 |        "HBox(children=(IntProgress(value=0, description='Epoch', max=90), HTML(value='')))"
 766 |       ]
 767 |      },
 768 |      "metadata": {},
 769 |      "output_type": "display_data"
 770 |     },
 771 |     {
 772 |      "name": "stdout",
 773 |      "output_type": "stream",
 774 |      "text": [
 775 |       "epoch      trn_loss   val_loss   accuracy                   \n",
 776 |       "    0      6.626329   6.030295   0.178525  \n",
 777 |       "    1      6.275505   5.589827   0.214941                   \n",
 778 |       "    2      6.027988   5.325422   0.228585                   \n",
 779 |       "    3      5.894091   5.133265   0.237486                   \n",
 780 |       "    4      5.759592   4.989663   0.246637                   \n",
 781 |       "    5      5.645917   4.893019   0.253144                   \n",
 782 |       "    6      5.589076   4.811414   0.257452                   \n",
 783 |       "    7      5.516833   4.762544   0.26024                    \n",
 784 |       "    8      5.443729   4.710184   0.261971                   \n",
 785 |       "    9      5.396975   4.677733   0.265007                   \n",
 786 |       "    10     5.379718   4.633137   0.268217                   \n",
 787 |       "    11     5.334224   4.610691   0.270078                   \n",
 788 |       "    12     5.288874   4.59802    0.269693                   \n",
 789 |       "    13     5.265066   4.561562   0.272778                   \n",
 790 |       "    14     5.273233   4.562297   0.271641                   \n",
 791 |       "    15     5.246758   4.542312   0.272835                   \n",
 792 |       "    16     5.216068   4.532898   0.274529                   \n",
 793 |       "    17     5.229719   4.532675   0.273187                   \n",
 794 |       "    18     5.192524   4.523911   0.274853                   \n",
 795 |       "    19     5.187457   4.511364   0.275774                   \n",
 796 |       "    20     5.195516   4.499867   0.276583                   \n",
 797 |       "    21     5.171666   4.503291   0.275934                   \n",
 798 |       "    22     5.177153   4.499515   0.275969                   \n",
 799 |       "    23     5.182805   4.512135   0.274146                   \n",
 800 |       "    24     5.154941   4.494557   0.277097                   \n",
 801 |       "    25     5.184615   4.502384   0.27478                    \n",
 802 |       "    26     5.170996   4.490614   0.276997                   \n",
 803 |       "    27     5.159199   4.488336   0.27764                    \n",
 804 |       "    28     5.174464   4.493053   0.277192                   \n",
 805 |       "    29     5.189454   4.50579    0.278318                   \n",
 806 |       "    30     5.157193   4.499749   0.277637                   \n",
 807 |       "    31     5.184397   4.491399   0.278034                   \n",
 808 |       "    32     5.177114   4.493066   0.277881                   \n",
 809 |       "    33     5.18206    4.488541   0.277623                   \n",
 810 |       "    34     5.178177   4.502771   0.276033                   \n",
 811 |       "    35     5.199843   4.500518   0.277119                   \n",
 812 |       "    36     5.183623   4.515243   0.275506                   \n",
 813 |       "    37     5.189091   4.508058   0.275959                   \n",
 814 |       "    38     5.197131   4.502707   0.27669                    \n",
 815 |       "    39     5.207819   4.510042   0.27562                    \n",
 816 |       "    40     5.210927   4.519082   0.276757                   \n",
 817 |       "    41     5.197964   4.514505   0.2768                     \n",
 818 |       "    42     5.177861   4.524699   0.277477                   \n",
 819 |       "    43     5.185628   4.513407   0.2754                     \n",
 820 |       "    44     5.205359   4.496628   0.277902                   \n",
 821 |       "    45     5.172986   4.484609   0.27843                    \n",
 822 |       "    46     5.164484   4.480022   0.279834                   \n",
 823 |       "    47     5.160003   4.455373   0.279106                   \n",
 824 |       "    48     5.16558    4.466043   0.279338                   \n",
 825 |       "    49     5.138186   4.43981    0.281377                   \n",
 826 |       "    50     5.110756   4.432053   0.28153                    \n",
 827 |       "    51     5.077271   4.433214   0.2801                     \n",
 828 |       "    52     5.064951   4.426848   0.281818                   \n",
 829 |       "    53     5.056215   4.415006   0.282988                   \n",
 830 |       "    54     5.054645   4.402987   0.284088                   \n",
 831 |       "    55     5.028499   4.39995    0.285488                   \n",
 832 |       "    56     5.019729   4.400858   0.28399                    \n",
 833 |       "    57     4.982164   4.3844     0.284593                   \n",
 834 |       "    58     4.979679   4.372939   0.285211                   \n",
 835 |       "    59     4.969687   4.368094   0.28614                    \n",
 836 |       "    60     4.981891   4.354716   0.287106                   \n",
 837 |       "    61     4.911375   4.351488   0.287914                   \n",
 838 |       "    62     4.95209    4.345455   0.287359                   \n",
 839 |       "    63     4.8968     4.338401   0.288977                   \n",
 840 |       "    64     4.920658   4.319134   0.289492                   \n",
 841 |       "    65     4.86294    4.315365   0.290067                   \n",
 842 |       "    66     4.88185    4.31132    0.290526                   \n",
 843 |       "    67     4.823708   4.310569   0.290439                   \n",
 844 |       "    68     4.80722    4.303623   0.290308                   \n",
 845 |       "    69     4.79474    4.289685   0.292336                   \n",
 846 |       "    70     4.807685   4.285055   0.293254                   \n",
 847 |       "    71     4.781553   4.278372   0.293524                   \n",
 848 |       "    72     4.765378   4.270534   0.29311                    \n",
 849 |       "    73     4.745676   4.269522   0.293928                   \n",
 850 |       "    74     4.683143   4.273508   0.293781                   \n",
 851 |       "    75     4.668473   4.258957   0.294176                   \n",
 852 |       "    76     4.67758    4.253821   0.295069                   \n",
 853 |       "    77     4.65209    4.246094   0.295221                   \n",
 854 |       "    78     4.646799   4.242568   0.295861                   \n",
 855 |       "    79     4.609731   4.235199   0.29571                    \n",
 856 |       "    80     4.567756   4.236626   0.296649                   \n",
 857 |       "    81     4.563991   4.21913    0.298206                   \n",
 858 |       "    82     4.549918   4.226132   0.297025                   \n",
 859 |       "    83     4.551016   4.218874   0.298011                   \n",
 860 |       "    84     4.540176   4.218165   0.298074                   \n",
 861 |       "    85     4.531996   4.215041   0.298828                   \n",
 862 |       "    86     4.503565   4.213403   0.298601                   \n",
 863 |       "    87     4.495955   4.214094   0.298183                   \n",
 864 |       "    88     4.428206   4.212115   0.29813                    \n",
 865 |       "    89     4.452531   4.212716   0.298399                   \n",
 866 |       "\n"
 867 |      ]
 868 |     },
 869 |     {
 870 |      "data": {
 871 |       "text/plain": [
 872 |        "[4.212716070810954, 0.2983988434076309]"
 873 |       ]
 874 |      },
 875 |      "execution_count": 59,
 876 |      "metadata": {},
 877 |      "output_type": "execute_result"
 878 |     }
 879 |    ],
 880 |    "source": [
 881 |     "learner.fit(1e-2, 1, cycle_len=90, wds=wd, use_clr_beta=(10,7.5,0.8,0.7))"
 882 |    ]
 883 |   },
 884 |   {
 885 |    "cell_type": "code",
 886 |    "execution_count": 60,
 887 |    "metadata": {},
 888 |    "outputs": [
 889 |     {
 890 |      "name": "stdout",
 891 |      "output_type": "stream",
 892 |      "text": [
 893 |       "100%|██████████| 108/108 [00:44<00:00,  2.41it/s]\n"
 894 |      ]
 895 |     },
 896 |     {
 897 |      "data": {
 898 |       "text/plain": [
 899 |        "(\n",
 900 |        "  4.2080\n",
 901 |        " [torch.cuda.FloatTensor of size () (GPU 0)], \n",
 902 |        "  67.2209\n",
 903 |        " [torch.FloatTensor of size ()])"
 904 |       ]
 905 |      },
 906 |      "execution_count": 60,
 907 |      "metadata": {},
 908 |      "output_type": "execute_result"
 909 |     }
 910 |    ],
 911 |    "source": [
 912 |     "my_validate(learner.model, np.concatenate(val_ids))"
 913 |    ]
 914 |   },
 915 |   {
 916 |    "cell_type": "markdown",
 917 |    "metadata": {},
 918 |    "source": [
 919 |     "Smerity's best with finetuning: 67.2"
 920 |    ]
 921 |   },
 922 |   {
 923 |    "cell_type": "code",
 924 |    "execution_count": 61,
 925 |    "metadata": {},
 926 |    "outputs": [
 927 |     {
 928 |      "name": "stdout",
 929 |      "output_type": "stream",
 930 |      "text": [
 931 |       "100%|██████████| 108/108 [07:52<00:00,  4.37s/it]\n"
 932 |      ]
 933 |     },
 934 |     {
 935 |      "data": {
 936 |       "text/plain": [
 937 |        "(\n",
 938 |        "  3.9554\n",
 939 |        " [torch.cuda.FloatTensor of size () (GPU 0)], \n",
 940 |        "  52.2156\n",
 941 |        " [torch.FloatTensor of size ()])"
 942 |       ]
 943 |      },
 944 |      "execution_count": 61,
 945 |      "metadata": {},
 946 |      "output_type": "execute_result"
 947 |     }
 948 |    ],
 949 |    "source": [
 950 |     "my_cache_pointer(learner.model, np.concatenate(val_ids), window=3785)"
 951 |    ]
 952 |   },
 953 |   {
 954 |    "cell_type": "markdown",
 955 |    "metadata": {},
 956 |    "source": [
 957 |     "Smerity's best: 52.2"
 958 |    ]
 959 |   },
 960 |   {
 961 |    "cell_type": "markdown",
 962 |    "metadata": {},
 963 |    "source": [
 964 |     "For the QRNNs, the best parameters for the mixup dataloader are 8/0.4"
 965 |    ]
 966 |   },
 967 |   {
 968 |    "cell_type": "code",
 969 |    "execution_count": null,
 970 |    "metadata": {},
 971 |    "outputs": [],
 972 |    "source": [
 973 |     "em_sz,nh,nl = 400,1550,4\n",
 974 |     "bptt, bs = 70, 100"
 975 |    ]
 976 |   },
 977 |   {
 978 |    "cell_type": "code",
 979 |    "execution_count": 37,
 980 |    "metadata": {},
 981 |    "outputs": [],
 982 |    "source": [
 983 |     "trn_dl = MixUpDataLoader1(np.concatenate(trn_ids), bs, bptt, 8, 0.4)\n",
 984 |     "val_dl = LanguageModelLoader(np.concatenate(val_ids), bs, bptt)\n",
 985 |     "md = LanguageModelData(PATH, 0, vocab_size, trn_dl, val_dl, bs=bs, bptt=bptt)"
 986 |    ]
 987 |   },
 988 |   {
 989 |    "cell_type": "code",
 990 |    "execution_count": 38,
 991 |    "metadata": {},
 992 |    "outputs": [],
 993 |    "source": [
 994 |     "drops = np.array([0.4,0.4,0.1,0.1,0.2]) #Smerity's dropouts from the github repo"
 995 |    ]
 996 |   },
 997 |   {
 998 |    "cell_type": "code",
 999 |    "execution_count": 39,
1000 |    "metadata": {},
1001 |    "outputs": [],
1002 |    "source": [
1003 |     "drops = drops * 0.75"
1004 |    ]
1005 |   },
1006 |   {
1007 |    "cell_type": "code",
1008 |    "execution_count": 40,
1009 |    "metadata": {},
1010 |    "outputs": [],
1011 |    "source": [
1012 |     "opt_fn = partial(optim.Adam, betas=(0.8,0.99))\n",
1013 |     "m = get_language_model1(vocab_size, em_sz, nh, nl, 0,\n",
1014 |     "    dropouti=drops[0], dropout=drops[1], wdrop=drops[2], dropoute=drops[3], dropouth=drops[4], qrnn=True, bias=True)\n",
1015 |     "model = LanguageModel(to_gpu(m))\n",
1016 |     "learner = RNN_Learner(md, model, opt_fn=opt_fn)\n",
1017 |     "learner.crit = MixUpLoss(partial(nn.CrossEntropyLoss, reduce=False))\n",
1018 |     "learner.metrics = [accuracy]\n",
1019 |     "learner.clip=0.12\n",
1020 |     "learner.unfreeze()\n",
1021 |     "learner.reg_fn=partial(seq2seq_reg, alpha=2, beta=1)\n",
1022 |     "wd = 1e-6"
1023 |    ]
1024 |   },
1025 |   {
1026 |    "cell_type": "code",
1027 |    "execution_count": 57,
1028 |    "metadata": {},
1029 |    "outputs": [
1030 |     {
1031 |      "data": {
1032 |       "application/vnd.jupyter.widget-view+json": {
1033 |        "model_id": "4a52dda730ac4a95815e791250b0d7ec",
1034 |        "version_major": 2,
1035 |        "version_minor": 0
1036 |       },
1037 |       "text/plain": [
1038 |        "HBox(children=(IntProgress(value=0, description='Epoch', max=90), HTML(value='')))"
1039 |       ]
1040 |      },
1041 |      "metadata": {},
1042 |      "output_type": "display_data"
1043 |     },
1044 |     {
1045 |      "name": "stdout",
1046 |      "output_type": "stream",
1047 |      "text": [
1048 |       "epoch      trn_loss   val_loss   accuracy                   \n",
1049 |       "    0      6.645041   6.111343   0.168837  \n",
1050 |       "    1      6.241145   5.621353   0.209918                   \n",
1051 |       "    2      6.006832   5.365736   0.221883                   \n",
1052 |       "    3      5.804309   5.169753   0.23302                    \n",
1053 |       "    4      5.672886   5.02983    0.243427                   \n",
1054 |       "    5      5.572516   4.927614   0.249757                   \n",
1055 |       "    6      5.480185   4.840228   0.251659                   \n",
1056 |       "    7      5.405964   4.775793   0.256458                   \n",
1057 |       "    8      5.323781   4.719518   0.260135                   \n",
1058 |       "    9      5.301296   4.684981   0.26099                    \n",
1059 |       "    10     5.259113   4.659498   0.262679                   \n",
1060 |       "    11     5.24911    4.629782   0.263006                   \n",
1061 |       "    12     5.196324   4.599059   0.268283                   \n",
1062 |       "    13     5.169601   4.592118   0.269083                   \n",
1063 |       "    14     5.131697   4.576978   0.269349                   \n",
1064 |       "    15     5.130156   4.57565    0.268847                   \n",
1065 |       "    16     5.109228   4.544824   0.270566                   \n",
1066 |       "    17     5.08938    4.567359   0.270374                   \n",
1067 |       "    18     5.085055   4.557732   0.269943                   \n",
1068 |       "    19     5.092073   4.529551   0.269761                   \n",
1069 |       "    20     5.050129   4.535354   0.269281                   \n",
1070 |       "    21     5.061608   4.534164   0.270514                   \n",
1071 |       "    22     5.042799   4.525334   0.271235                   \n",
1072 |       "    23     5.093311   4.533935   0.271791                   \n",
1073 |       "    24     5.090526   4.504567   0.273157                   \n",
1074 |       "    25     5.061009   4.514853   0.272238                   \n",
1075 |       "    26     5.06793    4.515023   0.27298                    \n",
1076 |       "    27     5.098586   4.519208   0.272268                   \n",
1077 |       "    28     5.083001   4.522001   0.272778                   \n",
1078 |       "    29     5.083461   4.51691    0.270516                   \n",
1079 |       "    30     5.090954   4.52026    0.272058                   \n",
1080 |       "    31     5.065465   4.532643   0.269444                   \n",
1081 |       "    32     5.079646   4.53429    0.269287                   \n",
1082 |       "    33     5.066051   4.534481   0.272314                   \n",
1083 |       "    34     5.095714   4.537878   0.271023                   \n",
1084 |       "    35     5.091392   4.530611   0.272417                   \n",
1085 |       "    36     5.129543   4.542377   0.269377                   \n",
1086 |       "    37     5.138392   4.545216   0.269866                   \n",
1087 |       "    38     5.165642   4.567279   0.270425                   \n",
1088 |       "    39     5.138128   4.557847   0.270514                   \n",
1089 |       "    40     5.135564   4.554789   0.269561                   \n",
1090 |       "    41     5.138917   4.545438   0.268686                   \n",
1091 |       "    42     5.095217   4.536446   0.267453                   \n",
1092 |       "    43     5.08435    4.523819   0.273534                   \n",
1093 |       "    44     5.136052   4.517914   0.271485                   \n",
1094 |       "    45     5.069285   4.496474   0.273144                   \n",
1095 |       "    46     5.052997   4.490711   0.273823                   \n",
1096 |       "    47     5.062989   4.480856   0.274575                   \n",
1097 |       "    48     5.031988   4.488932   0.274731                   \n",
1098 |       "    49     5.005546   4.460372   0.276215                   \n",
1099 |       "    50     5.013542   4.466787   0.277582                   \n",
1100 |       "    51     4.97171    4.446997   0.278847                   \n",
1101 |       "    52     4.960214   4.435676   0.279613                   \n",
1102 |       "    53     4.921822   4.436192   0.278903                   \n",
1103 |       "    54     4.932337   4.409701   0.280189                   \n",
1104 |       "    55     4.920174   4.400001   0.28084                    \n",
1105 |       "    56     4.898736   4.401653   0.280625                   \n",
1106 |       "    57     4.880563   4.377549   0.282495                   \n",
1107 |       "    58     4.876972   4.386669   0.279066                   \n",
1108 |       "    59     4.832559   4.369939   0.282924                   \n",
1109 |       "    60     4.842274   4.366953   0.284977                   \n",
1110 |       "    61     4.783952   4.354818   0.285994                   \n",
1111 |       "    62     4.78493    4.344323   0.286564                   \n",
1112 |       "    63     4.760014   4.33985    0.287427                   \n",
1113 |       "    64     4.781122   4.330417   0.287477                   \n",
1114 |       "    65     4.705451   4.317031   0.288281                   \n",
1115 |       "    66     4.7138     4.316244   0.288743                   \n",
1116 |       "    67     4.715738   4.302091   0.288728                   \n",
1117 |       "    68     4.679006   4.29817    0.289345                   \n",
1118 |       "    69     4.622593   4.287198   0.291783                   \n",
1119 |       "    70     4.649553   4.292628   0.290838                   \n",
1120 |       "    71     4.61578    4.281664   0.291115                   \n",
1121 |       "    72     4.595557   4.274585   0.291354                   \n",
1122 |       "    73     4.574758   4.267881   0.292458                   \n",
1123 |       "    74     4.539172   4.264382   0.29204                    \n",
1124 |       "    75     4.537903   4.259481   0.293912                   \n",
1125 |       "    76     4.477458   4.248281   0.294409                   \n",
1126 |       "    77     4.477541   4.241148   0.294697                   \n",
1127 |       "    78     4.43493    4.24016    0.295156                   \n",
1128 |       "    79     4.419557   4.231896   0.296171                   \n",
1129 |       "    80     4.404289   4.232758   0.296381                   \n",
1130 |       "    81     4.375007   4.231765   0.296446                   \n",
1131 |       "    82     4.363857   4.22911    0.296851                   \n",
1132 |       "    83     4.308866   4.232265   0.296502                   \n",
1133 |       "    84     4.297019   4.228879   0.29685                    \n",
1134 |       "    85     4.319711   4.227801   0.297055                   \n",
1135 |       "    86     4.31708    4.224775   0.297235                   \n",
1136 |       "    87     4.284512   4.224898   0.297354                   \n",
1137 |       "    88     4.277074   4.224936   0.297453                   \n",
1138 |       "    89     4.26808    4.220731   0.297647                   \n",
1139 |       "\n"
1140 |      ]
1141 |     },
1142 |     {
1143 |      "data": {
1144 |       "text/plain": [
1145 |        "[4.220731449127197, 0.29764690498511]"
1146 |       ]
1147 |      },
1148 |      "execution_count": 57,
1149 |      "metadata": {},
1150 |      "output_type": "execute_result"
1151 |     }
1152 |    ],
1153 |    "source": [
1154 |     "learner.fit(1e-2, 1, cycle_len=90, wds=wd, use_clr_beta=(10,10,0.8,0.7))"
1155 |    ]
1156 |   },
1157 |   {
1158 |    "cell_type": "code",
1159 |    "execution_count": 58,
1160 |    "metadata": {},
1161 |    "outputs": [
1162 |     {
1163 |      "name": "stdout",
1164 |      "output_type": "stream",
1165 |      "text": [
1166 |       "100%|██████████| 108/108 [00:18<00:00,  5.92it/s]\n"
1167 |      ]
1168 |     },
1169 |     {
1170 |      "data": {
1171 |       "text/plain": [
1172 |        "(\n",
1173 |        "  4.2194\n",
1174 |        " [torch.cuda.FloatTensor of size () (GPU 2)], \n",
1175 |        "  67.9927\n",
1176 |        " [torch.FloatTensor of size ()])"
1177 |       ]
1178 |      },
1179 |      "execution_count": 58,
1180 |      "metadata": {},
1181 |      "output_type": "execute_result"
1182 |     }
1183 |    ],
1184 |    "source": [
1185 |     "my_validate(learner.model, np.concatenate(val_ids))"
1186 |    ]
1187 |   },
1188 |   {
1189 |    "cell_type": "markdown",
1190 |    "metadata": {},
1191 |    "source": [
1192 |     "Smerity's best with finetuning: 68.5"
1193 |    ]
1194 |   },
1195 |   {
1196 |    "cell_type": "code",
1197 |    "execution_count": 59,
1198 |    "metadata": {},
1199 |    "outputs": [
1200 |     {
1201 |      "name": "stdout",
1202 |      "output_type": "stream",
1203 |      "text": [
1204 |       "100%|██████████| 108/108 [07:26<00:00,  4.13s/it]\n"
1205 |      ]
1206 |     },
1207 |     {
1208 |      "data": {
1209 |       "text/plain": [
1210 |        "(\n",
1211 |        "  3.9860\n",
1212 |        " [torch.cuda.FloatTensor of size () (GPU 2)], \n",
1213 |        "  53.8380\n",
1214 |        " [torch.FloatTensor of size ()])"
1215 |       ]
1216 |      },
1217 |      "execution_count": 59,
1218 |      "metadata": {},
1219 |      "output_type": "execute_result"
1220 |     }
1221 |    ],
1222 |    "source": [
1223 |     "my_cache_pointer(learner.model, np.concatenate(val_ids), window=3785)"
1224 |    ]
1225 |   },
1226 |   {
1227 |    "cell_type": "markdown",
1228 |    "metadata": {},
1229 |    "source": [
1230 |     "Smerity's best: 53.6"
1231 |    ]
1232 |   },
1233 |   {
1234 |    "cell_type": "code",
1235 |    "execution_count": null,
1236 |    "metadata": {},
1237 |    "outputs": [],
1238 |    "source": []
1239 |   }
1240 |  ],
1241 |  "metadata": {
1242 |   "kernelspec": {
1243 |    "display_name": "Python 3",
1244 |    "language": "python",
1245 |    "name": "python3"
1246 |   },
1247 |   "language_info": {
1248 |    "codemirror_mode": {
1249 |     "name": "ipython",
1250 |     "version": 3
1251 |    },
1252 |    "file_extension": ".py",
1253 |    "mimetype": "text/x-python",
1254 |    "name": "python",
1255 |    "nbconvert_exporter": "python",
1256 |    "pygments_lexer": "ipython3",
1257 |    "version": "3.6.4"
1258 |   }
1259 |  },
1260 |  "nbformat": 4,
1261 |  "nbformat_minor": 2
1262 | }
1263 | 


--------------------------------------------------------------------------------
/Lesson 9 loss function/overlaps0.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sgugger/Deep-Learning/95c2307697a764e191470aca1fca07de6fdaa23b/Lesson 9 loss function/overlaps0.npy


--------------------------------------------------------------------------------
/Lesson 9 loss function/overlaps4.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sgugger/Deep-Learning/95c2307697a764e191470aca1fca07de6fdaa23b/Lesson 9 loss function/overlaps4.npy


--------------------------------------------------------------------------------
/Lesson 9 loss function/pred_bb.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sgugger/Deep-Learning/95c2307697a764e191470aca1fca07de6fdaa23b/Lesson 9 loss function/pred_bb.npy


--------------------------------------------------------------------------------
/Lesson 9 loss function/pred_bb1.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sgugger/Deep-Learning/95c2307697a764e191470aca1fca07de6fdaa23b/Lesson 9 loss function/pred_bb1.npy


--------------------------------------------------------------------------------
/Lesson 9 loss function/pred_cls.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sgugger/Deep-Learning/95c2307697a764e191470aca1fca07de6fdaa23b/Lesson 9 loss function/pred_cls.npy


--------------------------------------------------------------------------------
/Lesson 9 loss function/pred_cls1.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sgugger/Deep-Learning/95c2307697a764e191470aca1fca07de6fdaa23b/Lesson 9 loss function/pred_cls1.npy


--------------------------------------------------------------------------------
/Lesson 9 loss function/targ_bb.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sgugger/Deep-Learning/95c2307697a764e191470aca1fca07de6fdaa23b/Lesson 9 loss function/targ_bb.npy


--------------------------------------------------------------------------------
/Lesson 9 loss function/targ_cls.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sgugger/Deep-Learning/95c2307697a764e191470aca1fca07de6fdaa23b/Lesson 9 loss function/targ_cls.npy


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Deep Learning Notebooks
 2 | 
 3 | Here are my jupyter notebooks on deep learning. All of them are written in pytorch and most of them use the [fastai library](https://github.com/fastai/fastai).
 4 | 
 5 | * 
 6 | 
 7 | ----
 8 | ## Tutorials
 9 | 
10 | * [Using the new training API in fastai](https://github.com/sgugger/Deep-Learning/blob/master/Understanding%20the%20new%20fastai%20API%20for%20scheduling%20training.ipynb).
11 | * [Building a simple neural net in pytorch](https://github.com/sgugger/Deep-Learning/blob/master/First%20neural%20net%20in%20pytorch.ipynb).
12 | * [Explore the fastai callback system](https://github.com/sgugger/Deep-Learning/blob/master/Using%20the%20callback%20system%20in%20fastai.ipynb).
13 | 
14 | ----
15 | ## Implementation of articles
16 | 
17 | * [Neural cache pointer](https://github.com/sgugger/Deep-Learning/blob/master/Cache%20pointer.ipynb) introduce by [Grave et al.](https://arxiv.org/abs/1612.04426)
18 | * [Superconvergence on cifar10](https://github.com/sgugger/Deep-Learning/blob/master/Cyclical%20LR%20and%20momentums.ipynb) using the 1cycle policy introduce by [Leslie Smith](https://arxiv.org/abs/1803.09820)
19 | * [Deep painterly harmonization](https://github.com/sgugger/Deep-Learning/blob/master/DeepPainterlyHarmonization.ipynb) from [this article](https://arxiv.org/abs/1804.03189)
20 | * [Adam and weight decay](https://github.com/sgugger/Deep-Learning/tree/master/Adam%20and%20weight%20decay) for the correction preoposed in [this article](https://arxiv.org/abs/1711.05101)
21 | 
22 | ----
23 | The rest is just a bun of random stuff shared with fellow fastai students.
24 | 
25 | 
26 | 


--------------------------------------------------------------------------------
/img/FPN.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sgugger/Deep-Learning/95c2307697a764e191470aca1fca07de6fdaa23b/img/FPN.png


--------------------------------------------------------------------------------
/img/RetinaHead.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sgugger/Deep-Learning/95c2307697a764e191470aca1fca07de6fdaa23b/img/RetinaHead.png


--------------------------------------------------------------------------------
/mAP/focus-4b.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sgugger/Deep-Learning/95c2307697a764e191470aca1fca07de6fdaa23b/mAP/focus-4b.h5


--------------------------------------------------------------------------------
/wikitext_103.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "%load_ext autoreload\n",
 10 |     "%autoreload 2"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": 2,
 16 |    "metadata": {},
 17 |    "outputs": [],
 18 |    "source": [
 19 |     "from fastai import *\n",
 20 |     "from fastai.text import *"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "markdown",
 25 |    "metadata": {},
 26 |    "source": [
 27 |     "# Wikitext 103"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "code",
 32 |    "execution_count": 3,
 33 |    "metadata": {},
 34 |    "outputs": [],
 35 |    "source": [
 36 |     "PATH = Path('../data/wikitext-103/60k')"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": 4,
 42 |    "metadata": {},
 43 |    "outputs": [],
 44 |    "source": [
 45 |     "def read_file(filename):\n",
 46 |     "    with open(PATH/f'{filename}.txt', encoding='utf8') as f:\n",
 47 |     "        text = f.readlines()\n",
 48 |     "    df = pd.DataFrame({'text':np.array(text), 'labels':np.zeros(len(text))}, columns=['labels', 'text'])\n",
 49 |     "    df.to_csv(PATH/f'{filename}.csv', header=False, index=False)"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "code",
 54 |    "execution_count": 9,
 55 |    "metadata": {},
 56 |    "outputs": [],
 57 |    "source": [
 58 |     "read_file('valid')\n",
 59 |     "#read_file('test')\n",
 60 |     "read_file('train')"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "code",
 65 |    "execution_count": 10,
 66 |    "metadata": {},
 67 |    "outputs": [
 68 |     {
 69 |      "name": "stdout",
 70 |      "output_type": "stream",
 71 |      "text": [
 72 |       "Tokenizing train. This might take a while so you should grab a coffee.\n"
 73 |      ]
 74 |     },
 75 |     {
 76 |      "data": {
 77 |       "application/vnd.jupyter.widget-view+json": {
 78 |        "model_id": "",
 79 |        "version_major": 2,
 80 |        "version_minor": 0
 81 |       },
 82 |       "text/plain": [
 83 |        "HBox(children=(IntProgress(value=0, max=181), HTML(value='')))"
 84 |       ]
 85 |      },
 86 |      "metadata": {},
 87 |      "output_type": "display_data"
 88 |     },
 89 |     {
 90 |      "name": "stdout",
 91 |      "output_type": "stream",
 92 |      "text": [
 93 |       "Numericalizing train.\n",
 94 |       "Tokenizing valid. This might take a while so you should grab a coffee.\n"
 95 |      ]
 96 |     },
 97 |     {
 98 |      "data": {
 99 |       "application/vnd.jupyter.widget-view+json": {
100 |        "model_id": "",
101 |        "version_major": 2,
102 |        "version_minor": 0
103 |       },
104 |       "text/plain": [
105 |        "HBox(children=(IntProgress(value=0, max=1), HTML(value='')))"
106 |       ]
107 |      },
108 |      "metadata": {},
109 |      "output_type": "display_data"
110 |     },
111 |     {
112 |      "name": "stdout",
113 |      "output_type": "stream",
114 |      "text": [
115 |       "Numericalizing valid.\n"
116 |      ]
117 |     }
118 |    ],
119 |    "source": [
120 |     "tokenizer = Tokenizer(rules=rules, special_cases=[BOS, FLD, 'xxunk', 'xxpad'])\n",
121 |     "train_ds, valid_ds = TextDataset.from_csv(PATH, tokenizer, max_vocab=60000)"
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "code",
126 |    "execution_count": 11,
127 |    "metadata": {},
128 |    "outputs": [],
129 |    "source": [
130 |     "bs,bptt = 80,70\n",
131 |     "train_dl = LanguageModelLoader(np.concatenate(train_ds.ids), bs, bptt)\n",
132 |     "valid_dl = LanguageModelLoader(np.concatenate(valid_ds.ids), bs, bptt)"
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "code",
137 |    "execution_count": 12,
138 |    "metadata": {},
139 |    "outputs": [],
140 |    "source": [
141 |     "vocab_size = len(train_ds.vocab.itos)\n",
142 |     "data = DataBunch(train_dl, valid_dl)"
143 |    ]
144 |   },
145 |   {
146 |    "cell_type": "code",
147 |    "execution_count": 13,
148 |    "metadata": {},
149 |    "outputs": [],
150 |    "source": [
151 |     "emb_sz, nh, nl = 400, 1150, 3\n",
152 |     "dps = np.array([0.25, 0.1, 0.2, 0.02, 0.15]) * 0.1\n",
153 |     "model = get_language_model(vocab_size, emb_sz, nh, nl, 0, input_p=dps[0], output_p=dps[1], weight_p=dps[2], \n",
154 |     "                           embed_p=dps[3], hidden_p=dps[4])\n",
155 |     "learn = Learner(data, model)"
156 |    ]
157 |   },
158 |   {
159 |    "cell_type": "code",
160 |    "execution_count": 14,
161 |    "metadata": {},
162 |    "outputs": [],
163 |    "source": [
164 |     "learn.opt_fn = partial(optim.Adam, betas=(0.8,0.99))\n",
165 |     "learn.callbacks.append(RNNTrainer(learn, bptt, alpha=2, beta=1))\n",
166 |     "learn.callback_fns.append(partial(GradientClipping, clip=0.12))\n",
167 |     "learn.metrics = [accuracy]\n",
168 |     "learn.true_wd=False"
169 |    ]
170 |   },
171 |   {
172 |    "cell_type": "code",
173 |    "execution_count": 15,
174 |    "metadata": {},
175 |    "outputs": [
176 |     {
177 |      "data": {
178 |       "application/vnd.jupyter.widget-view+json": {
179 |        "model_id": "",
180 |        "version_major": 2,
181 |        "version_minor": 0
182 |       },
183 |       "text/plain": [
184 |        "VBox(children=(HBox(children=(IntProgress(value=0, max=10), HTML(value=''))), HTML(value='epoch  train loss  v…"
185 |       ]
186 |      },
187 |      "metadata": {},
188 |      "output_type": "display_data"
189 |     },
190 |     {
191 |      "name": "stdout",
192 |      "output_type": "stream",
193 |      "text": [
194 |       "Total time: 10:28:58\n",
195 |       "epoch  train loss  valid loss  accuracy\n",
196 |       "0      3.867067    3.917060    0.357719  (1:03:08)\n",
197 |       "1      3.736901    3.824207    0.363141  (1:03:02)\n",
198 |       "2      3.734958    3.847070    0.360801  (1:02:49)\n",
199 |       "3      3.690017    3.810396    0.364572  (1:02:50)\n",
200 |       "4      3.615047    3.704101    0.374547  (1:02:58)\n",
201 |       "5      3.555377    3.589375    0.387541  (1:02:58)\n",
202 |       "6      3.526703    3.495668    0.399273  (1:02:42)\n",
203 |       "7      3.531244    3.480701    0.401607  (1:02:47)\n",
204 |       "8      3.505409    3.472463    0.402647  (1:02:46)\n",
205 |       "9      3.501862    3.468110    0.403324  (1:02:53)\n",
206 |       "\n"
207 |      ]
208 |     }
209 |    ],
210 |    "source": [
211 |     "fit_one_cycle(learn, 10, 5e-3, (0.8,0.7), wd=1e-7)"
212 |    ]
213 |   },
214 |   {
215 |    "cell_type": "code",
216 |    "execution_count": 16,
217 |    "metadata": {},
218 |    "outputs": [],
219 |    "source": [
220 |     "learn.save('lstm_wt103')"
221 |    ]
222 |   }
223 |  ],
224 |  "metadata": {
225 |   "kernelspec": {
226 |    "display_name": "Python 3",
227 |    "language": "python",
228 |    "name": "python3"
229 |   },
230 |   "language_info": {
231 |    "codemirror_mode": {
232 |     "name": "ipython",
233 |     "version": 3
234 |    },
235 |    "file_extension": ".py",
236 |    "mimetype": "text/x-python",
237 |    "name": "python",
238 |    "nbconvert_exporter": "python",
239 |    "pygments_lexer": "ipython3",
240 |    "version": "3.7.0"
241 |   },
242 |   "toc": {
243 |    "base_numbering": 1,
244 |    "nav_menu": {},
245 |    "number_sections": false,
246 |    "sideBar": true,
247 |    "skip_h1_title": false,
248 |    "title_cell": "Table of Contents",
249 |    "title_sidebar": "Contents",
250 |    "toc_cell": false,
251 |    "toc_position": {},
252 |    "toc_section_display": true,
253 |    "toc_window_display": false
254 |   }
255 |  },
256 |  "nbformat": 4,
257 |  "nbformat_minor": 2
258 | }
259 | 


--------------------------------------------------------------------------------