├── .gitignore ├── 01_foundations ├── Code.ipynb └── math.ipynb ├── 02_fundamentals ├── Code.ipynb ├── __pycache__ │ └── helper.cpython-36.pyc └── math.ipynb ├── 03_dlfs └── Code.ipynb ├── 04_extensions ├── Code.ipynb └── Math.ipynb ├── 05_convolutions ├── Code.ipynb ├── Math.ipynb └── Numpy_Convolution_Demos.ipynb ├── 06_rnns ├── Autograd_Simple.ipynb ├── Math.ipynb ├── RNN_DLFS.ipynb └── input.txt ├── 07_PyTorch └── Code.ipynb ├── LICENSE ├── README.md └── lincoln ├── .gitignore ├── LICENSE ├── README.md ├── lincoln.png ├── lincoln ├── activations.py ├── base.py ├── conv.py ├── dense.py ├── layers.py ├── losses.py ├── network.py ├── optimizers.py ├── pytorch │ ├── layers.py │ ├── model.py │ ├── preprocessor.py │ ├── train.py │ └── utils.py ├── reshape.py ├── train.py └── utils │ ├── mnist.py │ └── np_utils.py └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | *__pycache__* 2 | *.pyc* 3 | *.ipynb_checkpoints* 4 | *.DS_Store* 5 | *.c 6 | *.so 7 | *.o 8 | *.txt 9 | 10 | *data/* 11 | *.pkl* 12 | 13 | *.pt 14 | *ubyte 15 | 16 | */utils/data* 17 | 18 | *.vscode -------------------------------------------------------------------------------- /01_foundations/math.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "$$ f'(x) $$ a function\n", 8 | "\n", 9 | "$$ f'(a) $$ a number when $f$ is a function of one variable" 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "# Derivative math" 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "metadata": {}, 22 | "source": [ 23 | "$$ \\frac{df}{dx}(a) = \n", 24 | "\\lim_{\\Delta \\to 0} \\frac{{f \\left( {a + \\Delta } \\right) - f\\left( a - \\Delta \\right)}}{2 * \\Delta } $$\n", 25 | "\n", 26 | "$$ \\frac = \n", 27 | "\\lim_{\\Delta \\to 0} \\frac{{f \\left( {a + \\Delta } \\right) - f\\left( a - \\Delta \\right)}}{2 * \\Delta } $$" 28 | ] 29 | }, 30 | { 31 | "cell_type": "markdown", 32 | "metadata": {}, 33 | "source": [ 34 | "$$ f_2(f_1(x)) = y $$\n", 35 | "\n", 36 | "$$ f_1(x) = u $$\n", 37 | "\n", 38 | "$$ \\frac{df_2}{dx}(x) = \\frac{df_2}{du}(f_1(x)) * \\frac{df_1}{dx}(x) $$" 39 | ] 40 | }, 41 | { 42 | "cell_type": "markdown", 43 | "metadata": {}, 44 | "source": [ 45 | "$$ \\frac{df_3}{dx}(x) = \\frac{df_3}{dv}(f_2(f_1(x))) * \\frac{df_2}{du}(f_1(x)) * \\frac{df_1}{dx}(x) $$" 46 | ] 47 | }, 48 | { 49 | "cell_type": "markdown", 50 | "metadata": {}, 51 | "source": [ 52 | "$$ \\frac{df}{dx}\\bigr\\rvert_{x=a} = \n", 53 | "\\lim_{\\Delta \\to 0} \\frac{{f \\left( {a + \\Delta } \\right) - f\\left( a - \\Delta \\right)}}{2 * \\Delta } $$" 54 | ] 55 | }, 56 | { 57 | "cell_type": "markdown", 58 | "metadata": {}, 59 | "source": [ 60 | "# Function with multiple inputs example" 61 | ] 62 | }, 63 | { 64 | "cell_type": "markdown", 65 | "metadata": {}, 66 | "source": [ 67 | "$$ f(x, y) = s$$\n", 68 | "\n", 69 | "$$ a = a(x, y) = x + y $$\n", 70 | "\n", 71 | "$$ s = \\sigma(a) $$" 72 | ] 73 | }, 74 | { 75 | "cell_type": "markdown", 76 | "metadata": {}, 77 | "source": [ 78 | "$$ f(x, y) = s(a(x, y)) $$\n", 79 | "\n", 80 | "$$ \\frac{\\partial f}{\\partial x} = \\frac{\\partial \\sigma}{\\partial u}(a(x, y)) * \\frac{\\partial a}{\\partial x}((x, y)) \\\\ = \\frac{\\partial \\sigma}{\\partial u}(x + y) * \\frac{\\partial a}{\\partial x}((x, y))$$" 81 | ] 82 | }, 83 | { 84 | "cell_type": "markdown", 85 | "metadata": {}, 86 | "source": [ 87 | "# Matrix multiplication example" 88 | ] 89 | }, 90 | { 91 | "cell_type": "markdown", 92 | "metadata": {}, 93 | "source": [ 94 | "$$ X = \\begin{bmatrix}\n", 95 | "x_{11} & x_{12} & x_{13} \\\\\n", 96 | "x_{21} & x_{22} & x_{23} \\\\\n", 97 | "x_{31} & x_{32} & x_{33}\n", 98 | "\\end{bmatrix} $$\n", 99 | "\n", 100 | "$$ W = \\begin{bmatrix}\n", 101 | "w_{11} & w_{12} \\\\\n", 102 | "w_{21} & w_{22} \\\\\n", 103 | "w_{31} & w_{32} \\\\\n", 104 | "\\end{bmatrix} $$" 105 | ] 106 | }, 107 | { 108 | "cell_type": "markdown", 109 | "metadata": {}, 110 | "source": [ 111 | "$$ \\nu(X, W) = X * W = \\begin{bmatrix}\n", 112 | "x_{11} * w_{11} + x_{12} * w_{21} + x_{13} * w_{31} &\n", 113 | "x_{11} * w_{12} + x_{12} * w_{22} + x_{13} * w_{32}\n", 114 | "\\\\\n", 115 | "x_{21} * w_{11} + x_{22} * w_{21} + x_{23} * w_{31} &\n", 116 | "x_{21} * w_{12} + x_{22} * w_{22} + x_{23} * w_{32}\n", 117 | "\\\\\n", 118 | "x_{31} * w_{11} + x_{32} * w_{21} + x_{33} * w_{31} &\n", 119 | "x_{31} * w_{12} + x_{32} * w_{22} + x_{33} * w_{32}\n", 120 | "\\end{bmatrix} = \n", 121 | "\\begin{bmatrix}\n", 122 | "XW_{11} &\n", 123 | "XW_{12}\n", 124 | "\\\\\n", 125 | "XW_{21} &\n", 126 | "XW_{22}\n", 127 | "\\\\\n", 128 | "XW_{31} &\n", 129 | "XW_{32}\n", 130 | "\\end{bmatrix}\n", 131 | "$$" 132 | ] 133 | }, 134 | { 135 | "cell_type": "markdown", 136 | "metadata": {}, 137 | "source": [ 138 | "$$\n", 139 | "\\sigma(XW_{11}) = \\sigma(x_{11} * w_{11} + x_{12} * w_{21} + x_{13} * w_{31}) \\\\\n", 140 | "\\sigma(XW_{12}) = \\sigma(x_{11} * w_{12} + x_{12} * w_{22} + x_{13} * w_{32}) \\\\\n", 141 | "\\cdots\n", 142 | "$$" 143 | ] 144 | }, 145 | { 146 | "cell_type": "markdown", 147 | "metadata": {}, 148 | "source": [ 149 | "$$ \\sigma(X * W) = \\begin{bmatrix}\n", 150 | "\\sigma(x_{11} * w_{11} + x_{12} * w_{21} + x_{13} * w_{31}) &\n", 151 | "\\sigma(x_{11} * w_{12} + x_{12} * w_{22} + x_{13} * w_{32})\n", 152 | "\\\\\n", 153 | "\\sigma(x_{21} * w_{11} + x_{22} * w_{21} + x_{23} * w_{31}) &\n", 154 | "\\sigma(x_{21} * w_{12} + x_{22} * w_{22} + x_{23} * w_{32})\n", 155 | "\\\\\n", 156 | "\\sigma(x_{31} * w_{11} + x_{32} * w_{21} + x_{33} * w_{31}) &\n", 157 | "\\sigma(x_{31} * w_{12} + x_{32} * w_{22} + x_{33} * w_{32})\n", 158 | "\\end{bmatrix} = \n", 159 | "\\begin{bmatrix}\n", 160 | "\\sigma(XW_{11}) & \\sigma(XW_{12})\\\\\n", 161 | "\\sigma(XW_{21}) & \\sigma(XW_{22})\\\\\n", 162 | "\\sigma(XW_{31}) & \\sigma(XW_{32})\n", 163 | "\\end{bmatrix}\n", 164 | "$$" 165 | ] 166 | }, 167 | { 168 | "cell_type": "markdown", 169 | "metadata": {}, 170 | "source": [ 171 | "$$ L = \\Lambda(\\sigma(X * W)) = \\Lambda(\\begin{bmatrix}\n", 172 | "\\sigma(XW_{11}) & \\sigma(XW_{12})\\\\\n", 173 | "\\sigma(XW_{21}) & \\sigma(XW_{22})\\\\\n", 174 | "\\sigma(XW_{31}) & \\sigma(XW_{32})\n", 175 | "\\end{bmatrix}) = \\sigma(XW_{11}) + \\sigma(XW_{12}) + \\sigma(XW_{21}) + \\sigma(XW_{22}) + \\sigma(XW_{31}) + \\sigma(XW_{32})\n", 176 | "$$" 177 | ] 178 | }, 179 | { 180 | "cell_type": "markdown", 181 | "metadata": {}, 182 | "source": [ 183 | "$$ \\frac{\\partial \\Lambda}{\\partial u}(X) = \n", 184 | "\\begin{bmatrix}\n", 185 | "\\frac{\\partial \\Lambda}{\\partial u}(x_{11}) & \n", 186 | "\\frac{\\partial \\Lambda}{\\partial u}(x_{12}) & \n", 187 | "\\frac{\\partial \\Lambda}{\\partial u}(x_{13}) \\\\\n", 188 | "\\frac{\\partial \\Lambda}{\\partial u}(x_{21}) & \n", 189 | "\\frac{\\partial \\Lambda}{\\partial u}(x_{22}) & \n", 190 | "\\frac{\\partial \\Lambda}{\\partial u}(x_{23}) \\\\\n", 191 | "\\frac{\\partial \\Lambda}{\\partial u}(x_{31}) & \n", 192 | "\\frac{\\partial \\Lambda}{\\partial u}(x_{32}) & \n", 193 | "\\frac{\\partial \\Lambda}{\\partial u}(x_{33}) \n", 194 | "\\end{bmatrix} $$" 195 | ] 196 | }, 197 | { 198 | "cell_type": "markdown", 199 | "metadata": {}, 200 | "source": [ 201 | "$$ S = \\begin{bmatrix}\n", 202 | "s_{11} & s_{12} \\\\\n", 203 | "s_{21} & s_{22} \\\\\n", 204 | "s_{31} & s_{32} \\\\\n", 205 | "\\end{bmatrix} $$\n", 206 | "\n", 207 | "$$ \\frac{\\partial \\Lambda}{\\partial u}(S) = \\begin{bmatrix}\n", 208 | "1 & 1\\\\\n", 209 | "1 & 1\\\\\n", 210 | "1 & 1\n", 211 | "\\end{bmatrix}) $$" 212 | ] 213 | }, 214 | { 215 | "cell_type": "markdown", 216 | "metadata": {}, 217 | "source": [ 218 | "$$ \\frac{\\partial \\sigma}{\\partial u}(N) = \\begin{bmatrix}\n", 219 | "\\frac{\\partial \\sigma}{\\partial u}(XW_{11}) &\n", 220 | "\\frac{\\partial \\sigma}{\\partial u}(XW_{12}) \\\\\n", 221 | "\\frac{\\partial \\sigma}{\\partial u}(XW_{21}) &\n", 222 | "\\frac{\\partial \\sigma}{\\partial u}(XW_{22}) \\\\\n", 223 | "\\frac{\\partial \\sigma}{\\partial u}(XW_{31}) &\n", 224 | "\\frac{\\partial \\sigma}{\\partial u}(XW_{32})\n", 225 | "\\end{bmatrix} $$" 226 | ] 227 | }, 228 | { 229 | "cell_type": "markdown", 230 | "metadata": {}, 231 | "source": [ 232 | "$ L = \\Lambda(\\sigma(\\nu(X, W))) $" 233 | ] 234 | }, 235 | { 236 | "cell_type": "markdown", 237 | "metadata": {}, 238 | "source": [ 239 | "$ \\frac{\\partial \\Lambda}{\\partial u}(X) = \n", 240 | "\\frac{\\partial \\nu}{\\partial X}(X, W) *\n", 241 | "\\frac{\\partial \\sigma}{\\partial u}(N) *\n", 242 | "\\frac{\\partial \\Lambda}{\\partial u}(S) $" 243 | ] 244 | }, 245 | { 246 | "cell_type": "markdown", 247 | "metadata": {}, 248 | "source": [ 249 | "$$ \\frac{\\partial \\Lambda}{\\partial u}(N) = \\frac{\\partial \\Lambda}{\\partial u}(N) $$" 250 | ] 251 | }, 252 | { 253 | "cell_type": "markdown", 254 | "metadata": {}, 255 | "source": [ 256 | "$$ \\frac{\\partial \\Lambda}{\\partial X}(X) = \n", 257 | "\\frac{\\partial \\Lambda}{\\partial u}(S) * ? = \n", 258 | "\\begin{bmatrix}\n", 259 | "\\frac{\\partial \\sigma}{\\partial u}(XW_{11}) &\n", 260 | "\\frac{\\partial \\sigma}{\\partial u}(XW_{12}) \\\\\n", 261 | "\\frac{\\partial \\sigma}{\\partial u}(XW_{21}) &\n", 262 | "\\frac{\\partial \\sigma}{\\partial u}(XW_{22}) \\\\\n", 263 | "\\frac{\\partial \\sigma}{\\partial u}(XW_{31}) &\n", 264 | "\\frac{\\partial \\sigma}{\\partial u}(XW_{32})\n", 265 | "\\end{bmatrix} * ? $$" 266 | ] 267 | }, 268 | { 269 | "cell_type": "markdown", 270 | "metadata": {}, 271 | "source": [ 272 | "$$ \\sigma(XW_{11}) = \\sigma(x_{11} * w_{11} + x_{12} * w_{21} + x_{13} * w_{31}) $$" 273 | ] 274 | }, 275 | { 276 | "cell_type": "markdown", 277 | "metadata": {}, 278 | "source": [ 279 | "$$ \\frac{\\partial \\Lambda}{\\partial X}(X) = \n", 280 | "\\begin{bmatrix}\n", 281 | "\\frac{\\partial \\Lambda}{\\partial u}(x_{11}) & \n", 282 | "\\frac{\\partial \\Lambda}{\\partial u}(x_{12}) & \n", 283 | "\\frac{\\partial \\Lambda}{\\partial u}(x_{13}) \\\\\n", 284 | "\\frac{\\partial \\Lambda}{\\partial u}(x_{21}) & \n", 285 | "\\frac{\\partial \\Lambda}{\\partial u}(x_{22}) & \n", 286 | "\\frac{\\partial \\Lambda}{\\partial u}(x_{23}) \\\\\n", 287 | "\\frac{\\partial \\Lambda}{\\partial u}(x_{31}) & \n", 288 | "\\frac{\\partial \\Lambda}{\\partial u}(x_{32}) & \n", 289 | "\\frac{\\partial \\Lambda}{\\partial u}(x_{33}) \n", 290 | "\\end{bmatrix} $$" 291 | ] 292 | }, 293 | { 294 | "cell_type": "markdown", 295 | "metadata": {}, 296 | "source": [ 297 | "$$ \\sigma(XW_{11}) = \\sigma(x_{11} * w_{11} + x_{12} * w_{21} + x_{13} * w_{31}) $$" 298 | ] 299 | }, 300 | { 301 | "cell_type": "markdown", 302 | "metadata": {}, 303 | "source": [ 304 | "$$ \\frac{\\partial \\sigma(XW_{11})}{\\partial X} = \\begin{bmatrix}\n", 305 | "\\frac{\\partial \\sigma}{\\partial u}(XW_{11}) * w_{11} & \n", 306 | "\\frac{\\partial \\sigma}{\\partial u}(XW_{11}) * w_{21} & \n", 307 | "\\frac{\\partial \\sigma}{\\partial u}(XW_{11}) * w_{31} \\\\\n", 308 | "0 &\n", 309 | "0 & \n", 310 | "0 \\\\\n", 311 | "0 & \n", 312 | "0 & \n", 313 | "0 \n", 314 | "\\end{bmatrix} $$" 315 | ] 316 | }, 317 | { 318 | "cell_type": "markdown", 319 | "metadata": {}, 320 | "source": [ 321 | "$$ \\sigma(XW_{32}) = \\sigma(x_{31} * w_{12} + x_{32} * w_{22} + x_{33} * w_{32}) $$" 322 | ] 323 | }, 324 | { 325 | "cell_type": "markdown", 326 | "metadata": {}, 327 | "source": [ 328 | "$$ \\frac{\\partial \\sigma(XW_{32})}{\\partial X} = \\begin{bmatrix}\n", 329 | "0 & \n", 330 | "0 & \n", 331 | "0 \\\\\n", 332 | "0 &\n", 333 | "0 & \n", 334 | "0 \\\\\n", 335 | "\\frac{\\partial \\sigma}{\\partial u}(XW_{32}) * w_{12} & \n", 336 | "\\frac{\\partial \\sigma}{\\partial u}(XW_{32}) * w_{22} & \n", 337 | "\\frac{\\partial \\sigma}{\\partial u}(XW_{32}) * w_{32} \n", 338 | "\\end{bmatrix} $$" 339 | ] 340 | }, 341 | { 342 | "cell_type": "markdown", 343 | "metadata": {}, 344 | "source": [ 345 | "# Derivative calculation for matrix multiplication example" 346 | ] 347 | }, 348 | { 349 | "cell_type": "markdown", 350 | "metadata": {}, 351 | "source": [ 352 | "This calculation is in the appendix of the book; it may be easier to follow here than it is to follow there." 353 | ] 354 | }, 355 | { 356 | "cell_type": "markdown", 357 | "metadata": {}, 358 | "source": [ 359 | "$$ \\frac{\\partial \\Lambda}{\\partial X}(S) = \n", 360 | "\\frac{\\partial \\sigma(XW_{11})}{\\partial X} + \n", 361 | "\\frac{\\partial \\sigma(XW_{12})}{\\partial X} + \n", 362 | "\\frac{\\partial \\sigma(XW_{21})}{\\partial X} + \n", 363 | "\\frac{\\partial \\sigma(XW_{22})}{\\partial X} + \n", 364 | "\\frac{\\partial \\sigma(XW_{31})}{\\partial X} + \n", 365 | "\\frac{\\partial \\sigma(XW_{32})}{\\partial X} = \n", 366 | "\\begin{bmatrix}\n", 367 | "\\frac{\\partial \\sigma}{\\partial u}(XW_{11}) * w_{11} & \n", 368 | "\\frac{\\partial \\sigma}{\\partial u}(XW_{11}) * w_{21} & \n", 369 | "\\frac{\\partial \\sigma}{\\partial u}(XW_{11}) * w_{31} \\\\\n", 370 | "0 &\n", 371 | "0 & \n", 372 | "0 \\\\\n", 373 | "0 & \n", 374 | "0 & \n", 375 | "0 \\end{bmatrix} +\n", 376 | "\\begin{bmatrix}\n", 377 | "\\frac{\\partial \\sigma}{\\partial u}(XW_{12}) * w_{12} & \n", 378 | "\\frac{\\partial \\sigma}{\\partial u}(XW_{12}) * w_{22} & \n", 379 | "\\frac{\\partial \\sigma}{\\partial u}(XW_{12}) * w_{32} \\\\\n", 380 | "0 &\n", 381 | "0 & \n", 382 | "0 \\\\\n", 383 | "0 & \n", 384 | "0 & \n", 385 | "0 \\end{bmatrix} + \n", 386 | "\\begin{bmatrix}\n", 387 | "0 & \n", 388 | "0 & \n", 389 | "0 \\\\\n", 390 | "\\frac{\\partial \\sigma}{\\partial u}(XW_{21}) * w_{11} &\n", 391 | "\\frac{\\partial \\sigma}{\\partial u}(XW_{21}) * w_{21} & \n", 392 | "\\frac{\\partial \\sigma}{\\partial u}(XW_{21}) * w_{31} \\\\\n", 393 | "0 & \n", 394 | "0 & \n", 395 | "0 \\end{bmatrix} + \n", 396 | "\\begin{bmatrix}\n", 397 | "0 & \n", 398 | "0 & \n", 399 | "0 \\\\\n", 400 | "\\frac{\\partial \\sigma}{\\partial u}(XW_{22}) * w_{12} &\n", 401 | "\\frac{\\partial \\sigma}{\\partial u}(XW_{22}) * w_{22} & \n", 402 | "\\frac{\\partial \\sigma}{\\partial u}(XW_{22}) * w_{32} \\\\\n", 403 | "0 & \n", 404 | "0 & \n", 405 | "0 \\end{bmatrix} +\n", 406 | "\\begin{bmatrix}\n", 407 | "0 & \n", 408 | "0 &\n", 409 | "0 \\\\\n", 410 | "0 &\n", 411 | "0 & \n", 412 | "0 \\\\\n", 413 | "\\frac{\\partial \\sigma}{\\partial u}(XW_{31}) * w_{11} &\n", 414 | "\\frac{\\partial \\sigma}{\\partial u}(XW_{31}) * w_{21} & \n", 415 | "\\frac{\\partial \\sigma}{\\partial u}(XW_{31}) * w_{31} \\end{bmatrix} +\n", 416 | "\\begin{bmatrix}\n", 417 | "0 &\n", 418 | "0 &\n", 419 | "0 \\\\\n", 420 | "0 &\n", 421 | "0 & \n", 422 | "0 \\\\\n", 423 | "\\frac{\\partial \\sigma}{\\partial u}(XW_{32}) * w_{12} & \n", 424 | "\\frac{\\partial \\sigma}{\\partial u}(XW_{32}) * w_{22} & \n", 425 | "\\frac{\\partial \\sigma}{\\partial u}(XW_{32}) * w_{32} \\end{bmatrix}\n", 426 | "$$" 427 | ] 428 | }, 429 | { 430 | "cell_type": "markdown", 431 | "metadata": {}, 432 | "source": [ 433 | "$$ \\frac{\\partial \\Lambda}{\\partial X}(S) = \n", 434 | "\\begin{bmatrix}\n", 435 | "\\frac{\\partial \\sigma}{\\partial u}(XW_{11}) * w_{11} + \\frac{\\partial \\sigma}{\\partial u}(XW_{12}) * w_{12} & \n", 436 | "\\frac{\\partial \\sigma}{\\partial u}(XW_{11}) * w_{21} + \\frac{\\partial \\sigma}{\\partial u}(XW_{12}) * w_{22} & \n", 437 | "\\frac{\\partial \\sigma}{\\partial u}(XW_{11}) * w_{31} + \\frac{\\partial \\sigma}{\\partial u}(XW_{12}) * w_{32} \\\\ \n", 438 | "\\frac{\\partial \\sigma}{\\partial u}(XW_{21}) * w_{11} + \\frac{\\partial \\sigma}{\\partial u}(XW_{22}) * w_{12} & \n", 439 | "\\frac{\\partial \\sigma}{\\partial u}(XW_{21}) * w_{21} + \\frac{\\partial \\sigma}{\\partial u}(XW_{22}) * w_{22} & \n", 440 | "\\frac{\\partial \\sigma}{\\partial u}(XW_{21}) * w_{31} + \\frac{\\partial \\sigma}{\\partial u}(XW_{22}) * w_{32} \\\\ \n", 441 | "\\frac{\\partial \\sigma}{\\partial u}(XW_{31}) * w_{11} + \\frac{\\partial \\sigma}{\\partial u}(XW_{32}) * w_{12} & \n", 442 | "\\frac{\\partial \\sigma}{\\partial u}(XW_{31}) * w_{21} + \\frac{\\partial \\sigma}{\\partial u}(XW_{32}) * w_{22} & \n", 443 | "\\frac{\\partial \\sigma}{\\partial u}(XW_{31}) * w_{31} + \\frac{\\partial \\sigma}{\\partial u}(XW_{32}) * w_{32} \\end{bmatrix} \n", 444 | "$$" 445 | ] 446 | }, 447 | { 448 | "cell_type": "markdown", 449 | "metadata": {}, 450 | "source": [ 451 | "$$ W = \\begin{bmatrix}\n", 452 | "w_{11} & w_{12} \\\\\n", 453 | "w_{21} & w_{22} \\\\\n", 454 | "w_{31} & w_{32} \\end{bmatrix} $$" 455 | ] 456 | }, 457 | { 458 | "cell_type": "markdown", 459 | "metadata": {}, 460 | "source": [ 461 | "$$ \\frac{\\partial \\Lambda}{\\partial X}(X) = \n", 462 | "\\frac{\\partial \\Lambda}{\\partial u}(S) = \n", 463 | "\\frac{\\partial \\Lambda}{\\partial u}(S) * W^T\n", 464 | "$$" 465 | ] 466 | }, 467 | { 468 | "cell_type": "markdown", 469 | "metadata": {}, 470 | "source": [ 471 | "$$ \n", 472 | "\\frac{\\partial \\Lambda}{\\partial u}(S) = \n", 473 | "\\begin{bmatrix}\n", 474 | "\\frac{\\partial \\sigma}{\\partial u}(XW_{11}) &\n", 475 | "\\frac{\\partial \\sigma}{\\partial u}(XW_{12}) \\\\\n", 476 | "\\frac{\\partial \\sigma}{\\partial u}(XW_{21}) &\n", 477 | "\\frac{\\partial \\sigma}{\\partial u}(XW_{22}) \\\\\n", 478 | "\\frac{\\partial \\sigma}{\\partial u}(XW_{31}) &\n", 479 | "\\frac{\\partial \\sigma}{\\partial u}(XW_{32})\n", 480 | "\\end{bmatrix}\n", 481 | "$$" 482 | ] 483 | }, 484 | { 485 | "cell_type": "markdown", 486 | "metadata": {}, 487 | "source": [ 488 | "$$ \\frac{\\partial \\Lambda}{\\partial X}(X) = \n", 489 | "\\begin{bmatrix}\n", 490 | "\\frac{\\partial \\sigma}{\\partial u}(XW_{11}) &\n", 491 | "\\frac{\\partial \\sigma}{\\partial u}(XW_{12}) \\\\\n", 492 | "\\frac{\\partial \\sigma}{\\partial u}(XW_{21}) &\n", 493 | "\\frac{\\partial \\sigma}{\\partial u}(XW_{22}) \\\\\n", 494 | "\\frac{\\partial \\sigma}{\\partial u}(XW_{31}) &\n", 495 | "\\frac{\\partial \\sigma}{\\partial u}(XW_{32})\n", 496 | "\\end{bmatrix} * \n", 497 | "\\begin{bmatrix}\n", 498 | "w_{11} & w_{21} & w_{31} \\\\\n", 499 | "w_{12} & w_{22} & w_{32} \\\\\n", 500 | "\\end{bmatrix} = \\frac{\\partial \\Lambda}{\\partial u}(S) * W^T\n", 501 | "$$" 502 | ] 503 | }, 504 | { 505 | "cell_type": "markdown", 506 | "metadata": {}, 507 | "source": [ 508 | "Like meat off the bone!" 509 | ] 510 | }, 511 | { 512 | "cell_type": "markdown", 513 | "metadata": {}, 514 | "source": [ 515 | "$$ \\frac{\\partial \\sigma}{\\partial X}(X, W) = \\frac{\\partial \\sigma}{\\partial u}(N) * W^T $$ \n", 516 | "\n", 517 | "$$ \\frac{\\partial \\sigma}{\\partial W}(X, W) = X^T * \\frac{\\partial \\sigma}{\\partial u}(N) $$ " 518 | ] 519 | } 520 | ], 521 | "metadata": { 522 | "kernelspec": { 523 | "display_name": "Python 3", 524 | "language": "python", 525 | "name": "python3" 526 | }, 527 | "language_info": { 528 | "codemirror_mode": { 529 | "name": "ipython", 530 | "version": 3 531 | }, 532 | "file_extension": ".py", 533 | "mimetype": "text/x-python", 534 | "name": "python", 535 | "nbconvert_exporter": "python", 536 | "pygments_lexer": "ipython3", 537 | "version": "3.7.4" 538 | } 539 | }, 540 | "nbformat": 4, 541 | "nbformat_minor": 2 542 | } 543 | -------------------------------------------------------------------------------- /02_fundamentals/__pycache__/helper.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SethHWeidman/DLFS_code/f4ec4de43049ef990d0f4ddece81223cef3a0e91/02_fundamentals/__pycache__/helper.cpython-36.pyc -------------------------------------------------------------------------------- /02_fundamentals/math.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Forward matrix multiplication math for linear regression" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "$$ p_{batch} = X_{batch} * W = \\begin{bmatrix}\n", 15 | "x_{11} & x_{12} & x_{13} & \\ldots & x_{1k} \\\\\n", 16 | "x_{21} & x_{22} & x_{23} & \\ldots & x_{2k} \\\\\n", 17 | "x_{31} & x_{32} & x_{33} & \\ldots & x_{3k}\n", 18 | "\\end{bmatrix} * \\begin{bmatrix} w_1 \\\\ w_2 \\\\ w_3 \\\\ \\vdots \\\\ w_k \\end{bmatrix}\n", 19 | "=\n", 20 | "\\begin{bmatrix}\n", 21 | "x_{11} * w_1 + x_{12} * w_2 + x_{13} * w_3 + \\ldots + & x_{1k} * w_k \\\\\n", 22 | "x_{21} * w_1 + x_{22} * w_2 + x_{23} * w_3 + \\ldots + & x_{2k} * w_k \\\\\n", 23 | "x_{31} * w_1 + x_{32} * w_2 + x_{33} * w_3 + \\ldots + & x_{3k} * w_k\n", 24 | "\\end{bmatrix}\n", 25 | "=\n", 26 | "\\begin{bmatrix}\n", 27 | "p_1 \\\\\n", 28 | "p_2 \\\\\n", 29 | "p_3\n", 30 | "\\end{bmatrix} $$" 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "metadata": {}, 36 | "source": [ 37 | "$$ p_{batch\\_with\\_bias} = \n", 38 | "\\begin{bmatrix}\n", 39 | "x_{11} * w_1 + x_{12} * w_2 + x_{13} * w_3 + \\ldots + & x_{1k} * w_k + b \\\\\n", 40 | "x_{21} * w_1 + x_{22} * w_2 + x_{23} * w_3 + \\ldots + & x_{2k} * w_k + b\\\\\n", 41 | "x_{31} * w_1 + x_{32} * w_2 + x_{33} * w_3 + \\ldots + & x_{3k} * w_k + b\n", 42 | "\\end{bmatrix}\n", 43 | "=\n", 44 | "\\begin{bmatrix}\n", 45 | "p_1 \\\\\n", 46 | "p_2 \\\\\n", 47 | "p_3\n", 48 | "\\end{bmatrix} $$" 49 | ] 50 | }, 51 | { 52 | "cell_type": "markdown", 53 | "metadata": {}, 54 | "source": [ 55 | "$$ MSE(p_{batch}, y_{batch}) = MSE(\\begin{bmatrix}\n", 56 | "p_1 \\\\\n", 57 | "p_2 \\\\\n", 58 | "p_3\n", 59 | "\\end{bmatrix}, \\begin{bmatrix}\n", 60 | "y_1 \\\\\n", 61 | "y_2 \\\\\n", 62 | "y_3\n", 63 | "\\end{bmatrix}) = \\frac{(y_1 - p_1)^2 + (y_2 - p_2)^2 + (y_3 - p_3)^2}{3} $$" 64 | ] 65 | }, 66 | { 67 | "cell_type": "markdown", 68 | "metadata": {}, 69 | "source": [ 70 | "$$ p_i = x_i * W + b = w_1 * x_{i1} + w_2 * x_{i2} + \\ldots + w_k * x_{ik} + b $$" 71 | ] 72 | }, 73 | { 74 | "cell_type": "markdown", 75 | "metadata": {}, 76 | "source": [ 77 | "# Sigmoid derivative" 78 | ] 79 | }, 80 | { 81 | "cell_type": "markdown", 82 | "metadata": {}, 83 | "source": [ 84 | "$$ \\frac{\\partial \\sigma}{\\partial u}(x) = \\sigma(x) * (1 - \\sigma(x)) $$ " 85 | ] 86 | } 87 | ], 88 | "metadata": { 89 | "kernelspec": { 90 | "display_name": "Python 3", 91 | "language": "python", 92 | "name": "python3" 93 | }, 94 | "language_info": { 95 | "codemirror_mode": { 96 | "name": "ipython", 97 | "version": 3 98 | }, 99 | "file_extension": ".py", 100 | "mimetype": "text/x-python", 101 | "name": "python", 102 | "nbconvert_exporter": "python", 103 | "pygments_lexer": "ipython3", 104 | "version": "3.7.4" 105 | } 106 | }, 107 | "nbformat": 4, 108 | "nbformat_minor": 2 109 | } 110 | -------------------------------------------------------------------------------- /03_dlfs/Code.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import numpy as np\n", 10 | "from numpy import ndarray\n", 11 | "\n", 12 | "from typing import List" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 2, 18 | "metadata": {}, 19 | "outputs": [], 20 | "source": [ 21 | "def assert_same_shape(array: ndarray,\n", 22 | " array_grad: ndarray):\n", 23 | " assert array.shape == array_grad.shape, \\\n", 24 | " '''\n", 25 | " Two ndarrays should have the same shape;\n", 26 | " instead, first ndarray's shape is {0}\n", 27 | " and second ndarray's shape is {1}.\n", 28 | " '''.format(tuple(array_grad.shape), tuple(array.shape))\n", 29 | " return None" 30 | ] 31 | }, 32 | { 33 | "cell_type": "markdown", 34 | "metadata": {}, 35 | "source": [ 36 | "# `Operation` and `ParamOperation`" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 3, 42 | "metadata": {}, 43 | "outputs": [], 44 | "source": [ 45 | "class Operation(object):\n", 46 | " '''\n", 47 | " Base class for an \"operation\" in a neural network.\n", 48 | " '''\n", 49 | " def __init__(self):\n", 50 | " pass\n", 51 | "\n", 52 | " def forward(self, input_: ndarray):\n", 53 | " '''\n", 54 | " Stores input in the self._input instance variable\n", 55 | " Calls the self._output() function.\n", 56 | " '''\n", 57 | " self.input_ = input_\n", 58 | "\n", 59 | " self.output = self._output()\n", 60 | "\n", 61 | " return self.output\n", 62 | "\n", 63 | "\n", 64 | " def backward(self, output_grad: ndarray) -> ndarray:\n", 65 | " '''\n", 66 | " Calls the self._input_grad() function.\n", 67 | " Checks that the appropriate shapes match.\n", 68 | " '''\n", 69 | " assert_same_shape(self.output, output_grad)\n", 70 | "\n", 71 | " self.input_grad = self._input_grad(output_grad)\n", 72 | "\n", 73 | " assert_same_shape(self.input_, self.input_grad)\n", 74 | " return self.input_grad\n", 75 | "\n", 76 | "\n", 77 | " def _output(self) -> ndarray:\n", 78 | " '''\n", 79 | " The _output method must be defined for each Operation\n", 80 | " '''\n", 81 | " raise NotImplementedError()\n", 82 | "\n", 83 | "\n", 84 | " def _input_grad(self, output_grad: ndarray) -> ndarray:\n", 85 | " '''\n", 86 | " The _input_grad method must be defined for each Operation\n", 87 | " '''\n", 88 | " raise NotImplementedError()" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": 4, 94 | "metadata": {}, 95 | "outputs": [], 96 | "source": [ 97 | "class ParamOperation(Operation):\n", 98 | " '''\n", 99 | " An Operation with parameters.\n", 100 | " '''\n", 101 | "\n", 102 | " def __init__(self, param: ndarray) -> ndarray:\n", 103 | " '''\n", 104 | " The ParamOperation method\n", 105 | " '''\n", 106 | " super().__init__()\n", 107 | " self.param = param\n", 108 | "\n", 109 | " def backward(self, output_grad: ndarray) -> ndarray:\n", 110 | " '''\n", 111 | " Calls self._input_grad and self._param_grad.\n", 112 | " Checks appropriate shapes.\n", 113 | " '''\n", 114 | "\n", 115 | " assert_same_shape(self.output, output_grad)\n", 116 | "\n", 117 | " self.input_grad = self._input_grad(output_grad)\n", 118 | " self.param_grad = self._param_grad(output_grad)\n", 119 | "\n", 120 | " assert_same_shape(self.input_, self.input_grad)\n", 121 | " assert_same_shape(self.param, self.param_grad)\n", 122 | "\n", 123 | " return self.input_grad\n", 124 | "\n", 125 | " def _param_grad(self, output_grad: ndarray) -> ndarray:\n", 126 | " '''\n", 127 | " Every subclass of ParamOperation must implement _param_grad.\n", 128 | " '''\n", 129 | " raise NotImplementedError()" 130 | ] 131 | }, 132 | { 133 | "cell_type": "markdown", 134 | "metadata": {}, 135 | "source": [ 136 | "## Specific `Operation`s" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": 5, 142 | "metadata": {}, 143 | "outputs": [], 144 | "source": [ 145 | "class WeightMultiply(ParamOperation):\n", 146 | " '''\n", 147 | " Weight multiplication operation for a neural network.\n", 148 | " '''\n", 149 | "\n", 150 | " def __init__(self, W: ndarray):\n", 151 | " '''\n", 152 | " Initialize Operation with self.param = W.\n", 153 | " '''\n", 154 | " super().__init__(W)\n", 155 | "\n", 156 | " def _output(self) -> ndarray:\n", 157 | " '''\n", 158 | " Compute output.\n", 159 | " '''\n", 160 | " return np.dot(self.input_, self.param)\n", 161 | "\n", 162 | " def _input_grad(self, output_grad: ndarray) -> ndarray:\n", 163 | " '''\n", 164 | " Compute input gradient.\n", 165 | " '''\n", 166 | " return np.dot(output_grad, np.transpose(self.param, (1, 0)))\n", 167 | "\n", 168 | " def _param_grad(self, output_grad: ndarray) -> ndarray:\n", 169 | " '''\n", 170 | " Compute parameter gradient.\n", 171 | " ''' \n", 172 | " return np.dot(np.transpose(self.input_, (1, 0)), output_grad)" 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": 6, 178 | "metadata": {}, 179 | "outputs": [], 180 | "source": [ 181 | "class BiasAdd(ParamOperation):\n", 182 | " '''\n", 183 | " Compute bias addition.\n", 184 | " '''\n", 185 | "\n", 186 | " def __init__(self,\n", 187 | " B: ndarray):\n", 188 | " '''\n", 189 | " Initialize Operation with self.param = B.\n", 190 | " Check appropriate shape.\n", 191 | " '''\n", 192 | " assert B.shape[0] == 1\n", 193 | " \n", 194 | " super().__init__(B)\n", 195 | "\n", 196 | " def _output(self) -> ndarray:\n", 197 | " '''\n", 198 | " Compute output.\n", 199 | " '''\n", 200 | " return self.input_ + self.param\n", 201 | "\n", 202 | " def _input_grad(self, output_grad: ndarray) -> ndarray:\n", 203 | " '''\n", 204 | " Compute input gradient.\n", 205 | " '''\n", 206 | " return np.ones_like(self.input_) * output_grad\n", 207 | "\n", 208 | " def _param_grad(self, output_grad: ndarray) -> ndarray:\n", 209 | " '''\n", 210 | " Compute parameter gradient.\n", 211 | " '''\n", 212 | " param_grad = np.ones_like(self.param) * output_grad\n", 213 | " return np.sum(param_grad, axis=0).reshape(1, param_grad.shape[1])" 214 | ] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "execution_count": 7, 219 | "metadata": {}, 220 | "outputs": [], 221 | "source": [ 222 | "class Sigmoid(Operation):\n", 223 | " '''\n", 224 | " Sigmoid activation function.\n", 225 | " '''\n", 226 | "\n", 227 | " def __init__(self) -> None:\n", 228 | " '''Pass'''\n", 229 | " super().__init__()\n", 230 | "\n", 231 | " def _output(self) -> ndarray:\n", 232 | " '''\n", 233 | " Compute output.\n", 234 | " '''\n", 235 | " return 1.0/(1.0+np.exp(-1.0 * self.input_))\n", 236 | "\n", 237 | " def _input_grad(self, output_grad: ndarray) -> ndarray:\n", 238 | " '''\n", 239 | " Compute input gradient.\n", 240 | " '''\n", 241 | " sigmoid_backward = self.output * (1.0 - self.output)\n", 242 | " input_grad = sigmoid_backward * output_grad\n", 243 | " return input_grad" 244 | ] 245 | }, 246 | { 247 | "cell_type": "code", 248 | "execution_count": 8, 249 | "metadata": {}, 250 | "outputs": [], 251 | "source": [ 252 | "class Linear(Operation):\n", 253 | " '''\n", 254 | " \"Identity\" activation function\n", 255 | " '''\n", 256 | "\n", 257 | " def __init__(self) -> None:\n", 258 | " '''Pass''' \n", 259 | " super().__init__()\n", 260 | "\n", 261 | " def _output(self) -> ndarray:\n", 262 | " '''Pass through'''\n", 263 | " return self.input_\n", 264 | "\n", 265 | " def _input_grad(self, output_grad: ndarray) -> ndarray:\n", 266 | " '''Pass through'''\n", 267 | " return output_grad" 268 | ] 269 | }, 270 | { 271 | "cell_type": "markdown", 272 | "metadata": {}, 273 | "source": [ 274 | "# `Layer` and `Dense`" 275 | ] 276 | }, 277 | { 278 | "cell_type": "code", 279 | "execution_count": 9, 280 | "metadata": {}, 281 | "outputs": [], 282 | "source": [ 283 | "class Layer(object):\n", 284 | " '''\n", 285 | " A \"layer\" of neurons in a neural network.\n", 286 | " '''\n", 287 | "\n", 288 | " def __init__(self,\n", 289 | " neurons: int):\n", 290 | " '''\n", 291 | " The number of \"neurons\" roughly corresponds to the \"breadth\" of the layer\n", 292 | " '''\n", 293 | " self.neurons = neurons\n", 294 | " self.first = True\n", 295 | " self.params: List[ndarray] = []\n", 296 | " self.param_grads: List[ndarray] = []\n", 297 | " self.operations: List[Operation] = []\n", 298 | "\n", 299 | " def _setup_layer(self, num_in: int) -> None:\n", 300 | " '''\n", 301 | " The _setup_layer function must be implemented for each layer\n", 302 | " '''\n", 303 | " raise NotImplementedError()\n", 304 | "\n", 305 | " def forward(self, input_: ndarray) -> ndarray:\n", 306 | " '''\n", 307 | " Passes input forward through a series of operations\n", 308 | " ''' \n", 309 | " if self.first:\n", 310 | " self._setup_layer(input_)\n", 311 | " self.first = False\n", 312 | "\n", 313 | " self.input_ = input_\n", 314 | "\n", 315 | " for operation in self.operations:\n", 316 | "\n", 317 | " input_ = operation.forward(input_)\n", 318 | "\n", 319 | " self.output = input_\n", 320 | "\n", 321 | " return self.output\n", 322 | "\n", 323 | " def backward(self, output_grad: ndarray) -> ndarray:\n", 324 | " '''\n", 325 | " Passes output_grad backward through a series of operations\n", 326 | " Checks appropriate shapes\n", 327 | " '''\n", 328 | "\n", 329 | " assert_same_shape(self.output, output_grad)\n", 330 | "\n", 331 | " for operation in reversed(self.operations):\n", 332 | " output_grad = operation.backward(output_grad)\n", 333 | "\n", 334 | " input_grad = output_grad\n", 335 | " \n", 336 | " self._param_grads()\n", 337 | "\n", 338 | " return input_grad\n", 339 | "\n", 340 | " def _param_grads(self) -> ndarray:\n", 341 | " '''\n", 342 | " Extracts the _param_grads from a layer's operations\n", 343 | " '''\n", 344 | "\n", 345 | " self.param_grads = []\n", 346 | " for operation in self.operations:\n", 347 | " if issubclass(operation.__class__, ParamOperation):\n", 348 | " self.param_grads.append(operation.param_grad)\n", 349 | "\n", 350 | " def _params(self) -> ndarray:\n", 351 | " '''\n", 352 | " Extracts the _params from a layer's operations\n", 353 | " '''\n", 354 | "\n", 355 | " self.params = []\n", 356 | " for operation in self.operations:\n", 357 | " if issubclass(operation.__class__, ParamOperation):\n", 358 | " self.params.append(operation.param)" 359 | ] 360 | }, 361 | { 362 | "cell_type": "code", 363 | "execution_count": 10, 364 | "metadata": {}, 365 | "outputs": [], 366 | "source": [ 367 | "class Dense(Layer):\n", 368 | " '''\n", 369 | " A fully connected layer which inherits from \"Layer\"\n", 370 | " '''\n", 371 | " def __init__(self,\n", 372 | " neurons: int,\n", 373 | " activation: Operation = Sigmoid()):\n", 374 | " '''\n", 375 | " Requires an activation function upon initialization\n", 376 | " '''\n", 377 | " super().__init__(neurons)\n", 378 | " self.activation = activation\n", 379 | "\n", 380 | " def _setup_layer(self, input_: ndarray) -> None:\n", 381 | " '''\n", 382 | " Defines the operations of a fully connected layer.\n", 383 | " '''\n", 384 | " if self.seed:\n", 385 | " np.random.seed(self.seed)\n", 386 | "\n", 387 | " self.params = []\n", 388 | "\n", 389 | " # weights\n", 390 | " self.params.append(np.random.randn(input_.shape[1], self.neurons))\n", 391 | "\n", 392 | " # bias\n", 393 | " self.params.append(np.random.randn(1, self.neurons))\n", 394 | "\n", 395 | " self.operations = [WeightMultiply(self.params[0]),\n", 396 | " BiasAdd(self.params[1]),\n", 397 | " self.activation]\n", 398 | "\n", 399 | " return None" 400 | ] 401 | }, 402 | { 403 | "cell_type": "markdown", 404 | "metadata": {}, 405 | "source": [ 406 | "# `Loss` and `MeanSquaredError`" 407 | ] 408 | }, 409 | { 410 | "cell_type": "code", 411 | "execution_count": 11, 412 | "metadata": {}, 413 | "outputs": [], 414 | "source": [ 415 | "class Loss(object):\n", 416 | " '''\n", 417 | " The \"loss\" of a neural network\n", 418 | " '''\n", 419 | "\n", 420 | " def __init__(self):\n", 421 | " '''Pass'''\n", 422 | " pass\n", 423 | "\n", 424 | " def forward(self, prediction: ndarray, target: ndarray) -> float:\n", 425 | " '''\n", 426 | " Computes the actual loss value\n", 427 | " '''\n", 428 | " assert_same_shape(prediction, target)\n", 429 | "\n", 430 | " self.prediction = prediction\n", 431 | " self.target = target\n", 432 | "\n", 433 | " loss_value = self._output()\n", 434 | "\n", 435 | " return loss_value\n", 436 | "\n", 437 | " def backward(self) -> ndarray:\n", 438 | " '''\n", 439 | " Computes gradient of the loss value with respect to the input to the loss function\n", 440 | " '''\n", 441 | " self.input_grad = self._input_grad()\n", 442 | "\n", 443 | " assert_same_shape(self.prediction, self.input_grad)\n", 444 | "\n", 445 | " return self.input_grad\n", 446 | "\n", 447 | " def _output(self) -> float:\n", 448 | " '''\n", 449 | " Every subclass of \"Loss\" must implement the _output function.\n", 450 | " '''\n", 451 | " raise NotImplementedError()\n", 452 | "\n", 453 | " def _input_grad(self) -> ndarray:\n", 454 | " '''\n", 455 | " Every subclass of \"Loss\" must implement the _input_grad function.\n", 456 | " '''\n", 457 | " raise NotImplementedError()" 458 | ] 459 | }, 460 | { 461 | "cell_type": "code", 462 | "execution_count": 12, 463 | "metadata": {}, 464 | "outputs": [], 465 | "source": [ 466 | "class MeanSquaredError(Loss):\n", 467 | "\n", 468 | " def __init__(self) -> None:\n", 469 | " '''Pass'''\n", 470 | " super().__init__()\n", 471 | "\n", 472 | " def _output(self) -> float:\n", 473 | " '''\n", 474 | " Computes the per-observation squared error loss\n", 475 | " '''\n", 476 | " loss = (\n", 477 | " np.sum(np.power(self.prediction - self.target, 2)) / \n", 478 | " self.prediction.shape[0]\n", 479 | " )\n", 480 | "\n", 481 | " return loss\n", 482 | "\n", 483 | " def _input_grad(self) -> ndarray:\n", 484 | " '''\n", 485 | " Computes the loss gradient with respect to the input for MSE loss\n", 486 | " ''' \n", 487 | "\n", 488 | " return 2.0 * (self.prediction - self.target) / self.prediction.shape[0]" 489 | ] 490 | }, 491 | { 492 | "cell_type": "markdown", 493 | "metadata": {}, 494 | "source": [ 495 | "# `NeuralNetwork`" 496 | ] 497 | }, 498 | { 499 | "cell_type": "code", 500 | "execution_count": 13, 501 | "metadata": {}, 502 | "outputs": [], 503 | "source": [ 504 | "class NeuralNetwork(object):\n", 505 | " '''\n", 506 | " The class for a neural network.\n", 507 | " '''\n", 508 | " def __init__(self, \n", 509 | " layers: List[Layer],\n", 510 | " loss: Loss,\n", 511 | " seed: int = 1) -> None:\n", 512 | " '''\n", 513 | " Neural networks need layers, and a loss.\n", 514 | " '''\n", 515 | " self.layers = layers\n", 516 | " self.loss = loss\n", 517 | " self.seed = seed\n", 518 | " if seed:\n", 519 | " for layer in self.layers:\n", 520 | " setattr(layer, \"seed\", self.seed) \n", 521 | "\n", 522 | " def forward(self, x_batch: ndarray) -> ndarray:\n", 523 | " '''\n", 524 | " Passes data forward through a series of layers.\n", 525 | " '''\n", 526 | " x_out = x_batch\n", 527 | " for layer in self.layers:\n", 528 | " x_out = layer.forward(x_out)\n", 529 | "\n", 530 | " return x_out\n", 531 | "\n", 532 | " def backward(self, loss_grad: ndarray) -> None:\n", 533 | " '''\n", 534 | " Passes data backward through a series of layers.\n", 535 | " '''\n", 536 | "\n", 537 | " grad = loss_grad\n", 538 | " for layer in reversed(self.layers):\n", 539 | " grad = layer.backward(grad)\n", 540 | "\n", 541 | " return None\n", 542 | "\n", 543 | " def train_batch(self,\n", 544 | " x_batch: ndarray,\n", 545 | " y_batch: ndarray) -> float:\n", 546 | " '''\n", 547 | " Passes data forward through the layers.\n", 548 | " Computes the loss.\n", 549 | " Passes data backward through the layers.\n", 550 | " '''\n", 551 | " \n", 552 | " predictions = self.forward(x_batch)\n", 553 | "\n", 554 | " loss = self.loss.forward(predictions, y_batch)\n", 555 | "\n", 556 | " self.backward(self.loss.backward())\n", 557 | "\n", 558 | " return loss\n", 559 | " \n", 560 | " def params(self):\n", 561 | " '''\n", 562 | " Gets the parameters for the network.\n", 563 | " '''\n", 564 | " for layer in self.layers:\n", 565 | " yield from layer.params\n", 566 | "\n", 567 | " def param_grads(self):\n", 568 | " '''\n", 569 | " Gets the gradient of the loss with respect to the parameters for the network.\n", 570 | " '''\n", 571 | " for layer in self.layers:\n", 572 | " yield from layer.param_grads " 573 | ] 574 | }, 575 | { 576 | "cell_type": "markdown", 577 | "metadata": {}, 578 | "source": [ 579 | "# `Optimizer` and `SGD`" 580 | ] 581 | }, 582 | { 583 | "cell_type": "code", 584 | "execution_count": 14, 585 | "metadata": {}, 586 | "outputs": [], 587 | "source": [ 588 | "class Optimizer(object):\n", 589 | " '''\n", 590 | " Base class for a neural network optimizer.\n", 591 | " '''\n", 592 | " def __init__(self,\n", 593 | " lr: float = 0.01):\n", 594 | " '''\n", 595 | " Every optimizer must have an initial learning rate.\n", 596 | " '''\n", 597 | " self.lr = lr\n", 598 | "\n", 599 | " def step(self) -> None:\n", 600 | " '''\n", 601 | " Every optimizer must implement the \"step\" function.\n", 602 | " '''\n", 603 | " pass" 604 | ] 605 | }, 606 | { 607 | "cell_type": "code", 608 | "execution_count": 15, 609 | "metadata": {}, 610 | "outputs": [], 611 | "source": [ 612 | "class SGD(Optimizer):\n", 613 | " '''\n", 614 | " Stochasitc gradient descent optimizer.\n", 615 | " ''' \n", 616 | " def __init__(self,\n", 617 | " lr: float = 0.01) -> None:\n", 618 | " '''Pass'''\n", 619 | " super().__init__(lr)\n", 620 | "\n", 621 | " def step(self):\n", 622 | " '''\n", 623 | " For each parameter, adjust in the appropriate direction, with the magnitude of the adjustment \n", 624 | " based on the learning rate.\n", 625 | " '''\n", 626 | " for (param, param_grad) in zip(self.net.params(),\n", 627 | " self.net.param_grads()):\n", 628 | "\n", 629 | " param -= self.lr * param_grad" 630 | ] 631 | }, 632 | { 633 | "cell_type": "markdown", 634 | "metadata": {}, 635 | "source": [ 636 | "# `Trainer`" 637 | ] 638 | }, 639 | { 640 | "cell_type": "code", 641 | "execution_count": 16, 642 | "metadata": {}, 643 | "outputs": [], 644 | "source": [ 645 | "from copy import deepcopy\n", 646 | "from typing import Tuple\n", 647 | "\n", 648 | "class Trainer(object):\n", 649 | " '''\n", 650 | " Trains a neural network\n", 651 | " '''\n", 652 | " def __init__(self,\n", 653 | " net: NeuralNetwork,\n", 654 | " optim: Optimizer) -> None:\n", 655 | " '''\n", 656 | " Requires a neural network and an optimizer in order for training to occur. \n", 657 | " Assign the neural network as an instance variable to the optimizer.\n", 658 | " '''\n", 659 | " self.net = net\n", 660 | " self.optim = optim\n", 661 | " self.best_loss = 1e9\n", 662 | " setattr(self.optim, 'net', self.net)\n", 663 | " \n", 664 | " def generate_batches(self,\n", 665 | " X: ndarray,\n", 666 | " y: ndarray,\n", 667 | " size: int = 32) -> Tuple[ndarray]:\n", 668 | " '''\n", 669 | " Generates batches for training \n", 670 | " '''\n", 671 | " assert X.shape[0] == y.shape[0], \\\n", 672 | " '''\n", 673 | " features and target must have the same number of rows, instead\n", 674 | " features has {0} and target has {1}\n", 675 | " '''.format(X.shape[0], y.shape[0])\n", 676 | "\n", 677 | " N = X.shape[0]\n", 678 | "\n", 679 | " for ii in range(0, N, size):\n", 680 | " X_batch, y_batch = X[ii:ii+size], y[ii:ii+size]\n", 681 | "\n", 682 | " yield X_batch, y_batch\n", 683 | "\n", 684 | " \n", 685 | " def fit(self, X_train: ndarray, y_train: ndarray,\n", 686 | " X_test: ndarray, y_test: ndarray,\n", 687 | " epochs: int=100,\n", 688 | " eval_every: int=10,\n", 689 | " batch_size: int=32,\n", 690 | " seed: int = 1,\n", 691 | " restart: bool = True)-> None:\n", 692 | " '''\n", 693 | " Fits the neural network on the training data for a certain number of epochs.\n", 694 | " Every \"eval_every\" epochs, it evaluated the neural network on the testing data.\n", 695 | " '''\n", 696 | "\n", 697 | " np.random.seed(seed)\n", 698 | " if restart:\n", 699 | " for layer in self.net.layers:\n", 700 | " layer.first = True\n", 701 | "\n", 702 | " self.best_loss = 1e9\n", 703 | "\n", 704 | " for e in range(epochs):\n", 705 | "\n", 706 | " if (e+1) % eval_every == 0:\n", 707 | " \n", 708 | " # for early stopping\n", 709 | " last_model = deepcopy(self.net)\n", 710 | "\n", 711 | " X_train, y_train = permute_data(X_train, y_train)\n", 712 | "\n", 713 | " batch_generator = self.generate_batches(X_train, y_train,\n", 714 | " batch_size)\n", 715 | "\n", 716 | " for ii, (X_batch, y_batch) in enumerate(batch_generator):\n", 717 | "\n", 718 | " self.net.train_batch(X_batch, y_batch)\n", 719 | "\n", 720 | " self.optim.step()\n", 721 | "\n", 722 | " if (e+1) % eval_every == 0:\n", 723 | "\n", 724 | " test_preds = self.net.forward(X_test)\n", 725 | " loss = self.net.loss.forward(test_preds, y_test)\n", 726 | "\n", 727 | " if loss < self.best_loss:\n", 728 | " print(f\"Validation loss after {e+1} epochs is {loss:.3f}\")\n", 729 | " self.best_loss = loss\n", 730 | " else:\n", 731 | " print(f\"\"\"Loss increased after epoch {e+1}, final loss was {self.best_loss:.3f}, using the model from epoch {e+1-eval_every}\"\"\")\n", 732 | " self.net = last_model\n", 733 | " # ensure self.optim is still updating self.net\n", 734 | " setattr(self.optim, 'net', self.net)\n", 735 | " break" 736 | ] 737 | }, 738 | { 739 | "cell_type": "markdown", 740 | "metadata": {}, 741 | "source": [ 742 | "#### Evaluation metrics" 743 | ] 744 | }, 745 | { 746 | "cell_type": "code", 747 | "execution_count": 17, 748 | "metadata": {}, 749 | "outputs": [], 750 | "source": [ 751 | "def mae(y_true: ndarray, y_pred: ndarray):\n", 752 | " '''\n", 753 | " Compute mean absolute error for a neural network.\n", 754 | " ''' \n", 755 | " return np.mean(np.abs(y_true - y_pred))\n", 756 | "\n", 757 | "def rmse(y_true: ndarray, y_pred: ndarray):\n", 758 | " '''\n", 759 | " Compute root mean squared error for a neural network.\n", 760 | " '''\n", 761 | " return np.sqrt(np.mean(np.power(y_true - y_pred, 2)))\n", 762 | "\n", 763 | "def eval_regression_model(model: NeuralNetwork,\n", 764 | " X_test: ndarray,\n", 765 | " y_test: ndarray):\n", 766 | " '''\n", 767 | " Compute mae and rmse for a neural network.\n", 768 | " '''\n", 769 | " preds = model.forward(X_test)\n", 770 | " preds = preds.reshape(-1, 1)\n", 771 | " print(\"Mean absolute error: {:.2f}\".format(mae(preds, y_test)))\n", 772 | " print()\n", 773 | " print(\"Root mean squared error {:.2f}\".format(rmse(preds, y_test)))" 774 | ] 775 | }, 776 | { 777 | "cell_type": "code", 778 | "execution_count": 18, 779 | "metadata": {}, 780 | "outputs": [], 781 | "source": [ 782 | "lr = NeuralNetwork(\n", 783 | " layers=[Dense(neurons=1,\n", 784 | " activation=Linear())],\n", 785 | " loss=MeanSquaredError(),\n", 786 | " seed=20190501\n", 787 | ")\n", 788 | "\n", 789 | "nn = NeuralNetwork(\n", 790 | " layers=[Dense(neurons=13,\n", 791 | " activation=Sigmoid()),\n", 792 | " Dense(neurons=1,\n", 793 | " activation=Linear())],\n", 794 | " loss=MeanSquaredError(),\n", 795 | " seed=20190501\n", 796 | ")\n", 797 | "\n", 798 | "dl = NeuralNetwork(\n", 799 | " layers=[Dense(neurons=13,\n", 800 | " activation=Sigmoid()),\n", 801 | " Dense(neurons=13,\n", 802 | " activation=Sigmoid()),\n", 803 | " Dense(neurons=1,\n", 804 | " activation=Linear())],\n", 805 | " loss=MeanSquaredError(),\n", 806 | " seed=20190501\n", 807 | ")" 808 | ] 809 | }, 810 | { 811 | "cell_type": "markdown", 812 | "metadata": {}, 813 | "source": [ 814 | "### Read in the data, train-test split etc." 815 | ] 816 | }, 817 | { 818 | "cell_type": "code", 819 | "execution_count": 19, 820 | "metadata": {}, 821 | "outputs": [], 822 | "source": [ 823 | "from sklearn.datasets import load_boston\n", 824 | "\n", 825 | "boston = load_boston()\n", 826 | "data = boston.data\n", 827 | "target = boston.target\n", 828 | "features = boston.feature_names" 829 | ] 830 | }, 831 | { 832 | "cell_type": "code", 833 | "execution_count": 20, 834 | "metadata": {}, 835 | "outputs": [], 836 | "source": [ 837 | "# Scaling the data\n", 838 | "from sklearn.preprocessing import StandardScaler\n", 839 | "s = StandardScaler()\n", 840 | "data = s.fit_transform(data)" 841 | ] 842 | }, 843 | { 844 | "cell_type": "code", 845 | "execution_count": 21, 846 | "metadata": {}, 847 | "outputs": [], 848 | "source": [ 849 | "def to_2d_np(a: ndarray, \n", 850 | " type: str=\"col\") -> ndarray:\n", 851 | " '''\n", 852 | " Turns a 1D Tensor into 2D\n", 853 | " '''\n", 854 | "\n", 855 | " assert a.ndim == 1, \\\n", 856 | " \"Input tensors must be 1 dimensional\"\n", 857 | " \n", 858 | " if type == \"col\": \n", 859 | " return a.reshape(-1, 1)\n", 860 | " elif type == \"row\":\n", 861 | " return a.reshape(1, -1)" 862 | ] 863 | }, 864 | { 865 | "cell_type": "code", 866 | "execution_count": 22, 867 | "metadata": {}, 868 | "outputs": [], 869 | "source": [ 870 | "from sklearn.model_selection import train_test_split\n", 871 | "X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.3, random_state=80718)\n", 872 | "\n", 873 | "# make target 2d array\n", 874 | "y_train, y_test = to_2d_np(y_train), to_2d_np(y_test)" 875 | ] 876 | }, 877 | { 878 | "cell_type": "markdown", 879 | "metadata": {}, 880 | "source": [ 881 | "### Train the three models" 882 | ] 883 | }, 884 | { 885 | "cell_type": "code", 886 | "execution_count": 23, 887 | "metadata": {}, 888 | "outputs": [], 889 | "source": [ 890 | "# helper function\n", 891 | "\n", 892 | "def permute_data(X, y):\n", 893 | " perm = np.random.permutation(X.shape[0])\n", 894 | " return X[perm], y[perm]" 895 | ] 896 | }, 897 | { 898 | "cell_type": "code", 899 | "execution_count": 24, 900 | "metadata": {}, 901 | "outputs": [ 902 | { 903 | "name": "stdout", 904 | "output_type": "stream", 905 | "text": [ 906 | "Validation loss after 10 epochs is 30.293\n", 907 | "Validation loss after 20 epochs is 28.469\n", 908 | "Validation loss after 30 epochs is 26.293\n", 909 | "Validation loss after 40 epochs is 25.541\n", 910 | "Validation loss after 50 epochs is 25.087\n", 911 | "\n", 912 | "Mean absolute error: 3.52\n", 913 | "\n", 914 | "Root mean squared error 5.01\n" 915 | ] 916 | } 917 | ], 918 | "source": [ 919 | "trainer = Trainer(lr, SGD(lr=0.01))\n", 920 | "\n", 921 | "trainer.fit(X_train, y_train, X_test, y_test,\n", 922 | " epochs = 50,\n", 923 | " eval_every = 10,\n", 924 | " seed=20190501);\n", 925 | "print()\n", 926 | "eval_regression_model(lr, X_test, y_test)" 927 | ] 928 | }, 929 | { 930 | "cell_type": "code", 931 | "execution_count": 25, 932 | "metadata": {}, 933 | "outputs": [ 934 | { 935 | "name": "stdout", 936 | "output_type": "stream", 937 | "text": [ 938 | "Validation loss after 10 epochs is 27.435\n", 939 | "Validation loss after 20 epochs is 21.839\n", 940 | "Validation loss after 30 epochs is 18.918\n", 941 | "Validation loss after 40 epochs is 17.195\n", 942 | "Validation loss after 50 epochs is 16.215\n", 943 | "\n", 944 | "Mean absolute error: 2.60\n", 945 | "\n", 946 | "Root mean squared error 4.03\n" 947 | ] 948 | } 949 | ], 950 | "source": [ 951 | "trainer = Trainer(nn, SGD(lr=0.01))\n", 952 | "\n", 953 | "trainer.fit(X_train, y_train, X_test, y_test,\n", 954 | " epochs = 50,\n", 955 | " eval_every = 10,\n", 956 | " seed=20190501);\n", 957 | "print()\n", 958 | "eval_regression_model(nn, X_test, y_test)" 959 | ] 960 | }, 961 | { 962 | "cell_type": "code", 963 | "execution_count": 26, 964 | "metadata": {}, 965 | "outputs": [ 966 | { 967 | "name": "stdout", 968 | "output_type": "stream", 969 | "text": [ 970 | "Validation loss after 10 epochs is 44.143\n", 971 | "Validation loss after 20 epochs is 25.278\n", 972 | "Validation loss after 30 epochs is 22.339\n", 973 | "Validation loss after 40 epochs is 16.500\n", 974 | "Validation loss after 50 epochs is 14.655\n", 975 | "\n", 976 | "Mean absolute error: 2.45\n", 977 | "\n", 978 | "Root mean squared error 3.83\n" 979 | ] 980 | } 981 | ], 982 | "source": [ 983 | "trainer = Trainer(dl, SGD(lr=0.01))\n", 984 | "\n", 985 | "trainer.fit(X_train, y_train, X_test, y_test,\n", 986 | " epochs = 50,\n", 987 | " eval_every = 10,\n", 988 | " seed=20190501);\n", 989 | "print()\n", 990 | "eval_regression_model(dl, X_test, y_test)" 991 | ] 992 | } 993 | ], 994 | "metadata": { 995 | "kernelspec": { 996 | "display_name": "Python 3", 997 | "language": "python", 998 | "name": "python3" 999 | }, 1000 | "language_info": { 1001 | "codemirror_mode": { 1002 | "name": "ipython", 1003 | "version": 3 1004 | }, 1005 | "file_extension": ".py", 1006 | "mimetype": "text/x-python", 1007 | "name": "python", 1008 | "nbconvert_exporter": "python", 1009 | "pygments_lexer": "ipython3", 1010 | "version": "3.7.4" 1011 | } 1012 | }, 1013 | "nbformat": 4, 1014 | "nbformat_minor": 2 1015 | } 1016 | -------------------------------------------------------------------------------- /04_extensions/Code.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "This notebook contains experiments for:\n", 8 | "\n", 9 | "* Loss functions\n", 10 | "* Learning rate decay\n", 11 | "* Optimizers\n", 12 | "* Weight initialization" 13 | ] 14 | }, 15 | { 16 | "cell_type": "markdown", 17 | "metadata": {}, 18 | "source": [ 19 | "# `lincoln` imports" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 1, 25 | "metadata": {}, 26 | "outputs": [], 27 | "source": [ 28 | "import numpy as np\n", 29 | "\n", 30 | "from lincoln import activations\n", 31 | "from lincoln import layers\n", 32 | "from lincoln import losses\n", 33 | "from lincoln import optimizers\n", 34 | "from lincoln import network\n", 35 | "from lincoln import train\n", 36 | "from lincoln.utils import mnist\n", 37 | "\n", 38 | "RANDOM_SEED = 190119" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": 2, 44 | "metadata": {}, 45 | "outputs": [], 46 | "source": [ 47 | "X_train, y_train, X_test, y_test = mnist.load()" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": 3, 53 | "metadata": {}, 54 | "outputs": [ 55 | { 56 | "data": { 57 | "text/plain": [ 58 | "60000" 59 | ] 60 | }, 61 | "execution_count": 3, 62 | "metadata": {}, 63 | "output_type": "execute_result" 64 | } 65 | ], 66 | "source": [ 67 | "num_labels = len(y_train)\n", 68 | "num_labels" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": 4, 74 | "metadata": {}, 75 | "outputs": [], 76 | "source": [ 77 | "# one-hot encode\n", 78 | "num_labels = len(y_train)\n", 79 | "train_labels = np.zeros((num_labels, 10))\n", 80 | "for i in range(num_labels):\n", 81 | " train_labels[i][y_train[i]] = 1\n", 82 | "\n", 83 | "num_labels = len(y_test)\n", 84 | "test_labels = np.zeros((num_labels, 10))\n", 85 | "for i in range(num_labels):\n", 86 | " test_labels[i][y_test[i]] = 1" 87 | ] 88 | }, 89 | { 90 | "cell_type": "markdown", 91 | "metadata": {}, 92 | "source": [ 93 | "# MNIST Demos" 94 | ] 95 | }, 96 | { 97 | "cell_type": "markdown", 98 | "metadata": {}, 99 | "source": [ 100 | "# Scale data to mean 0, variance 1" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": 5, 106 | "metadata": {}, 107 | "outputs": [], 108 | "source": [ 109 | "X_train, X_test = X_train - np.mean(X_train), X_test - np.mean(X_train)" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": 6, 115 | "metadata": {}, 116 | "outputs": [ 117 | { 118 | "data": { 119 | "text/plain": [ 120 | "(-33.318421449829934,\n", 121 | " 221.68157855017006,\n", 122 | " -33.318421449829934,\n", 123 | " 221.68157855017006)" 124 | ] 125 | }, 126 | "execution_count": 6, 127 | "metadata": {}, 128 | "output_type": "execute_result" 129 | } 130 | ], 131 | "source": [ 132 | "np.min(X_train), np.max(X_train), np.min(X_test), np.max(X_test)" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": 7, 138 | "metadata": {}, 139 | "outputs": [], 140 | "source": [ 141 | "X_train, X_test = X_train / np.std(X_train), X_test / np.std(X_train)" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": 8, 147 | "metadata": {}, 148 | "outputs": [ 149 | { 150 | "data": { 151 | "text/plain": [ 152 | "(-0.424073894391566, 2.821543345689335, -0.424073894391566, 2.821543345689335)" 153 | ] 154 | }, 155 | "execution_count": 8, 156 | "metadata": {}, 157 | "output_type": "execute_result" 158 | } 159 | ], 160 | "source": [ 161 | "np.min(X_train), np.max(X_train), np.min(X_test), np.max(X_test)" 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": 9, 167 | "metadata": {}, 168 | "outputs": [], 169 | "source": [ 170 | "def calc_accuracy_model(model, test_set):\n", 171 | " return print(\n", 172 | " '''The model validation accuracy is: {0:.2f}%'''.format(\n", 173 | " np.equal(np.argmax(model.forward(test_set), axis=1), y_test).sum()\n", 174 | " * 100.0\n", 175 | " / test_set.shape[0]\n", 176 | " )\n", 177 | " )" 178 | ] 179 | }, 180 | { 181 | "cell_type": "markdown", 182 | "metadata": {}, 183 | "source": [ 184 | "## Softmax cross entropy" 185 | ] 186 | }, 187 | { 188 | "cell_type": "markdown", 189 | "metadata": {}, 190 | "source": [ 191 | "### Trying sigmoid activation" 192 | ] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "execution_count": 10, 197 | "metadata": {}, 198 | "outputs": [ 199 | { 200 | "name": "stdout", 201 | "output_type": "stream", 202 | "text": [ 203 | "Validation loss after 5 epochs is 0.836\n", 204 | "Validation loss after 10 epochs is 0.718\n", 205 | "Validation loss after 15 epochs is 0.659\n", 206 | "Validation loss after 20 epochs is 0.638\n", 207 | "Validation loss after 25 epochs is 0.627\n", 208 | "Validation loss after 30 epochs is 0.619\n", 209 | "Validation loss after 35 epochs is 0.558\n", 210 | "Validation loss after 40 epochs is 0.506\n", 211 | "Validation loss after 45 epochs is 0.499\n", 212 | "Validation loss after 50 epochs is 0.495\n", 213 | "The model validation accuracy is: 57.16%\n" 214 | ] 215 | } 216 | ], 217 | "source": [ 218 | "model = network.NeuralNetwork(\n", 219 | " layers=[\n", 220 | " layers.Dense(neurons=89, activation=activations.Sigmoid()),\n", 221 | " layers.Dense(neurons=10, activation=activations.Sigmoid()),\n", 222 | " ],\n", 223 | " loss=losses.MeanSquaredError(normalize=False),\n", 224 | " seed=RANDOM_SEED,\n", 225 | ")\n", 226 | "\n", 227 | "trainer = train.Trainer(model, optimizers.SGD(0.1))\n", 228 | "trainer.fit(\n", 229 | " X_train,\n", 230 | " train_labels,\n", 231 | " X_test,\n", 232 | " test_labels,\n", 233 | " epochs=50,\n", 234 | " eval_every=5,\n", 235 | " seed=RANDOM_SEED,\n", 236 | " batch_size=60,\n", 237 | ")\n", 238 | "\n", 239 | "calc_accuracy_model(model, X_test)" 240 | ] 241 | }, 242 | { 243 | "cell_type": "markdown", 244 | "metadata": {}, 245 | "source": [ 246 | "Note: even if we normalize the outputs of a classification model with mean squared error loss, it still doesn't help:" 247 | ] 248 | }, 249 | { 250 | "cell_type": "code", 251 | "execution_count": 11, 252 | "metadata": {}, 253 | "outputs": [ 254 | { 255 | "name": "stdout", 256 | "output_type": "stream", 257 | "text": [ 258 | "Validation loss after 5 epochs is 0.573\n", 259 | "\n", 260 | "Loss increased after epoch 10, final loss was 0.573, \n", 261 | "using the model from epoch 5\n", 262 | "The model validation accuracy is: 62.54%\n" 263 | ] 264 | } 265 | ], 266 | "source": [ 267 | "model = network.NeuralNetwork(\n", 268 | " layers=[\n", 269 | " layers.Dense(neurons=89, activation=activations.Sigmoid()),\n", 270 | " layers.Dense(neurons=10, activation=activations.Sigmoid()),\n", 271 | " ],\n", 272 | " loss=losses.MeanSquaredError(normalize=True),\n", 273 | " seed=RANDOM_SEED,\n", 274 | ")\n", 275 | "\n", 276 | "trainer = train.Trainer(model, optimizers.SGD(0.1))\n", 277 | "trainer.fit(\n", 278 | " X_train,\n", 279 | " train_labels,\n", 280 | " X_test,\n", 281 | " test_labels,\n", 282 | " epochs=50,\n", 283 | " eval_every=5,\n", 284 | " seed=RANDOM_SEED,\n", 285 | " batch_size=60,\n", 286 | ")\n", 287 | "\n", 288 | "calc_accuracy_model(model, X_test)" 289 | ] 290 | }, 291 | { 292 | "cell_type": "markdown", 293 | "metadata": {}, 294 | "source": [ 295 | "The reason is that we should be using softmax cross entropy loss!" 296 | ] 297 | }, 298 | { 299 | "cell_type": "code", 300 | "execution_count": 12, 301 | "metadata": {}, 302 | "outputs": [ 303 | { 304 | "name": "stdout", 305 | "output_type": "stream", 306 | "text": [ 307 | "Validation loss after 5 epochs is 0.719\n", 308 | "Validation loss after 10 epochs is 0.611\n", 309 | "Validation loss after 15 epochs is 0.559\n", 310 | "Validation loss after 20 epochs is 0.530\n", 311 | "Validation loss after 25 epochs is 0.505\n", 312 | "Validation loss after 30 epochs is 0.488\n", 313 | "Validation loss after 35 epochs is 0.475\n", 314 | "Validation loss after 40 epochs is 0.467\n", 315 | "Validation loss after 45 epochs is 0.459\n", 316 | "Validation loss after 50 epochs is 0.453\n", 317 | "\n", 318 | "The model validation accuracy is: 92.61%\n" 319 | ] 320 | } 321 | ], 322 | "source": [ 323 | "model = network.NeuralNetwork(\n", 324 | " layers=[\n", 325 | " layers.Dense(neurons=89, activation=activations.Sigmoid()),\n", 326 | " layers.Dense(neurons=10, activation=activations.Linear()),\n", 327 | " ],\n", 328 | " loss=losses.SoftmaxCrossEntropy(),\n", 329 | " seed=RANDOM_SEED,\n", 330 | ")\n", 331 | "\n", 332 | "trainer = train.Trainer(model, optimizers.SGD(0.1))\n", 333 | "trainer.fit(\n", 334 | " X_train,\n", 335 | " train_labels,\n", 336 | " X_test,\n", 337 | " test_labels,\n", 338 | " epochs=50,\n", 339 | " eval_every=5,\n", 340 | " seed=RANDOM_SEED,\n", 341 | " batch_size=60,\n", 342 | ")\n", 343 | "print()\n", 344 | "calc_accuracy_model(model, X_test)" 345 | ] 346 | }, 347 | { 348 | "cell_type": "markdown", 349 | "metadata": {}, 350 | "source": [ 351 | "## SGD Momentum" 352 | ] 353 | }, 354 | { 355 | "cell_type": "code", 356 | "execution_count": 13, 357 | "metadata": {}, 358 | "outputs": [ 359 | { 360 | "name": "stdout", 361 | "output_type": "stream", 362 | "text": [ 363 | "Validation loss after 5 epochs is 0.413\n", 364 | "Validation loss after 10 epochs is 0.361\n", 365 | "\n", 366 | "Loss increased after epoch 15, final loss was 0.361, \n", 367 | "using the model from epoch 10\n", 368 | "The model validation accuracy is: 94.10%\n" 369 | ] 370 | } 371 | ], 372 | "source": [ 373 | "model = network.NeuralNetwork(\n", 374 | " layers=[\n", 375 | " layers.Dense(neurons=89, activation=activations.Sigmoid()),\n", 376 | " layers.Dense(neurons=10, activation=activations.Linear()),\n", 377 | " ],\n", 378 | " loss=losses.SoftmaxCrossEntropy(),\n", 379 | " seed=RANDOM_SEED,\n", 380 | ")\n", 381 | "\n", 382 | "optim = optimizers.SGDMomentum(0.1, momentum=0.9)\n", 383 | "\n", 384 | "trainer = train.Trainer(model, optim)\n", 385 | "trainer.fit(\n", 386 | " X_train,\n", 387 | " train_labels,\n", 388 | " X_test,\n", 389 | " test_labels,\n", 390 | " epochs=50,\n", 391 | " eval_every=5,\n", 392 | " seed=RANDOM_SEED,\n", 393 | " batch_size=60,\n", 394 | ")\n", 395 | "\n", 396 | "calc_accuracy_model(model, X_test)" 397 | ] 398 | }, 399 | { 400 | "cell_type": "markdown", 401 | "metadata": {}, 402 | "source": [ 403 | "## Different weight decay" 404 | ] 405 | }, 406 | { 407 | "cell_type": "code", 408 | "execution_count": 14, 409 | "metadata": {}, 410 | "outputs": [ 411 | { 412 | "name": "stdout", 413 | "output_type": "stream", 414 | "text": [ 415 | "Validation loss after 5 epochs is 0.376\n", 416 | "Validation loss after 10 epochs is 0.328\n", 417 | "\n", 418 | "Loss increased after epoch 15, final loss was 0.328, \n", 419 | "using the model from epoch 10\n", 420 | "The model validation accuracy is: 94.76%\n" 421 | ] 422 | } 423 | ], 424 | "source": [ 425 | "model = network.NeuralNetwork(\n", 426 | " layers=[\n", 427 | " layers.Dense(neurons=89, activation=activations.Sigmoid()),\n", 428 | " layers.Dense(neurons=10, activation=activations.Linear()),\n", 429 | " ],\n", 430 | " loss=losses.SoftmaxCrossEntropy(),\n", 431 | " seed=RANDOM_SEED,\n", 432 | ")\n", 433 | "\n", 434 | "optimizer = optimizers.SGDMomentum(0.15, momentum=0.9, final_lr=0.05, decay_type='linear')\n", 435 | "\n", 436 | "trainer = train.Trainer(model, optimizer)\n", 437 | "trainer.fit(\n", 438 | " X_train,\n", 439 | " train_labels,\n", 440 | " X_test,\n", 441 | " test_labels,\n", 442 | " epochs=25,\n", 443 | " eval_every=5,\n", 444 | " seed=RANDOM_SEED,\n", 445 | " batch_size=60,\n", 446 | ")\n", 447 | "\n", 448 | "calc_accuracy_model(model, X_test)" 449 | ] 450 | }, 451 | { 452 | "cell_type": "code", 453 | "execution_count": 15, 454 | "metadata": {}, 455 | "outputs": [ 456 | { 457 | "name": "stdout", 458 | "output_type": "stream", 459 | "text": [ 460 | "Validation loss after 5 epochs is 0.387\n", 461 | "Validation loss after 10 epochs is 0.336\n", 462 | "\n", 463 | "Loss increased after epoch 15, final loss was 0.336, \n", 464 | "using the model from epoch 10\n", 465 | "The model validation accuracy is: 94.81%\n" 466 | ] 467 | } 468 | ], 469 | "source": [ 470 | "model = network.NeuralNetwork(\n", 471 | " layers=[\n", 472 | " layers.Dense(neurons=89, activation=activations.Sigmoid()),\n", 473 | " layers.Dense(neurons=10, activation=activations.Linear()),\n", 474 | " ],\n", 475 | " loss=losses.SoftmaxCrossEntropy(),\n", 476 | " seed=RANDOM_SEED,\n", 477 | ")\n", 478 | "\n", 479 | "optimizer = optimizers.SGDMomentum(0.2, momentum=0.9, final_lr=0.05, decay_type='exponential')\n", 480 | "\n", 481 | "trainer = train.Trainer(model, optimizer)\n", 482 | "trainer.fit(\n", 483 | " X_train,\n", 484 | " train_labels,\n", 485 | " X_test,\n", 486 | " test_labels,\n", 487 | " epochs=25,\n", 488 | " eval_every=5,\n", 489 | " seed=RANDOM_SEED,\n", 490 | " batch_size=60,\n", 491 | ")\n", 492 | "\n", 493 | "calc_accuracy_model(model, X_test)" 494 | ] 495 | }, 496 | { 497 | "cell_type": "markdown", 498 | "metadata": {}, 499 | "source": [ 500 | "## Changing weight init" 501 | ] 502 | }, 503 | { 504 | "cell_type": "code", 505 | "execution_count": 16, 506 | "metadata": {}, 507 | "outputs": [ 508 | { 509 | "name": "stdout", 510 | "output_type": "stream", 511 | "text": [ 512 | "Validation loss after 5 epochs is 0.169\n", 513 | "Validation loss after 10 epochs is 0.160\n", 514 | "\n", 515 | "Loss increased after epoch 15, final loss was 0.160, \n", 516 | "using the model from epoch 10\n", 517 | "The model validation accuracy is: 97.46%\n" 518 | ] 519 | } 520 | ], 521 | "source": [ 522 | "model = network.NeuralNetwork(\n", 523 | " layers=[\n", 524 | " layers.Dense(neurons=89, activation=activations.Sigmoid(), weight_init=\"glorot\"),\n", 525 | " layers.Dense(neurons=10, activation=activations.Linear(), weight_init=\"glorot\"),\n", 526 | " ],\n", 527 | " loss=losses.SoftmaxCrossEntropy(),\n", 528 | " seed=RANDOM_SEED,\n", 529 | ")\n", 530 | "\n", 531 | "optimizer = optimizers.SGDMomentum(0.2, momentum=0.9, final_lr=0.05, decay_type='exponential')\n", 532 | "\n", 533 | "trainer = train.Trainer(model, optimizer)\n", 534 | "trainer.fit(\n", 535 | " X_train,\n", 536 | " train_labels,\n", 537 | " X_test,\n", 538 | " test_labels,\n", 539 | " epochs=25,\n", 540 | " eval_every=5,\n", 541 | " seed=RANDOM_SEED,\n", 542 | " batch_size=60,\n", 543 | ")\n", 544 | "\n", 545 | "calc_accuracy_model(model, X_test)" 546 | ] 547 | } 548 | ], 549 | "metadata": { 550 | "kernelspec": { 551 | "display_name": "Python 3", 552 | "language": "python", 553 | "name": "python3" 554 | }, 555 | "language_info": { 556 | "codemirror_mode": { 557 | "name": "ipython", 558 | "version": 3 559 | }, 560 | "file_extension": ".py", 561 | "mimetype": "text/x-python", 562 | "name": "python", 563 | "nbconvert_exporter": "python", 564 | "pygments_lexer": "ipython3", 565 | "version": "3.11.4" 566 | } 567 | }, 568 | "nbformat": 4, 569 | "nbformat_minor": 2 570 | } 571 | -------------------------------------------------------------------------------- /05_convolutions/Code.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### Imports" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import numpy as np\n", 17 | "from numpy import ndarray" 18 | ] 19 | }, 20 | { 21 | "cell_type": "markdown", 22 | "metadata": {}, 23 | "source": [ 24 | "# Helpers" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 2, 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "def assert_same_shape(output: ndarray, \n", 34 | " output_grad: ndarray):\n", 35 | " assert output.shape == output_grad.shape, \\\n", 36 | " '''\n", 37 | " Two ndarray should have the same shape; instead, first ndarray's shape is {0}\n", 38 | " and second ndarray's shape is {1}.\n", 39 | " '''.format(tuple(output_grad.shape), tuple(output.shape))\n", 40 | " return None" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": 3, 46 | "metadata": {}, 47 | "outputs": [], 48 | "source": [ 49 | "def assert_dim(t: ndarray,\n", 50 | " dim: ndarray):\n", 51 | " assert len(t.shape) == dim, \\\n", 52 | " '''\n", 53 | " Tensor expected to have dimension {0}, instead has dimension {1}\n", 54 | " '''.format(dim, len(t.shape))\n", 55 | " return None" 56 | ] 57 | }, 58 | { 59 | "cell_type": "markdown", 60 | "metadata": {}, 61 | "source": [ 62 | "# 1D Convolution" 63 | ] 64 | }, 65 | { 66 | "cell_type": "markdown", 67 | "metadata": {}, 68 | "source": [ 69 | "1 input, 1 output" 70 | ] 71 | }, 72 | { 73 | "cell_type": "markdown", 74 | "metadata": {}, 75 | "source": [ 76 | "## Padding" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": 4, 82 | "metadata": {}, 83 | "outputs": [], 84 | "source": [ 85 | "input_1d = np.array([1,2,3,4,5])\n", 86 | "param_1d = np.array([1,1,1])" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": 5, 92 | "metadata": {}, 93 | "outputs": [], 94 | "source": [ 95 | "def _pad_1d(inp: ndarray,\n", 96 | " num: int) -> ndarray:\n", 97 | " z = np.array([0])\n", 98 | " z = np.repeat(z, num)\n", 99 | " return np.concatenate([z, inp, z])" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": 6, 105 | "metadata": {}, 106 | "outputs": [ 107 | { 108 | "data": { 109 | "text/plain": [ 110 | "array([0, 1, 2, 3, 4, 5, 0])" 111 | ] 112 | }, 113 | "execution_count": 6, 114 | "metadata": {}, 115 | "output_type": "execute_result" 116 | } 117 | ], 118 | "source": [ 119 | "_pad_1d(input_1d, 1)" 120 | ] 121 | }, 122 | { 123 | "cell_type": "markdown", 124 | "metadata": {}, 125 | "source": [ 126 | "### Forward" 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": 7, 132 | "metadata": {}, 133 | "outputs": [], 134 | "source": [ 135 | "def conv_1d(inp: ndarray, \n", 136 | " param: ndarray) -> ndarray:\n", 137 | " \n", 138 | " # assert correct dimensions\n", 139 | " assert_dim(inp, 1)\n", 140 | " assert_dim(param, 1)\n", 141 | " \n", 142 | " # pad the input\n", 143 | " param_len = param.shape[0]\n", 144 | " param_mid = param_len // 2\n", 145 | " inp_pad = _pad_1d(inp, param_mid)\n", 146 | " \n", 147 | " # initialize the output\n", 148 | " out = np.zeros(inp.shape)\n", 149 | " \n", 150 | " # perform the 1d convolution\n", 151 | " for o in range(out.shape[0]):\n", 152 | " for p in range(param_len):\n", 153 | " out[o] += param[p] * inp_pad[o+p]\n", 154 | "\n", 155 | " # ensure shapes didn't change \n", 156 | " assert_same_shape(inp, out)\n", 157 | "\n", 158 | " return out" 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": 8, 164 | "metadata": {}, 165 | "outputs": [], 166 | "source": [ 167 | "def conv_1d_sum(inp: ndarray, \n", 168 | " param: ndarray) -> ndarray:\n", 169 | " out = conv_1d(inp, param)\n", 170 | " return np.sum(out)" 171 | ] 172 | }, 173 | { 174 | "cell_type": "code", 175 | "execution_count": 9, 176 | "metadata": {}, 177 | "outputs": [ 178 | { 179 | "data": { 180 | "text/plain": [ 181 | "39.0" 182 | ] 183 | }, 184 | "execution_count": 9, 185 | "metadata": {}, 186 | "output_type": "execute_result" 187 | } 188 | ], 189 | "source": [ 190 | "conv_1d_sum(input_1d, param_1d)" 191 | ] 192 | }, 193 | { 194 | "cell_type": "markdown", 195 | "metadata": {}, 196 | "source": [ 197 | "## Testing gradients" 198 | ] 199 | }, 200 | { 201 | "cell_type": "code", 202 | "execution_count": 10, 203 | "metadata": {}, 204 | "outputs": [ 205 | { 206 | "name": "stdout", 207 | "output_type": "stream", 208 | "text": [ 209 | "4\n", 210 | "0\n" 211 | ] 212 | } 213 | ], 214 | "source": [ 215 | "np.random.seed(190220)\n", 216 | "print(np.random.randint(0, input_1d.shape[0]))\n", 217 | "print(np.random.randint(0, param_1d.shape[0]))" 218 | ] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "execution_count": 11, 223 | "metadata": {}, 224 | "outputs": [], 225 | "source": [ 226 | "input_1d_2 = np.array([1,2,3,4,6])\n", 227 | "param_1d = np.array([1,1,1])" 228 | ] 229 | }, 230 | { 231 | "cell_type": "code", 232 | "execution_count": 12, 233 | "metadata": {}, 234 | "outputs": [ 235 | { 236 | "name": "stdout", 237 | "output_type": "stream", 238 | "text": [ 239 | "0.0\n" 240 | ] 241 | } 242 | ], 243 | "source": [ 244 | "print(conv_1d_sum(input_1d_2, param_1d) - conv_1d_sum(input_1d_2, param_1d))" 245 | ] 246 | }, 247 | { 248 | "cell_type": "code", 249 | "execution_count": 13, 250 | "metadata": {}, 251 | "outputs": [ 252 | { 253 | "name": "stdout", 254 | "output_type": "stream", 255 | "text": [ 256 | "10.0\n" 257 | ] 258 | } 259 | ], 260 | "source": [ 261 | "input_1d = np.array([1,2,3,4,5])\n", 262 | "param_1d_2 = np.array([2,1,1])\n", 263 | "\n", 264 | "print(conv_1d_sum(input_1d, param_1d_2) - conv_1d_sum(input_1d, param_1d))" 265 | ] 266 | }, 267 | { 268 | "cell_type": "markdown", 269 | "metadata": {}, 270 | "source": [ 271 | "## Gradients" 272 | ] 273 | }, 274 | { 275 | "cell_type": "code", 276 | "execution_count": 14, 277 | "metadata": {}, 278 | "outputs": [], 279 | "source": [ 280 | "def _param_grad_1d(inp: ndarray, \n", 281 | " param: ndarray, \n", 282 | " output_grad: ndarray = None) -> ndarray:\n", 283 | " \n", 284 | " param_len = param.shape[0]\n", 285 | " param_mid = param_len // 2\n", 286 | " input_pad = _pad_1d(inp, param_mid)\n", 287 | " \n", 288 | " if output_grad is None:\n", 289 | " output_grad = np.ones_like(inp)\n", 290 | " else:\n", 291 | " assert_same_shape(inp, output_grad)\n", 292 | "\n", 293 | " # Zero padded 1 dimensional convolution\n", 294 | " param_grad = np.zeros_like(param)\n", 295 | " input_grad = np.zeros_like(inp)\n", 296 | "\n", 297 | " for o in range(inp.shape[0]):\n", 298 | " for p in range(param.shape[0]):\n", 299 | " param_grad[p] += input_pad[o+p] * output_grad[o]\n", 300 | " \n", 301 | " assert_same_shape(param_grad, param)\n", 302 | " \n", 303 | " return param_grad" 304 | ] 305 | }, 306 | { 307 | "cell_type": "code", 308 | "execution_count": 15, 309 | "metadata": {}, 310 | "outputs": [], 311 | "source": [ 312 | "def _input_grad_1d(inp: ndarray, \n", 313 | " param: ndarray, \n", 314 | " output_grad: ndarray = None) -> ndarray:\n", 315 | " \n", 316 | " param_len = param.shape[0]\n", 317 | " param_mid = param_len // 2\n", 318 | " inp_pad = _pad_1d(inp, param_mid)\n", 319 | " \n", 320 | " if output_grad is None:\n", 321 | " output_grad = np.ones_like(inp)\n", 322 | " else:\n", 323 | " assert_same_shape(inp, output_grad)\n", 324 | " \n", 325 | " output_pad = _pad_1d(output_grad, param_mid)\n", 326 | " \n", 327 | " # Zero padded 1 dimensional convolution\n", 328 | " param_grad = np.zeros_like(param)\n", 329 | " input_grad = np.zeros_like(inp)\n", 330 | "\n", 331 | " for o in range(inp.shape[0]):\n", 332 | " for f in range(param.shape[0]):\n", 333 | " input_grad[o] += output_pad[o+param_len-f-1] * param[f]\n", 334 | " \n", 335 | " assert_same_shape(param_grad, param)\n", 336 | " \n", 337 | " return input_grad" 338 | ] 339 | }, 340 | { 341 | "cell_type": "code", 342 | "execution_count": 16, 343 | "metadata": {}, 344 | "outputs": [ 345 | { 346 | "data": { 347 | "text/plain": [ 348 | "array([2, 3, 3, 3, 2])" 349 | ] 350 | }, 351 | "execution_count": 16, 352 | "metadata": {}, 353 | "output_type": "execute_result" 354 | } 355 | ], 356 | "source": [ 357 | "_input_grad_1d(input_1d, param_1d)" 358 | ] 359 | }, 360 | { 361 | "cell_type": "code", 362 | "execution_count": 17, 363 | "metadata": {}, 364 | "outputs": [ 365 | { 366 | "data": { 367 | "text/plain": [ 368 | "array([10, 15, 14])" 369 | ] 370 | }, 371 | "execution_count": 17, 372 | "metadata": {}, 373 | "output_type": "execute_result" 374 | } 375 | ], 376 | "source": [ 377 | "_param_grad_1d(input_1d, param_1d)" 378 | ] 379 | }, 380 | { 381 | "cell_type": "markdown", 382 | "metadata": {}, 383 | "source": [ 384 | "Works!" 385 | ] 386 | }, 387 | { 388 | "cell_type": "markdown", 389 | "metadata": {}, 390 | "source": [ 391 | "## Batch size of 2" 392 | ] 393 | }, 394 | { 395 | "cell_type": "markdown", 396 | "metadata": {}, 397 | "source": [ 398 | "### Pad" 399 | ] 400 | }, 401 | { 402 | "cell_type": "code", 403 | "execution_count": 18, 404 | "metadata": {}, 405 | "outputs": [], 406 | "source": [ 407 | "input_1d_batch = np.array([[0,1,2,3,4,5,6], \n", 408 | " [1,2,3,4,5,6,7]])" 409 | ] 410 | }, 411 | { 412 | "cell_type": "code", 413 | "execution_count": 19, 414 | "metadata": {}, 415 | "outputs": [], 416 | "source": [ 417 | "def _pad_1d(inp: ndarray,\n", 418 | " num: int) -> ndarray:\n", 419 | " z = np.array([0])\n", 420 | " z = np.repeat(z, num)\n", 421 | " return np.concatenate([z, inp, z])" 422 | ] 423 | }, 424 | { 425 | "cell_type": "code", 426 | "execution_count": 20, 427 | "metadata": {}, 428 | "outputs": [], 429 | "source": [ 430 | "def _pad_1d_batch(inp: ndarray, \n", 431 | " num: int) -> ndarray:\n", 432 | " outs = [_pad_1d(obs, num) for obs in inp]\n", 433 | " return np.stack(outs)" 434 | ] 435 | }, 436 | { 437 | "cell_type": "code", 438 | "execution_count": 21, 439 | "metadata": { 440 | "scrolled": true 441 | }, 442 | "outputs": [ 443 | { 444 | "data": { 445 | "text/plain": [ 446 | "array([[0, 0, 1, 2, 3, 4, 5, 6, 0],\n", 447 | " [0, 1, 2, 3, 4, 5, 6, 7, 0]])" 448 | ] 449 | }, 450 | "execution_count": 21, 451 | "metadata": {}, 452 | "output_type": "execute_result" 453 | } 454 | ], 455 | "source": [ 456 | "_pad_1d_batch(input_1d_batch, 1)" 457 | ] 458 | }, 459 | { 460 | "cell_type": "markdown", 461 | "metadata": {}, 462 | "source": [ 463 | "### Forward" 464 | ] 465 | }, 466 | { 467 | "cell_type": "code", 468 | "execution_count": 22, 469 | "metadata": {}, 470 | "outputs": [], 471 | "source": [ 472 | "def conv_1d_batch(inp: ndarray, \n", 473 | " param: ndarray) -> ndarray:\n", 474 | "\n", 475 | " outs = [conv_1d(obs, param) for obs in inp]\n", 476 | " return np.stack(outs)" 477 | ] 478 | }, 479 | { 480 | "cell_type": "code", 481 | "execution_count": 23, 482 | "metadata": {}, 483 | "outputs": [ 484 | { 485 | "data": { 486 | "text/plain": [ 487 | "array([[ 1., 3., 6., 9., 12., 15., 11.],\n", 488 | " [ 3., 6., 9., 12., 15., 18., 13.]])" 489 | ] 490 | }, 491 | "execution_count": 23, 492 | "metadata": {}, 493 | "output_type": "execute_result" 494 | } 495 | ], 496 | "source": [ 497 | "conv_1d_batch(input_1d_batch, param_1d)" 498 | ] 499 | }, 500 | { 501 | "cell_type": "markdown", 502 | "metadata": {}, 503 | "source": [ 504 | "### Gradient" 505 | ] 506 | }, 507 | { 508 | "cell_type": "code", 509 | "execution_count": 24, 510 | "metadata": {}, 511 | "outputs": [], 512 | "source": [ 513 | "def input_grad_1d_batch(inp: ndarray, \n", 514 | " param: ndarray) -> ndarray:\n", 515 | "\n", 516 | " out = conv_1d_batch(inp, param)\n", 517 | " \n", 518 | " out_grad = np.ones_like(out)\n", 519 | " \n", 520 | " batch_size = out_grad.shape[0]\n", 521 | " \n", 522 | " grads = [_input_grad_1d(inp[i], param, out_grad[i]) for i in range(batch_size)] \n", 523 | "\n", 524 | " return np.stack(grads)" 525 | ] 526 | }, 527 | { 528 | "cell_type": "code", 529 | "execution_count": 25, 530 | "metadata": {}, 531 | "outputs": [], 532 | "source": [ 533 | "def param_grad_1d_batch(inp: ndarray, \n", 534 | " param: ndarray) -> ndarray:\n", 535 | "\n", 536 | " output_grad = np.ones_like(inp)\n", 537 | " \n", 538 | " inp_pad = _pad_1d_batch(inp, 1)\n", 539 | " out_pad = _pad_1d_batch(inp, 1)\n", 540 | "\n", 541 | " param_grad = np.zeros_like(param) \n", 542 | " \n", 543 | " for i in range(inp.shape[0]):\n", 544 | " for o in range(inp.shape[1]):\n", 545 | " for p in range(param.shape[0]):\n", 546 | " param_grad[p] += inp_pad[i][o+p] * output_grad[i][o] \n", 547 | "\n", 548 | " return param_grad" 549 | ] 550 | }, 551 | { 552 | "cell_type": "markdown", 553 | "metadata": {}, 554 | "source": [ 555 | "## Checking gradients for `conv_1d_batch`" 556 | ] 557 | }, 558 | { 559 | "cell_type": "code", 560 | "execution_count": 26, 561 | "metadata": {}, 562 | "outputs": [], 563 | "source": [ 564 | "def conv_1d_batch_sum(inp: ndarray, \n", 565 | " fil: ndarray) -> ndarray:\n", 566 | " out = conv_1d_batch(inp, fil)\n", 567 | " return np.sum(out)" 568 | ] 569 | }, 570 | { 571 | "cell_type": "code", 572 | "execution_count": 27, 573 | "metadata": {}, 574 | "outputs": [ 575 | { 576 | "data": { 577 | "text/plain": [ 578 | "133.0" 579 | ] 580 | }, 581 | "execution_count": 27, 582 | "metadata": {}, 583 | "output_type": "execute_result" 584 | } 585 | ], 586 | "source": [ 587 | "conv_1d_batch_sum(input_1d_batch, param_1d)" 588 | ] 589 | }, 590 | { 591 | "cell_type": "code", 592 | "execution_count": 28, 593 | "metadata": {}, 594 | "outputs": [ 595 | { 596 | "name": "stdout", 597 | "output_type": "stream", 598 | "text": [ 599 | "0\n", 600 | "2\n" 601 | ] 602 | } 603 | ], 604 | "source": [ 605 | "print(np.random.randint(0, input_1d_batch.shape[0]))\n", 606 | "print(np.random.randint(0, input_1d_batch.shape[1]))" 607 | ] 608 | }, 609 | { 610 | "cell_type": "code", 611 | "execution_count": 29, 612 | "metadata": {}, 613 | "outputs": [ 614 | { 615 | "data": { 616 | "text/plain": [ 617 | "3.0" 618 | ] 619 | }, 620 | "execution_count": 29, 621 | "metadata": {}, 622 | "output_type": "execute_result" 623 | } 624 | ], 625 | "source": [ 626 | "input_1d_batch_2 = input_1d_batch.copy()\n", 627 | "input_1d_batch_2[0][2] += 1\n", 628 | "conv_1d_batch_sum(input_1d_batch_2, param_1d) - conv_1d_batch_sum(input_1d_batch, param_1d)" 629 | ] 630 | }, 631 | { 632 | "cell_type": "code", 633 | "execution_count": 30, 634 | "metadata": {}, 635 | "outputs": [ 636 | { 637 | "data": { 638 | "text/plain": [ 639 | "array([[2, 3, 3, 3, 3, 3, 2],\n", 640 | " [2, 3, 3, 3, 3, 3, 2]])" 641 | ] 642 | }, 643 | "execution_count": 30, 644 | "metadata": {}, 645 | "output_type": "execute_result" 646 | } 647 | ], 648 | "source": [ 649 | "input_grad_1d_batch(input_1d_batch, param_1d)" 650 | ] 651 | }, 652 | { 653 | "cell_type": "code", 654 | "execution_count": 31, 655 | "metadata": {}, 656 | "outputs": [ 657 | { 658 | "name": "stdout", 659 | "output_type": "stream", 660 | "text": [ 661 | "2\n" 662 | ] 663 | } 664 | ], 665 | "source": [ 666 | "print(np.random.randint(0, param_1d.shape[0]))" 667 | ] 668 | }, 669 | { 670 | "cell_type": "code", 671 | "execution_count": 32, 672 | "metadata": {}, 673 | "outputs": [ 674 | { 675 | "data": { 676 | "text/plain": [ 677 | "48.0" 678 | ] 679 | }, 680 | "execution_count": 32, 681 | "metadata": {}, 682 | "output_type": "execute_result" 683 | } 684 | ], 685 | "source": [ 686 | "param_1d_2 = param_1d.copy()\n", 687 | "param_1d_2[2] += 1\n", 688 | "conv_1d_batch_sum(input_1d_batch, param_1d_2) - conv_1d_batch_sum(input_1d_batch, param_1d) " 689 | ] 690 | }, 691 | { 692 | "cell_type": "code", 693 | "execution_count": 33, 694 | "metadata": {}, 695 | "outputs": [ 696 | { 697 | "data": { 698 | "text/plain": [ 699 | "array([36, 49, 48])" 700 | ] 701 | }, 702 | "execution_count": 33, 703 | "metadata": {}, 704 | "output_type": "execute_result" 705 | } 706 | ], 707 | "source": [ 708 | "param_grad_1d_batch(input_1d_batch, param_1d)" 709 | ] 710 | }, 711 | { 712 | "cell_type": "markdown", 713 | "metadata": {}, 714 | "source": [ 715 | "# 2D Convolutions" 716 | ] 717 | }, 718 | { 719 | "cell_type": "code", 720 | "execution_count": 34, 721 | "metadata": {}, 722 | "outputs": [], 723 | "source": [ 724 | "imgs_2d_batch = np.random.randn(3, 28, 28)" 725 | ] 726 | }, 727 | { 728 | "cell_type": "code", 729 | "execution_count": 35, 730 | "metadata": {}, 731 | "outputs": [], 732 | "source": [ 733 | "param_2d = np.random.randn(3, 3)" 734 | ] 735 | }, 736 | { 737 | "cell_type": "markdown", 738 | "metadata": {}, 739 | "source": [ 740 | "## Padding" 741 | ] 742 | }, 743 | { 744 | "cell_type": "code", 745 | "execution_count": 36, 746 | "metadata": {}, 747 | "outputs": [], 748 | "source": [ 749 | "def _pad_2d(inp: ndarray, \n", 750 | " num: int):\n", 751 | " '''\n", 752 | " Input is a 3 dimensional tensor, first dimension batch size\n", 753 | " '''\n", 754 | " outs = [_pad_2d_obs(obs, num) for obs in inp]\n", 755 | "\n", 756 | " return np.stack(outs)" 757 | ] 758 | }, 759 | { 760 | "cell_type": "code", 761 | "execution_count": 37, 762 | "metadata": {}, 763 | "outputs": [], 764 | "source": [ 765 | "def _pad_2d_obs(inp: ndarray, \n", 766 | " num: int):\n", 767 | " '''\n", 768 | " Input is a 2 dimensional, square, 2D Tensor\n", 769 | " '''\n", 770 | " inp_pad = _pad_1d_batch(inp, num)\n", 771 | "\n", 772 | " other = np.zeros((num, inp.shape[0] + num * 2))\n", 773 | "\n", 774 | " return np.concatenate([other, inp_pad, other])" 775 | ] 776 | }, 777 | { 778 | "cell_type": "code", 779 | "execution_count": 38, 780 | "metadata": {}, 781 | "outputs": [ 782 | { 783 | "data": { 784 | "text/plain": [ 785 | "(3, 30, 30)" 786 | ] 787 | }, 788 | "execution_count": 38, 789 | "metadata": {}, 790 | "output_type": "execute_result" 791 | } 792 | ], 793 | "source": [ 794 | "_pad_2d(imgs_2d_batch, 1).shape" 795 | ] 796 | }, 797 | { 798 | "cell_type": "markdown", 799 | "metadata": {}, 800 | "source": [ 801 | "## Compute output" 802 | ] 803 | }, 804 | { 805 | "cell_type": "code", 806 | "execution_count": 39, 807 | "metadata": {}, 808 | "outputs": [], 809 | "source": [ 810 | "def _compute_output_obs_2d(obs: ndarray, \n", 811 | " param: ndarray):\n", 812 | " '''\n", 813 | " Obs is a 2d square Tensor, so is param\n", 814 | " '''\n", 815 | " param_mid = param.shape[0] // 2\n", 816 | " \n", 817 | " obs_pad = _pad_2d_obs(obs, param_mid)\n", 818 | " \n", 819 | " out = np.zeros_like(obs)\n", 820 | " \n", 821 | " for o_w in range(out.shape[0]):\n", 822 | " for o_h in range(out.shape[1]):\n", 823 | " for p_w in range(param.shape[0]):\n", 824 | " for p_h in range(param.shape[1]):\n", 825 | " out[o_w][o_h] += param[p_w][p_h] * obs_pad[o_w+p_w][o_h+p_h]\n", 826 | " return out" 827 | ] 828 | }, 829 | { 830 | "cell_type": "code", 831 | "execution_count": 40, 832 | "metadata": {}, 833 | "outputs": [], 834 | "source": [ 835 | "def _compute_output_2d(img_batch: ndarray,\n", 836 | " param: ndarray):\n", 837 | " \n", 838 | " assert_dim(img_batch, 3)\n", 839 | " \n", 840 | " outs = [_compute_output_obs_2d(obs, param) for obs in img_batch]\n", 841 | " \n", 842 | " return np.stack(outs)" 843 | ] 844 | }, 845 | { 846 | "cell_type": "code", 847 | "execution_count": 41, 848 | "metadata": {}, 849 | "outputs": [ 850 | { 851 | "data": { 852 | "text/plain": [ 853 | "(3, 28, 28)" 854 | ] 855 | }, 856 | "execution_count": 41, 857 | "metadata": {}, 858 | "output_type": "execute_result" 859 | } 860 | ], 861 | "source": [ 862 | "_compute_output_2d(imgs_2d_batch, param_2d).shape" 863 | ] 864 | }, 865 | { 866 | "cell_type": "markdown", 867 | "metadata": {}, 868 | "source": [ 869 | "### Param grads" 870 | ] 871 | }, 872 | { 873 | "cell_type": "code", 874 | "execution_count": 42, 875 | "metadata": {}, 876 | "outputs": [], 877 | "source": [ 878 | "def _compute_grads_obs_2d(input_obs: ndarray,\n", 879 | " output_grad_obs: ndarray, \n", 880 | " param: ndarray) -> ndarray:\n", 881 | " '''\n", 882 | " input_obs: 2D Tensor representing the input observation\n", 883 | " output_grad_obs: 2D Tensor representing the output gradient \n", 884 | " param: 2D filter\n", 885 | " '''\n", 886 | " \n", 887 | " param_size = param.shape[0]\n", 888 | " output_obs_pad = _pad_2d_obs(output_grad_obs, param_size // 2)\n", 889 | " input_grad = np.zeros_like(input_obs)\n", 890 | "\n", 891 | " for i_w in range(input_obs.shape[0]):\n", 892 | " for i_h in range(input_obs.shape[1]):\n", 893 | " for p_w in range(param_size):\n", 894 | " for p_h in range(param_size):\n", 895 | " input_grad[i_w][i_h] += output_obs_pad[i_w+param_size-p_w-1][i_h+param_size-p_h-1] \\\n", 896 | " * param[p_w][p_h]\n", 897 | "\n", 898 | " return input_grad\n", 899 | "\n", 900 | "def _compute_grads_2d(inp: ndarray,\n", 901 | " output_grad: ndarray, \n", 902 | " param: ndarray) -> ndarray:\n", 903 | "\n", 904 | " grads = [_compute_grads_obs_2d(inp[i], output_grad[i], param) for i in range(output_grad.shape[0])] \n", 905 | "\n", 906 | " return np.stack(grads)\n", 907 | "\n", 908 | "\n", 909 | "def _param_grad_2d(inp: ndarray,\n", 910 | " output_grad: ndarray, \n", 911 | " param: ndarray) -> ndarray:\n", 912 | "\n", 913 | " param_size = param.shape[0]\n", 914 | " inp_pad = _pad_2d(inp, param_size // 2)\n", 915 | "\n", 916 | " param_grad = np.zeros_like(param)\n", 917 | " img_shape = output_grad.shape[1:]\n", 918 | " \n", 919 | " for i in range(inp.shape[0]):\n", 920 | " for o_w in range(img_shape[0]):\n", 921 | " for o_h in range(img_shape[1]):\n", 922 | " for p_w in range(param_size):\n", 923 | " for p_h in range(param_size):\n", 924 | " param_grad[p_w][p_h] += inp_pad[i][o_w+p_w][o_h+p_h] \\\n", 925 | " * output_grad[i][o_w][o_h]\n", 926 | " return param_grad" 927 | ] 928 | }, 929 | { 930 | "cell_type": "code", 931 | "execution_count": 43, 932 | "metadata": {}, 933 | "outputs": [], 934 | "source": [ 935 | "img_grads = _compute_grads_2d(imgs_2d_batch, \n", 936 | " np.ones_like(imgs_2d_batch),\n", 937 | " param_2d)" 938 | ] 939 | }, 940 | { 941 | "cell_type": "code", 942 | "execution_count": 44, 943 | "metadata": {}, 944 | "outputs": [ 945 | { 946 | "data": { 947 | "text/plain": [ 948 | "(3, 28, 28)" 949 | ] 950 | }, 951 | "execution_count": 44, 952 | "metadata": {}, 953 | "output_type": "execute_result" 954 | } 955 | ], 956 | "source": [ 957 | "img_grads.shape" 958 | ] 959 | }, 960 | { 961 | "cell_type": "code", 962 | "execution_count": 45, 963 | "metadata": {}, 964 | "outputs": [ 965 | { 966 | "data": { 967 | "text/plain": [ 968 | "(3, 3)" 969 | ] 970 | }, 971 | "execution_count": 45, 972 | "metadata": {}, 973 | "output_type": "execute_result" 974 | } 975 | ], 976 | "source": [ 977 | "param_grad = _param_grad_2d(imgs_2d_batch, \n", 978 | " np.ones_like(imgs_2d_batch),\n", 979 | " param_2d)\n", 980 | "param_grad.shape" 981 | ] 982 | }, 983 | { 984 | "cell_type": "markdown", 985 | "metadata": {}, 986 | "source": [ 987 | "## Testing gradients" 988 | ] 989 | }, 990 | { 991 | "cell_type": "markdown", 992 | "metadata": {}, 993 | "source": [ 994 | "### Input" 995 | ] 996 | }, 997 | { 998 | "cell_type": "code", 999 | "execution_count": 46, 1000 | "metadata": {}, 1001 | "outputs": [ 1002 | { 1003 | "name": "stdout", 1004 | "output_type": "stream", 1005 | "text": [ 1006 | "0\n", 1007 | "6\n", 1008 | "18\n" 1009 | ] 1010 | } 1011 | ], 1012 | "source": [ 1013 | "print(np.random.randint(0, imgs_2d_batch.shape[0]))\n", 1014 | "print(np.random.randint(0, imgs_2d_batch.shape[1]))\n", 1015 | "print(np.random.randint(0, imgs_2d_batch.shape[2]))" 1016 | ] 1017 | }, 1018 | { 1019 | "cell_type": "code", 1020 | "execution_count": 47, 1021 | "metadata": {}, 1022 | "outputs": [], 1023 | "source": [ 1024 | "imgs_2d_batch_2 = imgs_2d_batch.copy()\n", 1025 | "imgs_2d_batch_2[0][6][18] += 1" 1026 | ] 1027 | }, 1028 | { 1029 | "cell_type": "code", 1030 | "execution_count": 48, 1031 | "metadata": {}, 1032 | "outputs": [], 1033 | "source": [ 1034 | "def _compute_output_2d_sum(img_batch: ndarray,\n", 1035 | " param: ndarray):\n", 1036 | " \n", 1037 | " out = _compute_output_2d(img_batch, param)\n", 1038 | " \n", 1039 | " return out.sum()" 1040 | ] 1041 | }, 1042 | { 1043 | "cell_type": "code", 1044 | "execution_count": 49, 1045 | "metadata": {}, 1046 | "outputs": [ 1047 | { 1048 | "data": { 1049 | "text/plain": [ 1050 | "-3.1843477398599163" 1051 | ] 1052 | }, 1053 | "execution_count": 49, 1054 | "metadata": {}, 1055 | "output_type": "execute_result" 1056 | } 1057 | ], 1058 | "source": [ 1059 | "_compute_output_2d_sum(imgs_2d_batch_2, param_2d) - \\\n", 1060 | "_compute_output_2d_sum(imgs_2d_batch, param_2d)" 1061 | ] 1062 | }, 1063 | { 1064 | "cell_type": "code", 1065 | "execution_count": 50, 1066 | "metadata": {}, 1067 | "outputs": [ 1068 | { 1069 | "data": { 1070 | "text/plain": [ 1071 | "-3.184347739859924" 1072 | ] 1073 | }, 1074 | "execution_count": 50, 1075 | "metadata": {}, 1076 | "output_type": "execute_result" 1077 | } 1078 | ], 1079 | "source": [ 1080 | "img_grads[0][6][18]" 1081 | ] 1082 | }, 1083 | { 1084 | "cell_type": "markdown", 1085 | "metadata": {}, 1086 | "source": [ 1087 | "### Param" 1088 | ] 1089 | }, 1090 | { 1091 | "cell_type": "code", 1092 | "execution_count": 51, 1093 | "metadata": {}, 1094 | "outputs": [ 1095 | { 1096 | "name": "stdout", 1097 | "output_type": "stream", 1098 | "text": [ 1099 | "0\n", 1100 | "2\n" 1101 | ] 1102 | } 1103 | ], 1104 | "source": [ 1105 | "print(np.random.randint(0, param_2d.shape[0]))\n", 1106 | "print(np.random.randint(0, param_2d.shape[1]))" 1107 | ] 1108 | }, 1109 | { 1110 | "cell_type": "code", 1111 | "execution_count": 52, 1112 | "metadata": {}, 1113 | "outputs": [], 1114 | "source": [ 1115 | "param_2d_2 = param_2d.copy()\n", 1116 | "param_2d_2[0][2] += 1" 1117 | ] 1118 | }, 1119 | { 1120 | "cell_type": "code", 1121 | "execution_count": 53, 1122 | "metadata": {}, 1123 | "outputs": [ 1124 | { 1125 | "data": { 1126 | "text/plain": [ 1127 | "5.53349015923007" 1128 | ] 1129 | }, 1130 | "execution_count": 53, 1131 | "metadata": {}, 1132 | "output_type": "execute_result" 1133 | } 1134 | ], 1135 | "source": [ 1136 | "_compute_output_2d_sum(imgs_2d_batch, param_2d_2) - _compute_output_2d_sum(imgs_2d_batch, param_2d)" 1137 | ] 1138 | }, 1139 | { 1140 | "cell_type": "code", 1141 | "execution_count": 54, 1142 | "metadata": {}, 1143 | "outputs": [ 1144 | { 1145 | "data": { 1146 | "text/plain": [ 1147 | "5.533490159230001" 1148 | ] 1149 | }, 1150 | "execution_count": 54, 1151 | "metadata": {}, 1152 | "output_type": "execute_result" 1153 | } 1154 | ], 1155 | "source": [ 1156 | "param_grad[0][2]" 1157 | ] 1158 | }, 1159 | { 1160 | "cell_type": "markdown", 1161 | "metadata": {}, 1162 | "source": [ 1163 | "## With channels + batch size" 1164 | ] 1165 | }, 1166 | { 1167 | "cell_type": "markdown", 1168 | "metadata": {}, 1169 | "source": [ 1170 | "### Helper" 1171 | ] 1172 | }, 1173 | { 1174 | "cell_type": "code", 1175 | "execution_count": 55, 1176 | "metadata": {}, 1177 | "outputs": [], 1178 | "source": [ 1179 | "def _pad_2d_channel(inp: ndarray, \n", 1180 | " num: int):\n", 1181 | " '''\n", 1182 | " inp has dimension [num_channels, image_width, image_height] \n", 1183 | " '''\n", 1184 | " return np.stack([_pad_2d_obs(channel, num) for channel in inp])\n", 1185 | "\n", 1186 | "def _pad_conv_input(inp: ndarray,\n", 1187 | " num: int): \n", 1188 | " '''\n", 1189 | " inp has dimension [batch_size, num_channels, image_width, image_height]\n", 1190 | " ''' \n", 1191 | " return np.stack([_pad_2d_channel(obs, num) for obs in inp])" 1192 | ] 1193 | }, 1194 | { 1195 | "cell_type": "markdown", 1196 | "metadata": {}, 1197 | "source": [ 1198 | "### Forward" 1199 | ] 1200 | }, 1201 | { 1202 | "cell_type": "code", 1203 | "execution_count": 56, 1204 | "metadata": {}, 1205 | "outputs": [], 1206 | "source": [ 1207 | "def _compute_output_obs(obs: ndarray, \n", 1208 | " param: ndarray):\n", 1209 | " '''\n", 1210 | " obs: [channels, img_width, img_height]\n", 1211 | " param: [in_channels, out_channels, fil_width, fil_height] \n", 1212 | " '''\n", 1213 | " assert_dim(obs, 3)\n", 1214 | " assert_dim(param, 4)\n", 1215 | " \n", 1216 | " param_size = param.shape[2]\n", 1217 | " param_mid = param_size // 2\n", 1218 | " obs_pad = _pad_2d_channel(obs, param_mid)\n", 1219 | " \n", 1220 | " in_channels = param.shape[0]\n", 1221 | " out_channels = param.shape[1]\n", 1222 | " img_size = obs.shape[1]\n", 1223 | " \n", 1224 | " out = np.zeros((out_channels,) + obs.shape[1:])\n", 1225 | " for c_in in range(in_channels):\n", 1226 | " for c_out in range(out_channels):\n", 1227 | " for o_w in range(img_size):\n", 1228 | " for o_h in range(img_size):\n", 1229 | " for p_w in range(param_size):\n", 1230 | " for p_h in range(param_size):\n", 1231 | " out[c_out][o_w][o_h] += \\\n", 1232 | " param[c_in][c_out][p_w][p_h] * obs_pad[c_in][o_w+p_w][o_h+p_h]\n", 1233 | " return out \n", 1234 | "\n", 1235 | "def _output(inp: ndarray,\n", 1236 | " param: ndarray) -> ndarray:\n", 1237 | " '''\n", 1238 | " obs: [batch_size, channels, img_width, img_height]\n", 1239 | " fil: [in_channels, out_channels, fil_width, fil_height] \n", 1240 | " '''\n", 1241 | " outs = [_compute_output_obs(obs, param) for obs in inp] \n", 1242 | "\n", 1243 | " return np.stack(outs)" 1244 | ] 1245 | }, 1246 | { 1247 | "cell_type": "markdown", 1248 | "metadata": {}, 1249 | "source": [ 1250 | "### Backward" 1251 | ] 1252 | }, 1253 | { 1254 | "cell_type": "code", 1255 | "execution_count": 57, 1256 | "metadata": {}, 1257 | "outputs": [], 1258 | "source": [ 1259 | "def _compute_grads_obs(input_obs: ndarray,\n", 1260 | " output_grad_obs: ndarray,\n", 1261 | " param: ndarray) -> ndarray:\n", 1262 | " '''\n", 1263 | " input_obs: [in_channels, img_width, img_height]\n", 1264 | " output_grad_obs: [out_channels, img_width, img_height]\n", 1265 | " param: [in_channels, out_channels, img_width, img_height] \n", 1266 | " '''\n", 1267 | " input_grad = np.zeros_like(input_obs) \n", 1268 | " param_size = param.shape[2]\n", 1269 | " param_mid = param_size // 2\n", 1270 | " img_size = input_obs.shape[1]\n", 1271 | " in_channels = input_obs.shape[0]\n", 1272 | " out_channels = param.shape[1]\n", 1273 | " output_obs_pad = _pad_2d_channel(output_grad_obs, param_mid)\n", 1274 | " \n", 1275 | " for c_in in range(in_channels):\n", 1276 | " for c_out in range(out_channels):\n", 1277 | " for i_w in range(input_obs.shape[1]):\n", 1278 | " for i_h in range(input_obs.shape[2]):\n", 1279 | " for p_w in range(param_size):\n", 1280 | " for p_h in range(param_size):\n", 1281 | " input_grad[c_in][i_w][i_h] += \\\n", 1282 | " output_obs_pad[c_out][i_w+param_size-p_w-1][i_h+param_size-p_h-1] \\\n", 1283 | " * param[c_in][c_out][p_w][p_h]\n", 1284 | " return input_grad\n", 1285 | "\n", 1286 | "def _input_grad(inp: ndarray,\n", 1287 | " output_grad: ndarray, \n", 1288 | " param: ndarray) -> ndarray:\n", 1289 | "\n", 1290 | " grads = [_compute_grads_obs(inp[i], output_grad[i], param) for i in range(output_grad.shape[0])] \n", 1291 | "\n", 1292 | " return np.stack(grads)" 1293 | ] 1294 | }, 1295 | { 1296 | "cell_type": "code", 1297 | "execution_count": 58, 1298 | "metadata": {}, 1299 | "outputs": [], 1300 | "source": [ 1301 | "def _param_grad(inp: ndarray,\n", 1302 | " output_grad: ndarray, \n", 1303 | " param: ndarray) -> ndarray:\n", 1304 | " '''\n", 1305 | " inp: [in_channels, img_width, img_height]\n", 1306 | " output_grad_obs: [out_channels, img_width, img_height]\n", 1307 | " param: [in_channels, out_channels, img_width, img_height] \n", 1308 | " '''\n", 1309 | " param_grad = np.zeros_like(param) \n", 1310 | " param_size = param.shape[2]\n", 1311 | " param_mid = param_size // 2\n", 1312 | " img_size = inp.shape[2]\n", 1313 | " in_channels = inp.shape[1]\n", 1314 | " out_channels = output_grad.shape[1] \n", 1315 | "\n", 1316 | " inp_pad = _pad_conv_input(inp, param_mid)\n", 1317 | " img_shape = output_grad.shape[2:]\n", 1318 | "\n", 1319 | " for i in range(inp.shape[0]):\n", 1320 | " for c_in in range(in_channels):\n", 1321 | " for c_out in range(out_channels):\n", 1322 | " for o_w in range(img_shape[0]):\n", 1323 | " for o_h in range(img_shape[1]):\n", 1324 | " for p_w in range(param_size):\n", 1325 | " for p_h in range(param_size):\n", 1326 | " param_grad[c_in][c_out][p_w][p_h] += \\\n", 1327 | " inp_pad[i][c_in][o_w+p_w][o_h+p_h] \\\n", 1328 | " * output_grad[i][c_out][o_w][o_h]\n", 1329 | " return param_grad" 1330 | ] 1331 | }, 1332 | { 1333 | "cell_type": "markdown", 1334 | "metadata": {}, 1335 | "source": [ 1336 | "## Testing gradients" 1337 | ] 1338 | }, 1339 | { 1340 | "cell_type": "code", 1341 | "execution_count": 59, 1342 | "metadata": {}, 1343 | "outputs": [], 1344 | "source": [ 1345 | "cifar_imgs = np.random.randn(10, 3, 32, 32)\n", 1346 | "cifar_param = np.random.randn(3, 16, 5, 5)" 1347 | ] 1348 | }, 1349 | { 1350 | "cell_type": "code", 1351 | "execution_count": 60, 1352 | "metadata": {}, 1353 | "outputs": [ 1354 | { 1355 | "name": "stdout", 1356 | "output_type": "stream", 1357 | "text": [ 1358 | "3\n", 1359 | "1\n", 1360 | "2\n", 1361 | "19\n", 1362 | "\n", 1363 | "0\n", 1364 | "8\n", 1365 | "0\n", 1366 | "2\n" 1367 | ] 1368 | } 1369 | ], 1370 | "source": [ 1371 | "print(np.random.randint(0, cifar_imgs.shape[0]))\n", 1372 | "print(np.random.randint(0, cifar_imgs.shape[1]))\n", 1373 | "print(np.random.randint(0, cifar_imgs.shape[2]))\n", 1374 | "print(np.random.randint(0, cifar_imgs.shape[3]))\n", 1375 | "print()\n", 1376 | "print(np.random.randint(0, cifar_param.shape[0]))\n", 1377 | "print(np.random.randint(0, cifar_param.shape[1]))\n", 1378 | "print(np.random.randint(0, cifar_param.shape[2]))\n", 1379 | "print(np.random.randint(0, cifar_param.shape[3]))" 1380 | ] 1381 | }, 1382 | { 1383 | "cell_type": "code", 1384 | "execution_count": 61, 1385 | "metadata": {}, 1386 | "outputs": [], 1387 | "source": [ 1388 | "def _compute_output_sum(imgs: ndarray,\n", 1389 | " param: ndarray):\n", 1390 | " return _output(imgs, param).sum()" 1391 | ] 1392 | }, 1393 | { 1394 | "cell_type": "markdown", 1395 | "metadata": {}, 1396 | "source": [ 1397 | "### Input grad" 1398 | ] 1399 | }, 1400 | { 1401 | "cell_type": "code", 1402 | "execution_count": 62, 1403 | "metadata": {}, 1404 | "outputs": [], 1405 | "source": [ 1406 | "cifar_imgs_2 = cifar_imgs.copy()\n", 1407 | "cifar_imgs_2[3][1][2][19] += 1" 1408 | ] 1409 | }, 1410 | { 1411 | "cell_type": "code", 1412 | "execution_count": 63, 1413 | "metadata": {}, 1414 | "outputs": [ 1415 | { 1416 | "data": { 1417 | "text/plain": [ 1418 | "2.345298758707486" 1419 | ] 1420 | }, 1421 | "execution_count": 63, 1422 | "metadata": {}, 1423 | "output_type": "execute_result" 1424 | } 1425 | ], 1426 | "source": [ 1427 | "_compute_output_sum(cifar_imgs_2, cifar_param) - _compute_output_sum(cifar_imgs, cifar_param)" 1428 | ] 1429 | }, 1430 | { 1431 | "cell_type": "code", 1432 | "execution_count": 64, 1433 | "metadata": {}, 1434 | "outputs": [ 1435 | { 1436 | "data": { 1437 | "text/plain": [ 1438 | "2.3452987587074423" 1439 | ] 1440 | }, 1441 | "execution_count": 64, 1442 | "metadata": {}, 1443 | "output_type": "execute_result" 1444 | } 1445 | ], 1446 | "source": [ 1447 | "_input_grad(cifar_imgs,\n", 1448 | " np.ones((10, 16, 32, 32)),\n", 1449 | " cifar_param)[3][1][2][19]" 1450 | ] 1451 | }, 1452 | { 1453 | "cell_type": "markdown", 1454 | "metadata": {}, 1455 | "source": [ 1456 | "### Param grad" 1457 | ] 1458 | }, 1459 | { 1460 | "cell_type": "code", 1461 | "execution_count": 65, 1462 | "metadata": {}, 1463 | "outputs": [], 1464 | "source": [ 1465 | "cifar_param_2 = cifar_param.copy()\n", 1466 | "cifar_param_2[0][8][0][2] += 1" 1467 | ] 1468 | }, 1469 | { 1470 | "cell_type": "code", 1471 | "execution_count": 66, 1472 | "metadata": {}, 1473 | "outputs": [ 1474 | { 1475 | "data": { 1476 | "text/plain": [ 1477 | "-47.09123124155292" 1478 | ] 1479 | }, 1480 | "execution_count": 66, 1481 | "metadata": {}, 1482 | "output_type": "execute_result" 1483 | } 1484 | ], 1485 | "source": [ 1486 | "_compute_output_sum(cifar_imgs, cifar_param_2) - _compute_output_sum(cifar_imgs, cifar_param)" 1487 | ] 1488 | }, 1489 | { 1490 | "cell_type": "code", 1491 | "execution_count": 67, 1492 | "metadata": {}, 1493 | "outputs": [ 1494 | { 1495 | "data": { 1496 | "text/plain": [ 1497 | "-47.0912312415532" 1498 | ] 1499 | }, 1500 | "execution_count": 67, 1501 | "metadata": {}, 1502 | "output_type": "execute_result" 1503 | } 1504 | ], 1505 | "source": [ 1506 | "_param_grad(cifar_imgs,\n", 1507 | " np.ones((10, 16, 32, 32)),\n", 1508 | " cifar_param)[0][8][0][2]" 1509 | ] 1510 | } 1511 | ], 1512 | "metadata": { 1513 | "kernelspec": { 1514 | "display_name": "Python 3", 1515 | "language": "python", 1516 | "name": "python3" 1517 | }, 1518 | "language_info": { 1519 | "codemirror_mode": { 1520 | "name": "ipython", 1521 | "version": 3 1522 | }, 1523 | "file_extension": ".py", 1524 | "mimetype": "text/x-python", 1525 | "name": "python", 1526 | "nbconvert_exporter": "python", 1527 | "pygments_lexer": "ipython3", 1528 | "version": "3.7.4" 1529 | } 1530 | }, 1531 | "nbformat": 4, 1532 | "nbformat_minor": 2 1533 | } 1534 | -------------------------------------------------------------------------------- /05_convolutions/Math.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 4, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import numpy as np\n", 10 | "np.random.seed(20190420)" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 5, 16 | "metadata": {}, 17 | "outputs": [ 18 | { 19 | "data": { 20 | "text/plain": [ 21 | "array([[5, 1, 0, 6],\n", 22 | " [3, 5, 5, 6],\n", 23 | " [9, 0, 3, 2],\n", 24 | " [4, 7, 0, 7]])" 25 | ] 26 | }, 27 | "execution_count": 5, 28 | "metadata": {}, 29 | "output_type": "execute_result" 30 | } 31 | ], 32 | "source": [ 33 | "np.random.randint(0, 10, size=(4,4))" 34 | ] 35 | }, 36 | { 37 | "cell_type": "markdown", 38 | "metadata": {}, 39 | "source": [ 40 | "$$ A = \\begin{bmatrix}5 & 1 & 0 & 6 \\\\ \n", 41 | "3 & 5 & 5 & 6 \\\\ \n", 42 | "9 & 0 & 3 & 2 \\\\ \n", 43 | "4 & 7 & 0 & 7 \\end{bmatrix} $$" 44 | ] 45 | }, 46 | { 47 | "cell_type": "markdown", 48 | "metadata": {}, 49 | "source": [ 50 | "$$ \\text{MaxPool}(A) = \\begin{bmatrix}5 & 6 \\\\ \n", 51 | "9 & 7 \\end{bmatrix} $$" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 8, 57 | "metadata": {}, 58 | "outputs": [ 59 | { 60 | "data": { 61 | "text/plain": [ 62 | "(3.5, 4.25, 5.0, 3.0)" 63 | ] 64 | }, 65 | "execution_count": 8, 66 | "metadata": {}, 67 | "output_type": "execute_result" 68 | } 69 | ], 70 | "source": [ 71 | "np.mean([5,1,3,5]), np.mean([0,5,6,6]), np.mean([9,0,4,7]), np.mean([3,0,2,7])" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": null, 77 | "metadata": {}, 78 | "outputs": [], 79 | "source": [] 80 | } 81 | ], 82 | "metadata": { 83 | "kernelspec": { 84 | "display_name": "Python 3", 85 | "language": "python", 86 | "name": "python3" 87 | }, 88 | "language_info": { 89 | "codemirror_mode": { 90 | "name": "ipython", 91 | "version": 3 92 | }, 93 | "file_extension": ".py", 94 | "mimetype": "text/x-python", 95 | "name": "python", 96 | "nbconvert_exporter": "python", 97 | "pygments_lexer": "ipython3", 98 | "version": "3.6.6" 99 | } 100 | }, 101 | "nbformat": 4, 102 | "nbformat_minor": 2 103 | } 104 | -------------------------------------------------------------------------------- /05_convolutions/Numpy_Convolution_Demos.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Convolution demos" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "In this notebook, we use the batch, multi-channel convolution operation implemented in Numpy (that you can find [here](../lincoln/lincoln/conv.py)) to train a small convolutional neural network to more than 90% accuracy on MNIST." 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 1, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "import numpy as np\n", 24 | "\n", 25 | "import lincoln\n", 26 | "from lincoln.layers import Dense\n", 27 | "from lincoln.losses import SoftmaxCrossEntropy, MeanSquaredError\n", 28 | "from lincoln.optimizers import Optimizer, SGD, SGDMomentum\n", 29 | "from lincoln.activations import Sigmoid, Tanh, Linear, ReLU\n", 30 | "from lincoln.network import NeuralNetwork\n", 31 | "from lincoln.train import Trainer\n", 32 | "from lincoln.utils import mnist\n", 33 | "from lincoln.layers import Conv2D\n", 34 | "\n", 35 | "X_train, y_train, X_test, y_test = mnist.load()" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 2, 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": [ 44 | "%load_ext autoreload\n", 45 | "%autoreload 2" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": 3, 51 | "metadata": {}, 52 | "outputs": [], 53 | "source": [ 54 | "X_train, X_test = X_train - np.mean(X_train), X_test - np.mean(X_train)\n", 55 | "X_train, X_test = X_train / np.std(X_train), X_test / np.std(X_train)" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": 4, 61 | "metadata": {}, 62 | "outputs": [], 63 | "source": [ 64 | "X_train_conv, X_test_conv = X_train.reshape(-1, 1, 28, 28), X_test.reshape(-1, 1, 28, 28)" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": 5, 70 | "metadata": {}, 71 | "outputs": [], 72 | "source": [ 73 | "num_labels = len(y_train)\n", 74 | "train_labels = np.zeros((num_labels, 10))\n", 75 | "for i in range(num_labels):\n", 76 | " train_labels[i][y_train[i]] = 1\n", 77 | "\n", 78 | "num_labels = len(y_test)\n", 79 | "test_labels = np.zeros((num_labels, 10))\n", 80 | "for i in range(num_labels):\n", 81 | " test_labels[i][y_test[i]] = 1" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": 6, 87 | "metadata": {}, 88 | "outputs": [], 89 | "source": [ 90 | "def calc_accuracy_model(model, test_set):\n", 91 | " return print(f'''The model validation accuracy is: \n", 92 | " {np.equal(np.argmax(model.forward(test_set, inference=True), axis=1), y_test).sum() * 100.0 / test_set.shape[0]:.2f}%''')" 93 | ] 94 | }, 95 | { 96 | "cell_type": "markdown", 97 | "metadata": {}, 98 | "source": [ 99 | "# CNN from scratch" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": 7, 105 | "metadata": {}, 106 | "outputs": [ 107 | { 108 | "name": "stdout", 109 | "output_type": "stream", 110 | "text": [ 111 | "Validation accuracy after 100 batches is 86.85%\n", 112 | "Validation accuracy after 200 batches is 83.78%\n", 113 | "Validation accuracy after 300 batches is 90.42%\n", 114 | "Validation accuracy after 400 batches is 89.08%\n", 115 | "Validation accuracy after 500 batches is 90.01%\n", 116 | "Validation accuracy after 600 batches is 90.57%\n", 117 | "Validation accuracy after 700 batches is 84.27%\n", 118 | "Validation accuracy after 800 batches is 91.85%\n", 119 | "Validation accuracy after 900 batches is 92.50%\n", 120 | "Validation loss after 1 epochs is 3.615\n" 121 | ] 122 | } 123 | ], 124 | "source": [ 125 | "model = NeuralNetwork(\n", 126 | " layers=[Conv2D(out_channels=16,\n", 127 | " param_size=5,\n", 128 | " dropout=0.8,\n", 129 | " weight_init=\"glorot\",\n", 130 | " flatten=True,\n", 131 | " activation=Tanh()),\n", 132 | " Dense(neurons=10, \n", 133 | " activation=Linear())],\n", 134 | " loss = SoftmaxCrossEntropy(), \n", 135 | "seed=20190402)\n", 136 | "\n", 137 | "trainer = Trainer(model, SGDMomentum(lr = 0.1, momentum=0.9))\n", 138 | "trainer.fit(X_train_conv, train_labels, X_test_conv, test_labels,\n", 139 | " epochs = 1,\n", 140 | " eval_every = 1,\n", 141 | " seed=20190402,\n", 142 | " batch_size=60,\n", 143 | " conv_testing=True);" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": 8, 149 | "metadata": {}, 150 | "outputs": [ 151 | { 152 | "name": "stdout", 153 | "output_type": "stream", 154 | "text": [ 155 | "The model validation accuracy is: \n", 156 | " 90.31%\n" 157 | ] 158 | } 159 | ], 160 | "source": [ 161 | "calc_accuracy_model(model, X_test_conv)" 162 | ] 163 | } 164 | ], 165 | "metadata": { 166 | "kernelspec": { 167 | "display_name": "Python 3", 168 | "language": "python", 169 | "name": "python3" 170 | }, 171 | "language_info": { 172 | "codemirror_mode": { 173 | "name": "ipython", 174 | "version": 3 175 | }, 176 | "file_extension": ".py", 177 | "mimetype": "text/x-python", 178 | "name": "python", 179 | "nbconvert_exporter": "python", 180 | "pygments_lexer": "ipython3", 181 | "version": "3.7.4" 182 | } 183 | }, 184 | "nbformat": 4, 185 | "nbformat_minor": 2 186 | } 187 | -------------------------------------------------------------------------------- /06_rnns/Autograd_Simple.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Simple automatic differentiation illustration" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "from typing import Union, List\n", 17 | "\n", 18 | "import numpy as np\n", 19 | "\n", 20 | "np.set_printoptions(precision=4)" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 2, 26 | "metadata": {}, 27 | "outputs": [ 28 | { 29 | "data": { 30 | "text/plain": [ 31 | "7" 32 | ] 33 | }, 34 | "execution_count": 2, 35 | "metadata": {}, 36 | "output_type": "execute_result" 37 | } 38 | ], 39 | "source": [ 40 | "a = 3\n", 41 | "a.__add__(4)" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": 3, 47 | "metadata": {}, 48 | "outputs": [ 49 | { 50 | "name": "stdout", 51 | "output_type": "stream", 52 | "text": [ 53 | "[2 3 1 0]\n", 54 | "Addition using '__add__': [6 7 5 4]\n", 55 | "Addition using '+': [6 7 5 4]\n" 56 | ] 57 | } 58 | ], 59 | "source": [ 60 | "a = np.array([2,3,1,0])\n", 61 | "\n", 62 | "print(a)\n", 63 | "print(\"Addition using '__add__':\", a.__add__(4))\n", 64 | "print(\"Addition using '+':\", a + 4)" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": 4, 70 | "metadata": {}, 71 | "outputs": [], 72 | "source": [ 73 | "Numberable = Union[float, int]\n", 74 | "\n", 75 | "def ensure_number(num: Numberable):\n", 76 | " if isinstance(num, NumberWithGrad):\n", 77 | " return num\n", 78 | " else:\n", 79 | " return NumberWithGrad(num) \n", 80 | "\n", 81 | "class NumberWithGrad(object):\n", 82 | " \n", 83 | " def __init__(self, \n", 84 | " num: Numberable,\n", 85 | " depends_on: List[Numberable] = None,\n", 86 | " creation_op: str = ''):\n", 87 | " self.num = num\n", 88 | " self.grad = None\n", 89 | " self.depends_on = depends_on or []\n", 90 | " self.creation_op = creation_op\n", 91 | "\n", 92 | " def __add__(self, \n", 93 | " other: Numberable):\n", 94 | " return NumberWithGrad(self.num + ensure_number(other).num,\n", 95 | " depends_on = [self, ensure_number(other)],\n", 96 | " creation_op = 'add')\n", 97 | " \n", 98 | " def __mul__(self,\n", 99 | " other: Numberable = None):\n", 100 | "\n", 101 | " return NumberWithGrad(self.num * ensure_number(other).num,\n", 102 | " depends_on = [self, ensure_number(other)],\n", 103 | " creation_op = 'mul')\n", 104 | " \n", 105 | " def backward(self, backward_grad: Numberable = None):\n", 106 | " if backward_grad is None: # first time calling backward\n", 107 | " self.grad = 1\n", 108 | " else: \n", 109 | " # These lines allow gradients to accumulate.\n", 110 | " # If the gradient doesn't exist yet, simply set it equal\n", 111 | " # to backward_grad\n", 112 | " if self.grad is None:\n", 113 | " self.grad = backward_grad\n", 114 | " # Otherwise, simply add backward_grad to the existing gradient\n", 115 | " else:\n", 116 | " self.grad += backward_grad\n", 117 | " \n", 118 | " if self.creation_op == \"add\":\n", 119 | " # Simply send backward self.grad, since increasing either of these \n", 120 | " # elements will increase the output by that same amount\n", 121 | " self.depends_on[0].backward(self.grad)\n", 122 | " self.depends_on[1].backward(self.grad) \n", 123 | "\n", 124 | " if self.creation_op == \"mul\":\n", 125 | "\n", 126 | " # Calculate the derivative with respect to the first element\n", 127 | " new = self.depends_on[1] * self.grad\n", 128 | " # Send backward the derivative with respect to that element\n", 129 | " self.depends_on[0].backward(new.num)\n", 130 | "\n", 131 | " # Calculate the derivative with respect to the second element\n", 132 | " new = self.depends_on[0] * self.grad\n", 133 | " # Send backward the derivative with respect to that element\n", 134 | " self.depends_on[1].backward(new.num)" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": 5, 140 | "metadata": {}, 141 | "outputs": [ 142 | { 143 | "name": "stdout", 144 | "output_type": "stream", 145 | "text": [ 146 | "4\n", 147 | "1\n" 148 | ] 149 | } 150 | ], 151 | "source": [ 152 | "a = NumberWithGrad(3)\n", 153 | "b = a * 4\n", 154 | "c = b + 3\n", 155 | "c.backward()\n", 156 | "print(a.grad) # as expected\n", 157 | "print(b.grad) # as expected" 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": 6, 163 | "metadata": {}, 164 | "outputs": [], 165 | "source": [ 166 | "a = NumberWithGrad(3)" 167 | ] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "execution_count": 7, 172 | "metadata": {}, 173 | "outputs": [], 174 | "source": [ 175 | "b = a * 4\n", 176 | "c = b + 3\n", 177 | "d = (a + 2)\n", 178 | "e = c * d \n", 179 | "e.backward() " 180 | ] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "execution_count": 8, 185 | "metadata": {}, 186 | "outputs": [ 187 | { 188 | "data": { 189 | "text/plain": [ 190 | "35" 191 | ] 192 | }, 193 | "execution_count": 8, 194 | "metadata": {}, 195 | "output_type": "execute_result" 196 | } 197 | ], 198 | "source": [ 199 | "a.grad # as expected" 200 | ] 201 | } 202 | ], 203 | "metadata": { 204 | "kernelspec": { 205 | "display_name": "Python 3", 206 | "language": "python", 207 | "name": "python3" 208 | }, 209 | "language_info": { 210 | "codemirror_mode": { 211 | "name": "ipython", 212 | "version": 3 213 | }, 214 | "file_extension": ".py", 215 | "mimetype": "text/x-python", 216 | "name": "python", 217 | "nbconvert_exporter": "python", 218 | "pygments_lexer": "ipython3", 219 | "version": "3.7.4" 220 | } 221 | }, 222 | "nbformat": 4, 223 | "nbformat_minor": 2 224 | } 225 | -------------------------------------------------------------------------------- /06_rnns/Math.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "$$ abcdb \\rightarrow \n", 8 | "\\begin{bmatrix} \\begin{bmatrix} 1 \\\\ 0 \\\\ 0 \\\\ 0 \\end{bmatrix} & \\begin{bmatrix} 0 \\\\ 1 \\\\ 0 \\\\ 0 \\end{bmatrix} & \\begin{bmatrix} 0 \\\\ 0 \\\\ 1 \\\\ 0 \\end{bmatrix} & \\begin{bmatrix} 0 \\\\ 0 \\\\ 0 \\\\ 1 \\end{bmatrix} & \\begin{bmatrix} 0 \\\\ 1 \\\\ 0 \\\\ 0 \\end{bmatrix} \\end{bmatrix} = \\begin{bmatrix} 1 & 0 & 0 & 0 & 0 \\\\ 0 & 1 & 0 & 0 & 1 \\\\ 0 & 0 & 1 & 0 & 0 \\\\ 0 & 0 & 0 & 1 & 0 \\end{bmatrix} $$\n", 9 | "\n", 10 | "$$ bcdba \\rightarrow \\begin{bmatrix} \n", 11 | "\\begin{bmatrix} 0 \\\\ 1 \\\\ 0 \\\\ 0 \\end{bmatrix} & \\begin{bmatrix} 0 \\\\ 0 \\\\ 1 \\\\ 0 \\end{bmatrix} & \\begin{bmatrix} 0 \\\\ 0 \\\\ 0 \\\\ 1 \\end{bmatrix} & \\begin{bmatrix} 0 \\\\ 1 \\\\ 0 \\\\ 0 \\end{bmatrix} & \\begin{bmatrix} 1 \\\\ 0 \\\\ 0 \\\\ 0 \\end{bmatrix} \\end{bmatrix} = \\begin{bmatrix} 0 & 0 & 0 & 0 & 1 \\\\ 1 & 0 & 0 & 1 & 0 \\\\ 0 & 1 & 0 & 0 & 0 \\\\ 0 & 0 & 1 & 0 & 0 \\end{bmatrix} $$" 12 | ] 13 | } 14 | ], 15 | "metadata": { 16 | "kernelspec": { 17 | "display_name": "Python 3", 18 | "language": "python", 19 | "name": "python3" 20 | }, 21 | "language_info": { 22 | "codemirror_mode": { 23 | "name": "ipython", 24 | "version": 3 25 | }, 26 | "file_extension": ".py", 27 | "mimetype": "text/x-python", 28 | "name": "python", 29 | "nbconvert_exporter": "python", 30 | "pygments_lexer": "ipython3", 31 | "version": "3.7.4" 32 | } 33 | }, 34 | "nbformat": 4, 35 | "nbformat_minor": 2 36 | } 37 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Seth Weidman 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Deep Learning From Scratch code 2 | 3 | This repo contains all the code from the book [Deep Learning From Scratch](https://www.amazon.com/Deep-Learning-Scratch-Building-Principles/dp/1492041416), published by O'Reilly in September 2019. 4 | 5 | It was mostly for me to keep the code I was writing for the book organized, but my hope is readers can clone this repo and step through the code systematically themselves to better understand the concepts. 6 | 7 | ## Structure 8 | 9 | Each chapter has two notebooks: a `Code` notebook and a `Math` notebook. Each `Code` notebook contains the Python code for corresponding chapter and can be run start to finish to generate the results from the chapters. The `Math` notebooks were just for me to store the LaTeX equations used in the book, taking advantage of Jupyter's LaTeX rendering functionality. 10 | 11 | ### `lincoln` 12 | 13 | In the notebooks in the Chapters 4, 5, and 7 folders, I import classes from `lincoln`, rather than putting those classes in the Jupyter Notebook itself. `lincoln` is not currently a `pip` installable library; th way I'd recommend to be able to `import` it and run these notebooks is to add a line like the following your `.bashrc` file: 14 | 15 | ```bash 16 | export PYTHONPATH=$PYTHONPATH:/Users/seth/development/DLFS_code/lincoln 17 | ``` 18 | 19 | This will cause Python to search this path for a module called `lincoln` when you run the `import` command (of course, you'll have to replace the path above with the relevant path on your machine once you clone this repo). Then, simply `source` your `.bashrc` file before running the `jupyter notebook` command and you should be good to go. 20 | 21 | ### Chapter 5: Numpy Convolution Demos 22 | 23 | While I don't spend much time delving into the details in the main text of the book, I have implemented the batch, multi-channel convolution operation in pure Numpy (I do describe how to do this and share the code in the book's Appendix). In [this notebook](05_convolutions/Numpy_Convolution_Demos.ipynb), I demonstrate using this operation to train a single layer CNN from scratch in pure Numpy to get over 90% accuracy on MNIST. 24 | -------------------------------------------------------------------------------- /lincoln/.gitignore: -------------------------------------------------------------------------------- 1 | *__pycache__* 2 | *.pyc* 3 | *.ipynb_checkpoints* 4 | *.DS_Store* 5 | *.c 6 | *.so 7 | *.o 8 | *.txt 9 | 10 | *data/* 11 | *.pkl* 12 | 13 | *.pt 14 | *ubyte 15 | -------------------------------------------------------------------------------- /lincoln/LICENSE: -------------------------------------------------------------------------------- 1 | =========== 2 | MIT License 3 | =========== 4 | 5 | Copyright (c) 2018, Seth Weidman & Mat Leonard 6 | 7 | Permission is hereby granted, free of charge, to any person obtaining a copy 8 | of this software and associated documentation files (the "Software"), to deal 9 | in the Software without restriction, including without limitation the rights 10 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | copies of the Software, and to permit persons to whom the Software is 12 | furnished to do so, subject to the following conditions: 13 | 14 | The above copyright notice and this permission notice shall be included in all 15 | copies or substantial portions of the Software. 16 | 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | SOFTWARE. -------------------------------------------------------------------------------- /lincoln/README.md: -------------------------------------------------------------------------------- 1 | # Lincoln 2 | 3 | "A Deep Learning library by the people, for the people." 4 | 5 | ![](lincoln.png) 6 | 7 | ## Description 8 | 9 | "Lincoln" is a minimal Deep Learning library accompanying the book "Deep Learning From Scratch", which will be published by O'Reilly in August 2019. 10 | 11 | It is intended for beginners who want to understand the key components of how Deep Learning works by walking through a clean, minimal implementation. 12 | -------------------------------------------------------------------------------- /lincoln/lincoln.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SethHWeidman/DLFS_code/f4ec4de43049ef990d0f4ddece81223cef3a0e91/lincoln/lincoln.png -------------------------------------------------------------------------------- /lincoln/lincoln/activations.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from lincoln import base 3 | 4 | 5 | class Linear(base.Operation): 6 | """ 7 | Linear activation function 8 | """ 9 | 10 | def __init__(self) -> None: 11 | super().__init__() 12 | 13 | def _output(self) -> np.ndarray: 14 | return self.input_ 15 | 16 | def _input_grad(self, output_grad: np.ndarray) -> np.ndarray: 17 | return output_grad 18 | 19 | 20 | class Sigmoid(base.Operation): 21 | """ 22 | Sigmoid activation function 23 | """ 24 | 25 | def __init__(self) -> None: 26 | super().__init__() 27 | 28 | def _output(self) -> np.ndarray: 29 | return 1.0 / (1.0 + np.exp(-1.0 * self.input_)) 30 | 31 | def _input_grad(self, output_grad: np.ndarray) -> np.ndarray: 32 | sigmoid_backward = self.output * (1.0 - self.output) 33 | input_grad = sigmoid_backward * output_grad 34 | return input_grad 35 | 36 | 37 | class Tanh(base.Operation): 38 | """ 39 | Hyperbolic tangent activation function 40 | """ 41 | 42 | def __init__(self) -> None: 43 | super().__init__() 44 | 45 | def _output(self) -> np.ndarray: 46 | return np.tanh(self.input_) 47 | 48 | def _input_grad(self, output_grad: np.ndarray) -> np.ndarray: 49 | 50 | return output_grad * (1 - self.output * self.output) 51 | 52 | 53 | class ReLU(base.Operation): 54 | """ 55 | Hyperbolic tangent activation function 56 | """ 57 | 58 | def __init__(self) -> None: 59 | super().__init__() 60 | 61 | def _output(self) -> np.ndarray: 62 | return np.clip(self.input_, 0, None) 63 | 64 | def _input_grad(self, output_grad: np.ndarray) -> np.ndarray: 65 | 66 | mask = self.output >= 0 67 | return output_grad * mask 68 | -------------------------------------------------------------------------------- /lincoln/lincoln/base.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from lincoln.utils import np_utils 4 | 5 | 6 | class Operation(object): 7 | 8 | def __init__(self): 9 | pass 10 | 11 | def forward(self, input_: np.ndarray) -> np.ndarray: 12 | 13 | self.input_ = input_ 14 | 15 | self.output = self._output() 16 | 17 | return self.output 18 | 19 | def backward(self, output_grad: np.ndarray) -> np.ndarray: 20 | 21 | np_utils.assert_same_shape(self.output, output_grad) 22 | 23 | self.input_grad = self._input_grad(output_grad) 24 | 25 | np_utils.assert_same_shape(self.input_, self.input_grad) 26 | 27 | return self.input_grad 28 | 29 | def _output(self) -> np.ndarray: 30 | raise NotImplementedError() 31 | 32 | def _input_grad(self, output_grad: np.ndarray) -> np.ndarray: 33 | raise NotImplementedError() 34 | 35 | 36 | class ParamOperation(Operation): 37 | 38 | def __init__(self, param: np.ndarray) -> np.ndarray: 39 | super().__init__() 40 | self.param = param 41 | 42 | def backward(self, output_grad: np.ndarray) -> np.ndarray: 43 | 44 | np_utils.assert_same_shape(self.output, output_grad) 45 | 46 | self.input_grad = self._input_grad(output_grad) 47 | self.param_grad = self._param_grad(output_grad) 48 | 49 | np_utils.assert_same_shape(self.input_, self.input_grad) 50 | 51 | return self.input_grad 52 | 53 | def _param_grad(self, output_grad: np.ndarray) -> np.ndarray: 54 | raise NotImplementedError() 55 | -------------------------------------------------------------------------------- /lincoln/lincoln/conv.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from lincoln import base 4 | 5 | 6 | class Conv2D_Op(base.ParamOperation): 7 | 8 | def __init__(self, W: np.ndarray): 9 | super().__init__(W) 10 | self.param_size = W.shape[2] 11 | self.param_pad = self.param_size // 2 12 | 13 | def _pad_1d(self, inp: np.ndarray) -> np.ndarray: 14 | z = np.array([0]) 15 | z = np.repeat(z, self.param_pad) 16 | return np.concatenate([z, inp, z]) 17 | 18 | def _pad_1d_batch(self, inp: np.ndarray) -> np.ndarray: 19 | outs = [self._pad_1d(obs) for obs in inp] 20 | return np.stack(outs) 21 | 22 | def _pad_2d_obs(self, inp: np.ndarray): 23 | """ 24 | Input is a 2 dimensional, square, 2D Tensor 25 | """ 26 | inp_pad = self._pad_1d_batch(inp) 27 | 28 | other = np.zeros((self.param_pad, inp.shape[0] + self.param_pad * 2)) 29 | 30 | return np.concatenate([other, inp_pad, other]) 31 | 32 | def _pad_2d_channel(self, inp: np.ndarray): 33 | """ 34 | inp has dimension [num_channels, image_width, image_height] 35 | """ 36 | return np.stack([self._pad_2d_obs(channel) for channel in inp]) 37 | 38 | def _get_image_patches(self, input_: np.ndarray): 39 | imgs_batch_pad = np.stack([self._pad_2d_channel(obs) for obs in input_]) 40 | patches = [] 41 | img_height = imgs_batch_pad.shape[2] 42 | for h in range(img_height - self.param_size + 1): 43 | for w in range(img_height - self.param_size + 1): 44 | patch = imgs_batch_pad[:, :, h : h + self.param_size, w : w + self.param_size] 45 | patches.append(patch) 46 | return np.stack(patches) 47 | 48 | def _output(self): 49 | """ 50 | conv_in: [batch_size, channels, img_width, img_height] 51 | param: [in_channels, out_channels, fil_width, fil_height] 52 | """ 53 | # assert_dim(obs, 4) 54 | # assert_dim(param, 4) 55 | batch_size = self.input_.shape[0] 56 | img_height = self.input_.shape[2] 57 | img_size = self.input_.shape[2] * self.input_.shape[3] 58 | patch_size = self.param.shape[0] * self.param.shape[2] * self.param.shape[3] 59 | 60 | patches = self._get_image_patches(self.input_) 61 | 62 | patches_reshaped = patches.transpose(1, 0, 2, 3, 4).reshape(batch_size, img_size, -1) 63 | 64 | param_reshaped = self.param.transpose(0, 2, 3, 1).reshape(patch_size, -1) 65 | 66 | output_reshaped = ( 67 | np.matmul(patches_reshaped, param_reshaped) 68 | .reshape(batch_size, img_height, img_height, -1) 69 | .transpose(0, 3, 1, 2) 70 | ) 71 | 72 | return output_reshaped 73 | 74 | def _input_grad(self, output_grad: np.ndarray) -> np.ndarray: 75 | 76 | batch_size = self.input_.shape[0] 77 | img_size = self.input_.shape[2] * self.input_.shape[3] 78 | img_height = self.input_.shape[2] 79 | 80 | output_patches = ( 81 | self._get_image_patches(output_grad) 82 | .transpose(1, 0, 2, 3, 4) 83 | .reshape(batch_size * img_size, -1) 84 | ) 85 | 86 | param_reshaped = self.param.reshape(self.param.shape[0], -1).transpose(1, 0) 87 | 88 | return ( 89 | np.matmul(output_patches, param_reshaped) 90 | .reshape(batch_size, img_height, img_height, self.param.shape[0]) 91 | .transpose(0, 3, 1, 2) 92 | ) 93 | 94 | def _param_grad(self, output_grad: np.ndarray) -> np.ndarray: 95 | 96 | batch_size = self.input_.shape[0] 97 | img_size = self.input_.shape[2] * self.input_.shape[3] 98 | in_channels = self.param.shape[0] 99 | out_channels = self.param.shape[1] 100 | 101 | in_patches_reshape = ( 102 | self._get_image_patches(self.input_).reshape(batch_size * img_size, -1).transpose(1, 0) 103 | ) 104 | 105 | out_grad_reshape = output_grad.transpose(0, 2, 3, 1).reshape(batch_size * img_size, -1) 106 | 107 | return ( 108 | np.matmul(in_patches_reshape, out_grad_reshape) 109 | .reshape(in_channels, self.param_size, self.param_size, out_channels) 110 | .transpose(0, 3, 1, 2) 111 | ) 112 | -------------------------------------------------------------------------------- /lincoln/lincoln/dense.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from lincoln import base 4 | 5 | 6 | class WeightMultiply(base.ParamOperation): 7 | 8 | def __init__(self, W: np.ndarray): 9 | super().__init__(W) 10 | 11 | def _output(self) -> np.ndarray: 12 | return np.matmul(self.input_, self.param) 13 | 14 | def _input_grad(self, output_grad: np.ndarray) -> np.ndarray: 15 | return np.matmul(output_grad, self.param.transpose(1, 0)) 16 | 17 | def _param_grad(self, output_grad: np.ndarray) -> np.ndarray: 18 | return np.matmul(self.input_.transpose(1, 0), output_grad) 19 | 20 | 21 | class BiasAdd(base.ParamOperation): 22 | 23 | def __init__(self, B: np.ndarray): 24 | super().__init__(B) 25 | 26 | def _output(self) -> np.ndarray: 27 | return self.input_ + self.param 28 | 29 | def _input_grad(self, output_grad: np.ndarray) -> np.ndarray: 30 | return np.ones_like(self.input_) * output_grad 31 | 32 | def _param_grad(self, output_grad: np.ndarray) -> np.ndarray: 33 | output_grad_reshape = np.sum(output_grad, axis=0).reshape(1, -1) 34 | param_grad = np.ones_like(self.param) 35 | return param_grad * output_grad_reshape 36 | -------------------------------------------------------------------------------- /lincoln/lincoln/layers.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | import numpy as np 4 | 5 | from lincoln import activations 6 | from lincoln import base 7 | from lincoln import conv 8 | from lincoln import dense 9 | from lincoln import reshape 10 | from lincoln.utils import np_utils 11 | 12 | 13 | class Layer(object): 14 | 15 | def __init__(self, neurons: int) -> None: 16 | self.neurons = neurons 17 | self.first = True 18 | self.params: List[np.ndarray] = [] 19 | self.param_grads: List[np.ndarray] = [] 20 | self.operations: List[base.Operation] = [] 21 | 22 | def _setup_layer(self, input_: np.ndarray) -> None: 23 | pass 24 | 25 | def forward(self, input_: np.ndarray) -> np.ndarray: 26 | 27 | if self.first: 28 | self._setup_layer(input_) 29 | self.first = False 30 | 31 | self.input_ = input_ 32 | 33 | for operation in self.operations: 34 | 35 | input_ = operation.forward(input_) 36 | 37 | self.output = input_ 38 | 39 | return self.output 40 | 41 | def backward(self, output_grad: np.ndarray) -> np.ndarray: 42 | 43 | np_utils.assert_same_shape(self.output, output_grad) 44 | 45 | for operation in self.operations[::-1]: 46 | output_grad = operation.backward(output_grad) 47 | 48 | input_grad = output_grad 49 | 50 | np_utils.assert_same_shape(self.input_, input_grad) 51 | 52 | self._param_grads() 53 | 54 | return input_grad 55 | 56 | def _param_grads(self) -> None: 57 | 58 | self.param_grads = [] 59 | for operation in self.operations: 60 | if issubclass(operation.__class__, base.ParamOperation): 61 | self.param_grads.append(operation.param_grad) 62 | 63 | def _params(self) -> None: 64 | 65 | self.params = [] 66 | for operation in self.operations: 67 | if issubclass(operation.__class__, base.ParamOperation): 68 | self.params.append(operation.param) 69 | 70 | 71 | class Dense(Layer): 72 | 73 | def __init__( 74 | self, 75 | neurons: int, 76 | activation: base.Operation = activations.Linear(), 77 | conv_in: bool = False, 78 | weight_init: str = "standard", 79 | ) -> None: 80 | super().__init__(neurons) 81 | self.activation = activation 82 | self.conv_in = conv_in 83 | self.dropout = dropout 84 | self.weight_init = weight_init 85 | 86 | def _setup_layer(self, input_: np.ndarray) -> None: 87 | np.random.seed(self.seed) 88 | num_in = input_.shape[1] 89 | 90 | if self.weight_init == "glorot": 91 | scale = np.sqrt(2 / (num_in + self.neurons)) 92 | else: 93 | scale = 1.0 94 | 95 | # weights 96 | self.params = [] 97 | self.params.append(np.random.normal(loc=0, scale=scale, size=(num_in, self.neurons))) 98 | 99 | # bias 100 | self.params.append(np.random.normal(loc=0, scale=scale, size=(1, self.neurons))) 101 | 102 | self.operations = [ 103 | dense.WeightMultiply(self.params[0]), 104 | dense.BiasAdd(self.params[1]), 105 | self.activation, 106 | ] 107 | 108 | return None 109 | 110 | 111 | class Conv2D(Layer): 112 | """ 113 | Once we define all the Operations and the outline of a layer, 114 | all that remains to implement here is the _setup_layer function! 115 | """ 116 | 117 | def __init__( 118 | self, 119 | out_channels: int, 120 | param_size: int, 121 | dropout: int = 1.0, 122 | weight_init: str = "normal", 123 | activation: base.Operation = activations.Linear(), 124 | flatten: bool = False, 125 | ) -> None: 126 | super().__init__(out_channels) 127 | self.param_size = param_size 128 | self.activation = activation 129 | self.flatten = flatten 130 | self.dropout = dropout 131 | self.weight_init = weight_init 132 | self.out_channels = out_channels 133 | 134 | def _setup_layer(self, input_: np.ndarray) -> np.ndarray: 135 | 136 | self.params = [] 137 | in_channels = input_.shape[1] 138 | 139 | if self.weight_init == "glorot": 140 | scale = 2 / (in_channels + self.out_channels) 141 | else: 142 | scale = 1.0 143 | 144 | conv_param = np.random.normal( 145 | loc=0, 146 | scale=scale, 147 | size=( 148 | input_.shape[1], # input channels 149 | self.out_channels, 150 | self.param_size, 151 | self.param_size, 152 | ), 153 | ) 154 | 155 | self.params.append(conv_param) 156 | 157 | self.operations = [] 158 | self.operations.append(conv.Conv2D_Op(conv_param)) 159 | self.operations.append(self.activation) 160 | 161 | if self.flatten: 162 | self.operations.append(reshape.Flatten()) 163 | 164 | if self.dropout < 1.0: 165 | self.operations.append(dropout.Dropout(self.dropout)) 166 | 167 | return None 168 | -------------------------------------------------------------------------------- /lincoln/lincoln/losses.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from lincoln.utils import np_utils 4 | 5 | 6 | class Loss(object): 7 | 8 | def __init__(self): 9 | pass 10 | 11 | def forward(self, prediction: np.ndarray, target: np.ndarray) -> float: 12 | 13 | # batch size x num_classes 14 | np_utils.assert_same_shape(prediction, target) 15 | 16 | self.prediction = prediction 17 | self.target = target 18 | 19 | self.output = self._output() 20 | 21 | return self.output 22 | 23 | def backward(self) -> np.ndarray: 24 | 25 | self.input_grad = self._input_grad() 26 | 27 | np_utils.assert_same_shape(self.prediction, self.input_grad) 28 | 29 | return self.input_grad 30 | 31 | def _output(self) -> float: 32 | raise NotImplementedError() 33 | 34 | def _input_grad(self) -> np.ndarray: 35 | raise NotImplementedError() 36 | 37 | 38 | class MeanSquaredError(Loss): 39 | 40 | def __init__(self, normalize: bool = False) -> None: 41 | super().__init__() 42 | self.normalize = normalize 43 | 44 | def _output(self) -> float: 45 | 46 | if self.normalize: 47 | self.prediction = self.prediction / self.prediction.sum(axis=1, keepdims=True) 48 | 49 | loss = np.sum(np.power(self.prediction - self.target, 2)) / self.prediction.shape[0] 50 | 51 | return loss 52 | 53 | def _input_grad(self) -> np.ndarray: 54 | 55 | return 2.0 * (self.prediction - self.target) / self.prediction.shape[0] 56 | 57 | 58 | class SoftmaxCrossEntropy(Loss): 59 | def __init__(self, eps: float = 1e-9) -> None: 60 | super().__init__() 61 | self.eps = eps 62 | self.single_class = False 63 | 64 | def _output(self) -> float: 65 | 66 | # if the network is just outputting probabilities 67 | # of just belonging to one class: 68 | if self.target.shape[1] == 0: 69 | self.single_class = True 70 | 71 | # if "single_class", apply the "normalize" operation defined above: 72 | if self.single_class: 73 | self.prediction, self.target = np_utils.normalize(self.prediction), np_utils.normalize( 74 | self.target 75 | ) 76 | 77 | # applying the softmax function to each row (observation) 78 | softmax_preds = np_utils.softmax(self.prediction, axis=1) 79 | 80 | # clipping the softmax output to prevent numeric instability 81 | self.softmax_preds = np.clip(softmax_preds, self.eps, 1 - self.eps) 82 | 83 | # actual loss computation 84 | softmax_cross_entropy_loss = -1.0 * self.target * np.log(self.softmax_preds) - ( 85 | 1.0 - self.target 86 | ) * np.log(1 - self.softmax_preds) 87 | 88 | return np.sum(softmax_cross_entropy_loss) / self.prediction.shape[0] 89 | 90 | def _input_grad(self) -> np.ndarray: 91 | 92 | # if "single_class", "un-normalize" probabilities before returning gradient: 93 | if self.single_class: 94 | return np_utils.unnormalize(self.softmax_preds - self.target) 95 | else: 96 | return (self.softmax_preds - self.target) / self.prediction.shape[0] 97 | -------------------------------------------------------------------------------- /lincoln/lincoln/network.py: -------------------------------------------------------------------------------- 1 | import typing 2 | import numpy as np 3 | 4 | from lincoln import layers 5 | from lincoln import losses 6 | 7 | 8 | class LayerBlock(object): 9 | 10 | def __init__(self, layers: typing.List[layers.Layer]): 11 | super().__init__() 12 | self.layers = layers 13 | 14 | def forward(self, X_batch: np.ndarray) -> np.ndarray: 15 | 16 | X_out = X_batch 17 | for layer in self.layers: 18 | X_out = layer.forward(X_out) 19 | 20 | return X_out 21 | 22 | def backward(self, loss_grad: np.ndarray) -> np.ndarray: 23 | 24 | grad = loss_grad 25 | for layer in reversed(self.layers): 26 | grad = layer.backward(grad) 27 | 28 | return grad 29 | 30 | def params(self): 31 | for layer in self.layers: 32 | yield from layer.params 33 | 34 | def param_grads(self): 35 | for layer in self.layers: 36 | yield from layer.param_grads 37 | 38 | def __iter__(self): 39 | return iter(self.layers) 40 | 41 | def __repr__(self): 42 | layer_strs = [str(layer) for layer in self.layers] 43 | return f"{self.__class__.__name__}(\n " + ",\n ".join(layer_strs) + ")" 44 | 45 | 46 | class NeuralNetwork(LayerBlock): 47 | """ 48 | Just a list of layers that runs forwards and backwards 49 | """ 50 | 51 | def __init__( 52 | self, 53 | layers: typing.List[layers.Layer], 54 | loss: losses.Loss = losses.MeanSquaredError, 55 | seed: int = 1, 56 | ): 57 | super().__init__(layers) 58 | self.loss = loss 59 | self.seed = seed 60 | if seed: 61 | for layer in self.layers: 62 | setattr(layer, "seed", self.seed) 63 | 64 | def forward_loss(self, X_batch: np.ndarray, y_batch: np.ndarray) -> float: 65 | 66 | prediction = self.forward(X_batch) 67 | return self.loss.forward(prediction, y_batch) 68 | 69 | def train_batch(self, X_batch: np.ndarray, y_batch: np.ndarray) -> float: 70 | 71 | prediction = self.forward(X_batch) 72 | 73 | batch_loss = self.loss.forward(prediction, y_batch) 74 | loss_grad = self.loss.backward() 75 | 76 | self.backward(loss_grad) 77 | 78 | return batch_loss 79 | -------------------------------------------------------------------------------- /lincoln/lincoln/optimizers.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class Optimizer(object): 5 | def __init__(self, lr: float = 0.01, final_lr: float = 0, decay_type: str = None) -> None: 6 | self.lr = lr 7 | self.final_lr = final_lr 8 | self.decay_type = decay_type 9 | self.first = True 10 | 11 | def _setup_decay(self) -> None: 12 | 13 | if not self.decay_type: 14 | return 15 | elif self.decay_type == "exponential": 16 | self.decay_per_epoch = np.power(self.final_lr / self.lr, 1.0 / (self.max_epochs - 1)) 17 | elif self.decay_type == "linear": 18 | self.decay_per_epoch = (self.lr - self.final_lr) / (self.max_epochs - 1) 19 | 20 | def _decay_lr(self) -> None: 21 | 22 | if not self.decay_type: 23 | return 24 | 25 | if self.decay_type == "exponential": 26 | self.lr *= self.decay_per_epoch 27 | 28 | elif self.decay_type == "linear": 29 | self.lr -= self.decay_per_epoch 30 | 31 | def step(self) -> None: 32 | 33 | for param, param_grad in zip(self.net.params(), self.net.param_grads()): 34 | self._update_rule(param=param, grad=param_grad) 35 | 36 | def _update_rule(self, **kwargs) -> None: 37 | raise NotImplementedError() 38 | 39 | 40 | class SGD(Optimizer): 41 | def __init__(self, lr: float = 0.01, final_lr: float = 0, decay_type: str = None) -> None: 42 | super().__init__(lr, final_lr, decay_type) 43 | 44 | def _update_rule(self, **kwargs) -> None: 45 | 46 | update = self.lr * kwargs["grad"] 47 | kwargs["param"] -= update 48 | 49 | 50 | class SGDMomentum(Optimizer): 51 | def __init__( 52 | self, lr: float = 0.01, final_lr: float = 0, decay_type: str = None, momentum: float = 0.9 53 | ) -> None: 54 | super().__init__(lr, final_lr, decay_type) 55 | self.momentum = momentum 56 | 57 | def step(self) -> None: 58 | if self.first: 59 | self.velocities = [np.zeros_like(param) for param in self.net.params()] 60 | self.first = False 61 | 62 | for param, param_grad, velocity in zip( 63 | self.net.params(), self.net.param_grads(), self.velocities 64 | ): 65 | self._update_rule(param=param, grad=param_grad, velocity=velocity) 66 | 67 | def _update_rule(self, **kwargs) -> None: 68 | 69 | # Update velocity 70 | kwargs["velocity"] *= self.momentum 71 | kwargs["velocity"] += self.lr * kwargs["grad"] 72 | 73 | # Use this to update parameters 74 | kwargs["param"] -= kwargs["velocity"] 75 | 76 | 77 | class AdaGrad(Optimizer): 78 | def __init__( 79 | self, lr: float = 0.01, final_lr_exp: float = 0, final_lr_linear: float = 0 80 | ) -> None: 81 | super().__init__(lr, final_lr_exp, final_lr_linear) 82 | self.eps = 1e-7 83 | 84 | def step(self) -> None: 85 | if self.first: 86 | self.sum_squares = [np.zeros_like(param) for param in self.net.params()] 87 | self.first = False 88 | 89 | for param, param_grad, sum_square in zip( 90 | self.net.params(), self.net.param_grads(), self.sum_squares 91 | ): 92 | self._update_rule(param=param, grad=param_grad, sum_square=sum_square) 93 | 94 | def _update_rule(self, **kwargs) -> None: 95 | 96 | # Update running sum of squares 97 | kwargs["sum_square"] += self.eps + np.power(kwargs["grad"], 2) 98 | 99 | # Scale learning rate by running sum of squareds=5 100 | lr = np.divide(self.lr, np.sqrt(kwargs["sum_square"])) 101 | 102 | # Use this to update parameters 103 | kwargs["param"] -= lr * kwargs["grad"] 104 | 105 | 106 | class RegularizedSGD(Optimizer): 107 | def __init__(self, lr: float = 0.01, alpha: float = 0.1) -> None: 108 | super().__init__() 109 | self.lr = lr 110 | self.alpha = alpha 111 | 112 | def step(self) -> None: 113 | 114 | for param, param_grad in zip(self.net.params(), self.net.param_grads()): 115 | 116 | self._update_rule(param=param, grad=param_grad) 117 | 118 | def _update_rule(self, **kwargs) -> None: 119 | 120 | # Use this to update parameters 121 | kwargs["param"] -= self.lr * kwargs["grad"] + self.alpha * kwargs["param"] 122 | -------------------------------------------------------------------------------- /lincoln/lincoln/pytorch/layers.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | from torch import Tensor 3 | 4 | 5 | def inference_mode(m: nn.Module): 6 | m.eval() 7 | 8 | 9 | class PyTorchLayer(nn.Module): 10 | 11 | def __init__(self) -> None: 12 | super().__init__() 13 | 14 | def forward(self, x: Tensor, inference: bool = False) -> Tensor: 15 | raise NotImplementedError() 16 | 17 | 18 | class DenseLayer(PyTorchLayer): 19 | def __init__( 20 | self, 21 | input_size: int, 22 | neurons: int, 23 | dropout: float = 1.0, 24 | activation: nn.Module = None, 25 | ) -> None: 26 | 27 | super().__init__() 28 | self.linear = nn.Linear(input_size, neurons) 29 | self.activation = activation 30 | if dropout < 1.0: 31 | self.dropout = nn.Dropout(1 - dropout) 32 | 33 | def forward(self, x: Tensor, inference: bool = False) -> Tensor: 34 | if inference: 35 | self.apply(inference_mode) 36 | 37 | x = self.linear(x) # does weight multiplication + bias 38 | if self.activation: 39 | x = self.activation(x) 40 | if hasattr(self, "dropout"): 41 | x = self.dropout(x) 42 | 43 | return x 44 | -------------------------------------------------------------------------------- /lincoln/lincoln/pytorch/model.py: -------------------------------------------------------------------------------- 1 | from typing import Tuple 2 | 3 | from torch import nn, Tensor 4 | 5 | 6 | class PyTorchModel(nn.Module): 7 | 8 | def __init__(self) -> None: 9 | super().__init__() 10 | 11 | def forward(self, x: Tensor) -> Tuple[Tensor]: 12 | raise NotImplementedError() 13 | -------------------------------------------------------------------------------- /lincoln/lincoln/pytorch/preprocessor.py: -------------------------------------------------------------------------------- 1 | from torch import Tensor 2 | 3 | 4 | class PyTorchPreprocessor(): 5 | def __init__(self): 6 | pass 7 | 8 | def transform(self, x: Tensor) -> Tensor: 9 | raise NotImplementedError() 10 | 11 | 12 | class ConvNetPreprocessor(PyTorchPreprocessor): 13 | def __init__(self): 14 | pass 15 | 16 | def transform(self, x: Tensor) -> Tensor: 17 | return x.permute(0, 3, 1, 2) 18 | -------------------------------------------------------------------------------- /lincoln/lincoln/pytorch/train.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, Tuple 2 | 3 | import torch 4 | from torch import Tensor 5 | from torch.optim import Optimizer 6 | from torch.optim import lr_scheduler 7 | from torch.nn.modules.loss import _Loss 8 | from torch.utils.data import DataLoader 9 | 10 | from .utils import permute_data 11 | from .model import PyTorchModel 12 | 13 | 14 | class PyTorchTrainer(object): 15 | def __init__(self, 16 | model: PyTorchModel, 17 | optim: Optimizer, 18 | criterion: _Loss): 19 | self.model = model 20 | self.optim = optim 21 | self.loss = criterion 22 | self._check_optim_net_aligned() 23 | 24 | def _check_optim_net_aligned(self): 25 | assert self.optim.param_groups[0]['params']\ 26 | == list(self.model.parameters()) 27 | 28 | def _generate_batches(self, 29 | X: Tensor, 30 | y: Tensor, 31 | size: int = 32) -> Tuple[Tensor]: 32 | 33 | N = X.shape[0] 34 | 35 | for ii in range(0, N, size): 36 | X_batch, y_batch = X[ii:ii+size], y[ii:ii+size] 37 | 38 | yield X_batch, y_batch 39 | 40 | def fit(self, X_train: Tensor = None, 41 | y_train: Tensor = None, 42 | X_test: Tensor = None, 43 | y_test: Tensor = None, 44 | train_dataloader: DataLoader = None, 45 | test_dataloader: DataLoader = None, 46 | epochs: int=100, 47 | eval_every: int=10, 48 | batch_size: int=32, 49 | final_lr_exp: int = None): 50 | 51 | init_lr = self.optim.param_groups[0]['lr'] 52 | if final_lr_exp: 53 | decay = (final_lr_exp / init_lr) ** (1.0 / (epochs + 1)) 54 | scheduler = lr_scheduler.ExponentialLR(self.optim, gamma=decay) 55 | for e in range(epochs): 56 | 57 | if final_lr_exp: 58 | scheduler.step() 59 | 60 | if not train_dataloader: 61 | X_train, y_train = permute_data(X_train, y_train) 62 | 63 | batch_generator = self._generate_batches(X_train, y_train, 64 | batch_size) 65 | 66 | self.model.train() 67 | 68 | for ii, (X_batch, y_batch) in enumerate(batch_generator): 69 | 70 | self.optim.zero_grad() # zero the gradient buffers 71 | 72 | output = self.model(X_batch)[0] 73 | 74 | loss = self.loss(output, y_batch) 75 | loss.backward() 76 | self.optim.step() 77 | 78 | if e % eval_every == 0: 79 | with torch.no_grad(): 80 | self.model.eval() 81 | output = self.model(X_test)[0] 82 | loss = self.loss(output, y_test) 83 | print("The loss after", e+1, "epochs was", loss.item()) 84 | 85 | else: 86 | for X_batch, y_batch in train_dataloader: 87 | 88 | self.optim.zero_grad() 89 | 90 | output = self.model(X_batch)[0] 91 | 92 | loss = self.loss(output, y_batch) 93 | loss.backward() 94 | self.optim.step() 95 | 96 | if e % eval_every == 0: 97 | with torch.no_grad(): 98 | self.model.eval() 99 | losses = [] 100 | for X_batch, y_batch in test_dataloader: 101 | output = self.model(X_batch)[0] 102 | loss = self.loss(output, y_batch) 103 | losses.append(loss.item()) 104 | print("The loss after", e, "epochs was", 105 | round(torch.Tensor(losses).mean().item(), 4)) 106 | -------------------------------------------------------------------------------- /lincoln/lincoln/pytorch/utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import Tensor 3 | 4 | from typing import Tuple 5 | 6 | 7 | def permute_data(X: Tensor, y: Tensor, seed=1) -> Tuple[Tensor]: 8 | perm = torch.randperm(X.shape[0]) 9 | return X[perm], y[perm] 10 | 11 | 12 | def assert_dim(t: Tensor, 13 | dim: Tensor): 14 | assert len(t.shape) == dim, \ 15 | ''' 16 | Tensor expected to have dimension {0}, instead has dimension {1} 17 | '''.format(dim, len(t.shape)) 18 | return None 19 | -------------------------------------------------------------------------------- /lincoln/lincoln/reshape.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from lincoln import base 4 | 5 | 6 | class Flatten(base.Operation): 7 | def __init__(self): 8 | super().__init__() 9 | 10 | def _output(self) -> np.ndarray: 11 | return self.input_.reshape(self.input_.shape[0], -1) 12 | 13 | def _input_grad(self, output_grad: np.ndarray) -> np.ndarray: 14 | return output_grad.reshape(self.input_.shape) 15 | -------------------------------------------------------------------------------- /lincoln/lincoln/train.py: -------------------------------------------------------------------------------- 1 | import typing 2 | from copy import deepcopy 3 | 4 | import numpy as np 5 | 6 | from lincoln import network 7 | from lincoln import optimizers 8 | from lincoln.utils import np_utils 9 | 10 | 11 | class Trainer(object): 12 | """ 13 | Just a list of layers that runs forwards and backwards 14 | """ 15 | 16 | def __init__(self, net: network.NeuralNetwork, optim: optimizers.Optimizer) -> None: 17 | self.net = net 18 | self.optim = optim 19 | self.best_loss = 1e9 20 | setattr(self.optim, "net", self.net) 21 | 22 | def fit( 23 | self, 24 | X_train: np.ndarray, 25 | y_train: np.ndarray, 26 | X_test: np.ndarray, 27 | y_test: np.ndarray, 28 | epochs: int = 100, 29 | eval_every: int = 10, 30 | batch_size: int = 32, 31 | seed: int = 1, 32 | restart: bool = True, 33 | early_stopping: bool = True, 34 | conv_testing: bool = False, 35 | ) -> None: 36 | 37 | setattr(self.optim, "max_epochs", epochs) 38 | self.optim._setup_decay() 39 | 40 | np.random.seed(seed) 41 | if restart: 42 | for layer in self.net.layers: 43 | layer.first = True 44 | 45 | self.best_loss = 1e9 46 | 47 | for e in range(epochs): 48 | 49 | if (e + 1) % eval_every == 0: 50 | 51 | last_model = deepcopy(self.net) 52 | 53 | X_train, y_train = np_utils.permute_data(X_train, y_train) 54 | 55 | batch_generator = self.generate_batches(X_train, y_train, batch_size) 56 | 57 | for ii, (X_batch, y_batch) in enumerate(batch_generator): 58 | 59 | self.net.train_batch(X_batch, y_batch) 60 | 61 | self.optim.step() 62 | 63 | if conv_testing: 64 | if ii % 10 == 0: 65 | test_preds = self.net.forward(X_batch) 66 | batch_loss = self.net.loss.forward(test_preds, y_batch) 67 | print("batch", ii, "loss", batch_loss) 68 | 69 | if ii % 100 == 0 and ii > 0: 70 | print( 71 | "Validation accuracy after", 72 | ii, 73 | "batches is", 74 | """{0:.2f}%""".format( 75 | np.equal( 76 | np.argmax(self.net.forward(X_test), axis=1), 77 | np.argmax(y_test, axis=1), 78 | ).sum() 79 | * 100.0 80 | / X_test.shape[0] 81 | ), 82 | ) 83 | 84 | if (e + 1) % eval_every == 0: 85 | 86 | test_preds = self.net.forward(X_test) 87 | loss = self.net.loss.forward(test_preds, y_test) 88 | 89 | if early_stopping: 90 | if loss < self.best_loss: 91 | print(f"Validation loss after {e+1} epochs is {loss:.3f}") 92 | self.best_loss = loss 93 | else: 94 | print() 95 | print( 96 | "Loss increased after epoch {0}, final loss was {1:.3f},".format( 97 | e + 1, self.best_loss 98 | ), 99 | "\nusing the model from epoch {0}".format(e + 1 - eval_every), 100 | ) 101 | self.net = last_model 102 | # ensure self.optim is still updating self.net 103 | setattr(self.optim, "net", self.net) 104 | break 105 | else: 106 | print(f"Validation loss after {e+1} epochs is {loss:.3f}") 107 | 108 | if self.optim.final_lr: 109 | self.optim._decay_lr() 110 | 111 | def generate_batches( 112 | self, X: np.ndarray, y: np.ndarray, size: int = 32 113 | ) -> typing.Generator[typing.Tuple[np.ndarray]]: 114 | 115 | assert ( 116 | X.shape[0] == y.shape[0] 117 | ), """ 118 | features and target must have the same number of rows, instead 119 | features has {0} and target has {1} 120 | """.format( 121 | X.shape[0], y.shape[0] 122 | ) 123 | 124 | N = X.shape[0] 125 | 126 | for ii in range(0, N, size): 127 | X_batch, y_batch = X[ii : ii + size], y[ii : ii + size] 128 | 129 | yield X_batch, y_batch 130 | -------------------------------------------------------------------------------- /lincoln/lincoln/utils/mnist.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | # Get the directory where mnist.py is located 4 | MODULE_DIR = os.path.dirname(os.path.abspath(__file__)) 5 | 6 | import pickle 7 | 8 | from torchvision import datasets 9 | 10 | 11 | def download_mnist(): 12 | # Downloads to './data' by default 13 | train_dataset = datasets.MNIST("./data", train=True, download=True) 14 | test_dataset = datasets.MNIST("./data", train=False, download=True) 15 | 16 | # Convert to numpy arrays in the same format as the original code 17 | train_images = train_dataset.data.numpy().reshape(-1, 28 * 28) 18 | train_labels = train_dataset.targets.numpy() 19 | test_images = test_dataset.data.numpy().reshape(-1, 28 * 28) 20 | test_labels = test_dataset.targets.numpy() 21 | 22 | # Save in the same format as the original code 23 | mnist = { 24 | "training_images": train_images, 25 | "training_labels": train_labels, 26 | "test_images": test_images, 27 | "test_labels": test_labels, 28 | } 29 | 30 | # Use absolute path for saving 31 | pkl_path = os.path.join(MODULE_DIR, "mnist.pkl") 32 | with open(pkl_path, "wb") as f: 33 | pickle.dump(mnist, f) 34 | print("Save complete.") 35 | 36 | 37 | def load(): 38 | # Use absolute path for loading 39 | pkl_path = os.path.join(MODULE_DIR, "mnist.pkl") 40 | with open(pkl_path, "rb") as f: 41 | mnist = pickle.load(f) 42 | return ( 43 | mnist["training_images"], 44 | mnist["training_labels"], 45 | mnist["test_images"], 46 | mnist["test_labels"], 47 | ) 48 | 49 | 50 | if __name__ == "__main__": 51 | download_mnist() 52 | -------------------------------------------------------------------------------- /lincoln/lincoln/utils/np_utils.py: -------------------------------------------------------------------------------- 1 | from typing import Tuple 2 | import numpy as np 3 | from scipy import special 4 | 5 | 6 | def to_2d(a: np.ndarray, type: str = "col") -> np.ndarray: 7 | """ 8 | Turns a 1D Tensor into 2D 9 | """ 10 | 11 | assert a.ndim == 1, "Input tensors must be 1 dimensional" 12 | 13 | if type == "col": 14 | return a.reshape(-1, 1) 15 | elif type == "row": 16 | return a.reshape(1, -1) 17 | 18 | 19 | def normalize(a: np.ndarray): 20 | other = 1 - a 21 | return np.concatenate([a, other], axis=1) 22 | 23 | 24 | def unnormalize(a: np.ndarray): 25 | return a[np.newaxis, 0] 26 | 27 | 28 | def permute_data(X: np.ndarray, y: np.ndarray): 29 | perm = np.random.permutation(X.shape[0]) 30 | return X[perm], y[perm] 31 | 32 | 33 | Batch = Tuple[np.ndarray, np.ndarray] 34 | 35 | 36 | def generate_batch( 37 | X: np.ndarray, y: np.ndarray, start: int = 0, batch_size: int = 10 38 | ) -> Batch: 39 | 40 | assert (X.dim() == 2) and (y.dim() == 2), "X and Y must be 2 dimensional" 41 | 42 | if start + batch_size > X.shape[0]: 43 | batch_size = X.shape[0] - start 44 | 45 | X_batch, y_batch = X[start : start + batch_size], y[start : start + batch_size] 46 | 47 | return X_batch, y_batch 48 | 49 | 50 | def assert_same_shape(output: np.ndarray, output_grad: np.ndarray): 51 | assert ( 52 | output.shape == output_grad.shape 53 | ), """ 54 | Two tensors should have the same shape; 55 | instead, first Tensor's shape is {0} 56 | and second Tensor's shape is {1}. 57 | """.format( 58 | tuple(output_grad.shape), tuple(output.shape) 59 | ) 60 | return None 61 | 62 | 63 | def assert_dim(t: np.ndarray, dim: int): 64 | assert ( 65 | t.ndim == dim 66 | ), """ 67 | Tensor expected to have dimension {0}, instead has dimension {1} 68 | """.format( 69 | dim, len(t.shape) 70 | ) 71 | return None 72 | 73 | 74 | def softmax(x: np.ndarray, axis=None) -> np.ndarray: 75 | return np.exp(x - special.logsumexp(x, axis=axis, keepdims=True)) 76 | -------------------------------------------------------------------------------- /lincoln/setup.py: -------------------------------------------------------------------------------- 1 | from distutils.core import setup 2 | from Cython.Build import cythonize 3 | import numpy 4 | 5 | setup( 6 | ext_modules = cythonize("lincoln/operations/conv_cy.pyx"), 7 | include_dirs=[numpy.get_include()] 8 | ) 9 | --------------------------------------------------------------------------------