├── .gitignore ├── .notebooks-setup └── get-started.bash ├── Exercise-02.ipynb ├── Exercise-03.ipynb ├── Exercise-04.ipynb ├── Exercise-05.ipynb ├── Exercise-06.ipynb ├── Exercise-07.ipynb ├── Exercise-08.ipynb ├── Exercise-09.ipynb ├── Exercise-10.ipynb ├── Exercise-11.ipynb ├── Exercise-12.ipynb ├── Exercise-13.ipynb ├── Exercise-14.ipynb ├── Extra-01.ipynb ├── Extra-02.ipynb ├── Extra-03.ipynb ├── LICENSE ├── README.md ├── SETUP.md └── pml_utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | /day1/.ipynb_checkpoints/ 3 | /day2/.ipynb_checkpoints/ 4 | /.ipynb_checkpoints/ 5 | /MNIST/ 6 | __pycache__ 7 | -------------------------------------------------------------------------------- /.notebooks-setup/get-started.bash: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | ## Script that downloads the code for doing the Deep Learning course exercises 3 | cd /home/jovyan 4 | 5 | # git reflog requires a name and email if user is not in passwd 6 | # even if you're only cloning 7 | export GIT_COMMITTER_NAME=anonymous 8 | export GIT_COMMITTER_EMAIL=anon@localhost 9 | 10 | git clone https://github.com/csc-training/python-introduction 11 | git clone https://github.com/csc-training/intro-to-ml 12 | 13 | rmdir work 14 | rm get-started.bash 15 | 16 | # pip install ipython --upgrade 17 | # pip install xgboost scikit-image graphviz 18 | -------------------------------------------------------------------------------- /Exercise-02.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "ein.tags": "worksheet-0", 7 | "slideshow": { 8 | "slide_type": "-" 9 | } 10 | }, 11 | "source": [ 12 | "# MNIST handwritten digits classification with linear methods\n", 13 | "\n", 14 | "In this notebook, we'll classify handwritten digits using linear classifiers and [scikit-learn](https://scikit-learn.org/).\n", 15 | "\n", 16 | "First, the needed imports. " 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": null, 22 | "metadata": { 23 | "autoscroll": false, 24 | "collapsed": false, 25 | "ein.hycell": false, 26 | "ein.tags": "worksheet-0", 27 | "jupyter": { 28 | "outputs_hidden": false 29 | }, 30 | "slideshow": { 31 | "slide_type": "-" 32 | } 33 | }, 34 | "outputs": [], 35 | "source": [ 36 | "%matplotlib inline\n", 37 | "\n", 38 | "from pml_utils import get_mnist\n", 39 | "\n", 40 | "import sklearn\n", 41 | "from sklearn import svm\n", 42 | "from sklearn.linear_model import LogisticRegression\n", 43 | "from sklearn.multiclass import OneVsRestClassifier\n", 44 | "from sklearn.metrics import accuracy_score, confusion_matrix, classification_report\n", 45 | "\n", 46 | "import matplotlib.pyplot as plt\n", 47 | "import seaborn as sns\n", 48 | "sns.set()\n", 49 | "\n", 50 | "from packaging.version import Version\n", 51 | "assert(Version(sklearn.__version__) >= Version(\"0.20\")), \"Version >= 0.20 of sklearn is required.\"" 52 | ] 53 | }, 54 | { 55 | "cell_type": "markdown", 56 | "metadata": {}, 57 | "source": [ 58 | "## MNIST digit data" 59 | ] 60 | }, 61 | { 62 | "cell_type": "markdown", 63 | "metadata": { 64 | "ein.tags": "worksheet-0", 65 | "slideshow": { 66 | "slide_type": "-" 67 | } 68 | }, 69 | "source": [ 70 | "Then we will load the MNIST data. The first time, it will download the data over the network, which can take a while." 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": null, 76 | "metadata": { 77 | "autoscroll": false, 78 | "collapsed": false, 79 | "ein.hycell": false, 80 | "ein.tags": "worksheet-0", 81 | "jupyter": { 82 | "outputs_hidden": false 83 | }, 84 | "slideshow": { 85 | "slide_type": "-" 86 | } 87 | }, 88 | "outputs": [], 89 | "source": [ 90 | "X_train, y_train, X_test, y_test = get_mnist('MNIST')\n", 91 | "\n", 92 | "print('MNIST data loaded: train:',len(X_train),'test:',len(X_test))\n", 93 | "print('X_train:', X_train.shape)\n", 94 | "print('y_train:', y_train.shape)\n", 95 | "print('X_test', X_test.shape)\n", 96 | "print('y_test', y_test.shape)" 97 | ] 98 | }, 99 | { 100 | "cell_type": "markdown", 101 | "metadata": { 102 | "ein.tags": "worksheet-0", 103 | "slideshow": { 104 | "slide_type": "-" 105 | } 106 | }, 107 | "source": [ 108 | "The training data `X_train` is a matrix of size 60000x784, i.e., it consists of 60000 images expressed as vectors of length 784. These vectors are in fact \"flattened\" 28x28 images, where each component corresponds the gray scale value of a pixel (0=black, 0.5=middle gray, 1=white, etc.).\n", 109 | "\n", 110 | "`y_train` is a 60000-dimensional vector containing the correct classes (\"0\", \"1\", ..., \"9\") for each training sample.\n" 111 | ] 112 | }, 113 | { 114 | "cell_type": "markdown", 115 | "metadata": { 116 | "ein.tags": "worksheet-0", 117 | "slideshow": { 118 | "slide_type": "-" 119 | } 120 | }, 121 | "source": [ 122 | "### Plotting images\n", 123 | "\n", 124 | "Let's take a closer look at the MNIST images. Here are the first 10 training digits plotted as images together with the correct class label:" 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": null, 130 | "metadata": { 131 | "autoscroll": false, 132 | "collapsed": false, 133 | "ein.hycell": false, 134 | "ein.tags": "worksheet-0", 135 | "jupyter": { 136 | "outputs_hidden": false 137 | }, 138 | "slideshow": { 139 | "slide_type": "-" 140 | } 141 | }, 142 | "outputs": [], 143 | "source": [ 144 | "pltsize=1\n", 145 | "plt.figure(figsize=(10*pltsize, pltsize))\n", 146 | "\n", 147 | "for i in range(10):\n", 148 | " plt.subplot(1,10,i+1)\n", 149 | " plt.axis('off')\n", 150 | " plt.imshow(X_train[i,:].reshape(28,28), cmap=\"gray\")\n", 151 | " plt.title('Class: '+str(y_train[i]))" 152 | ] 153 | }, 154 | { 155 | "cell_type": "markdown", 156 | "metadata": { 157 | "ein.tags": "worksheet-0", 158 | "slideshow": { 159 | "slide_type": "-" 160 | } 161 | }, 162 | "source": [ 163 | "Note that for each digit we use `reshape(28,28)` to transform the 768-size vector into a 28x28 size image matrix." 164 | ] 165 | }, 166 | { 167 | "cell_type": "markdown", 168 | "metadata": {}, 169 | "source": [ 170 | "## Using scikit-learn\n", 171 | "\n", 172 | "In this course we will be mostly relying on [scikit-learn, a machine learning framework for Python](https://scikit-learn.org/stable/index.html). \n", 173 | "\n", 174 | "In scikit-learn all machine learning models follow the same pattern:\n", 175 | "\n", 176 | "1. First create a model object with the appropriate constructor for the method you are using. Here you can also specify _hyperparameters_ for the method:\n", 177 | "```\n", 178 | "clf = SomeModel(param1=a, param2=b)\n", 179 | "```\n", 180 | "\n", 181 | "\n", 182 | "2. Next, fit your model to the training set (e.g., train your classifier):\n", 183 | "```\n", 184 | "clf.fit(X_train, y_train)\n", 185 | "```\n", 186 | "\n", 187 | "\n", 188 | "3. Finally, for the inference stage (e.g., predict the classes of new unseen items with your trained classifier):\n", 189 | "```\n", 190 | "y_predicted_test = clf.predict(X_test)\n", 191 | "```\n" 192 | ] 193 | }, 194 | { 195 | "cell_type": "markdown", 196 | "metadata": { 197 | "ein.tags": "worksheet-0", 198 | "slideshow": { 199 | "slide_type": "-" 200 | } 201 | }, 202 | "source": [ 203 | "## Logistic regression\n", 204 | "\n", 205 | "Let's start by trying logistic regression with a stochastic gradient descent algorithm. The corresponding scikit-learn class is [LogisticRegression](http://scikit-learn.org/stable/modules/linear_model.html#logistic-regression).\n", 206 | "\n", 207 | "### Learning\n", 208 | "\n", 209 | "We'll actually just use the first 10,000 samples as the method is rather slow. We are using the \"sag\" solver (which is a variant of SGD), and the one-versus-rest strategy for doing multi-class classification." 210 | ] 211 | }, 212 | { 213 | "cell_type": "code", 214 | "execution_count": null, 215 | "metadata": { 216 | "autoscroll": false, 217 | "collapsed": false, 218 | "ein.hycell": false, 219 | "ein.tags": "worksheet-0", 220 | "jupyter": { 221 | "outputs_hidden": false 222 | }, 223 | "slideshow": { 224 | "slide_type": "-" 225 | } 226 | }, 227 | "outputs": [], 228 | "source": [ 229 | "%%time\n", 230 | "\n", 231 | "clf_lr = OneVsRestClassifier(LogisticRegression(solver='sag'))\n", 232 | "clf_lr.fit(X_train[:10000,:], y_train[:10000])" 233 | ] 234 | }, 235 | { 236 | "cell_type": "markdown", 237 | "metadata": { 238 | "ein.tags": "worksheet-0", 239 | "slideshow": { 240 | "slide_type": "-" 241 | } 242 | }, 243 | "source": [ 244 | "### Inference\n", 245 | "\n", 246 | "As the decision boundaries are linear, prediction with logistic regression is fast:" 247 | ] 248 | }, 249 | { 250 | "cell_type": "code", 251 | "execution_count": null, 252 | "metadata": { 253 | "autoscroll": false, 254 | "collapsed": false, 255 | "ein.hycell": false, 256 | "ein.tags": "worksheet-0", 257 | "jupyter": { 258 | "outputs_hidden": false 259 | }, 260 | "slideshow": { 261 | "slide_type": "-" 262 | } 263 | }, 264 | "outputs": [], 265 | "source": [ 266 | "%%time \n", 267 | "\n", 268 | "pred_lr = clf_lr.predict(X_test)\n", 269 | "print('Predicted', len(pred_lr), 'digits with accuracy:', accuracy_score(y_test, pred_lr))" 270 | ] 271 | }, 272 | { 273 | "cell_type": "markdown", 274 | "metadata": {}, 275 | "source": [ 276 | "#### Confusion matrix\n", 277 | "\n", 278 | "We can compute the confusion matrix to see which digits get mixed the most:" 279 | ] 280 | }, 281 | { 282 | "cell_type": "code", 283 | "execution_count": null, 284 | "metadata": { 285 | "collapsed": false, 286 | "jupyter": { 287 | "outputs_hidden": false 288 | } 289 | }, 290 | "outputs": [], 291 | "source": [ 292 | "labels=[str(i) for i in range(10)]\n", 293 | "print('Confusion matrix (rows: true classes; columns: predicted classes):'); print()\n", 294 | "cm=confusion_matrix(y_test, pred_lr, labels=labels)\n", 295 | "print(cm); print()" 296 | ] 297 | }, 298 | { 299 | "cell_type": "markdown", 300 | "metadata": {}, 301 | "source": [ 302 | "If we plot it as an image, we can see it more visually. The matrix looks quite good as most image are on the diagonal, meaning they were classified correctly." 303 | ] 304 | }, 305 | { 306 | "cell_type": "code", 307 | "execution_count": null, 308 | "metadata": {}, 309 | "outputs": [], 310 | "source": [ 311 | "plt.matshow(cm, cmap=plt.cm.gray)\n", 312 | "plt.xticks(range(10))\n", 313 | "plt.yticks(range(10))\n", 314 | "plt.grid(None)\n", 315 | "plt.show()" 316 | ] 317 | }, 318 | { 319 | "cell_type": "markdown", 320 | "metadata": {}, 321 | "source": [ 322 | "#### Accuracy, precision and recall\n", 323 | "\n", 324 | "Classification accuracy for each class:" 325 | ] 326 | }, 327 | { 328 | "cell_type": "code", 329 | "execution_count": null, 330 | "metadata": { 331 | "collapsed": false, 332 | "jupyter": { 333 | "outputs_hidden": false 334 | } 335 | }, 336 | "outputs": [], 337 | "source": [ 338 | "for i,j in enumerate(cm.diagonal()/cm.sum(axis=1)): print(\"%d: %.4f\" % (i,j))" 339 | ] 340 | }, 341 | { 342 | "cell_type": "markdown", 343 | "metadata": {}, 344 | "source": [ 345 | "Precision and recall for each class:" 346 | ] 347 | }, 348 | { 349 | "cell_type": "code", 350 | "execution_count": null, 351 | "metadata": {}, 352 | "outputs": [], 353 | "source": [ 354 | "print(classification_report(y_test, pred_lr, labels=labels))" 355 | ] 356 | }, 357 | { 358 | "cell_type": "markdown", 359 | "metadata": { 360 | "ein.tags": "worksheet-0", 361 | "slideshow": { 362 | "slide_type": "-" 363 | } 364 | }, 365 | "source": [ 366 | "## Linear SVM\n", 367 | "\n", 368 | "### Learning\n", 369 | "\n", 370 | "Next we'll try linear SVM. Let's use the [`LinearSVC`](http://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html) class, as it is a specialized in linear SVMs. `C` is the penalty parameter." 371 | ] 372 | }, 373 | { 374 | "cell_type": "code", 375 | "execution_count": null, 376 | "metadata": { 377 | "autoscroll": false, 378 | "ein.hycell": false, 379 | "ein.tags": "worksheet-0", 380 | "slideshow": { 381 | "slide_type": "-" 382 | } 383 | }, 384 | "outputs": [], 385 | "source": [ 386 | "%%time\n", 387 | "\n", 388 | "C = 1.0\n", 389 | "clf_lsvm = svm.LinearSVC(C=C, multi_class='ovr')\n", 390 | "clf_lsvm.fit(X_train[:10000,:], y_train[:10000])" 391 | ] 392 | }, 393 | { 394 | "cell_type": "markdown", 395 | "metadata": { 396 | "ein.tags": "worksheet-0", 397 | "slideshow": { 398 | "slide_type": "-" 399 | } 400 | }, 401 | "source": [ 402 | "The training of a Linear SVM is rather fast, so it seems more data could easily be used in the training.\n", 403 | "\n", 404 | "### Inference\n", 405 | "\n", 406 | "Again, prediction with linear functions is fast:" 407 | ] 408 | }, 409 | { 410 | "cell_type": "code", 411 | "execution_count": null, 412 | "metadata": { 413 | "autoscroll": false, 414 | "collapsed": false, 415 | "ein.hycell": false, 416 | "ein.tags": "worksheet-0", 417 | "jupyter": { 418 | "outputs_hidden": false 419 | }, 420 | "slideshow": { 421 | "slide_type": "-" 422 | } 423 | }, 424 | "outputs": [], 425 | "source": [ 426 | "%%time\n", 427 | "\n", 428 | "pred_lsvm = clf_lsvm.predict(X_test)\n", 429 | "print('Predicted', len(pred_lsvm), 'digits with accuracy:', accuracy_score(y_test, pred_lsvm))" 430 | ] 431 | }, 432 | { 433 | "cell_type": "markdown", 434 | "metadata": {}, 435 | "source": [ 436 | "#### Confusion matrix\n", 437 | "\n", 438 | "We can compute the confusion matrix to see which digits get mixed the most, and look at classification accuracies separately for each class:" 439 | ] 440 | }, 441 | { 442 | "cell_type": "code", 443 | "execution_count": null, 444 | "metadata": { 445 | "collapsed": false, 446 | "jupyter": { 447 | "outputs_hidden": false 448 | } 449 | }, 450 | "outputs": [], 451 | "source": [ 452 | "labels=[str(i) for i in range(10)]\n", 453 | "print('Confusion matrix (rows: true classes; columns: predicted classes):'); print()\n", 454 | "cm=confusion_matrix(y_test, pred_lsvm, labels=labels)\n", 455 | "print(cm); print()" 456 | ] 457 | }, 458 | { 459 | "cell_type": "markdown", 460 | "metadata": {}, 461 | "source": [ 462 | "If we plot it as an image, we can see it more visually. The matrix looks quite good as most image are on the diagonal, meaning they were classified correctly." 463 | ] 464 | }, 465 | { 466 | "cell_type": "code", 467 | "execution_count": null, 468 | "metadata": {}, 469 | "outputs": [], 470 | "source": [ 471 | "plt.matshow(cm, cmap=plt.cm.gray)\n", 472 | "plt.xticks(range(10))\n", 473 | "plt.yticks(range(10))\n", 474 | "plt.grid(None)\n", 475 | "plt.show()" 476 | ] 477 | }, 478 | { 479 | "cell_type": "markdown", 480 | "metadata": {}, 481 | "source": [ 482 | "#### Accuracy, precision and recall\n", 483 | "\n", 484 | "Classification accuracy for each class:" 485 | ] 486 | }, 487 | { 488 | "cell_type": "code", 489 | "execution_count": null, 490 | "metadata": { 491 | "collapsed": false, 492 | "jupyter": { 493 | "outputs_hidden": false 494 | } 495 | }, 496 | "outputs": [], 497 | "source": [ 498 | "for i,j in enumerate(cm.diagonal()/cm.sum(axis=1)): print(\"%d: %.4f\" % (i,j))" 499 | ] 500 | }, 501 | { 502 | "cell_type": "markdown", 503 | "metadata": {}, 504 | "source": [ 505 | "Precision and recall for each class:" 506 | ] 507 | }, 508 | { 509 | "cell_type": "code", 510 | "execution_count": null, 511 | "metadata": { 512 | "collapsed": false, 513 | "jupyter": { 514 | "outputs_hidden": false 515 | } 516 | }, 517 | "outputs": [], 518 | "source": [ 519 | "print(classification_report(y_test, pred_lsvm, labels=labels))" 520 | ] 521 | }, 522 | { 523 | "cell_type": "markdown", 524 | "metadata": { 525 | "ein.tags": "worksheet-0", 526 | "slideshow": { 527 | "slide_type": "-" 528 | } 529 | }, 530 | "source": [ 531 | "## Model tuning" 532 | ] 533 | }, 534 | { 535 | "cell_type": "markdown", 536 | "metadata": { 537 | "ein.tags": "worksheet-0", 538 | "slideshow": { 539 | "slide_type": "-" 540 | } 541 | }, 542 | "source": [ 543 | "Study the scikit-learn documentation of [LogisticRegression](http://scikit-learn.org/stable/modules/linear_model.html#logistic-regression) and [LinearSVC](http://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html). Experiment with different hyperparameter values. You can also try [SGDClassifier](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDClassifier.html) which does pure SGD (one sample at a time).\n", 544 | "\n", 545 | "Can you improve on the accuracy or make training faster?\n", 546 | "\n", 547 | "Report the highest classification accuracy you manage to obtain. Also mark down the parameters you used, so others can try to reproduce your results.\n" 548 | ] 549 | }, 550 | { 551 | "cell_type": "code", 552 | "execution_count": null, 553 | "metadata": { 554 | "autoscroll": false, 555 | "collapsed": false, 556 | "ein.hycell": false, 557 | "ein.tags": "worksheet-0", 558 | "jupyter": { 559 | "outputs_hidden": false 560 | }, 561 | "slideshow": { 562 | "slide_type": "-" 563 | } 564 | }, 565 | "outputs": [], 566 | "source": [] 567 | } 568 | ], 569 | "metadata": { 570 | "kernelspec": { 571 | "display_name": "Python 3 (ipykernel)", 572 | "language": "python", 573 | "name": "python3" 574 | }, 575 | "language_info": { 576 | "codemirror_mode": { 577 | "name": "ipython", 578 | "version": 3 579 | }, 580 | "file_extension": ".py", 581 | "mimetype": "text/x-python", 582 | "name": "python", 583 | "nbconvert_exporter": "python", 584 | "pygments_lexer": "ipython3", 585 | "version": "3.11.9" 586 | }, 587 | "name": "sklearn-mnist-lc.ipynb" 588 | }, 589 | "nbformat": 4, 590 | "nbformat_minor": 4 591 | } 592 | -------------------------------------------------------------------------------- /Exercise-03.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# MNIST handwritten digits classification with nearest neighbors \n", 8 | "\n", 9 | "In this notebook, we'll use [nearest-neighbor classifiers](http://scikit-learn.org/stable/modules/neighbors.html#nearest-neighbors-classification) to classify MNIST digits using scikit-learn (version 0.20 or later required).\n", 10 | "\n", 11 | "First, the needed imports. " 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": null, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "%matplotlib inline\n", 21 | "\n", 22 | "from pml_utils import get_mnist, show_failures\n", 23 | "\n", 24 | "import numpy as np\n", 25 | "from sklearn.model_selection import train_test_split\n", 26 | "from sklearn import neighbors, __version__\n", 27 | "from sklearn.metrics import accuracy_score, confusion_matrix, classification_report\n", 28 | "\n", 29 | "import matplotlib.pyplot as plt\n", 30 | "import seaborn as sns\n", 31 | "sns.set()\n", 32 | "\n", 33 | "from packaging.version import Version\n", 34 | "assert(Version(__version__) >= Version(\"0.20\")), \"Version >= 0.20 of sklearn is required.\"" 35 | ] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "metadata": {}, 40 | "source": [ 41 | "Then we load the MNIST data. First time we need to download the data, which can take a while." 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": null, 47 | "metadata": {}, 48 | "outputs": [], 49 | "source": [ 50 | "X_train, y_train, X_test, y_test = get_mnist('MNIST')\n", 51 | "\n", 52 | "print('MNIST data loaded: train:',len(X_train),'test:',len(X_test))\n", 53 | "print('X_train:', X_train.shape)\n", 54 | "print('y_train:', y_train.shape)\n", 55 | "print('X_test', X_test.shape)\n", 56 | "print('y_test', y_test.shape)" 57 | ] 58 | }, 59 | { 60 | "cell_type": "markdown", 61 | "metadata": {}, 62 | "source": [ 63 | "The training data (`X_train`) is a matrix of size (60000, 784), i.e. it consists of 60000 digits expressed as 784 sized vectors (28x28 images flattened to 1D). `y_train` is a 60000-dimensional vector containing the correct classes (\"0\", \"1\", ..., \"9\") for each training digit.\n", 64 | "\n", 65 | "Let's take a closer look. Here are the first 10 training digits:" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": null, 71 | "metadata": { 72 | "collapsed": false, 73 | "jupyter": { 74 | "outputs_hidden": false 75 | } 76 | }, 77 | "outputs": [], 78 | "source": [ 79 | "pltsize=1\n", 80 | "plt.figure(figsize=(10*pltsize, pltsize))\n", 81 | "\n", 82 | "for i in range(10):\n", 83 | " plt.subplot(1,10,i+1)\n", 84 | " plt.axis('off')\n", 85 | " plt.imshow(X_train[i,:].reshape(28, 28), cmap=\"gray\")\n", 86 | " plt.title('Class: '+y_train[i])" 87 | ] 88 | }, 89 | { 90 | "cell_type": "markdown", 91 | "metadata": {}, 92 | "source": [ 93 | "## 1-NN classifier\n", 94 | "\n", 95 | "### Initialization\n", 96 | "\n", 97 | "Let's create first a 1-NN classifier. Note that with nearest-neighbor classifiers there is no internal (parameterized) model and therefore no learning required. Instead, calling the `fit()` function simply stores the samples of the training data in a suitable data structure." 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": null, 103 | "metadata": { 104 | "collapsed": false, 105 | "jupyter": { 106 | "outputs_hidden": false 107 | } 108 | }, 109 | "outputs": [], 110 | "source": [ 111 | "%%time\n", 112 | "\n", 113 | "n_neighbors = 1\n", 114 | "clf_nn = neighbors.KNeighborsClassifier(n_neighbors)\n", 115 | "clf_nn.fit(X_train, y_train)" 116 | ] 117 | }, 118 | { 119 | "cell_type": "markdown", 120 | "metadata": {}, 121 | "source": [ 122 | "### Inference\n", 123 | "\n", 124 | "And try to classify some test samples with it." 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": null, 130 | "metadata": { 131 | "collapsed": false, 132 | "jupyter": { 133 | "outputs_hidden": false 134 | } 135 | }, 136 | "outputs": [], 137 | "source": [ 138 | "%%time\n", 139 | "\n", 140 | "pred_nn = clf_nn.predict(X_test[:200,:])" 141 | ] 142 | }, 143 | { 144 | "cell_type": "markdown", 145 | "metadata": {}, 146 | "source": [ 147 | "We observe that the classifier is rather slow, and classifying the whole test set would take quite some time. What is the reason for this?\n", 148 | "\n", 149 | "The accuracy of the classifier:" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": null, 155 | "metadata": { 156 | "collapsed": false, 157 | "jupyter": { 158 | "outputs_hidden": false 159 | } 160 | }, 161 | "outputs": [], 162 | "source": [ 163 | "print('Predicted', len(pred_nn), 'digits with accuracy:',\n", 164 | " accuracy_score(y_test[:len(pred_nn)], pred_nn))" 165 | ] 166 | }, 167 | { 168 | "cell_type": "markdown", 169 | "metadata": {}, 170 | "source": [ 171 | "## Faster 1-NN classifier\n", 172 | "\n", 173 | "### Initialization\n", 174 | "\n", 175 | "One way to make our 1-NN classifier faster is to use less training data:" 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": null, 181 | "metadata": { 182 | "collapsed": false, 183 | "jupyter": { 184 | "outputs_hidden": false 185 | } 186 | }, 187 | "outputs": [], 188 | "source": [ 189 | "%%time\n", 190 | "\n", 191 | "n_neighbors = 1\n", 192 | "n_data = 1024\n", 193 | "clf_nn_fast = neighbors.KNeighborsClassifier(n_neighbors)\n", 194 | "clf_nn_fast.fit(X_train[:n_data,:], y_train[:n_data])" 195 | ] 196 | }, 197 | { 198 | "cell_type": "markdown", 199 | "metadata": {}, 200 | "source": [ 201 | "### Inference\n", 202 | "\n", 203 | "Now we can use the classifier created with reduced data to classify our whole test set in a reasonable amount of time." 204 | ] 205 | }, 206 | { 207 | "cell_type": "code", 208 | "execution_count": null, 209 | "metadata": { 210 | "collapsed": false, 211 | "jupyter": { 212 | "outputs_hidden": false 213 | } 214 | }, 215 | "outputs": [], 216 | "source": [ 217 | "%%time\n", 218 | "\n", 219 | "pred_nn_fast = clf_nn_fast.predict(X_test)" 220 | ] 221 | }, 222 | { 223 | "cell_type": "markdown", 224 | "metadata": {}, 225 | "source": [ 226 | "The classification accuracy is however now not as good:" 227 | ] 228 | }, 229 | { 230 | "cell_type": "code", 231 | "execution_count": null, 232 | "metadata": { 233 | "collapsed": false, 234 | "jupyter": { 235 | "outputs_hidden": false 236 | } 237 | }, 238 | "outputs": [], 239 | "source": [ 240 | "print('Predicted', len(pred_nn_fast), 'digits with accuracy:',\n", 241 | " accuracy_score(y_test, pred_nn_fast))" 242 | ] 243 | }, 244 | { 245 | "cell_type": "markdown", 246 | "metadata": {}, 247 | "source": [ 248 | "#### Confusion matrix\n", 249 | "\n", 250 | "We can compute the confusion matrix to see which digits get mixed the most:" 251 | ] 252 | }, 253 | { 254 | "cell_type": "code", 255 | "execution_count": null, 256 | "metadata": { 257 | "collapsed": false, 258 | "jupyter": { 259 | "outputs_hidden": false 260 | } 261 | }, 262 | "outputs": [], 263 | "source": [ 264 | "labels=[str(i) for i in range(10)]\n", 265 | "print('Confusion matrix (rows: true classes; columns: predicted classes):'); print()\n", 266 | "cm=confusion_matrix(y_test, pred_nn_fast, labels=labels)\n", 267 | "print(cm); print()" 268 | ] 269 | }, 270 | { 271 | "cell_type": "markdown", 272 | "metadata": {}, 273 | "source": [ 274 | "Plotted as an image:" 275 | ] 276 | }, 277 | { 278 | "cell_type": "code", 279 | "execution_count": null, 280 | "metadata": {}, 281 | "outputs": [], 282 | "source": [ 283 | "plt.matshow(cm, cmap=plt.cm.gray)\n", 284 | "plt.xticks(range(10))\n", 285 | "plt.yticks(range(10))\n", 286 | "plt.grid(None)\n", 287 | "plt.show()" 288 | ] 289 | }, 290 | { 291 | "cell_type": "markdown", 292 | "metadata": {}, 293 | "source": [ 294 | "#### Accuracy, precision and recall\n", 295 | "\n", 296 | "Classification accuracy for each class:" 297 | ] 298 | }, 299 | { 300 | "cell_type": "code", 301 | "execution_count": null, 302 | "metadata": { 303 | "collapsed": false, 304 | "jupyter": { 305 | "outputs_hidden": false 306 | } 307 | }, 308 | "outputs": [], 309 | "source": [ 310 | "for i,j in enumerate(cm.diagonal()/cm.sum(axis=1)): print(\"%d: %.4f\" % (i,j))" 311 | ] 312 | }, 313 | { 314 | "cell_type": "markdown", 315 | "metadata": {}, 316 | "source": [ 317 | "Precision and recall for each class:" 318 | ] 319 | }, 320 | { 321 | "cell_type": "code", 322 | "execution_count": null, 323 | "metadata": { 324 | "collapsed": false, 325 | "jupyter": { 326 | "outputs_hidden": false 327 | } 328 | }, 329 | "outputs": [], 330 | "source": [ 331 | "print(classification_report(y_test, pred_nn_fast, labels=labels))" 332 | ] 333 | }, 334 | { 335 | "cell_type": "markdown", 336 | "metadata": {}, 337 | "source": [ 338 | "#### Failure analysis\n", 339 | "\n", 340 | "We can also inspect the results in more detail. Let's use the `show_failures()` helper function (defined in `pml_utils.py`) to show the wrongly classified test digits.\n", 341 | "\n", 342 | "The helper function is defined as:\n", 343 | "\n", 344 | "```\n", 345 | "show_failures(predictions, y_test, X_test, trueclass=None, predictedclass=None, maxtoshow=10)\n", 346 | "```\n", 347 | "\n", 348 | "where:\n", 349 | "- `predictions` is a vector with the predicted classes for each test set image\n", 350 | "- `y_test` the _correct_ classes for the test set images\n", 351 | "- `X_test` the test set images\n", 352 | "- `trueclass` can be set to show only images for a given correct (true) class\n", 353 | "- `predictedclass` can be set to show only images which were predicted as a given class\n", 354 | "- `maxtoshow` specifies how many items to show\n" 355 | ] 356 | }, 357 | { 358 | "cell_type": "code", 359 | "execution_count": null, 360 | "metadata": { 361 | "collapsed": false, 362 | "jupyter": { 363 | "outputs_hidden": false 364 | } 365 | }, 366 | "outputs": [], 367 | "source": [ 368 | "show_failures(pred_nn_fast, y_test, X_test)" 369 | ] 370 | }, 371 | { 372 | "cell_type": "markdown", 373 | "metadata": {}, 374 | "source": [ 375 | "We can use `show_failures()` to inspect failures in more detail. For example:\n", 376 | "\n", 377 | "* show failures in which the true class was \"5\":" 378 | ] 379 | }, 380 | { 381 | "cell_type": "code", 382 | "execution_count": null, 383 | "metadata": { 384 | "collapsed": false, 385 | "jupyter": { 386 | "outputs_hidden": false 387 | } 388 | }, 389 | "outputs": [], 390 | "source": [ 391 | "show_failures(pred_nn_fast, y_test, X_test, trueclass='5')" 392 | ] 393 | }, 394 | { 395 | "cell_type": "markdown", 396 | "metadata": {}, 397 | "source": [ 398 | "* show failures in which the prediction was \"0\":" 399 | ] 400 | }, 401 | { 402 | "cell_type": "code", 403 | "execution_count": null, 404 | "metadata": { 405 | "collapsed": false, 406 | "jupyter": { 407 | "outputs_hidden": false 408 | } 409 | }, 410 | "outputs": [], 411 | "source": [ 412 | "show_failures(pred_nn_fast, y_test, X_test, predictedclass='0')" 413 | ] 414 | }, 415 | { 416 | "cell_type": "markdown", 417 | "metadata": {}, 418 | "source": [ 419 | "* show failures in which the true class was \"0\" and the prediction was \"2\":" 420 | ] 421 | }, 422 | { 423 | "cell_type": "code", 424 | "execution_count": null, 425 | "metadata": { 426 | "collapsed": false, 427 | "jupyter": { 428 | "outputs_hidden": false 429 | } 430 | }, 431 | "outputs": [], 432 | "source": [ 433 | "show_failures(pred_nn_fast, y_test, X_test, trueclass='0', predictedclass='2')" 434 | ] 435 | }, 436 | { 437 | "cell_type": "markdown", 438 | "metadata": {}, 439 | "source": [ 440 | "We can observe that the classifier makes rather \"easy\" mistakes, and there might thus be room for improvement." 441 | ] 442 | }, 443 | { 444 | "cell_type": "markdown", 445 | "metadata": {}, 446 | "source": [ 447 | "## Model tuning\n", 448 | "\n", 449 | "Try to improve the accuracy of the nearest-neighbor classifier while preserving a reasonable runtime to classify the whole test set. Things to try include using more than one neighbor (with or without weights) or increasing the amount of training data. See the documentation for [KNeighborsClassifier](https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html#sklearn-neighbors-kneighborsclassifier).\n", 450 | "\n", 451 | "See also http://scikit-learn.org/stable/modules/neighbors.html#nearest-neighbors-classification for more information." 452 | ] 453 | }, 454 | { 455 | "cell_type": "code", 456 | "execution_count": null, 457 | "metadata": {}, 458 | "outputs": [], 459 | "source": [] 460 | } 461 | ], 462 | "metadata": { 463 | "kernelspec": { 464 | "display_name": "Python 3 (ipykernel)", 465 | "language": "python", 466 | "name": "python3" 467 | }, 468 | "language_info": { 469 | "codemirror_mode": { 470 | "name": "ipython", 471 | "version": 3 472 | }, 473 | "file_extension": ".py", 474 | "mimetype": "text/x-python", 475 | "name": "python", 476 | "nbconvert_exporter": "python", 477 | "pygments_lexer": "ipython3", 478 | "version": "3.11.9" 479 | } 480 | }, 481 | "nbformat": 4, 482 | "nbformat_minor": 4 483 | } 484 | -------------------------------------------------------------------------------- /Exercise-04.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# California housing dataset with linear and polynomial regression \n", 8 | "\n", 9 | "In this notebook, we'll use [linear regression](https://scikit-learn.org/stable/modules/linear_model.html#ordinary-least-squares), [regularized linear regression](https://scikit-learn.org/stable/modules/linear_model.html#ridge-regression), and [polynomial regression](https://scikit-learn.org/stable/modules/linear_model.html#polynomial-regression-extending-linear-models-with-basis-functions) to estimate median house values on Californian housing districts.\n", 10 | "\n", 11 | "First, the needed imports. " 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": null, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "%matplotlib inline\n", 21 | "\n", 22 | "import numpy as np\n", 23 | "from sklearn import datasets, __version__\n", 24 | "from sklearn.model_selection import train_test_split\n", 25 | "from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet\n", 26 | "from sklearn.metrics import mean_squared_error\n", 27 | "from sklearn.preprocessing import StandardScaler, PolynomialFeatures\n", 28 | "from sklearn.pipeline import Pipeline\n", 29 | "import pandas as pd\n", 30 | "\n", 31 | "import matplotlib.pyplot as plt\n", 32 | "import seaborn as sns\n", 33 | "sns.set()" 34 | ] 35 | }, 36 | { 37 | "cell_type": "markdown", 38 | "metadata": {}, 39 | "source": [ 40 | "## Data\n", 41 | "\n", 42 | "Then we load the California housing data. First time we need to download the data, which can take a while." 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "chd = datasets.fetch_california_housing()" 52 | ] 53 | }, 54 | { 55 | "cell_type": "markdown", 56 | "metadata": {}, 57 | "source": [ 58 | "Let's first convert the data into a Pandas DataFrame to inspect some basic statistics:" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": null, 64 | "metadata": { 65 | "collapsed": false, 66 | "jupyter": { 67 | "outputs_hidden": false 68 | } 69 | }, 70 | "outputs": [], 71 | "source": [ 72 | "df = pd.DataFrame(data=chd.data, columns=chd.feature_names)\n", 73 | "df['Target'] = pd.Series(chd.target, index=df.index)\n", 74 | "df.describe()" 75 | ] 76 | }, 77 | { 78 | "cell_type": "markdown", 79 | "metadata": {}, 80 | "source": [ 81 | "We see that the data consists of 20640 housing districts, each characterized with 8 attributes: *MedInc, HouseAge, AveRooms, AveBedrms, Population, AveOccup, Latitude, Longitude*. We also defined a target value (median house value) for each housing district.\n", 82 | " \n", 83 | "Let's then plot all attributes against the target value:" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": null, 89 | "metadata": {}, 90 | "outputs": [], 91 | "source": [ 92 | "plt.figure(figsize=(15,10))\n", 93 | "for i in range(8):\n", 94 | " plt.subplot(4,2,i+1)\n", 95 | " plt.scatter(chd.data[:,i], chd.target, s=2, label=chd.feature_names[i])\n", 96 | " plt.legend(loc='best')" 97 | ] 98 | }, 99 | { 100 | "cell_type": "markdown", 101 | "metadata": {}, 102 | "source": [ 103 | "We'll now split the data into a training and a test set. Let's use 5000 samples as test data.\n", 104 | "\n", 105 | "Let's also select a single attribute to start the analysis with, for example *MedInc*. This way we can plot the regression functions against the target value. Later we will use all attributes in the regression." 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": null, 111 | "metadata": {}, 112 | "outputs": [], 113 | "source": [ 114 | "test_size = 5000\n", 115 | "single_attribute = 'MedInc'\n", 116 | "\n", 117 | "X_train_all, X_test_all, y_train, y_test = train_test_split(\n", 118 | " chd.data, chd.target, test_size=test_size, shuffle=True)\n", 119 | "\n", 120 | "attribute_index = chd.feature_names.index(single_attribute)\n", 121 | "X_train_single = X_train_all[:, attribute_index].reshape(-1, 1)\n", 122 | "X_test_single = X_test_all[:, attribute_index].reshape(-1, 1)\n", 123 | " \n", 124 | "print()\n", 125 | "print('California housing data: train:',len(X_train_all),'test:',len(X_test_all))\n", 126 | "print()\n", 127 | "print('X_train_all:', X_train_all.shape)\n", 128 | "print('X_train_single:', X_train_single.shape)\n", 129 | "print('y_train:', y_train.shape)\n", 130 | "print()\n", 131 | "print('X_test_all', X_test_all.shape)\n", 132 | "print('X_test_single', X_test_single.shape)\n", 133 | "print('y_test', y_test.shape)" 134 | ] 135 | }, 136 | { 137 | "cell_type": "markdown", 138 | "metadata": {}, 139 | "source": [ 140 | "The training data matrix `X_train_all` is a matrix of size (`n_train`, 8), and `X_train_single` contains only the first attribute (*MedInc* by default) of each housing district. The vector `y_train` contains the target value (median house value) for each housing district in the training set.\n", 141 | "\n", 142 | "Let's start our analysis with the single attribute. Later, you can set `only_single_attribute = False` to use all eight attributes in the regression. \n", 143 | "\n", 144 | "As the final step, let's scale our input data to zero mean and unit variance." 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": null, 150 | "metadata": {}, 151 | "outputs": [], 152 | "source": [ 153 | "only_single_attribute = True\n", 154 | "\n", 155 | "if only_single_attribute:\n", 156 | " X_train = X_train_single\n", 157 | " X_test = X_test_single\n", 158 | "else:\n", 159 | " X_train = X_train_all\n", 160 | " X_test = X_test_all\n", 161 | "\n", 162 | "scaler = StandardScaler().fit(X_train)\n", 163 | "X_train = scaler.transform(X_train)\n", 164 | "X_test = scaler.transform(X_test)\n", 165 | "print('X_train: shape:', X_train.shape, 'mean:', X_train.mean(axis=0), 'std:', X_train.std(axis=0))\n", 166 | "print('X_test: shape:', X_test.shape, 'mean:', X_test.mean(axis=0), 'std:', X_test.std(axis=0))" 167 | ] 168 | }, 169 | { 170 | "cell_type": "markdown", 171 | "metadata": {}, 172 | "source": [ 173 | "## Linear regression\n", 174 | "\n", 175 | "We begin with linear regression:\n", 176 | "\n", 177 | "$$J(w) = \\|y - Xw\\|^2_2$$\n", 178 | "\n", 179 | "### Learning\n", 180 | "\n", 181 | "The parameters of linear regression can be solved in closed form as:\n", 182 | "\n", 183 | "$$\\hat{w} = (X^TX)^{-1}X^Ty$$" 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "execution_count": null, 189 | "metadata": { 190 | "collapsed": false, 191 | "jupyter": { 192 | "outputs_hidden": false 193 | } 194 | }, 195 | "outputs": [], 196 | "source": [ 197 | "%%time\n", 198 | "\n", 199 | "lin_reg = LinearRegression()\n", 200 | "lin_reg.fit(X_train, y_train)\n", 201 | "print('coefficients:', lin_reg.coef_)\n", 202 | "print('intercept:', lin_reg.intercept_)" 203 | ] 204 | }, 205 | { 206 | "cell_type": "markdown", 207 | "metadata": {}, 208 | "source": [ 209 | "We can visualize the results if we are using only a single attribute:" 210 | ] 211 | }, 212 | { 213 | "cell_type": "code", 214 | "execution_count": null, 215 | "metadata": {}, 216 | "outputs": [], 217 | "source": [ 218 | "if X_train.shape[1] == 1:\n", 219 | " plt.figure(figsize=(10, 10))\n", 220 | " plt.scatter(X_train, y_train, s=5)\n", 221 | " reg_x = np.arange(np.min(X_train), np.max(X_train), 0.01).reshape(-1, 1)\n", 222 | " plt.scatter(reg_x, lin_reg.predict(reg_x), s=8, label='linear')\n", 223 | " plt.legend(loc='best');" 224 | ] 225 | }, 226 | { 227 | "cell_type": "markdown", 228 | "metadata": {}, 229 | "source": [ 230 | "### Inference\n", 231 | "\n", 232 | "We use *mean squared error* as the performance measure for our regression algorihm: " 233 | ] 234 | }, 235 | { 236 | "cell_type": "code", 237 | "execution_count": null, 238 | "metadata": { 239 | "collapsed": false, 240 | "jupyter": { 241 | "outputs_hidden": false 242 | } 243 | }, 244 | "outputs": [], 245 | "source": [ 246 | "%%time\n", 247 | "\n", 248 | "predictions = lin_reg.predict(X_test)\n", 249 | "print(\"Mean squared error: %.3f\"\n", 250 | " % mean_squared_error(y_test, predictions))" 251 | ] 252 | }, 253 | { 254 | "cell_type": "markdown", 255 | "metadata": {}, 256 | "source": [ 257 | "## Regularized linear regression: Ridge\n", 258 | "\n", 259 | "Ridge regression adds $L_2$ regularization: \n", 260 | "\n", 261 | "$$J(w) = \\|y - Xw\\|^2_2 + \\alpha \\|w\\|^2_2$$\n", 262 | "\n", 263 | "where $\\alpha \\ge 0$ is the penalty parameter for the weights. You can experiment with different values of $\\alpha$.\n", 264 | "\n", 265 | "You can also try out [`Lasso()`](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Lasso.html#sklearn-linear-model-lasso) or [`ElasticNet()`](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.ElasticNet.html#sklearn-linear-model-elasticnet). Note that Elastic net has also a second parameter `l1_ratio`. \n", 266 | "\n", 267 | "### Learning" 268 | ] 269 | }, 270 | { 271 | "cell_type": "code", 272 | "execution_count": null, 273 | "metadata": {}, 274 | "outputs": [], 275 | "source": [ 276 | "%%time\n", 277 | "\n", 278 | "alpha = 1e3\n", 279 | "\n", 280 | "rdg_reg = Ridge(alpha=alpha)\n", 281 | "rdg_reg.fit(X_train, y_train)\n", 282 | "print('coefficients:', rdg_reg.coef_)\n", 283 | "print('intercept:', rdg_reg.intercept_)" 284 | ] 285 | }, 286 | { 287 | "cell_type": "code", 288 | "execution_count": null, 289 | "metadata": {}, 290 | "outputs": [], 291 | "source": [ 292 | "if X_train.shape[1] == 1:\n", 293 | " plt.figure(figsize=(10, 10))\n", 294 | " plt.scatter(X_train, y_train, s=5)\n", 295 | " reg_x = np.arange(np.min(X_train), np.max(X_train), 0.01).reshape(-1, 1)\n", 296 | " plt.scatter(reg_x, lin_reg.predict(reg_x), s=8, label='linear');\n", 297 | " plt.scatter(reg_x, rdg_reg.predict(reg_x), s=8, label=r'ridge, $\\alpha=${}'.format(alpha))\n", 298 | " plt.legend(loc='best');" 299 | ] 300 | }, 301 | { 302 | "cell_type": "markdown", 303 | "metadata": {}, 304 | "source": [ 305 | "### Inference" 306 | ] 307 | }, 308 | { 309 | "cell_type": "code", 310 | "execution_count": null, 311 | "metadata": {}, 312 | "outputs": [], 313 | "source": [ 314 | "%%time\n", 315 | "\n", 316 | "predictions = rdg_reg.predict(X_test)\n", 317 | "print(\"Mean squared error: %.3f\"\n", 318 | " % mean_squared_error(y_test, predictions))" 319 | ] 320 | }, 321 | { 322 | "cell_type": "markdown", 323 | "metadata": {}, 324 | "source": [ 325 | "## Polynomial regression\n", 326 | "\n", 327 | "Polynomial regression can be performed by constructing polynomial features, e.g.:\n", 328 | "\n", 329 | "$$z=[1,\\,x_1,\\,x_2,\\,x_1x_2,\\,x_1^2,\\,x_2^2]$$\n", 330 | "\n", 331 | "and using a linear model with the new features:\n", 332 | "\n", 333 | "$$J(w) = \\|z - X'w\\|^2_2$$\n", 334 | "\n", 335 | "### Learning\n", 336 | "\n", 337 | "To implement polynomial regression, we use scikit-learn's [Pipeline](https://scikit-learn.org/stable/modules/compose.html#pipeline), a tool for building composite estimators. \n", 338 | "\n", 339 | "Note that the polynomial features contain all possible combinations, so the number of features grows quickly especially when using many attributes. Also, you can try using regularized linear regression with polynomial features." 340 | ] 341 | }, 342 | { 343 | "cell_type": "code", 344 | "execution_count": null, 345 | "metadata": { 346 | "collapsed": false, 347 | "jupyter": { 348 | "outputs_hidden": false 349 | } 350 | }, 351 | "outputs": [], 352 | "source": [ 353 | "%%time\n", 354 | "degree = 5\n", 355 | "\n", 356 | "poly_model = Pipeline([('poly', PolynomialFeatures(degree=degree)),\n", 357 | " ('linear', LinearRegression(fit_intercept=False))])\n", 358 | "poly_model.fit(X_train, y_train)\n", 359 | "print('coefficients:', poly_model.steps[1][1].coef_)\n", 360 | "print('intercept:', poly_model.steps[1][1].intercept_)" 361 | ] 362 | }, 363 | { 364 | "cell_type": "code", 365 | "execution_count": null, 366 | "metadata": {}, 367 | "outputs": [], 368 | "source": [ 369 | "if X_train.shape[1] == 1:\n", 370 | " plt.figure(figsize=(10, 10))\n", 371 | " plt.scatter(X_train, y_train, s=5)\n", 372 | " reg_x = np.arange(np.min(X_train), np.max(X_train), 0.01).reshape(-1, 1)\n", 373 | " plt.scatter(reg_x, lin_reg.predict(reg_x), s=8, label='linear');\n", 374 | " plt.scatter(reg_x, poly_model.predict(reg_x), s=8, label='polynomial')\n", 375 | " plt.legend(loc='best');" 376 | ] 377 | }, 378 | { 379 | "cell_type": "markdown", 380 | "metadata": {}, 381 | "source": [ 382 | "### Inference" 383 | ] 384 | }, 385 | { 386 | "cell_type": "code", 387 | "execution_count": null, 388 | "metadata": { 389 | "collapsed": false, 390 | "jupyter": { 391 | "outputs_hidden": false 392 | } 393 | }, 394 | "outputs": [], 395 | "source": [ 396 | "%%time\n", 397 | "\n", 398 | "predictions = poly_model.predict(X_test)\n", 399 | "print(\"Mean squared error: %.3f\"\n", 400 | " % mean_squared_error(y_test, predictions))" 401 | ] 402 | }, 403 | { 404 | "cell_type": "markdown", 405 | "metadata": {}, 406 | "source": [ 407 | "## Model tuning\n", 408 | "\n", 409 | "Try to reduce the mean squared error of the regression. Experiment with several single attributes and with using all attributes.\n", 410 | "\n", 411 | "To further improve the results, it is possible to replace [StandardScaler](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html), that is scaling the input data to zero mean and unit variance, with more advanced preprocessing.\n", 412 | "See [Preprocessing data](https://scikit-learn.org/stable/modules/preprocessing.html#preprocessing-data) for more information." 413 | ] 414 | }, 415 | { 416 | "cell_type": "code", 417 | "execution_count": null, 418 | "metadata": {}, 419 | "outputs": [], 420 | "source": [] 421 | } 422 | ], 423 | "metadata": { 424 | "kernelspec": { 425 | "display_name": "Python 3 (ipykernel)", 426 | "language": "python", 427 | "name": "python3" 428 | }, 429 | "language_info": { 430 | "codemirror_mode": { 431 | "name": "ipython", 432 | "version": 3 433 | }, 434 | "file_extension": ".py", 435 | "mimetype": "text/x-python", 436 | "name": "python", 437 | "nbconvert_exporter": "python", 438 | "pygments_lexer": "ipython3", 439 | "version": "3.11.9" 440 | } 441 | }, 442 | "nbformat": 4, 443 | "nbformat_minor": 4 444 | } 445 | -------------------------------------------------------------------------------- /Exercise-05.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# MNIST handwritten digits classification with support vector machines \n", 8 | "\n", 9 | "In this notebook, we'll use [support vector machines (SVMs)](http://scikit-learn.org/stable/modules/svm.html#svm-classification) and related algorithms to classify MNIST digits using scikit-learn.\n", 10 | "\n", 11 | "First, the needed imports. " 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": null, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "%matplotlib inline\n", 21 | "\n", 22 | "from pml_utils import get_mnist, show_failures\n", 23 | "\n", 24 | "import sklearn\n", 25 | "from sklearn import svm\n", 26 | "from sklearn.metrics import accuracy_score, confusion_matrix, classification_report\n", 27 | "\n", 28 | "import matplotlib.pyplot as plt\n", 29 | "import seaborn as sns\n", 30 | "sns.set()\n", 31 | "\n", 32 | "from packaging.version import Version\n", 33 | "assert(Version(sklearn.__version__) >= Version(\"0.20\")), \"Version >= 0.20 of sklearn is required.\"" 34 | ] 35 | }, 36 | { 37 | "cell_type": "markdown", 38 | "metadata": {}, 39 | "source": [ 40 | "Then we load the MNIST data. First time it downloads the data, which can take a while." 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": null, 46 | "metadata": { 47 | "collapsed": false, 48 | "jupyter": { 49 | "outputs_hidden": false 50 | } 51 | }, 52 | "outputs": [], 53 | "source": [ 54 | "X_train, y_train, X_test, y_test = get_mnist('MNIST')\n", 55 | "\n", 56 | "print('MNIST data loaded: train:',len(X_train),'test:',len(X_test))\n", 57 | "print('X_train:', X_train.shape)\n", 58 | "print('y_train:', y_train.shape)\n", 59 | "print('X_test', X_test.shape)\n", 60 | "print('y_test', y_test.shape)" 61 | ] 62 | }, 63 | { 64 | "cell_type": "markdown", 65 | "metadata": {}, 66 | "source": [ 67 | "## Linear SVM \n", 68 | "\n", 69 | "### Learning\n", 70 | "\n", 71 | "Our first classifier is a linear SVM trained with a subset of training data. Let's use the [`LinearSVC`](http://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html#sklearn.svm.LinearSVC) class, as it is a specialized in linear SVMs. `C` is the penalty parameter. (The general `SVC` has a similar `kernel=’linear’` option that can also be used. The third option is to use `SGDClassifier`.)" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": null, 77 | "metadata": { 78 | "collapsed": false, 79 | "jupyter": { 80 | "outputs_hidden": false 81 | } 82 | }, 83 | "outputs": [], 84 | "source": [ 85 | "%%time\n", 86 | "\n", 87 | "C = 1.0\n", 88 | "clf_lsvm = svm.LinearSVC(C=C)\n", 89 | "print(clf_lsvm.fit(X_train[:10000,:], y_train[:10000]))" 90 | ] 91 | }, 92 | { 93 | "cell_type": "markdown", 94 | "metadata": {}, 95 | "source": [ 96 | "The training of a Linear SVM is rather fast, so it seems more data could easily be used in the training.\n", 97 | "\n", 98 | "Note also that the default multiclass strategy of `LinearSVM` is one-vs-rest.\n", 99 | "\n", 100 | "### Inference\n", 101 | "\n", 102 | "As the decision boundaries are linear, prediction with linear SVMs is fast:" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": null, 108 | "metadata": { 109 | "collapsed": false, 110 | "jupyter": { 111 | "outputs_hidden": false 112 | } 113 | }, 114 | "outputs": [], 115 | "source": [ 116 | "pred_lsvm = clf_lsvm.predict(X_test)\n", 117 | "print('Predicted', len(pred_lsvm), 'digits with accuracy:', accuracy_score(y_test, pred_lsvm))" 118 | ] 119 | }, 120 | { 121 | "cell_type": "markdown", 122 | "metadata": {}, 123 | "source": [ 124 | "## Kernel SVM\n", 125 | "\n", 126 | "In addition to linear classification, SVMs can be used for non-linear classification by implicitly mapping the input features into high-dimensional feature spaces. This is sometimes called the *kernel trick*, as the implicit mapping is often computationally cheaper than explicitly operating in the high-dimensional space.\n", 127 | "\n", 128 | "### Learning\n", 129 | "\n", 130 | "Let's train a *3rd degree polynomial kernel SVM* with the *one-vs-rest* strategy for multiclass classification. A Gaussian kernel, that is `kernel='rbf'` is another common choice." 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": null, 136 | "metadata": { 137 | "collapsed": false, 138 | "jupyter": { 139 | "outputs_hidden": false 140 | }, 141 | "scrolled": true 142 | }, 143 | "outputs": [], 144 | "source": [ 145 | "%%time\n", 146 | "\n", 147 | "clf_ksvm = svm.SVC(decision_function_shape='ovr', kernel='poly', degree=3)\n", 148 | "print(clf_ksvm.fit(X_train[:10000,:], y_train[:10000]))" 149 | ] 150 | }, 151 | { 152 | "cell_type": "markdown", 153 | "metadata": {}, 154 | "source": [ 155 | "### Inference\n", 156 | "\n", 157 | "Despite the kernel trick, prediction of new samples is noticeably slower than with the linear SVM. The classification accuracy, on the other hand, is improved. " 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": null, 163 | "metadata": { 164 | "collapsed": false, 165 | "jupyter": { 166 | "outputs_hidden": false 167 | } 168 | }, 169 | "outputs": [], 170 | "source": [ 171 | "%%time\n", 172 | "\n", 173 | "pred_ksvm = clf_ksvm.predict(X_test)\n", 174 | "print('Predicted', len(pred_ksvm), 'digits with accuracy:', accuracy_score(y_test, pred_ksvm))" 175 | ] 176 | }, 177 | { 178 | "cell_type": "markdown", 179 | "metadata": {}, 180 | "source": [ 181 | "#### Confusion matrix\n", 182 | "\n", 183 | "We can compute the confusion matrix to see which digits get mixed the most:" 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "execution_count": null, 189 | "metadata": { 190 | "collapsed": false, 191 | "jupyter": { 192 | "outputs_hidden": false 193 | } 194 | }, 195 | "outputs": [], 196 | "source": [ 197 | "labels=[str(i) for i in range(10)]\n", 198 | "print('Confusion matrix (rows: true classes; columns: predicted classes):'); print()\n", 199 | "cm=confusion_matrix(y_test, pred_ksvm, labels=labels)\n", 200 | "print(cm); print()" 201 | ] 202 | }, 203 | { 204 | "cell_type": "markdown", 205 | "metadata": {}, 206 | "source": [ 207 | "If we plot it as an image, we can see it more visually. The matrix looks quite good as most image are on the diagonal, meaning they were classified correctly." 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": null, 213 | "metadata": {}, 214 | "outputs": [], 215 | "source": [ 216 | "plt.matshow(cm, cmap=plt.cm.gray)\n", 217 | "plt.show()" 218 | ] 219 | }, 220 | { 221 | "cell_type": "markdown", 222 | "metadata": {}, 223 | "source": [ 224 | "#### Accuracy, precision and recall\n", 225 | "\n", 226 | "Classification accuracy for each class:" 227 | ] 228 | }, 229 | { 230 | "cell_type": "code", 231 | "execution_count": null, 232 | "metadata": { 233 | "collapsed": false, 234 | "jupyter": { 235 | "outputs_hidden": false 236 | } 237 | }, 238 | "outputs": [], 239 | "source": [ 240 | "for i,j in enumerate(cm.diagonal()/cm.sum(axis=1)): print(\"%d: %.4f\" % (i,j))" 241 | ] 242 | }, 243 | { 244 | "cell_type": "markdown", 245 | "metadata": {}, 246 | "source": [ 247 | "Precision and recall for each class:" 248 | ] 249 | }, 250 | { 251 | "cell_type": "code", 252 | "execution_count": null, 253 | "metadata": { 254 | "collapsed": false, 255 | "jupyter": { 256 | "outputs_hidden": false 257 | } 258 | }, 259 | "outputs": [], 260 | "source": [ 261 | "print(classification_report(y_test, pred_ksvm, labels=labels))" 262 | ] 263 | }, 264 | { 265 | "cell_type": "markdown", 266 | "metadata": { 267 | "ein.tags": "worksheet-0", 268 | "slideshow": { 269 | "slide_type": "-" 270 | } 271 | }, 272 | "source": [ 273 | "#### Failure analysis\n", 274 | "\n", 275 | "We can also do some failure analysis. Let's check the 10 first wrongly predicted digits." 276 | ] 277 | }, 278 | { 279 | "cell_type": "code", 280 | "execution_count": null, 281 | "metadata": { 282 | "autoscroll": false, 283 | "collapsed": false, 284 | "ein.hycell": false, 285 | "ein.tags": "worksheet-0", 286 | "jupyter": { 287 | "outputs_hidden": false 288 | }, 289 | "slideshow": { 290 | "slide_type": "-" 291 | } 292 | }, 293 | "outputs": [], 294 | "source": [ 295 | "show_failures(pred_ksvm, y_test, X_test)" 296 | ] 297 | }, 298 | { 299 | "cell_type": "markdown", 300 | "metadata": {}, 301 | "source": [ 302 | "## Model tuning" 303 | ] 304 | }, 305 | { 306 | "cell_type": "markdown", 307 | "metadata": {}, 308 | "source": [ 309 | "Study the scikit-learn documentation of the linear and kernel [SVMs](http://scikit-learn.org/stable/modules/svm.html#svm) and the available SVM classes ([`SVC`](http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html#sklearn.svm.SVC), [`NuSVC`](http://scikit-learn.org/stable/modules/generated/sklearn.svm.NuSVC.html#sklearn.svm.NuSVC) and [`LinearSVC`](http://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html#sklearn.svm.LinearSVC). Experiment with different hyperparameter values.\n", 310 | "\n", 311 | "Report the highest classification accuracy you manage to obtain. Also mark down the parameters you used, so others can try to reproduce your results.\n" 312 | ] 313 | }, 314 | { 315 | "cell_type": "code", 316 | "execution_count": null, 317 | "metadata": {}, 318 | "outputs": [], 319 | "source": [] 320 | } 321 | ], 322 | "metadata": { 323 | "kernelspec": { 324 | "display_name": "Python 3 (ipykernel)", 325 | "language": "python", 326 | "name": "python3" 327 | }, 328 | "language_info": { 329 | "codemirror_mode": { 330 | "name": "ipython", 331 | "version": 3 332 | }, 333 | "file_extension": ".py", 334 | "mimetype": "text/x-python", 335 | "name": "python", 336 | "nbconvert_exporter": "python", 337 | "pygments_lexer": "ipython3", 338 | "version": "3.11.9" 339 | } 340 | }, 341 | "nbformat": 4, 342 | "nbformat_minor": 4 343 | } 344 | -------------------------------------------------------------------------------- /Exercise-06.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# California housing dataset regression with support vector machines\n", 8 | "\n", 9 | "In this notebook, we'll use linear and non-linear [support vector machines (SVMs)](https://scikit-learn.org/stable/modules/svm.html#regression) to estimate median house values on Californian housing districts.\n", 10 | "\n", 11 | "First, the needed imports. " 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": null, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "%matplotlib inline\n", 21 | "\n", 22 | "import numpy as np\n", 23 | "from sklearn import svm, datasets, __version__\n", 24 | "from sklearn.model_selection import train_test_split\n", 25 | "from sklearn.metrics import mean_squared_error\n", 26 | "from sklearn.preprocessing import StandardScaler\n", 27 | "\n", 28 | "import matplotlib.pyplot as plt\n", 29 | "import seaborn as sns\n", 30 | "sns.set()" 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "metadata": {}, 36 | "source": [ 37 | "## Data\n", 38 | "\n", 39 | "Then we load the California housing data. First time we need to download the data, which can take a while." 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": null, 45 | "metadata": {}, 46 | "outputs": [], 47 | "source": [ 48 | "chd = datasets.fetch_california_housing()" 49 | ] 50 | }, 51 | { 52 | "cell_type": "markdown", 53 | "metadata": {}, 54 | "source": [ 55 | "We'll split the data into a training and a test set.\n", 56 | "\n", 57 | "Let's also select a single attribute to start the analysis with, say *MedInc*." 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": null, 63 | "metadata": {}, 64 | "outputs": [], 65 | "source": [ 66 | "test_size = 5000\n", 67 | "single_attribute = 'MedInc'\n", 68 | "\n", 69 | "X_train_all, X_test_all, y_train, y_test = train_test_split(\n", 70 | " chd.data, chd.target, test_size=test_size, shuffle=True)\n", 71 | "\n", 72 | "attribute_index = chd.feature_names.index(single_attribute)\n", 73 | "X_train_single = X_train_all[:, attribute_index].reshape(-1, 1)\n", 74 | "X_test_single = X_test_all[:, attribute_index].reshape(-1, 1)\n", 75 | " \n", 76 | "print()\n", 77 | "print('California housing data: train:',len(X_train_all),'test:',len(X_test_all))\n", 78 | "print()\n", 79 | "print('X_train_all:', X_train_all.shape)\n", 80 | "print('X_train_single:', X_train_single.shape)\n", 81 | "print('y_train:', y_train.shape)\n", 82 | "print()\n", 83 | "print('X_test_all', X_test_all.shape)\n", 84 | "print('X_test_single', X_test_single.shape)\n", 85 | "print('y_test', y_test.shape)" 86 | ] 87 | }, 88 | { 89 | "cell_type": "markdown", 90 | "metadata": {}, 91 | "source": [ 92 | "The training data matrix `X_train_all` is a matrix of size (`n_train`, 8), and `X_train_single` contains only the first attribute (*MedInc* by default). The vector `y_train` contains the target value (median house value) for each housing district in the training set.\n", 93 | "\n", 94 | "Let's start our analysis with the single attribute. Later, you can set `only_single_attribute = False` to use all eight attributes in the regression.\n", 95 | "\n", 96 | "As the final step, let's scale the input data to zero mean and unit variance: " 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": null, 102 | "metadata": {}, 103 | "outputs": [], 104 | "source": [ 105 | "only_single_attribute = True\n", 106 | "\n", 107 | "if only_single_attribute:\n", 108 | " X_train = X_train_single\n", 109 | " X_test = X_test_single\n", 110 | "else:\n", 111 | " X_train = X_train_all\n", 112 | " X_test = X_test_all\n", 113 | "\n", 114 | "scaler = StandardScaler().fit(X_train)\n", 115 | "X_train = scaler.transform(X_train)\n", 116 | "X_test = scaler.transform(X_test)\n", 117 | "print('X_train: shape:', X_train.shape, 'mean:', X_train.mean(axis=0), 'std:', X_train.std(axis=0))\n", 118 | "print('X_test: shape:', X_test.shape, 'mean:', X_test.mean(axis=0), 'std:', X_test.std(axis=0))" 119 | ] 120 | }, 121 | { 122 | "cell_type": "markdown", 123 | "metadata": {}, 124 | "source": [ 125 | "## Linear SVM\n", 126 | "\n", 127 | "We begin with SVM using a linear kernel.\n", 128 | "\n", 129 | "### Learning\n", 130 | "\n", 131 | "Let's use [`LinearSVR`](https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVR.html#sklearn.svm.LinearSVR), as it is a specialized in linear SVMs. `C` is the penalty parameter. (The general `SVR` has a similar `kernel=’linear’` option that can also be used.)" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": null, 137 | "metadata": { 138 | "collapsed": false, 139 | "jupyter": { 140 | "outputs_hidden": false 141 | } 142 | }, 143 | "outputs": [], 144 | "source": [ 145 | "%%time\n", 146 | "\n", 147 | "C = 1.0\n", 148 | "lin_reg = svm.LinearSVR(C=C)\n", 149 | "lin_reg.fit(X_train, y_train)\n", 150 | "print('coefficients:', lin_reg.coef_)\n", 151 | "print('intercept:', lin_reg.intercept_)" 152 | ] 153 | }, 154 | { 155 | "cell_type": "markdown", 156 | "metadata": {}, 157 | "source": [ 158 | "We can visualize the results if we are using only a single attribute:" 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": null, 164 | "metadata": {}, 165 | "outputs": [], 166 | "source": [ 167 | "if X_train.shape[1] == 1:\n", 168 | " plt.figure(figsize=(10, 10))\n", 169 | " plt.scatter(X_train, y_train, s=5)\n", 170 | " reg_x = np.arange(np.min(X_train), np.max(X_train), 0.01).reshape(-1, 1)\n", 171 | " plt.scatter(reg_x, lin_reg.predict(reg_x), s=8, label='linear SVR')\n", 172 | " plt.legend(loc='best');" 173 | ] 174 | }, 175 | { 176 | "cell_type": "markdown", 177 | "metadata": {}, 178 | "source": [ 179 | "### Inference\n", 180 | "\n", 181 | "We use *mean squared error* as the performance measure for our regression algorihm: " 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": null, 187 | "metadata": { 188 | "collapsed": false, 189 | "jupyter": { 190 | "outputs_hidden": false 191 | } 192 | }, 193 | "outputs": [], 194 | "source": [ 195 | "%%time\n", 196 | "\n", 197 | "predictions = lin_reg.predict(X_test)\n", 198 | "print(\"Mean squared error: %.3f\"\n", 199 | " % mean_squared_error(y_test, predictions))" 200 | ] 201 | }, 202 | { 203 | "cell_type": "markdown", 204 | "metadata": {}, 205 | "source": [ 206 | "## Non-linear (or kernel) SVM\n", 207 | "\n", 208 | "In addition to using a linear kernel, SVMs can be used for non-linear regression by implicitly mapping the input features into high-dimensional feature spaces. This is sometimes called the *kernel trick*, as the implicit mapping is often computationally cheaper than explicitly operating in the high-dimensional space.\n", 209 | "\n", 210 | "### Learning\n", 211 | "\n", 212 | "Let's start with a Gaussian kernel or `kernel='rbf'`. \n", 213 | "\n", 214 | "A polynomial kernel, that is `kernel='poly'`, is another common choice. The degree of the polynomial is set using the `degree` parameter.\n", 215 | "\n", 216 | "Note that non-linear SVMs can be relatively slow to train, so it might be a good idea to start with a subset of the training data." 217 | ] 218 | }, 219 | { 220 | "cell_type": "code", 221 | "execution_count": null, 222 | "metadata": {}, 223 | "outputs": [], 224 | "source": [ 225 | "%%time\n", 226 | "\n", 227 | "kernel = 'rbf'\n", 228 | "C = 1.0\n", 229 | "svm_reg = svm.SVR(kernel=kernel, C=C, gamma='auto')\n", 230 | "svm_reg.fit(X_train, y_train)" 231 | ] 232 | }, 233 | { 234 | "cell_type": "code", 235 | "execution_count": null, 236 | "metadata": {}, 237 | "outputs": [], 238 | "source": [ 239 | "if X_train.shape[1] == 1:\n", 240 | " plt.figure(figsize=(10, 10))\n", 241 | " plt.scatter(X_train, y_train, s=5)\n", 242 | " reg_x = np.arange(np.min(X_train), np.max(X_train), 0.01).reshape(-1, 1)\n", 243 | " plt.scatter(reg_x, lin_reg.predict(reg_x), s=8, label='linear SVR')\n", 244 | " plt.scatter(reg_x, svm_reg.predict(reg_x), s=8, label='non-linear ({}) SVR'.format(kernel))\n", 245 | " plt.legend(loc='best');" 246 | ] 247 | }, 248 | { 249 | "cell_type": "markdown", 250 | "metadata": {}, 251 | "source": [ 252 | "### Inference" 253 | ] 254 | }, 255 | { 256 | "cell_type": "code", 257 | "execution_count": null, 258 | "metadata": {}, 259 | "outputs": [], 260 | "source": [ 261 | "%%time\n", 262 | "\n", 263 | "predictions = svm_reg.predict(X_test)\n", 264 | "print(\"Mean squared error: %.3f\"\n", 265 | " % mean_squared_error(y_test, predictions))" 266 | ] 267 | }, 268 | { 269 | "cell_type": "markdown", 270 | "metadata": {}, 271 | "source": [ 272 | "## Model tuning\n", 273 | "\n", 274 | "Try to reduce the mean squared error of the regression. Experiment with several single attributes and with using all attributes. See the documentation of [LinearSVR](https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVR.html#sklearn.svm.LinearSVR) and [SVR](https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVR.html#sklearn.svm.SVR) for further options.\n", 275 | "\n", 276 | "To further improve the results, it is possible to replace [StandardScaler](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html), that is scaling the input data to zero mean and unit variance, with more advanced preprocessing.\n", 277 | "See [Preprocessing data](https://scikit-learn.org/stable/modules/preprocessing.html#preprocessing-data) for more information." 278 | ] 279 | }, 280 | { 281 | "cell_type": "code", 282 | "execution_count": null, 283 | "metadata": {}, 284 | "outputs": [], 285 | "source": [] 286 | } 287 | ], 288 | "metadata": { 289 | "kernelspec": { 290 | "display_name": "Python 3 (ipykernel)", 291 | "language": "python", 292 | "name": "python3" 293 | }, 294 | "language_info": { 295 | "codemirror_mode": { 296 | "name": "ipython", 297 | "version": 3 298 | }, 299 | "file_extension": ".py", 300 | "mimetype": "text/x-python", 301 | "name": "python", 302 | "nbconvert_exporter": "python", 303 | "pygments_lexer": "ipython3", 304 | "version": "3.11.9" 305 | } 306 | }, 307 | "nbformat": 4, 308 | "nbformat_minor": 4 309 | } 310 | -------------------------------------------------------------------------------- /Exercise-07.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# MNIST handwritten digits classification with decision trees \n", 8 | "\n", 9 | "In this notebook, we'll use [decision trees](http://scikit-learn.org/stable/modules/tree.html) and [ensembles of trees](http://scikit-learn.org/stable/modules/ensemble.html) to classify MNIST digits using scikit-learn and [XGBoost](https://xgboost.readthedocs.io/en/latest/).\n", 10 | "\n", 11 | "First, the needed imports. " 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": null, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "%matplotlib inline\n", 21 | "\n", 22 | "from pml_utils import get_mnist, show_failures\n", 23 | "\n", 24 | "import numpy as np\n", 25 | "from sklearn import __version__\n", 26 | "from sklearn.tree import DecisionTreeClassifier, export_graphviz\n", 27 | "from sklearn.ensemble import RandomForestClassifier\n", 28 | "from xgboost import XGBClassifier\n", 29 | "from sklearn.metrics import accuracy_score, confusion_matrix, classification_report\n", 30 | "\n", 31 | "import matplotlib.pyplot as plt\n", 32 | "import seaborn as sns\n", 33 | "import graphviz\n", 34 | "sns.set()\n", 35 | "\n", 36 | "from packaging.version import Version\n", 37 | "assert(Version(__version__) >= Version(\"0.20\")), \"Version >= 0.20 of sklearn is required.\"" 38 | ] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "metadata": {}, 43 | "source": [ 44 | "Then we load the MNIST data. First time we need to download the data, which can take a while." 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": null, 50 | "metadata": { 51 | "collapsed": false, 52 | "jupyter": { 53 | "outputs_hidden": false 54 | } 55 | }, 56 | "outputs": [], 57 | "source": [ 58 | "X_train, y_train, X_test, y_test = get_mnist('MNIST')\n", 59 | "\n", 60 | "print('MNIST data loaded: train:',len(X_train),'test:',len(X_test))\n", 61 | "print('X_train:', X_train.shape)\n", 62 | "print('y_train:', y_train.shape)\n", 63 | "print('X_test', X_test.shape)\n", 64 | "print('y_test', y_test.shape)" 65 | ] 66 | }, 67 | { 68 | "cell_type": "markdown", 69 | "metadata": {}, 70 | "source": [ 71 | "## Decision tree\n", 72 | "\n", 73 | "Decision tree is a model that predicts the value of a target variable by learning simple *if-then-else* decision rules inferred from the data features.\n", 74 | "\n", 75 | "### Learning\n", 76 | "\n", 77 | "Let's start by training a decision tree with default parameter values for classifying MNIST digits." 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": null, 83 | "metadata": { 84 | "collapsed": false, 85 | "jupyter": { 86 | "outputs_hidden": false 87 | } 88 | }, 89 | "outputs": [], 90 | "source": [ 91 | "%%time\n", 92 | "\n", 93 | "clf_dt = DecisionTreeClassifier()\n", 94 | "clf_dt.fit(X_train, y_train)" 95 | ] 96 | }, 97 | { 98 | "cell_type": "markdown", 99 | "metadata": {}, 100 | "source": [ 101 | "### Inference\n", 102 | "\n", 103 | "Classifying a new sample with a decision tree is fast, as it consists of following a single path in the tree until a leaf node is found." 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": null, 109 | "metadata": { 110 | "collapsed": false, 111 | "jupyter": { 112 | "outputs_hidden": false 113 | } 114 | }, 115 | "outputs": [], 116 | "source": [ 117 | "%%time\n", 118 | "\n", 119 | "pred_dt = clf_dt.predict(X_test)\n", 120 | "print('Predicted', len(pred_dt), 'digits with accuracy:', accuracy_score(y_test, pred_dt))" 121 | ] 122 | }, 123 | { 124 | "cell_type": "markdown", 125 | "metadata": {}, 126 | "source": [ 127 | "### Visualization\n", 128 | "\n", 129 | "Decision trees are simple to understand and visualize. Large trees can, however, be rather hard to inspect. \n", 130 | "\n", 131 | "The code below draws the trained decision tree classifier. The resulting figure is huge, so it is better to save it as a separate file (`mydt.pdf`) and use a separate PDF viewer instead of drawing the figure into this notebook. \n", 132 | "\n", 133 | "To obtain a small tree for better suited for visualization, try adding the option `max_depth=3` to the above `DecisionTreeClassifier()`. " 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": null, 139 | "metadata": { 140 | "collapsed": false, 141 | "jupyter": { 142 | "outputs_hidden": false 143 | } 144 | }, 145 | "outputs": [], 146 | "source": [ 147 | "export_graphviz(clf_dt, out_file=\"mydt.dot\")\n", 148 | "with open(\"mydt.dot\") as f:\n", 149 | " dot_graph = f.read()\n", 150 | "a=graphviz.Source(dot_graph)\n", 151 | "print('Wrote PDF file:', a.render('mydt', view=False))" 152 | ] 153 | }, 154 | { 155 | "cell_type": "markdown", 156 | "metadata": {}, 157 | "source": [ 158 | "## Random forest\n", 159 | "\n", 160 | "Random forest is an ensemble (or a group; hence the name *forest*) of decision trees, obtained by introducing randomness into the tree generation. The prediction of the random forest is obtained by *averaging* the predictions of the individual trees.\n", 161 | "\n", 162 | "Random forest is a solid workhorse that almost always produces serviceable results without much tuning.\n", 163 | "\n", 164 | "### Learning\n", 165 | "\n", 166 | "Random forest classifiers are quick to train, quite robust to hyperparameter values, and often work relatively well." 167 | ] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "execution_count": null, 172 | "metadata": { 173 | "collapsed": false, 174 | "jupyter": { 175 | "outputs_hidden": false 176 | } 177 | }, 178 | "outputs": [], 179 | "source": [ 180 | "%%time\n", 181 | "\n", 182 | "n_estimators = 10\n", 183 | "clf_rf = RandomForestClassifier(n_estimators=n_estimators)\n", 184 | "clf_rf.fit(X_train, y_train)" 185 | ] 186 | }, 187 | { 188 | "cell_type": "markdown", 189 | "metadata": {}, 190 | "source": [ 191 | "### Inference" 192 | ] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "execution_count": null, 197 | "metadata": { 198 | "collapsed": false, 199 | "jupyter": { 200 | "outputs_hidden": false 201 | } 202 | }, 203 | "outputs": [], 204 | "source": [ 205 | "%%time\n", 206 | "\n", 207 | "pred_rf = clf_rf.predict(X_test)\n", 208 | "print('Predicted', len(pred_rf), 'digits with accuracy:', accuracy_score(y_test, pred_rf))" 209 | ] 210 | }, 211 | { 212 | "cell_type": "markdown", 213 | "metadata": {}, 214 | "source": [ 215 | "#### Failure analysis\n", 216 | "\n", 217 | "The random forest classifier worked quite well, so let's take a closer look." 218 | ] 219 | }, 220 | { 221 | "cell_type": "markdown", 222 | "metadata": {}, 223 | "source": [ 224 | "Here are the first 10 test digits the random forest model classified to a wrong class:" 225 | ] 226 | }, 227 | { 228 | "cell_type": "code", 229 | "execution_count": null, 230 | "metadata": { 231 | "collapsed": false, 232 | "jupyter": { 233 | "outputs_hidden": false 234 | } 235 | }, 236 | "outputs": [], 237 | "source": [ 238 | "show_failures(pred_rf, y_test, X_test)" 239 | ] 240 | }, 241 | { 242 | "cell_type": "markdown", 243 | "metadata": {}, 244 | "source": [ 245 | "We can use `show_failures()` to inspect failures in more detail. For example:\n", 246 | "\n", 247 | "* show failures in which the true class was \"5\":" 248 | ] 249 | }, 250 | { 251 | "cell_type": "code", 252 | "execution_count": null, 253 | "metadata": { 254 | "collapsed": false, 255 | "jupyter": { 256 | "outputs_hidden": false 257 | } 258 | }, 259 | "outputs": [], 260 | "source": [ 261 | "show_failures(pred_rf, y_test, X_test, trueclass='5')" 262 | ] 263 | }, 264 | { 265 | "cell_type": "markdown", 266 | "metadata": {}, 267 | "source": [ 268 | "* show failures in which the prediction was \"0\":" 269 | ] 270 | }, 271 | { 272 | "cell_type": "code", 273 | "execution_count": null, 274 | "metadata": { 275 | "collapsed": false, 276 | "jupyter": { 277 | "outputs_hidden": false 278 | } 279 | }, 280 | "outputs": [], 281 | "source": [ 282 | "show_failures(pred_rf, y_test, X_test, predictedclass='0')" 283 | ] 284 | }, 285 | { 286 | "cell_type": "markdown", 287 | "metadata": {}, 288 | "source": [ 289 | "* show failures in which the true class was \"0\" and the prediction was \"2\":" 290 | ] 291 | }, 292 | { 293 | "cell_type": "code", 294 | "execution_count": null, 295 | "metadata": { 296 | "collapsed": false, 297 | "jupyter": { 298 | "outputs_hidden": false 299 | } 300 | }, 301 | "outputs": [], 302 | "source": [ 303 | "show_failures(pred_rf, y_test, X_test, trueclass='0', predictedclass='2')" 304 | ] 305 | }, 306 | { 307 | "cell_type": "markdown", 308 | "metadata": {}, 309 | "source": [ 310 | "#### Confusion matrix, accuracy, precision, and recall\n", 311 | "\n", 312 | "We can also compute the confusion matrix to see which digits get mixed the most, and look at classification accuracies separately for each class:" 313 | ] 314 | }, 315 | { 316 | "cell_type": "code", 317 | "execution_count": null, 318 | "metadata": { 319 | "collapsed": false, 320 | "jupyter": { 321 | "outputs_hidden": false 322 | } 323 | }, 324 | "outputs": [], 325 | "source": [ 326 | "labels=[str(i) for i in range(10)]\n", 327 | "print('Confusion matrix (rows: true classes; columns: predicted classes):'); print()\n", 328 | "cm=confusion_matrix(y_test, pred_rf, labels=labels)\n", 329 | "print(cm); print()\n", 330 | "\n", 331 | "print('Classification accuracy for each class:'); print()\n", 332 | "for i,j in enumerate(cm.diagonal()/cm.sum(axis=1)): print(\"%d: %.4f\" % (i,j))" 333 | ] 334 | }, 335 | { 336 | "cell_type": "markdown", 337 | "metadata": {}, 338 | "source": [ 339 | "Precision and recall for each class:" 340 | ] 341 | }, 342 | { 343 | "cell_type": "code", 344 | "execution_count": null, 345 | "metadata": { 346 | "collapsed": false, 347 | "jupyter": { 348 | "outputs_hidden": false 349 | } 350 | }, 351 | "outputs": [], 352 | "source": [ 353 | "print(classification_report(y_test, pred_rf, labels=labels))" 354 | ] 355 | }, 356 | { 357 | "cell_type": "markdown", 358 | "metadata": {}, 359 | "source": [ 360 | "## Gradient boosted trees (XGBoost)\n", 361 | "\n", 362 | "Gradient boosted trees (or extreme gradient boosted trees) is another way of constructing ensembles of decision trees, using the *boosting* framework. Let's use a popular separate package, [XGBoost](http://xgboost.readthedocs.io/en/latest/), to train gradient boosted trees to classify MNIST digits. \n", 363 | "\n", 364 | "XGBoost has been used to obtain record-breaking results on many machine learning competitions, but have quite a lot of hyperparameters that need to be carefully tuned to get the best performance.\n", 365 | "\n", 366 | "### Learning\n", 367 | "\n", 368 | "Training an XGBoost classifier takes a bit more time, so let's start by using only a subset of the training data. " 369 | ] 370 | }, 371 | { 372 | "cell_type": "markdown", 373 | "metadata": {}, 374 | "source": [ 375 | "XGBoost needs to have integer labels, not strings \"0\", \"1\", \"2\" etc that we have." 376 | ] 377 | }, 378 | { 379 | "cell_type": "code", 380 | "execution_count": null, 381 | "metadata": {}, 382 | "outputs": [], 383 | "source": [ 384 | "from sklearn.preprocessing import LabelEncoder\n", 385 | "le = LabelEncoder()\n", 386 | "le.fit(y_train)\n", 387 | "le.classes_" 388 | ] 389 | }, 390 | { 391 | "cell_type": "code", 392 | "execution_count": null, 393 | "metadata": { 394 | "collapsed": false, 395 | "jupyter": { 396 | "outputs_hidden": false 397 | } 398 | }, 399 | "outputs": [], 400 | "source": [ 401 | "%%time\n", 402 | "\n", 403 | "n_data = 10000\n", 404 | "clf_xgb = XGBClassifier()\n", 405 | "clf_xgb.fit(X_train[:n_data,:], le.transform(y_train[:n_data]))" 406 | ] 407 | }, 408 | { 409 | "cell_type": "markdown", 410 | "metadata": {}, 411 | "source": [ 412 | "### Inference\n", 413 | "\n", 414 | "At least with only a subset of training data and default hyperparameters values, XGBoost does not reach the performance of random forest." 415 | ] 416 | }, 417 | { 418 | "cell_type": "code", 419 | "execution_count": null, 420 | "metadata": { 421 | "collapsed": false, 422 | "jupyter": { 423 | "outputs_hidden": false 424 | } 425 | }, 426 | "outputs": [], 427 | "source": [ 428 | "%%time\n", 429 | "\n", 430 | "pred_xgb = clf_xgb.predict(X_test)\n", 431 | "pred_xgb = le.inverse_transform(pred_xgb) # convert back to our string labels\n", 432 | "print('Predicted', len(pred_xgb), 'digits with accuracy:', accuracy_score(y_test, pred_xgb))" 433 | ] 434 | }, 435 | { 436 | "cell_type": "markdown", 437 | "metadata": {}, 438 | "source": [ 439 | "You can also use `show_failures()` to inspect the failures, and calculate the confusion matrix and other metrics as was done with the random forest above.\n", 440 | "\n", 441 | "## Model tuning" 442 | ] 443 | }, 444 | { 445 | "cell_type": "markdown", 446 | "metadata": {}, 447 | "source": [ 448 | "Study the documentation of the different decision tree models used in this notebook ([decision trees](http://scikit-learn.org/stable/modules/tree.html), [tree ensembles](http://scikit-learn.org/stable/modules/ensemble.html), [XGBoost](https://xgboost.readthedocs.io/en/latest/)), and experiment with different hyperparameter values. \n", 449 | "\n", 450 | "Report the highest classification accuracy you manage to obtain for each model type. Also mark down the parameters you used, so others can try to reproduce your results. " 451 | ] 452 | }, 453 | { 454 | "cell_type": "code", 455 | "execution_count": null, 456 | "metadata": {}, 457 | "outputs": [], 458 | "source": [] 459 | } 460 | ], 461 | "metadata": { 462 | "kernelspec": { 463 | "display_name": "Python 3 (ipykernel)", 464 | "language": "python", 465 | "name": "python3" 466 | }, 467 | "language_info": { 468 | "codemirror_mode": { 469 | "name": "ipython", 470 | "version": 3 471 | }, 472 | "file_extension": ".py", 473 | "mimetype": "text/x-python", 474 | "name": "python", 475 | "nbconvert_exporter": "python", 476 | "pygments_lexer": "ipython3", 477 | "version": "3.11.9" 478 | } 479 | }, 480 | "nbformat": 4, 481 | "nbformat_minor": 4 482 | } 483 | -------------------------------------------------------------------------------- /Exercise-08.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# California housing dataset regression with decision trees \n", 8 | "\n", 9 | "In this notebook, we'll use [decision trees](http://scikit-learn.org/stable/modules/tree.html) and [ensembles of trees](http://scikit-learn.org/stable/modules/ensemble.html) to estimate median house values on Californian housing districts using scikit-learn and [XGBoost](https://xgboost.readthedocs.io/en/latest/).\n", 10 | "\n", 11 | "First, the needed imports. " 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": null, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "%matplotlib inline\n", 21 | "\n", 22 | "import numpy as np\n", 23 | "from sklearn import datasets, __version__\n", 24 | "from sklearn.model_selection import train_test_split\n", 25 | "from sklearn.tree import DecisionTreeRegressor\n", 26 | "from sklearn.ensemble import RandomForestRegressor\n", 27 | "from xgboost import XGBRegressor\n", 28 | "from sklearn.metrics import mean_squared_error\n", 29 | "\n", 30 | "import matplotlib.pyplot as plt\n", 31 | "import seaborn as sns\n", 32 | "sns.set()" 33 | ] 34 | }, 35 | { 36 | "cell_type": "markdown", 37 | "metadata": {}, 38 | "source": [ 39 | "## Data\n", 40 | "\n", 41 | "Then we load the California housing data. First time we need to download the data, which can take a while." 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": null, 47 | "metadata": { 48 | "collapsed": false, 49 | "jupyter": { 50 | "outputs_hidden": false 51 | } 52 | }, 53 | "outputs": [], 54 | "source": [ 55 | "chd = datasets.fetch_california_housing()" 56 | ] 57 | }, 58 | { 59 | "cell_type": "markdown", 60 | "metadata": {}, 61 | "source": [ 62 | "We'll split the data into a training and a test set.\n", 63 | "\n", 64 | "Let's also select a single attribute to start the analysis with, say *MedInc*." 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": null, 70 | "metadata": { 71 | "collapsed": false, 72 | "jupyter": { 73 | "outputs_hidden": false 74 | } 75 | }, 76 | "outputs": [], 77 | "source": [ 78 | "test_size = 5000\n", 79 | "single_attribute = 'MedInc'\n", 80 | "\n", 81 | "X_train_all, X_test_all, y_train, y_test = train_test_split(\n", 82 | " chd.data, chd.target, test_size=test_size, shuffle=True)\n", 83 | "\n", 84 | "attribute_index = chd.feature_names.index(single_attribute)\n", 85 | "X_train_single = X_train_all[:, attribute_index].reshape(-1, 1)\n", 86 | "X_test_single = X_test_all[:, attribute_index].reshape(-1, 1)\n", 87 | " \n", 88 | "print()\n", 89 | "print('California housing data: train:',len(X_train_all),'test:',len(X_test_all))\n", 90 | "print()\n", 91 | "print('X_train_all:', X_train_all.shape)\n", 92 | "print('X_train_single:', X_train_single.shape)\n", 93 | "print('y_train:', y_train.shape)\n", 94 | "print()\n", 95 | "print('X_test_all', X_test_all.shape)\n", 96 | "print('X_test_single', X_test_single.shape)\n", 97 | "print('y_test', y_test.shape)" 98 | ] 99 | }, 100 | { 101 | "cell_type": "markdown", 102 | "metadata": {}, 103 | "source": [ 104 | "The training data matrix `X_train_all` is a matrix of size (`n_train`, 8), and `X_train_single` contains only the first attribute (*MedInc* by default). The vector `y_train` contains the target value (median house value) for each housing district in the training set.\n", 105 | "\n", 106 | "Let's start our analysis with the single attribute. Later, you can set `only_single_attribute = False` to use all eight attributes in the regression." 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": null, 112 | "metadata": {}, 113 | "outputs": [], 114 | "source": [ 115 | "only_single_attribute = True\n", 116 | "\n", 117 | "if only_single_attribute:\n", 118 | " X_train = X_train_single\n", 119 | " X_test = X_test_single\n", 120 | "else:\n", 121 | " X_train = X_train_all\n", 122 | " X_test = X_test_all\n", 123 | "\n", 124 | "print('X_train:', X_train.shape)\n", 125 | "print('X_test:', X_test.shape)" 126 | ] 127 | }, 128 | { 129 | "cell_type": "markdown", 130 | "metadata": {}, 131 | "source": [ 132 | "## Decision tree\n", 133 | "\n", 134 | "Decision tree is a model that predicts the value of a target variable by learning simple *if-then-else* decision rules inferred from the data features.\n", 135 | "\n", 136 | "### Learning\n", 137 | "\n", 138 | "The parameter `max_depth` specifies the maximum depth of the tree." 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": null, 144 | "metadata": { 145 | "collapsed": false, 146 | "jupyter": { 147 | "outputs_hidden": false 148 | } 149 | }, 150 | "outputs": [], 151 | "source": [ 152 | "%%time\n", 153 | "\n", 154 | "max_depth = 3\n", 155 | "dt_reg = DecisionTreeRegressor(max_depth=max_depth)\n", 156 | "dt_reg.fit(X_train, y_train)" 157 | ] 158 | }, 159 | { 160 | "cell_type": "markdown", 161 | "metadata": {}, 162 | "source": [ 163 | "We can visualize the results if we are using only a single attribute:" 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": null, 169 | "metadata": {}, 170 | "outputs": [], 171 | "source": [ 172 | "if X_train.shape[1] == 1:\n", 173 | " plt.figure(figsize=(10, 10))\n", 174 | " plt.scatter(X_train, y_train, s=5)\n", 175 | " reg_x = np.arange(np.min(X_train), np.max(X_train), 0.01).reshape(-1, 1)\n", 176 | " plt.plot(reg_x, dt_reg.predict(reg_x), lw=4, c=sns.color_palette()[1],\n", 177 | " label='decision tree')\n", 178 | " plt.legend(loc='best');" 179 | ] 180 | }, 181 | { 182 | "cell_type": "markdown", 183 | "metadata": {}, 184 | "source": [ 185 | "### Inference\n", 186 | "\n", 187 | "We use *mean squared error* as the performance measure for our regression algorihm: " 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "execution_count": null, 193 | "metadata": { 194 | "collapsed": false, 195 | "jupyter": { 196 | "outputs_hidden": false 197 | } 198 | }, 199 | "outputs": [], 200 | "source": [ 201 | "%%time\n", 202 | "\n", 203 | "predictions = dt_reg.predict(X_test)\n", 204 | "print(\"Mean squared error: %.3f\"\n", 205 | " % mean_squared_error(y_test, predictions))" 206 | ] 207 | }, 208 | { 209 | "cell_type": "markdown", 210 | "metadata": {}, 211 | "source": [ 212 | "## Random forest\n", 213 | "\n", 214 | "Random forest is an ensemble (or a group; hence the name *forest*) of decision trees, obtained by introducing randomness into the tree generation. The prediction of the random forest is obtained by *averaging* the predictions of the individual trees.\n", 215 | "\n", 216 | "Random forest is a solid workhorse that almost always produces serviceable results without much tuning.\n", 217 | "\n", 218 | "### Learning\n", 219 | "\n", 220 | "Random forest classifiers are quick to train, quite robust to hyperparameter values, and often work relatively well." 221 | ] 222 | }, 223 | { 224 | "cell_type": "code", 225 | "execution_count": null, 226 | "metadata": { 227 | "collapsed": false, 228 | "jupyter": { 229 | "outputs_hidden": false 230 | } 231 | }, 232 | "outputs": [], 233 | "source": [ 234 | "%%time\n", 235 | "\n", 236 | "n_estimators = 10\n", 237 | "max_depth = 3\n", 238 | "rf_reg = RandomForestRegressor(n_estimators=n_estimators,\n", 239 | " max_depth=max_depth)\n", 240 | "rf_reg.fit(X_train, y_train)" 241 | ] 242 | }, 243 | { 244 | "cell_type": "code", 245 | "execution_count": null, 246 | "metadata": {}, 247 | "outputs": [], 248 | "source": [ 249 | "if X_train.shape[1] == 1:\n", 250 | " plt.figure(figsize=(10, 10))\n", 251 | " plt.scatter(X_train, y_train, s=5)\n", 252 | " reg_x = np.arange(np.min(X_train), np.max(X_train), 0.01).reshape(-1, 1)\n", 253 | " plt.plot(reg_x, dt_reg.predict(reg_x), lw=4, c=sns.color_palette()[1],\n", 254 | " label='decision tree')\n", 255 | " plt.plot(reg_x, rf_reg.predict(reg_x), lw=4, c=sns.color_palette()[2],\n", 256 | " label='random forest')\n", 257 | " plt.legend(loc='best');" 258 | ] 259 | }, 260 | { 261 | "cell_type": "markdown", 262 | "metadata": {}, 263 | "source": [ 264 | "### Inference" 265 | ] 266 | }, 267 | { 268 | "cell_type": "code", 269 | "execution_count": null, 270 | "metadata": { 271 | "collapsed": false, 272 | "jupyter": { 273 | "outputs_hidden": false 274 | } 275 | }, 276 | "outputs": [], 277 | "source": [ 278 | "%%time\n", 279 | "\n", 280 | "predictions = rf_reg.predict(X_test)\n", 281 | "print(\"Mean squared error: %.3f\"\n", 282 | " % mean_squared_error(y_test, predictions))" 283 | ] 284 | }, 285 | { 286 | "cell_type": "markdown", 287 | "metadata": {}, 288 | "source": [ 289 | "## Gradient boosted trees (XGBoost)\n", 290 | "\n", 291 | "Gradient boosted trees (or extreme gradient boosted trees) is another way of constructing ensembles of decision trees, using the *boosting* framework. Let's use a popular separate package, [XGBoost](http://xgboost.readthedocs.io/en/latest/), to train gradient boosted trees for regression. \n", 292 | "\n", 293 | "XGBoost has been recently used to obtain record-breaking results on many machine learning competitions, but have quite a lot of hyperparameters that need to be carefully tuned to get the best performance.\n", 294 | "\n", 295 | "### Learning" 296 | ] 297 | }, 298 | { 299 | "cell_type": "code", 300 | "execution_count": null, 301 | "metadata": { 302 | "collapsed": false, 303 | "jupyter": { 304 | "outputs_hidden": false 305 | } 306 | }, 307 | "outputs": [], 308 | "source": [ 309 | "%%time\n", 310 | "\n", 311 | "xgb_reg = XGBRegressor()\n", 312 | "xgb_reg.fit(X_train, y_train)" 313 | ] 314 | }, 315 | { 316 | "cell_type": "code", 317 | "execution_count": null, 318 | "metadata": {}, 319 | "outputs": [], 320 | "source": [ 321 | "if X_train.shape[1] == 1:\n", 322 | " plt.figure(figsize=(10, 10))\n", 323 | " plt.scatter(X_train, y_train, s=5)\n", 324 | " reg_x = np.arange(np.min(X_train), np.max(X_train), 0.01).reshape(-1, 1)\n", 325 | " plt.plot(reg_x, dt_reg.predict(reg_x), lw=4, c=sns.color_palette()[1],\n", 326 | " label='decision tree')\n", 327 | " plt.plot(reg_x, rf_reg.predict(reg_x), lw=4, c=sns.color_palette()[2],\n", 328 | " label='random forest')\n", 329 | " plt.plot(reg_x, xgb_reg.predict(reg_x), lw=4, c=sns.color_palette()[3],\n", 330 | " label='XGBoost')\n", 331 | " plt.legend(loc='best');" 332 | ] 333 | }, 334 | { 335 | "cell_type": "markdown", 336 | "metadata": {}, 337 | "source": [ 338 | "### Inference" 339 | ] 340 | }, 341 | { 342 | "cell_type": "code", 343 | "execution_count": null, 344 | "metadata": { 345 | "collapsed": false, 346 | "jupyter": { 347 | "outputs_hidden": false 348 | } 349 | }, 350 | "outputs": [], 351 | "source": [ 352 | "%%time\n", 353 | "\n", 354 | "predictions = xgb_reg.predict(X_test)\n", 355 | "print(\"Mean squared error: %.3f\"\n", 356 | " % mean_squared_error(y_test, predictions))" 357 | ] 358 | }, 359 | { 360 | "cell_type": "markdown", 361 | "metadata": {}, 362 | "source": [ 363 | "## Model tuning" 364 | ] 365 | }, 366 | { 367 | "cell_type": "markdown", 368 | "metadata": {}, 369 | "source": [ 370 | "Study the documentation of the different decision tree models used in this notebook ([decision trees](http://scikit-learn.org/stable/modules/tree.html), [tree ensembles](http://scikit-learn.org/stable/modules/ensemble.html), [XGBoost](https://xgboost.readthedocs.io/en/latest/)), and experiment with different hyperparameter values. \n", 371 | "\n", 372 | "Report the lowest mean squared error you manage to obtain for each model type. Also mark down the parameters you used, so others can try to reproduce your results. " 373 | ] 374 | }, 375 | { 376 | "cell_type": "code", 377 | "execution_count": null, 378 | "metadata": {}, 379 | "outputs": [], 380 | "source": [] 381 | } 382 | ], 383 | "metadata": { 384 | "kernelspec": { 385 | "display_name": "Python 3 (ipykernel)", 386 | "language": "python", 387 | "name": "python3" 388 | }, 389 | "language_info": { 390 | "codemirror_mode": { 391 | "name": "ipython", 392 | "version": 3 393 | }, 394 | "file_extension": ".py", 395 | "mimetype": "text/x-python", 396 | "name": "python", 397 | "nbconvert_exporter": "python", 398 | "pygments_lexer": "ipython3", 399 | "version": "3.11.9" 400 | } 401 | }, 402 | "nbformat": 4, 403 | "nbformat_minor": 4 404 | } 405 | -------------------------------------------------------------------------------- /Exercise-09.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# MNIST handwritten digits classification with MLPs\n", 8 | "\n", 9 | "In this notebook, we'll train a multi-layer perceptron model to classify MNIST digits using [TensorFlow](https://www.tensorflow.org/) (version $\\ge$ 2.0 required) with the [Keras API](https://www.tensorflow.org/guide/keras/overview).\n", 10 | "\n", 11 | "First, the needed imports." 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": null, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "%matplotlib inline\n", 21 | "\n", 22 | "from pml_utils import show_failures\n", 23 | "\n", 24 | "import tensorflow as tf\n", 25 | "from tensorflow.keras.models import Sequential\n", 26 | "from tensorflow.keras.layers import Dense, Activation, Dropout, Flatten\n", 27 | "from tensorflow.keras.utils import plot_model, to_categorical\n", 28 | "\n", 29 | "from IPython.display import SVG, display\n", 30 | "\n", 31 | "import numpy as np\n", 32 | "import matplotlib.pyplot as plt\n", 33 | "import seaborn as sns\n", 34 | "sns.set()\n", 35 | "\n", 36 | "print('Using Tensorflow version: {}, and Keras version: {}.'.format(tf.__version__, tf.keras.__version__))" 37 | ] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "metadata": {}, 42 | "source": [ 43 | "Let's check if we have GPU available." 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": null, 49 | "metadata": {}, 50 | "outputs": [], 51 | "source": [ 52 | "if tf.test.is_gpu_available():\n", 53 | " from tensorflow.python.client import device_lib\n", 54 | " for d in device_lib.list_local_devices():\n", 55 | " if d.device_type == 'GPU':\n", 56 | " print('GPU', d.physical_device_desc)\n", 57 | "else:\n", 58 | " print('No GPU, using CPU instead.')" 59 | ] 60 | }, 61 | { 62 | "cell_type": "markdown", 63 | "metadata": {}, 64 | "source": [ 65 | "## MNIST data set\n", 66 | "\n", 67 | "Next we'll load the MNIST handwritten digits data set using TensorFlow's own tools. First time we may have to download the data, which can take a while.\n", 68 | "\n", 69 | "#### Altenative: Fashion-MNIST\n", 70 | "\n", 71 | "Alternatively, MNIST can be replaced with Fashion-MNIST, which can be used as drop-in replacement for MNIST. Fashion-MNIST contains images of 10 fashion categories:\n", 72 | "\n", 73 | "Label|Description|Label|Description\n", 74 | "--- | --- |--- | ---\n", 75 | "0|T-shirt/top|5|Sandal\n", 76 | "1|Trouser|6|Shirt\n", 77 | "2|Pullover|7|Sneaker\n", 78 | "3|Dress|8|Bag\n", 79 | "4|Coat|9|Ankle boot\n" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": null, 85 | "metadata": {}, 86 | "outputs": [], 87 | "source": [ 88 | "from tensorflow.keras.datasets import mnist, fashion_mnist\n", 89 | "\n", 90 | "## MNIST:\n", 91 | "(X_train, y_train), (X_test, y_test) = mnist.load_data()\n", 92 | "## Fashion-MNIST:\n", 93 | "#(X_train, y_train), (X_test, y_test) = fashion_mnist.load_data()\n", 94 | "\n", 95 | "nb_classes = 10\n", 96 | "\n", 97 | "X_train = X_train.astype('float32')\n", 98 | "X_test = X_test.astype('float32')\n", 99 | "X_train /= 255.0\n", 100 | "X_test /= 255.0\n", 101 | "\n", 102 | "# one-hot encoding:\n", 103 | "Y_train = to_categorical(y_train, nb_classes)\n", 104 | "Y_test = to_categorical(y_test, nb_classes)\n", 105 | "\n", 106 | "print()\n", 107 | "print('MNIST data loaded: train:',len(X_train),'test:',len(X_test))\n", 108 | "print('X_train:', X_train.shape)\n", 109 | "print('y_train:', y_train.shape)\n", 110 | "print('Y_train:', Y_train.shape)" 111 | ] 112 | }, 113 | { 114 | "cell_type": "markdown", 115 | "metadata": {}, 116 | "source": [ 117 | "The training data (`X_train`) is a 3rd-order tensor of size (60000, 28, 28), i.e. it consists of 60000 images of size 28x28 pixels. `y_train` is a 60000-dimensional vector containing the correct classes (\"0\", \"1\", ..., \"9\") for each training sample, and `Y_train` is a [one-hot](https://en.wikipedia.org/wiki/One-hot) encoding of `y_train`.\n", 118 | "\n", 119 | "Let's take a closer look. Here are the first 10 training digits (or fashion items for Fashion-MNIST):" 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": null, 125 | "metadata": {}, 126 | "outputs": [], 127 | "source": [ 128 | "pltsize=1\n", 129 | "plt.figure(figsize=(10*pltsize, pltsize))\n", 130 | "\n", 131 | "for i in range(10):\n", 132 | " plt.subplot(1,10,i+1)\n", 133 | " plt.axis('off')\n", 134 | " plt.imshow(X_train[i,:,:], cmap=\"gray\")\n", 135 | " plt.title('Class: '+str(y_train[i]))\n", 136 | " print('Training sample',i,': class:',y_train[i], ', one-hot encoded:', Y_train[i])" 137 | ] 138 | }, 139 | { 140 | "cell_type": "markdown", 141 | "metadata": {}, 142 | "source": [ 143 | "## Multi-layer perceptron (MLP) network\n", 144 | "\n", 145 | "### Activation functions\n", 146 | "\n", 147 | "Let's start by plotting some common activation functions for neural networks. `'relu'` stands for rectified linear unit, $y=\\max(0,x)$, a very simple non-linearity we will be using in our MLP network below." 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "execution_count": null, 153 | "metadata": {}, 154 | "outputs": [], 155 | "source": [ 156 | "x = np.arange(-4,4,.01)\n", 157 | "plt.figure()\n", 158 | "plt.plot(x, np.maximum(x,0), label='relu')\n", 159 | "plt.plot(x, 1/(1+np.exp(-x)), label='sigmoid')\n", 160 | "plt.plot(x, np.tanh(x), label='tanh')\n", 161 | "plt.axis([-4, 4, -1.1, 1.5])\n", 162 | "plt.title('Activation functions')\n", 163 | "plt.legend(loc='best');" 164 | ] 165 | }, 166 | { 167 | "cell_type": "markdown", 168 | "metadata": {}, 169 | "source": [ 170 | "### Initialization\n", 171 | "\n", 172 | "Let's now create an MLP model that has multiple layers, non-linear activation functions, and optionally dropout layers for regularization.\n", 173 | "\n", 174 | "We first initialize the model with `Sequential()`. Then we add a `Dense` layer that has 28*28=784 input nodes (one for each pixel in the input image) and 20 output nodes. The `Dense` layer connects each input to each output with some weight parameter. \n", 175 | "\n", 176 | "Next, the output of the dense layer is passed through a ReLU non-linear activation function.\n", 177 | "\n", 178 | "Commented out is an alternative, more complex, model that you can also try out. It uses more layers and dropout. `Dropout()` randomly sets a fraction of inputs to zero during training, which is one approach to regularization and can sometimes help to prevent overfitting.\n", 179 | "\n", 180 | "The output of the last layer needs to be a softmaxed 10-dimensional vector to match the groundtruth (`Y_train`). This means that it will output 10 values between 0 and 1 which sum to 1, hence, together they can be interpreted as a probability distribution over our 10 classes.\n", 181 | "\n", 182 | "Finally, we select *categorical crossentropy* as the loss function, select [*Adam*](https://keras.io/optimizers/#adam) as the optimizer, add *accuracy* to the list of metrics to be evaluated, and `compile()` the model. Adam is simply a an advanced version of stochastic gradient descent, note there are [several different options](https://keras.io/optimizers/) for the optimizer in Keras that we could use instead of *adam*." 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": null, 188 | "metadata": {}, 189 | "outputs": [], 190 | "source": [ 191 | "# Model initialization:\n", 192 | "model = Sequential()\n", 193 | "\n", 194 | "# A simple model:\n", 195 | "model.add(Dense(units=20, input_dim=28*28))\n", 196 | "model.add(Activation('relu'))\n", 197 | "\n", 198 | "# A bit more complex model:\n", 199 | "#model.add(Dense(units=50, input_dim=28*28))\n", 200 | "#model.add(Activation('relu'))\n", 201 | "#model.add(Dropout(0.2))\n", 202 | "\n", 203 | "#model.add(Dense(units=50))\n", 204 | "#model.add(Activation('relu'))\n", 205 | "#model.add(Dropout(0.2))\n", 206 | "\n", 207 | "# The last layer needs to be like this:\n", 208 | "model.add(Dense(units=10, activation='softmax'))\n", 209 | "\n", 210 | "model.compile(loss='categorical_crossentropy', \n", 211 | " optimizer='adam', \n", 212 | " metrics=['accuracy'])\n", 213 | "print(model.summary())" 214 | ] 215 | }, 216 | { 217 | "cell_type": "markdown", 218 | "metadata": {}, 219 | "source": [ 220 | "The summary shows that there are 15,910 parameters in total in our model.\n", 221 | "\n", 222 | "For example for the first dense layer we have 785x20 = 15,700 parameters as the weight matrix is of size 785x20 (not 784, as there's an additional bias term).\n", 223 | "\n", 224 | "We can also draw a fancier graph of our model." 225 | ] 226 | }, 227 | { 228 | "cell_type": "code", 229 | "execution_count": null, 230 | "metadata": {}, 231 | "outputs": [], 232 | "source": [ 233 | "plot_model(model, show_shapes=True)" 234 | ] 235 | }, 236 | { 237 | "cell_type": "markdown", 238 | "metadata": {}, 239 | "source": [ 240 | "### Learning\n", 241 | "\n", 242 | "Next, we'll train our model. Notice how the interface is similar to scikit-learn: we still call the `fit()` method on our model object.\n", 243 | "\n", 244 | "An *epoch* means one pass through the whole training data, we'll begin by running training for 10 epochs.\n", 245 | "\n", 246 | "The `reshape()` function flattens our 28x28 images into vectors of length 784. (This means we are not using any information about the spatial neighborhood relations of pixels. This setup is known as the *permutation invariant MNIST*.) \n", 247 | "\n", 248 | "You can run code below multiple times and it will continue the training process from where it left off. If you want to start from scratch, re-initialize the model using the code a few cells ago. \n", 249 | "\n", 250 | "We use a batch size of 32, so the actual input will be 32x784 for each batch of 32 images." 251 | ] 252 | }, 253 | { 254 | "cell_type": "code", 255 | "execution_count": null, 256 | "metadata": {}, 257 | "outputs": [], 258 | "source": [ 259 | "%%time\n", 260 | "epochs = 10\n", 261 | "\n", 262 | "history = model.fit(X_train.reshape((-1,28*28)), \n", 263 | " Y_train, \n", 264 | " epochs=epochs, \n", 265 | " batch_size=32,\n", 266 | " verbose=2)" 267 | ] 268 | }, 269 | { 270 | "cell_type": "markdown", 271 | "metadata": {}, 272 | "source": [ 273 | "Let's now see how the training progressed. \n", 274 | "\n", 275 | "* *Loss* is a function of the difference of the network output and the target values. We are minimizing the loss function during training so it should decrease over time.\n", 276 | "* *Accuracy* is the classification accuracy for the training data. It gives some indication of the real accuracy of the model but cannot be fully trusted, as it may have overfitted and just memorizes the training data." 277 | ] 278 | }, 279 | { 280 | "cell_type": "code", 281 | "execution_count": null, 282 | "metadata": {}, 283 | "outputs": [], 284 | "source": [ 285 | "plt.figure(figsize=(5,3))\n", 286 | "plt.plot(history.epoch,history.history['loss'])\n", 287 | "plt.title('loss')\n", 288 | "\n", 289 | "plt.figure(figsize=(5,3))\n", 290 | "plt.plot(history.epoch,history.history['accuracy'])\n", 291 | "plt.title('accuracy');" 292 | ] 293 | }, 294 | { 295 | "cell_type": "markdown", 296 | "metadata": {}, 297 | "source": [ 298 | "### Inference\n", 299 | "\n", 300 | "For a better measure of the quality of the model, let's see the model accuracy for the test data. " 301 | ] 302 | }, 303 | { 304 | "cell_type": "code", 305 | "execution_count": null, 306 | "metadata": {}, 307 | "outputs": [], 308 | "source": [ 309 | "%%time\n", 310 | "scores = model.evaluate(X_test.reshape((-1,28*28)), Y_test, verbose=2)\n", 311 | "print(\"%s: %.2f%%\" % (model.metrics_names[1], scores[1]*100))" 312 | ] 313 | }, 314 | { 315 | "cell_type": "markdown", 316 | "metadata": {}, 317 | "source": [ 318 | "We can now take a closer look at the results using the `show_failures()` helper function." 319 | ] 320 | }, 321 | { 322 | "cell_type": "markdown", 323 | "metadata": {}, 324 | "source": [ 325 | "Here are the first 10 test digits the MLP classified to a wrong class:" 326 | ] 327 | }, 328 | { 329 | "cell_type": "code", 330 | "execution_count": null, 331 | "metadata": {}, 332 | "outputs": [], 333 | "source": [ 334 | "predictions = model.predict(X_test.reshape((-1,28*28)))\n", 335 | "\n", 336 | "show_failures(predictions, y_test, X_test)" 337 | ] 338 | }, 339 | { 340 | "cell_type": "markdown", 341 | "metadata": {}, 342 | "source": [ 343 | "We can use `show_failures()` to inspect failures in more detail. For example, here are failures in which the true class was \"6\":" 344 | ] 345 | }, 346 | { 347 | "cell_type": "code", 348 | "execution_count": null, 349 | "metadata": {}, 350 | "outputs": [], 351 | "source": [ 352 | "show_failures(predictions, y_test, X_test, trueclass=6)" 353 | ] 354 | }, 355 | { 356 | "cell_type": "markdown", 357 | "metadata": {}, 358 | "source": [ 359 | "We can also compute the confusion matrix to see which digits get mixed the most, and look at classification accuracies separately for each class:" 360 | ] 361 | }, 362 | { 363 | "cell_type": "code", 364 | "execution_count": null, 365 | "metadata": {}, 366 | "outputs": [], 367 | "source": [ 368 | "from sklearn.metrics import confusion_matrix\n", 369 | "\n", 370 | "print('Confusion matrix (rows: true classes; columns: predicted classes):'); print()\n", 371 | "cm=confusion_matrix(y_test, np.argmax(predictions, axis=1), labels=list(range(10)))\n", 372 | "print(cm); print()\n", 373 | "\n", 374 | "print('Classification accuracy for each class:'); print()\n", 375 | "for i,j in enumerate(cm.diagonal()/cm.sum(axis=1)): print(\"%d: %.4f\" % (i,j))" 376 | ] 377 | }, 378 | { 379 | "cell_type": "markdown", 380 | "metadata": {}, 381 | "source": [ 382 | "## Model tuning\n", 383 | "\n", 384 | "Modify the MLP model. Try to improve the classification accuracy, or experiment with the effects of different parameters. If you are interested in the state-of-the-art performance on permutation invariant MNIST, see e.g. this [recent paper](https://arxiv.org/abs/1507.02672) by Aalto University / The Curious AI Company researchers.\n", 385 | "\n", 386 | "You can also consult the Keras documentation at https://keras.io/. For example, the Dense, Activation, and Dropout layers are described at https://keras.io/layers/core/." 387 | ] 388 | } 389 | ], 390 | "metadata": { 391 | "kernelspec": { 392 | "display_name": "Python 3 (ipykernel)", 393 | "language": "python", 394 | "name": "python3" 395 | }, 396 | "language_info": { 397 | "codemirror_mode": { 398 | "name": "ipython", 399 | "version": 3 400 | }, 401 | "file_extension": ".py", 402 | "mimetype": "text/x-python", 403 | "name": "python", 404 | "nbconvert_exporter": "python", 405 | "pygments_lexer": "ipython3", 406 | "version": "3.11.9" 407 | } 408 | }, 409 | "nbformat": 4, 410 | "nbformat_minor": 4 411 | } 412 | -------------------------------------------------------------------------------- /Exercise-10.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# California housing dataset regression with MLPs\n", 8 | "\n", 9 | "In this notebook, we'll train a multi-layer perceptron model to to estimate median house values on Californian housing districts using **Tensorflow** (version $\\ge$ 2.0 required) with the **Keras API**.\n", 10 | "\n", 11 | "First, the needed imports." 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": null, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "%matplotlib inline\n", 21 | "\n", 22 | "from sklearn import datasets\n", 23 | "from sklearn.model_selection import train_test_split\n", 24 | "from sklearn.metrics import mean_squared_error\n", 25 | "from sklearn.preprocessing import StandardScaler\n", 26 | "\n", 27 | "import tensorflow as tf\n", 28 | "from tensorflow.keras.models import Sequential\n", 29 | "from tensorflow.keras.layers import Dense, Activation, Dropout, Input\n", 30 | "from tensorflow.keras.utils import plot_model\n", 31 | "\n", 32 | "from IPython.display import SVG\n", 33 | "\n", 34 | "import numpy as np\n", 35 | "import matplotlib.pyplot as plt\n", 36 | "import seaborn as sns\n", 37 | "sns.set()\n", 38 | "\n", 39 | "print('Using Tensorflow version: {}, and Keras version: {}.'.format(tf.__version__, tf.keras.__version__))" 40 | ] 41 | }, 42 | { 43 | "cell_type": "markdown", 44 | "metadata": {}, 45 | "source": [ 46 | "## Data\n", 47 | "\n", 48 | "Then we load the California housing data. First time we need to download the data, which can take a while." 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": null, 54 | "metadata": {}, 55 | "outputs": [], 56 | "source": [ 57 | "chd = datasets.fetch_california_housing()" 58 | ] 59 | }, 60 | { 61 | "cell_type": "markdown", 62 | "metadata": {}, 63 | "source": [ 64 | "The data consists of 20640 housing districts, each characterized with 8 attributes: *MedInc, HouseAge, AveRooms, AveBedrms, Population, AveOccup, Latitude, Longitude*. There is also a target value (median house value) for each housing district.\n", 65 | " \n", 66 | "Let's plot all attributes against the target value:" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": null, 72 | "metadata": {}, 73 | "outputs": [], 74 | "source": [ 75 | "plt.figure(figsize=(15,10))\n", 76 | "for i in range(8):\n", 77 | " plt.subplot(4,2,i+1)\n", 78 | " plt.scatter(chd.data[:,i], chd.target, s=2, label=chd.feature_names[i])\n", 79 | " plt.legend(loc='best')" 80 | ] 81 | }, 82 | { 83 | "cell_type": "markdown", 84 | "metadata": {}, 85 | "source": [ 86 | "We'll split the data into a training and a test set.\n", 87 | "\n", 88 | "Let's also select a single attribute to start the analysis with, say *MedInc*." 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": null, 94 | "metadata": {}, 95 | "outputs": [], 96 | "source": [ 97 | "test_size = 5000\n", 98 | "single_attribute = 'MedInc'\n", 99 | "\n", 100 | "X_train_all, X_test_all, y_train, y_test = train_test_split(\n", 101 | " chd.data, chd.target, test_size=test_size, shuffle=True)\n", 102 | "\n", 103 | "attribute_index = chd.feature_names.index(single_attribute)\n", 104 | "X_train_single = X_train_all[:, attribute_index].reshape(-1, 1)\n", 105 | "X_test_single = X_test_all[:, attribute_index].reshape(-1, 1)\n", 106 | " \n", 107 | "print()\n", 108 | "print('California housing data: train:',len(X_train_all),'test:',len(X_test_all))\n", 109 | "print()\n", 110 | "print('X_train_all:', X_train_all.shape)\n", 111 | "print('X_train_single:', X_train_single.shape)\n", 112 | "print('y_train:', y_train.shape)\n", 113 | "print()\n", 114 | "print('X_test_all', X_test_all.shape)\n", 115 | "print('X_test_single', X_test_single.shape)\n", 116 | "print('y_test', y_test.shape)" 117 | ] 118 | }, 119 | { 120 | "cell_type": "markdown", 121 | "metadata": {}, 122 | "source": [ 123 | "The training data matrix `X_train_all` is a matrix of size (`n_train`, 8), and `X_train_single` contains only the first attribute (*MedInc* by default). The vector `y_train` contains the target value (median house value) for each housing district in the training set.\n", 124 | "\n", 125 | "Let's start our analysis with the single attribute. Later, you can set `only_single_attribute = False` to use all eight attributes in the regression.\n", 126 | "\n", 127 | "As the final step, let's scale the input data to zero mean and unit variance: " 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": null, 133 | "metadata": {}, 134 | "outputs": [], 135 | "source": [ 136 | "only_single_attribute = True\n", 137 | "\n", 138 | "if only_single_attribute:\n", 139 | " X_train = X_train_single\n", 140 | " X_test = X_test_single\n", 141 | "else:\n", 142 | " X_train = X_train_all\n", 143 | " X_test = X_test_all\n", 144 | "\n", 145 | "scaler = StandardScaler().fit(X_train)\n", 146 | "X_train = scaler.transform(X_train)\n", 147 | "X_test = scaler.transform(X_test)\n", 148 | "print('X_train: shape:', X_train.shape, 'mean:', X_train.mean(axis=0), 'std:', X_train.std(axis=0))\n", 149 | "print('X_test: shape:', X_test.shape, 'mean:', X_test.mean(axis=0), 'std:', X_test.std(axis=0))" 150 | ] 151 | }, 152 | { 153 | "cell_type": "markdown", 154 | "metadata": {}, 155 | "source": [ 156 | "## One hidden layer\n", 157 | "\n", 158 | "### Initialization\n", 159 | "\n", 160 | "Let's begin with a simple model that has a single hidden layer. We first initialize the model with `Sequential()`. Then we add a `Dense` layer that has `X_train.shape[1]` inputs (one for each attribute in the training data) and 10 units. The `Dense` layer connects each input to each output with some weight parameter. \n", 161 | "Then we have an output layer that has only one unit with a linear activation function.\n", 162 | "\n", 163 | "Finally, we select *mean squared error* as the loss function, select [*stochastic gradient descent*](https://keras.io/optimizers/#sgd) as the optimizer, and `compile()` the model. Note there are [several different options](https://keras.io/optimizers/) for the optimizer in Keras that we could use instead of *sgd*." 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": null, 169 | "metadata": {}, 170 | "outputs": [], 171 | "source": [ 172 | "slmodel = Sequential()\n", 173 | "slmodel.add(Dense(units=10, input_dim=X_train.shape[1], activation='relu'))\n", 174 | "slmodel.add(Dense(units=1, activation='linear'))\n", 175 | "\n", 176 | "slmodel.compile(loss='mean_squared_error', \n", 177 | " optimizer='sgd')\n", 178 | "print(slmodel.summary())" 179 | ] 180 | }, 181 | { 182 | "cell_type": "markdown", 183 | "metadata": {}, 184 | "source": [ 185 | "We can also draw a fancier graph of our model." 186 | ] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "execution_count": null, 191 | "metadata": {}, 192 | "outputs": [], 193 | "source": [ 194 | "plot_model(slmodel, show_shapes=True)" 195 | ] 196 | }, 197 | { 198 | "cell_type": "markdown", 199 | "metadata": {}, 200 | "source": [ 201 | "### Learning\n", 202 | "\n", 203 | "Now we are ready to train our first model. An *epoch* means one pass through the whole training data. \n", 204 | "\n", 205 | "You can run code below multiple times and it will continue the training process from where it left off. If you want to start from scratch, re-initialize the model using the code a few cells ago. " 206 | ] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "execution_count": null, 211 | "metadata": {}, 212 | "outputs": [], 213 | "source": [ 214 | "%%time\n", 215 | "epochs = 10 \n", 216 | "\n", 217 | "slhistory = slmodel.fit(X_train, \n", 218 | " y_train, \n", 219 | " epochs=epochs, \n", 220 | " batch_size=32,\n", 221 | " verbose=2)" 222 | ] 223 | }, 224 | { 225 | "cell_type": "markdown", 226 | "metadata": {}, 227 | "source": [ 228 | "Let's now see how the training progressed. *Loss* is a function of the difference of the network output and the target values. We are minimizing the loss function during training so it should decrease over time." 229 | ] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "execution_count": null, 234 | "metadata": {}, 235 | "outputs": [], 236 | "source": [ 237 | "plt.figure(figsize=(5,3))\n", 238 | "plt.plot(slhistory.epoch,slhistory.history['loss'])\n", 239 | "plt.title('loss');" 240 | ] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "execution_count": null, 245 | "metadata": {}, 246 | "outputs": [], 247 | "source": [ 248 | "if X_train.shape[1] == 1:\n", 249 | " plt.figure(figsize=(10, 10))\n", 250 | " plt.scatter(X_train, y_train, s=5)\n", 251 | " reg_x = np.arange(np.min(X_train), np.max(X_train), 0.01).reshape(-1, 1)\n", 252 | " plt.scatter(reg_x, slmodel.predict(reg_x), s=8, label='one hidden layer')\n", 253 | " plt.legend(loc='best');" 254 | ] 255 | }, 256 | { 257 | "cell_type": "markdown", 258 | "metadata": {}, 259 | "source": [ 260 | "### Inference\n", 261 | "\n", 262 | "For a better measure of the quality of the model, let's see the model accuracy for the test data. " 263 | ] 264 | }, 265 | { 266 | "cell_type": "code", 267 | "execution_count": null, 268 | "metadata": {}, 269 | "outputs": [], 270 | "source": [ 271 | "%%time\n", 272 | "\n", 273 | "slpred = slmodel.predict(X_test)\n", 274 | "print(\"Mean squared error: %.3f\"\n", 275 | " % mean_squared_error(y_test, slpred))" 276 | ] 277 | }, 278 | { 279 | "cell_type": "markdown", 280 | "metadata": {}, 281 | "source": [ 282 | "## Multiple hidden layers" 283 | ] 284 | }, 285 | { 286 | "cell_type": "markdown", 287 | "metadata": {}, 288 | "source": [ 289 | "### Initialization\n", 290 | "\n", 291 | "Let's now create a more complex MLP model that has multiple dense layers and dropout layers. `Dropout()` randomly sets a fraction of inputs to zero during training, which is one approach to regularization and can sometimes help to prevent overfitting.\n", 292 | "\n", 293 | "The last layer needs to have a single unit with linear activation to match the groundtruth (`Y_train`). \n", 294 | "\n", 295 | "Finally, we again `compile()` the model, this time using [*Adam*](https://keras.io/optimizers/#adam) as the optimizer." 296 | ] 297 | }, 298 | { 299 | "cell_type": "code", 300 | "execution_count": null, 301 | "metadata": {}, 302 | "outputs": [], 303 | "source": [ 304 | "mlmodel = Sequential()\n", 305 | "\n", 306 | "mlmodel.add(Input([X_train.shape[1]]))\n", 307 | "mlmodel.add(Dense(units=20, activation='relu'))\n", 308 | "mlmodel.add(Dense(units=20, activation='relu'))\n", 309 | "mlmodel.add(Dropout(0.5))\n", 310 | "\n", 311 | "mlmodel.add(Dense(units=1, activation='linear'))\n", 312 | "\n", 313 | "mlmodel.compile(loss='mean_squared_error', \n", 314 | " optimizer='adam')\n", 315 | "print(mlmodel.summary())" 316 | ] 317 | }, 318 | { 319 | "cell_type": "code", 320 | "execution_count": null, 321 | "metadata": {}, 322 | "outputs": [], 323 | "source": [ 324 | "plot_model(mlmodel, show_shapes=True)" 325 | ] 326 | }, 327 | { 328 | "cell_type": "markdown", 329 | "metadata": {}, 330 | "source": [ 331 | "### Learning" 332 | ] 333 | }, 334 | { 335 | "cell_type": "code", 336 | "execution_count": null, 337 | "metadata": {}, 338 | "outputs": [], 339 | "source": [ 340 | "%%time\n", 341 | "epochs = 10 \n", 342 | "\n", 343 | "mlhistory = mlmodel.fit(X_train, \n", 344 | " y_train, \n", 345 | " epochs=epochs, \n", 346 | " batch_size=32,\n", 347 | " verbose=2)" 348 | ] 349 | }, 350 | { 351 | "cell_type": "code", 352 | "execution_count": null, 353 | "metadata": {}, 354 | "outputs": [], 355 | "source": [ 356 | "plt.figure(figsize=(5,3))\n", 357 | "plt.plot(mlhistory.epoch,mlhistory.history['loss'])\n", 358 | "plt.title('loss');" 359 | ] 360 | }, 361 | { 362 | "cell_type": "code", 363 | "execution_count": null, 364 | "metadata": {}, 365 | "outputs": [], 366 | "source": [ 367 | "if X_train.shape[1] == 1:\n", 368 | " plt.figure(figsize=(10, 10))\n", 369 | " plt.scatter(X_train, y_train, s=5)\n", 370 | " reg_x = np.arange(np.min(X_train), np.max(X_train), 0.01).reshape(-1, 1)\n", 371 | " plt.scatter(reg_x, slmodel.predict(reg_x), s=8, label='one hidden layer')\n", 372 | " plt.scatter(reg_x, mlmodel.predict(reg_x), s=8, label='multiple hidden layers')\n", 373 | " plt.legend(loc='best');" 374 | ] 375 | }, 376 | { 377 | "cell_type": "markdown", 378 | "metadata": {}, 379 | "source": [ 380 | "### Inference" 381 | ] 382 | }, 383 | { 384 | "cell_type": "code", 385 | "execution_count": null, 386 | "metadata": {}, 387 | "outputs": [], 388 | "source": [ 389 | "%%time\n", 390 | "\n", 391 | "mlpred = mlmodel.predict(X_test)\n", 392 | "print(\"Mean squared error: %.3f\"\n", 393 | " % mean_squared_error(y_test, mlpred))" 394 | ] 395 | }, 396 | { 397 | "cell_type": "markdown", 398 | "metadata": {}, 399 | "source": [ 400 | "## Model tuning\n", 401 | "\n", 402 | "Try to reduce the mean squared error of the regression. Modify the network architectures and see if the results improve. See the documentation of [Keras](https://keras.io/) for further options.\n", 403 | "\n", 404 | "To further improve the results, it is possible to replace [StandardScaler](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html), that is scaling the input data to zero mean and unit variance, with more advanced preprocessing.\n", 405 | "See [Preprocessing data](https://scikit-learn.org/stable/modules/preprocessing.html#preprocessing-data) for more information." 406 | ] 407 | }, 408 | { 409 | "cell_type": "code", 410 | "execution_count": null, 411 | "metadata": {}, 412 | "outputs": [], 413 | "source": [] 414 | }, 415 | { 416 | "cell_type": "code", 417 | "execution_count": null, 418 | "metadata": {}, 419 | "outputs": [], 420 | "source": [] 421 | } 422 | ], 423 | "metadata": { 424 | "kernelspec": { 425 | "display_name": "Python 3 (ipykernel)", 426 | "language": "python", 427 | "name": "python3" 428 | }, 429 | "language_info": { 430 | "codemirror_mode": { 431 | "name": "ipython", 432 | "version": 3 433 | }, 434 | "file_extension": ".py", 435 | "mimetype": "text/x-python", 436 | "name": "python", 437 | "nbconvert_exporter": "python", 438 | "pygments_lexer": "ipython3", 439 | "version": "3.11.9" 440 | } 441 | }, 442 | "nbformat": 4, 443 | "nbformat_minor": 4 444 | } 445 | -------------------------------------------------------------------------------- /Exercise-11.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# MNIST handwritten digits dimensionality reduction with scikit-learn\n", 8 | "\n", 9 | "In this notebook, we'll use some popular methods to reduce the dimensionality of MNIST digits data before classification.\n", 10 | "\n", 11 | "[Section 1](#1.-Feature-extraction) of the notebook contains examples of feature extraction methods, and [Section 2](#2.-Feature-selection) two methods for feature selection. Any of these methods can then be applied to train a MNIST digits classifier for lower-dimensional data in [Section 3](#3.-Classification-with-dimension-reduced-data).\n", 12 | "\n", 13 | "First, the needed imports." 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": null, 19 | "metadata": { 20 | "collapsed": false, 21 | "jupyter": { 22 | "outputs_hidden": false 23 | } 24 | }, 25 | "outputs": [], 26 | "source": [ 27 | "%matplotlib inline\n", 28 | "\n", 29 | "from pml_utils import get_mnist\n", 30 | "\n", 31 | "import numpy as np\n", 32 | "from sklearn.model_selection import train_test_split\n", 33 | "from sklearn import __version__\n", 34 | "from sklearn import decomposition, feature_selection\n", 35 | "from skimage.measure import block_reduce\n", 36 | "from skimage.feature import canny\n", 37 | "\n", 38 | "import matplotlib.pyplot as plt\n", 39 | "import seaborn as sns\n", 40 | "sns.set()\n", 41 | "\n", 42 | "from packaging.version import Version\n", 43 | "assert(Version(__version__) >= Version(\"0.20\")), \"Version >= 0.20 of sklearn is required.\"" 44 | ] 45 | }, 46 | { 47 | "cell_type": "markdown", 48 | "metadata": {}, 49 | "source": [ 50 | "Then we load the MNIST data. First time it may download the data, which can take a while." 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": null, 56 | "metadata": { 57 | "collapsed": false, 58 | "jupyter": { 59 | "outputs_hidden": false 60 | } 61 | }, 62 | "outputs": [], 63 | "source": [ 64 | "X_train, y_train, X_test, y_test = get_mnist('MNIST')\n", 65 | "\n", 66 | "print('MNIST data loaded: train:',len(X_train),'test:',len(X_test))\n", 67 | "print('X_train:', X_train.shape)\n", 68 | "print('y_train:', y_train.shape)\n", 69 | "print('X_test', X_test.shape)\n", 70 | "print('y_test', y_test.shape)" 71 | ] 72 | }, 73 | { 74 | "cell_type": "markdown", 75 | "metadata": {}, 76 | "source": [ 77 | "## 1. Feature extraction\n", 78 | "\n", 79 | "### 1.1 PCA\n", 80 | "\n", 81 | "[Principal component analysis](http://scikit-learn.org/stable/modules/decomposition.html#pca) (PCA) is a standard method to decompose a high-dimensional dataset in a set of successive orthogonal components that explain a maximum amount of the variance. Here we project the data into `n_components` principal components. The components have the maximal possible variance under the orthogonality constraint.\n", 82 | "\n", 83 | "The option `whiten=True` can be used to whiten the outputs to have unit component-wise variances. Its usefulness depends on the model to be used." 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": null, 89 | "metadata": { 90 | "collapsed": false, 91 | "jupyter": { 92 | "outputs_hidden": false 93 | } 94 | }, 95 | "outputs": [], 96 | "source": [ 97 | "%%time\n", 98 | "n_components = 50\n", 99 | "pca = decomposition.PCA(n_components=n_components, whiten=True)\n", 100 | "X_pca = pca.fit_transform(X_train)\n", 101 | "print('X_pca:', X_pca.shape)" 102 | ] 103 | }, 104 | { 105 | "cell_type": "markdown", 106 | "metadata": {}, 107 | "source": [ 108 | "We can inspect the amount of variance explained by the principal components." 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": null, 114 | "metadata": { 115 | "collapsed": false, 116 | "jupyter": { 117 | "outputs_hidden": false 118 | } 119 | }, 120 | "outputs": [], 121 | "source": [ 122 | "plt.figure()\n", 123 | "plt.plot(np.arange(n_components)+1, pca.explained_variance_)\n", 124 | "plt.title('Explained variance by PCA components')\n", 125 | "plt.ylabel('explained variance')\n", 126 | "plt.xlabel('PCA component');" 127 | ] 128 | }, 129 | { 130 | "cell_type": "markdown", 131 | "metadata": {}, 132 | "source": [ 133 | "### 1.2 Image feature extraction\n", 134 | "\n", 135 | "There are a lot of different feature extraction methods for image data. Common ones include extraction of colors, textures, and shapes from images, or detection of edges, corners, lines, blobs, or templates. Let's try a simple filtering-based method to reduce the dimensionality of the features, and a widely-used edge detector.\n", 136 | "\n", 137 | "The [`measure.block_reduce()`](http://scikit-image.org/docs/dev/api/skimage.measure.html#skimage.measure.block_reduce) function from scikit-image applies a function (for_example `np.mean`, `np.max` or `np.median`) to blocks of the image, resulting in a downsampled image." 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": null, 143 | "metadata": { 144 | "collapsed": false, 145 | "jupyter": { 146 | "outputs_hidden": false 147 | } 148 | }, 149 | "outputs": [], 150 | "source": [ 151 | "X_train_img = X_train.reshape(-1, 28, 28)\n", 152 | "filter_size = 2\n", 153 | "X_train_img_downsampled = block_reduce(X_train_img, \n", 154 | " block_size=(1, filter_size, filter_size), \n", 155 | " func=np.mean)\n", 156 | "\n", 157 | "print('X_train_img:', X_train_img.shape)\n", 158 | "print('X_train_img_downsampled:', X_train_img_downsampled.shape)" 159 | ] 160 | }, 161 | { 162 | "cell_type": "markdown", 163 | "metadata": {}, 164 | "source": [ 165 | "The [`feature.canny()`](http://scikit-image.org/docs/dev/api/skimage.feature.html#skimage.feature.canny) function applies the [Canny edge detector](https://en.wikipedia.org/wiki/Canny_edge_detector) to extract edges from the image. Processing all images may take a couple of minutes." 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": null, 171 | "metadata": { 172 | "collapsed": false, 173 | "jupyter": { 174 | "outputs_hidden": false 175 | } 176 | }, 177 | "outputs": [], 178 | "source": [ 179 | "%%time\n", 180 | "\n", 181 | "sigma = 1.0\n", 182 | "X_train_img_canny = np.zeros(X_train_img.shape)\n", 183 | "for i in range(X_train_img.shape[0]):\n", 184 | " X_train_img_canny[i,:,:] = canny(X_train_img[i,:,:], sigma=sigma)\n", 185 | "print('X_train_img_canny:', X_train_img_canny.shape)" 186 | ] 187 | }, 188 | { 189 | "cell_type": "markdown", 190 | "metadata": {}, 191 | "source": [ 192 | "Let's compare the original and filtered digit images:" 193 | ] 194 | }, 195 | { 196 | "cell_type": "code", 197 | "execution_count": null, 198 | "metadata": { 199 | "collapsed": false, 200 | "jupyter": { 201 | "outputs_hidden": false 202 | } 203 | }, 204 | "outputs": [], 205 | "source": [ 206 | "pltsize=1\n", 207 | "\n", 208 | "plt.figure(figsize=(10*pltsize, pltsize))\n", 209 | "plt.suptitle('Original')\n", 210 | "plt.subplots_adjust(top=0.8)\n", 211 | "for i in range(10):\n", 212 | " plt.subplot(1,10,i+1)\n", 213 | " plt.axis('off')\n", 214 | " plt.imshow(X_train_img[i,:,:], cmap=\"gray\", interpolation='none')\n", 215 | "\n", 216 | "plt.figure(figsize=(10*pltsize, pltsize))\n", 217 | "plt.suptitle('Downsampled with a %dx%d filter' % (filter_size, filter_size))\n", 218 | "plt.subplots_adjust(top=0.8)\n", 219 | "for i in range(10):\n", 220 | " plt.subplot(1,10,i+1)\n", 221 | " plt.axis('off')\n", 222 | " plt.imshow(X_train_img_downsampled[i,:,:], cmap=\"gray\", interpolation='none')\n", 223 | " \n", 224 | "plt.figure(figsize=(10*pltsize, pltsize))\n", 225 | "plt.suptitle('Canny edge detection with sigma=%.2f' % sigma)\n", 226 | "plt.subplots_adjust(top=0.8)\n", 227 | "for i in range(10):\n", 228 | " plt.subplot(1,10,i+1)\n", 229 | " plt.axis('off')\n", 230 | " plt.imshow(X_train_img_canny[i,:,:], cmap=\"gray\", interpolation='none')" 231 | ] 232 | }, 233 | { 234 | "cell_type": "markdown", 235 | "metadata": {}, 236 | "source": [ 237 | "## 2. Feature selection\n", 238 | "\n", 239 | "### 2.1 Low variance\n", 240 | "\n", 241 | "The MNIST digits have a lot of components (pixels) with little variance. These components are not particularly useful for discriminating between the classes, so they can probably be removed safely. Let's first draw the component-wise variances of MNIST data." 242 | ] 243 | }, 244 | { 245 | "cell_type": "code", 246 | "execution_count": null, 247 | "metadata": { 248 | "collapsed": false, 249 | "jupyter": { 250 | "outputs_hidden": false 251 | } 252 | }, 253 | "outputs": [], 254 | "source": [ 255 | "variances = np.var(X_train, axis=0)\n", 256 | "plt.figure()\n", 257 | "plt.plot(variances)\n", 258 | "plt.title('Component-wise variance of MNIST digits')\n", 259 | "plt.ylabel('variance')\n", 260 | "plt.xlabel('component');" 261 | ] 262 | }, 263 | { 264 | "cell_type": "markdown", 265 | "metadata": {}, 266 | "source": [ 267 | "The variances can also be plotted for each pixel in the image plane." 268 | ] 269 | }, 270 | { 271 | "cell_type": "code", 272 | "execution_count": null, 273 | "metadata": { 274 | "collapsed": false, 275 | "jupyter": { 276 | "outputs_hidden": false 277 | } 278 | }, 279 | "outputs": [], 280 | "source": [ 281 | "plt.figure()\n", 282 | "sns.heatmap(variances.reshape(28,28), cmap=sns.color_palette(\"Blues\"))\n", 283 | "plt.title('Pixel-wise variance of MNIST digits')\n", 284 | "plt.grid(False)" 285 | ] 286 | }, 287 | { 288 | "cell_type": "markdown", 289 | "metadata": {}, 290 | "source": [ 291 | "Select an appropriate `variance_threshold` based on the *\"Component-wise variance of MNIST digits\"* figure above." 292 | ] 293 | }, 294 | { 295 | "cell_type": "code", 296 | "execution_count": null, 297 | "metadata": { 298 | "collapsed": false, 299 | "jupyter": { 300 | "outputs_hidden": false 301 | } 302 | }, 303 | "outputs": [], 304 | "source": [ 305 | "%%time\n", 306 | "\n", 307 | "variance_threshold = 1000\n", 308 | "lv = feature_selection.VarianceThreshold(threshold=variance_threshold)\n", 309 | "X_lv = lv.fit_transform(X_train)\n", 310 | "print('X_lv:', X_lv.shape)" 311 | ] 312 | }, 313 | { 314 | "cell_type": "markdown", 315 | "metadata": {}, 316 | "source": [ 317 | "### 2.2 Univariate feature selection\n", 318 | "\n", 319 | "Another method for feature selection is to select the *k* best features based on univariate statistical tests between the features and the class of each sample. Therefore, this is a supervised method and we need to include `y_train` in `fit_transform()`.\n", 320 | "See [scikit-learn documentation](http://scikit-learn.org/stable/modules/feature_selection.html#univariate-feature-selection) for the set of available statistical tests and other further options." 321 | ] 322 | }, 323 | { 324 | "cell_type": "code", 325 | "execution_count": null, 326 | "metadata": { 327 | "collapsed": false, 328 | "jupyter": { 329 | "outputs_hidden": false 330 | } 331 | }, 332 | "outputs": [], 333 | "source": [ 334 | "%%time\n", 335 | "\n", 336 | "k = 50\n", 337 | "ukb = feature_selection.SelectKBest(k=k)\n", 338 | "X_ukb = ukb.fit_transform(X_train, y_train)\n", 339 | "print('X_ukb:', X_ukb.shape)" 340 | ] 341 | }, 342 | { 343 | "cell_type": "markdown", 344 | "metadata": {}, 345 | "source": [ 346 | "We can check which features (that is, pixels in case) got selected:" 347 | ] 348 | }, 349 | { 350 | "cell_type": "code", 351 | "execution_count": null, 352 | "metadata": { 353 | "collapsed": false, 354 | "jupyter": { 355 | "outputs_hidden": false 356 | } 357 | }, 358 | "outputs": [], 359 | "source": [ 360 | "support = ukb.get_support()\n", 361 | "plt.figure()\n", 362 | "sns.heatmap(support.reshape(28,28), cmap=sns.color_palette(\"Blues\"))\n", 363 | "#with sns.axes_style(\"white\"):\n", 364 | "# plt.imshow(support.reshape(28,28), interpolation='none')\n", 365 | "plt.title('Support of SelectKBest() with k=%d' % k)\n", 366 | "plt.grid(False)" 367 | ] 368 | }, 369 | { 370 | "cell_type": "markdown", 371 | "metadata": {}, 372 | "source": [ 373 | "## 3. Classification with dimension-reduced data \n", 374 | "\n", 375 | "Let's now train a classifier using lower-dimensional data. Choose any of the above feature extraction or feature selection methods, and reduce the dimensionality of the MNIST data with that method. You can also implement your own dimensionaly reduction method.\n", 376 | "\n", 377 | "Note that you need to transform also the test data into the lower-dimensional space using `transform()`. Here is an example for PCA:" 378 | ] 379 | }, 380 | { 381 | "cell_type": "code", 382 | "execution_count": null, 383 | "metadata": { 384 | "collapsed": false, 385 | "jupyter": { 386 | "outputs_hidden": false 387 | } 388 | }, 389 | "outputs": [], 390 | "source": [ 391 | "X_test_pca = pca.transform(X_test)\n", 392 | "print('X_test_pca:', X_test_pca.shape)" 393 | ] 394 | }, 395 | { 396 | "cell_type": "markdown", 397 | "metadata": {}, 398 | "source": [ 399 | "Select a classification method from the ones that have been discussed on the previous lectures. For example, nearest neighbor classifiers or decision trees are good choices. Compare the results (accuracy, time) to classification using the original MNIST data.\n" 400 | ] 401 | }, 402 | { 403 | "cell_type": "code", 404 | "execution_count": null, 405 | "metadata": {}, 406 | "outputs": [], 407 | "source": [] 408 | }, 409 | { 410 | "cell_type": "markdown", 411 | "metadata": {}, 412 | "source": [ 413 | "## 4. Other methods for dimensionality reduction\n", 414 | "\n", 415 | "Study and experiment with additional dimensionality reduction methods based on [decomposing](http://scikit-learn.org/stable/modules/decomposition.html) or [feature selection](http://scikit-learn.org/stable/modules/feature_selection.html). See also [unsupervised dimensionality reduction](http://scikit-learn.org/stable/modules/unsupervised_reduction.html)." 416 | ] 417 | }, 418 | { 419 | "cell_type": "code", 420 | "execution_count": null, 421 | "metadata": {}, 422 | "outputs": [], 423 | "source": [] 424 | } 425 | ], 426 | "metadata": { 427 | "kernelspec": { 428 | "display_name": "Python 3 (ipykernel)", 429 | "language": "python", 430 | "name": "python3" 431 | }, 432 | "language_info": { 433 | "codemirror_mode": { 434 | "name": "ipython", 435 | "version": 3 436 | }, 437 | "file_extension": ".py", 438 | "mimetype": "text/x-python", 439 | "name": "python", 440 | "nbconvert_exporter": "python", 441 | "pygments_lexer": "ipython3", 442 | "version": "3.11.9" 443 | } 444 | }, 445 | "nbformat": 4, 446 | "nbformat_minor": 4 447 | } 448 | -------------------------------------------------------------------------------- /Exercise-12.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# MNIST handwritten digits visualization with scikit-learn\n", 8 | "\n", 9 | "In this notebook, we'll use some popular visualization techniques to visualize MNIST digits. This notebook is based on the scikit-learn embedding examples found [here](http://scikit-learn.org/stable/auto_examples/manifold/plot_lle_digits.html).\n", 10 | "\n", 11 | "First, the needed imports." 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": null, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "%matplotlib inline\n", 21 | "\n", 22 | "from time import time\n", 23 | "\n", 24 | "from pml_utils import get_mnist\n", 25 | "\n", 26 | "import numpy as np\n", 27 | "import sklearn\n", 28 | "from sklearn import random_projection, decomposition, manifold, __version__\n", 29 | "\n", 30 | "import matplotlib.pyplot as plt\n", 31 | "\n", 32 | "from packaging.version import Version\n", 33 | "assert(Version(__version__) >= Version(\"0.20\")), \"Version >= 0.20 of sklearn is required.\"" 34 | ] 35 | }, 36 | { 37 | "cell_type": "markdown", 38 | "metadata": {}, 39 | "source": [ 40 | "Then we load the MNIST data. First time it downloads the data, which can take a while.\n", 41 | "\n", 42 | "In this notebook, we only use 1024 first samples of the training data. This reduces the time needed to calculate the visualizations and makes the visualizations appear less crowded." 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "X_train, y_train, X_test, y_test = get_mnist('MNIST')\n", 52 | "\n", 53 | "# Let's inspect only 1024 first training samples in this notebook\n", 54 | "X = X_train[:1024]\n", 55 | "y = y_train[:1024]\n", 56 | "print()\n", 57 | "print('MNIST data loaded:')\n", 58 | "print('X:', X.shape)\n", 59 | "print('y:', y.shape)" 60 | ] 61 | }, 62 | { 63 | "cell_type": "markdown", 64 | "metadata": {}, 65 | "source": [ 66 | "Let's start by inspecting our data. For such a small dataset, we can actually draw all the samples at once:" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": null, 72 | "metadata": {}, 73 | "outputs": [], 74 | "source": [ 75 | "n_img_per_row = 32 # 32*32=1024\n", 76 | "img = np.zeros((28 * n_img_per_row, 28 * n_img_per_row))\n", 77 | "\n", 78 | "for i in range(n_img_per_row):\n", 79 | " ix = 28 * i\n", 80 | " for j in range(n_img_per_row): \n", 81 | " iy = 28 * j\n", 82 | " img[ix:ix + 28, iy:iy + 28] = X[i * n_img_per_row + j,:].reshape(28,28)\n", 83 | "img = np.max(img)-img\n", 84 | "\n", 85 | "plt.figure(figsize=(9, 9))\n", 86 | "plt.imshow(img, cmap='gray')\n", 87 | "plt.title('1024 first MNIST digits')\n", 88 | "ax=plt.axis('off')" 89 | ] 90 | }, 91 | { 92 | "cell_type": "markdown", 93 | "metadata": {}, 94 | "source": [ 95 | "Let's define a helper function to plot the different visualizations:" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": null, 101 | "metadata": {}, 102 | "outputs": [], 103 | "source": [ 104 | "def plot_embedding(X, title=None, time=None, show_digits=True):\n", 105 | " x_min, x_max = np.min(X, 0), np.max(X, 0)\n", 106 | " X = (X - x_min) / (x_max - x_min)\n", 107 | "\n", 108 | " plt.figure(figsize=(9,6))\n", 109 | " plt.axis('off')\n", 110 | " if show_digits:\n", 111 | " for i in range(X.shape[0]):\n", 112 | " plt.text(X[i, 0], X[i, 1], str(y[i]),\n", 113 | " color=plt.cm.Set1(int(y[i]) / 10.),\n", 114 | " fontdict={'weight': 'bold', 'size': 9})\n", 115 | " else:\n", 116 | " s = plt.scatter(X[:, 0], X[:, 1],\n", 117 | " color=[plt.cm.Set1(int(yi) / 10.) for yi in y])\n", 118 | "\n", 119 | " if title is not None:\n", 120 | " if t0 is not None:\n", 121 | " plt.title(\"%s (%.2fs)\" % (title, time))\n", 122 | " else:\n", 123 | " plt.title(title)" 124 | ] 125 | }, 126 | { 127 | "cell_type": "markdown", 128 | "metadata": {}, 129 | "source": [ 130 | "## 1. Random projection\n", 131 | "\n", 132 | "A simple first visualization is a [random projection](http://scikit-learn.org/stable/modules/random_projection.html#random-projection) of the data into two dimensions." 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": null, 138 | "metadata": {}, 139 | "outputs": [], 140 | "source": [ 141 | "t0 = time()\n", 142 | "rp = random_projection.SparseRandomProjection(n_components=2, random_state=42)\n", 143 | "X_projected = rp.fit_transform(X)\n", 144 | "t = time() - t0\n", 145 | "\n", 146 | "plot_embedding(X_projected, \"Random projection\", t)" 147 | ] 148 | }, 149 | { 150 | "cell_type": "markdown", 151 | "metadata": {}, 152 | "source": [ 153 | "The data can also be plotted with points instead of digit labels by setting `show_digits=False`:" 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": null, 159 | "metadata": {}, 160 | "outputs": [], 161 | "source": [ 162 | "plot_embedding(X_projected, \"Random projection\", t, show_digits=False)" 163 | ] 164 | }, 165 | { 166 | "cell_type": "markdown", 167 | "metadata": {}, 168 | "source": [ 169 | "## 2. PCA\n", 170 | "\n", 171 | "[Principal component analysis](http://scikit-learn.org/stable/modules/decomposition.html#pca) (PCA) is a standard method to decompose a high-dimensional dataset in a set of successive orthogonal components that explain a maximum amount of the variance. Here we project the data into two first principal components. The components have the maximal possible variance under the orthogonality constraint." 172 | ] 173 | }, 174 | { 175 | "cell_type": "code", 176 | "execution_count": null, 177 | "metadata": {}, 178 | "outputs": [], 179 | "source": [ 180 | "t0 = time()\n", 181 | "pca = decomposition.PCA(n_components=2)\n", 182 | "X_pca = pca.fit_transform(X)\n", 183 | "t = time() - t0\n", 184 | "\n", 185 | "plot_embedding(X_pca, \"PCA projection\", t)" 186 | ] 187 | }, 188 | { 189 | "cell_type": "markdown", 190 | "metadata": {}, 191 | "source": [ 192 | "## 3. MDS\n", 193 | "\n", 194 | "[Multidimensional scaling](http://scikit-learn.org/stable/modules/manifold.html#multidimensional-scaling) (MDS) seeks a low-dimensional representation of the data in which the distances try to respect the distances in the original high-dimensional space. " 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": null, 200 | "metadata": {}, 201 | "outputs": [], 202 | "source": [ 203 | "t0 = time()\n", 204 | "mds = manifold.MDS(n_components=2, max_iter=500)\n", 205 | "X_mds = mds.fit_transform(X)\n", 206 | "t = time() - t0\n", 207 | "\n", 208 | "plot_embedding(X_mds, \"MDS embedding\", t)" 209 | ] 210 | }, 211 | { 212 | "cell_type": "markdown", 213 | "metadata": {}, 214 | "source": [ 215 | "## 4. t-SNE\n", 216 | "\n", 217 | "[t-distributed Stochastic Neighbor Embedding](http://scikit-learn.org/stable/modules/manifold.html#t-sne) (t-SNE) is a relatively new and popular tool to visualize high-dimensional data. t-SNE is particularly sensitive to local structure and can often reveal clusters in the data.\n", 218 | "\n", 219 | "t-SNE has an important tuneable parameter called `perplexity`, that can have a large effect on the resulting visualization, depending on the data. Typical values for perplexity are between 5 and 50. " 220 | ] 221 | }, 222 | { 223 | "cell_type": "code", 224 | "execution_count": null, 225 | "metadata": {}, 226 | "outputs": [], 227 | "source": [ 228 | "t0 = time()\n", 229 | "perplexity=30\n", 230 | "tsne = manifold.TSNE(n_components=2, perplexity=perplexity)\n", 231 | "X_tsne = tsne.fit_transform(X)\n", 232 | "t = time() - t0\n", 233 | "\n", 234 | "plot_embedding(X_tsne, \"t-SNE embedding with perplexity=%d\" % perplexity, t)" 235 | ] 236 | }, 237 | { 238 | "cell_type": "markdown", 239 | "metadata": {}, 240 | "source": [ 241 | "## 5. Further visualizations\n", 242 | "\n", 243 | "Take a look at the original scikit-learn [embedding examples](http://scikit-learn.org/stable/auto_examples/manifold/plot_lle_digits.html) for more visualizations. Try some of these (for example LLE and isomap) on the MNIST data." 244 | ] 245 | }, 246 | { 247 | "cell_type": "code", 248 | "execution_count": null, 249 | "metadata": {}, 250 | "outputs": [], 251 | "source": [] 252 | } 253 | ], 254 | "metadata": { 255 | "kernelspec": { 256 | "display_name": "Python 3 (ipykernel)", 257 | "language": "python", 258 | "name": "python3" 259 | }, 260 | "language_info": { 261 | "codemirror_mode": { 262 | "name": "ipython", 263 | "version": 3 264 | }, 265 | "file_extension": ".py", 266 | "mimetype": "text/x-python", 267 | "name": "python", 268 | "nbconvert_exporter": "python", 269 | "pygments_lexer": "ipython3", 270 | "version": "3.11.9" 271 | } 272 | }, 273 | "nbformat": 4, 274 | "nbformat_minor": 4 275 | } 276 | -------------------------------------------------------------------------------- /Exercise-13.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# MNIST handwritten digits clustering\n", 8 | "\n", 9 | "In this notebook, we'll use some common clustering algorithms to analyze MNIST digits using scikit-learn.\n", 10 | "\n", 11 | "First, the needed imports. " 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": null, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "%matplotlib inline\n", 21 | "\n", 22 | "from pml_utils import get_mnist, show_clusters\n", 23 | "\n", 24 | "import numpy as np\n", 25 | "from sklearn import __version__\n", 26 | "from sklearn.cluster import KMeans, AgglomerativeClustering\n", 27 | "from sklearn.metrics import adjusted_rand_score\n", 28 | "\n", 29 | "import matplotlib.pyplot as plt\n", 30 | "import seaborn as sns\n", 31 | "sns.set()\n", 32 | "\n", 33 | "from packaging.version import Version\n", 34 | "assert(Version(__version__) >= Version(\"0.20\")), \"Version >= 0.20 of sklearn is required.\"" 35 | ] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "metadata": {}, 40 | "source": [ 41 | "Then we load the MNIST data. First time it downloads the data, which can take a while.\n", 42 | "\n", 43 | "To speed up the computations, let's use only 10000 digits in this notebook." 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": null, 49 | "metadata": {}, 50 | "outputs": [], 51 | "source": [ 52 | "X_train, y_train, X_test, y_test = get_mnist('MNIST')\n", 53 | "\n", 54 | "X = X_train[:10000]\n", 55 | "y = y_train[:10000]\n", 56 | "print()\n", 57 | "print('MNIST data loaded:')\n", 58 | "print('X:', X.shape)\n", 59 | "print('y:', y.shape)" 60 | ] 61 | }, 62 | { 63 | "cell_type": "markdown", 64 | "metadata": {}, 65 | "source": [ 66 | "## k-means\n", 67 | "\n", 68 | "[K-means](https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html#sklearn.cluster.KMeans) clusters data by trying to separate samples in *k* groups of equal variance using an iterative two-step algorithm. It requires the number of clusters as a parameter." 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": null, 74 | "metadata": {}, 75 | "outputs": [], 76 | "source": [ 77 | "%%time\n", 78 | "\n", 79 | "n_clusters_kmeans = 10\n", 80 | "\n", 81 | "kmeans = KMeans(n_clusters=n_clusters_kmeans)\n", 82 | "kmeans.fit(X)" 83 | ] 84 | }, 85 | { 86 | "cell_type": "markdown", 87 | "metadata": {}, 88 | "source": [ 89 | "The sizes of the clusters:" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": null, 95 | "metadata": {}, 96 | "outputs": [], 97 | "source": [ 98 | "plt.hist(kmeans.labels_, bins=range(kmeans.n_clusters+1),\n", 99 | " rwidth=0.5)\n", 100 | "plt.xticks(0.5+np.arange(kmeans.n_clusters),\n", 101 | " np.arange(kmeans.n_clusters))\n", 102 | "plt.title('Cluster sizes');" 103 | ] 104 | }, 105 | { 106 | "cell_type": "markdown", 107 | "metadata": {}, 108 | "source": [ 109 | "The k-means centroids are vectors in the same space as the original data, so we can take a look at them:" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": null, 115 | "metadata": {}, 116 | "outputs": [], 117 | "source": [ 118 | "plt.figure(figsize=(kmeans.n_clusters, 1))\n", 119 | "\n", 120 | "for i in range(kmeans.n_clusters):\n", 121 | " plt.subplot(1, kmeans.n_clusters, i+1)\n", 122 | " plt.axis('off')\n", 123 | " plt.imshow(kmeans.cluster_centers_[i,:].reshape(28,28), cmap=\"gray\")\n", 124 | " plt.title(str(i))" 125 | ] 126 | }, 127 | { 128 | "cell_type": "markdown", 129 | "metadata": {}, 130 | "source": [ 131 | "Let's also draw some digits from each cluster:" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": null, 137 | "metadata": {}, 138 | "outputs": [], 139 | "source": [ 140 | "show_clusters(kmeans.labels_, kmeans.n_clusters, X)" 141 | ] 142 | }, 143 | { 144 | "cell_type": "markdown", 145 | "metadata": {}, 146 | "source": [ 147 | "### Evaluation\n", 148 | "\n", 149 | "Since we know the correct labels for MNIST digits, we can evaluate the quality of the clustering. We'll use the [adjusted Rand index](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.adjusted_rand_score.html) which considers all pairs of samples and counts pairs that are assigned in the same or different clusters in the predicted and true clusterings. The index is between 0.0 and 1.0 with higher values denoting better clusterings." 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": null, 155 | "metadata": {}, 156 | "outputs": [], 157 | "source": [ 158 | "print(\"Adjusted Rand index: %.3f\"\n", 159 | " % adjusted_rand_score(y, kmeans.labels_))" 160 | ] 161 | }, 162 | { 163 | "cell_type": "markdown", 164 | "metadata": {}, 165 | "source": [ 166 | "## Hierarchical clustering\n", 167 | "\n", 168 | "Hierarchical clustering is a family of clustering algorithms that build nested clusters by merging or splitting them successively. We'll use here [agglomerative clustering](https://scikit-learn.org/stable/modules/generated/sklearn.cluster.AgglomerativeClustering.html#sklearn.cluster.AgglomerativeClustering), in which all samples start in separate clusters and are then successively merged together.\n", 169 | "\n", 170 | "The `linkage` criteria determines the metric used for the merge strategy:\n", 171 | "* `ward` minimizes the sum of squared differences within all clusters\n", 172 | "* `complete` linkage minimizes the maximum distance between observations of pairs of clusters\n", 173 | "* `average` linkage minimizes the average of the distances between all observations of pairs of clusters\n", 174 | "* `single` linkage minimizes the distance between the closest observations of pairs of clusters" 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "execution_count": null, 180 | "metadata": {}, 181 | "outputs": [], 182 | "source": [ 183 | "%%time\n", 184 | "\n", 185 | "n_clusters_hclust = 10\n", 186 | "linkage_hclust = \"ward\"\n", 187 | "\n", 188 | "hclust = AgglomerativeClustering(n_clusters=n_clusters_hclust,\n", 189 | " linkage=linkage_hclust)\n", 190 | "hclust.fit(X)" 191 | ] 192 | }, 193 | { 194 | "cell_type": "markdown", 195 | "metadata": {}, 196 | "source": [ 197 | "The sizes of the clusters:" 198 | ] 199 | }, 200 | { 201 | "cell_type": "code", 202 | "execution_count": null, 203 | "metadata": {}, 204 | "outputs": [], 205 | "source": [ 206 | "plt.hist(hclust.labels_, bins=range(hclust.n_clusters+1),\n", 207 | " rwidth=0.5)\n", 208 | "plt.xticks(0.5+np.arange(hclust.n_clusters),\n", 209 | " np.arange(hclust.n_clusters))\n", 210 | "plt.title('Cluster sizes');" 211 | ] 212 | }, 213 | { 214 | "cell_type": "markdown", 215 | "metadata": {}, 216 | "source": [ 217 | "Some digits from each cluster:" 218 | ] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "execution_count": null, 223 | "metadata": {}, 224 | "outputs": [], 225 | "source": [ 226 | "show_clusters(hclust.labels_, hclust.n_clusters, X)" 227 | ] 228 | }, 229 | { 230 | "cell_type": "markdown", 231 | "metadata": {}, 232 | "source": [ 233 | "### Evaluation" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": null, 239 | "metadata": {}, 240 | "outputs": [], 241 | "source": [ 242 | "print(\"Adjusted Rand index: %.3f\"\n", 243 | " % adjusted_rand_score(y, hclust.labels_))" 244 | ] 245 | }, 246 | { 247 | "cell_type": "markdown", 248 | "metadata": {}, 249 | "source": [ 250 | "## Experiments\n", 251 | "\n", 252 | "1. Experiment with different numbers of clusters, different linkage criteria, and other parameters for [k-means](https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html#sklearn.cluster.KMeans) and [hierarchical clustering](https://scikit-learn.org/stable/modules/generated/sklearn.cluster.AgglomerativeClustering.html#sklearn.cluster.AgglomerativeClustering).\n", 253 | "\n", 254 | "2. Evaluate different clustering methods using Rand index scores with `n_clusters=10`. What is the best clustering algorithm according to this measure?\n", 255 | "\n", 256 | "3. Try other clustering methods available in scikit-learn. See [Clustering](https://scikit-learn.org/stable/modules/clustering.html#clustering) to get started." 257 | ] 258 | }, 259 | { 260 | "cell_type": "code", 261 | "execution_count": null, 262 | "metadata": {}, 263 | "outputs": [], 264 | "source": [] 265 | } 266 | ], 267 | "metadata": { 268 | "kernelspec": { 269 | "display_name": "Python 3 (ipykernel)", 270 | "language": "python", 271 | "name": "python3" 272 | }, 273 | "language_info": { 274 | "codemirror_mode": { 275 | "name": "ipython", 276 | "version": 3 277 | }, 278 | "file_extension": ".py", 279 | "mimetype": "text/x-python", 280 | "name": "python", 281 | "nbconvert_exporter": "python", 282 | "pygments_lexer": "ipython3", 283 | "version": "3.11.9" 284 | } 285 | }, 286 | "nbformat": 4, 287 | "nbformat_minor": 4 288 | } 289 | -------------------------------------------------------------------------------- /Exercise-14.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# MNIST handwritten digits anomaly detection\n", 8 | "\n", 9 | "In this notebook, we'll test some anomaly detection methods to detect outliers within MNIST digits data using scikit-learn.\n", 10 | "\n", 11 | "First, the needed imports. " 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": null, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "%matplotlib inline\n", 21 | "\n", 22 | "from pml_utils import get_mnist, show_anomalies\n", 23 | "\n", 24 | "import numpy as np\n", 25 | "from sklearn import __version__\n", 26 | "from sklearn.ensemble import IsolationForest\n", 27 | "from sklearn.neighbors import LocalOutlierFactor\n", 28 | "\n", 29 | "import matplotlib.pyplot as plt\n", 30 | "import seaborn as sns\n", 31 | "sns.set()\n", 32 | "\n", 33 | "from packaging.version import Version\n", 34 | "assert(Version(__version__) >= Version(\"0.20\")), \"Version >= 0.20 of sklearn is required.\"" 35 | ] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "metadata": {}, 40 | "source": [ 41 | "Then we load the MNIST data. First time it downloads the data, which can take a while.\n", 42 | "\n", 43 | "To speed up the computations, let's use only 10000 digits in this notebook." 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": null, 49 | "metadata": {}, 50 | "outputs": [], 51 | "source": [ 52 | "X_train, y_train, X_test, y_test = get_mnist('MNIST')\n", 53 | "\n", 54 | "X = X_train[:10000]\n", 55 | "y = y_train[:10000]\n", 56 | "print()\n", 57 | "print('MNIST data loaded:')\n", 58 | "print('X:', X.shape)\n", 59 | "print('y:', y.shape)" 60 | ] 61 | }, 62 | { 63 | "cell_type": "markdown", 64 | "metadata": {}, 65 | "source": [ 66 | "Let us then create some outliers in our data. We \n", 67 | "* invert all pixels of one sample\n", 68 | "* shuffle all pixels of one sample, and\n", 69 | "* add salt-and-pepper noise to 10% of pixels of one sample.\n", 70 | "\n", 71 | "You can also continue creating more outliers in a similar fashion. " 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": null, 77 | "metadata": {}, 78 | "outputs": [], 79 | "source": [ 80 | "X[9999,:]=255-X[9999,:]\n", 81 | "np.random.shuffle(X[9998,:])\n", 82 | "for i in np.random.randint(0, X.shape[1], int(X.shape[1]*0.1)):\n", 83 | " X[9997,i] = 0.0 if np.random.rand()<0.5 else 255.0 " 84 | ] 85 | }, 86 | { 87 | "cell_type": "markdown", 88 | "metadata": {}, 89 | "source": [ 90 | "Let's have a look at our outliers:" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": null, 96 | "metadata": {}, 97 | "outputs": [], 98 | "source": [ 99 | "n_outliers = 3\n", 100 | "\n", 101 | "pltsize = 5\n", 102 | "plt.figure(figsize=(n_outliers*pltsize, pltsize))\n", 103 | "\n", 104 | "for i in range(n_outliers):\n", 105 | " plt.subplot(1,10,i+1)\n", 106 | " plt.axis('off')\n", 107 | " plt.imshow(X[9999-i,:].reshape(28,28), cmap=\"gray\")" 108 | ] 109 | }, 110 | { 111 | "cell_type": "markdown", 112 | "metadata": {}, 113 | "source": [ 114 | "## Isolation forest\n", 115 | "\n", 116 | "[Isolation forest](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.IsolationForest.html#sklearn.ensemble.IsolationForest) is an outlier detection method based on using random forests. The idea is to isolate data items by random features and splits. Outliers are easier to isolate, so they tend to produce shorter paths on average.\n", 117 | "\n", 118 | "We specify the number of trees as `n_estimators` and the assumed proportion of outliers in the data set as `if_contamination`." 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": null, 124 | "metadata": {}, 125 | "outputs": [], 126 | "source": [ 127 | "%%time\n", 128 | "\n", 129 | "n_estimators = 100\n", 130 | "if_contamination = 0.001\n", 131 | "\n", 132 | "if_model = IsolationForest(n_estimators=n_estimators, \n", 133 | " contamination=if_contamination)\n", 134 | "if_pred = if_model.fit(X).predict(X)\n", 135 | "print('Number of anomalies:', np.sum(if_pred==-1))" 136 | ] 137 | }, 138 | { 139 | "cell_type": "markdown", 140 | "metadata": {}, 141 | "source": [ 142 | "We use a function `show_anomalies` to take a look at the found outliers." 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": null, 148 | "metadata": {}, 149 | "outputs": [], 150 | "source": [ 151 | "show_anomalies(if_pred, X)" 152 | ] 153 | }, 154 | { 155 | "cell_type": "markdown", 156 | "metadata": {}, 157 | "source": [ 158 | "## Local outlier factor\n", 159 | "\n", 160 | "[Local outlier factor](https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.LocalOutlierFactor.html#sklearn.neighbors.LocalOutlierFactor) is another method for outlier detection. It is based on k-nearest neighbors and computes the local density of data points with respect to their neighbors. Outliers have substantially lower local density than inliers.\n", 161 | "\n", 162 | "We specify the number of neighbors considered as `n_neighbors` and the assumed proportion of outliers in the data set as `lof_contamination`." 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": null, 168 | "metadata": {}, 169 | "outputs": [], 170 | "source": [ 171 | "%%time\n", 172 | "\n", 173 | "n_neighbors = 20\n", 174 | "lof_contamination = 0.001\n", 175 | "\n", 176 | "lof_model = LocalOutlierFactor(n_neighbors=n_neighbors,\n", 177 | " contamination=lof_contamination)\n", 178 | "lof_pred = lof_model.fit_predict(X)\n", 179 | "print('Number of anomalies:', np.sum(lof_pred==-1))" 180 | ] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "execution_count": null, 185 | "metadata": {}, 186 | "outputs": [], 187 | "source": [ 188 | "show_anomalies(lof_pred, X)" 189 | ] 190 | }, 191 | { 192 | "cell_type": "markdown", 193 | "metadata": {}, 194 | "source": [ 195 | "## Experiments\n", 196 | "\n", 197 | "Experiment with different parameters for [isolation forest](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.IsolationForest.html#sklearn.ensemble.IsolationForest) and [local outlier factor](https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.LocalOutlierFactor.html#sklearn.neighbors.LocalOutlierFactor). Are the algorithms able to find all the generated outliers?\n", 198 | "\n", 199 | "You can also create more outliers in a similar fashion. " 200 | ] 201 | }, 202 | { 203 | "cell_type": "code", 204 | "execution_count": null, 205 | "metadata": {}, 206 | "outputs": [], 207 | "source": [] 208 | } 209 | ], 210 | "metadata": { 211 | "kernelspec": { 212 | "display_name": "Python 3 (ipykernel)", 213 | "language": "python", 214 | "name": "python3" 215 | }, 216 | "language_info": { 217 | "codemirror_mode": { 218 | "name": "ipython", 219 | "version": 3 220 | }, 221 | "file_extension": ".py", 222 | "mimetype": "text/x-python", 223 | "name": "python", 224 | "nbconvert_exporter": "python", 225 | "pygments_lexer": "ipython3", 226 | "version": "3.11.9" 227 | } 228 | }, 229 | "nbformat": 4, 230 | "nbformat_minor": 4 231 | } 232 | -------------------------------------------------------------------------------- /Extra-01.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# MNIST handwritten digits classification with naive Bayes \n", 8 | "\n", 9 | "In this notebook, we'll use [naive Bayes classifiers](https://scikit-learn.org/stable/modules/naive_bayes.html) to classify MNIST digits using scikit-learn (version 0.20 or later required).\n", 10 | "\n", 11 | "First, the needed imports. " 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": null, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "%matplotlib inline\n", 21 | "\n", 22 | "from pml_utils import get_mnist, show_failures\n", 23 | "\n", 24 | "import numpy as np\n", 25 | "from sklearn.model_selection import train_test_split\n", 26 | "from sklearn import datasets, __version__\n", 27 | "from sklearn.naive_bayes import GaussianNB, BernoulliNB\n", 28 | "from sklearn.metrics import accuracy_score\n", 29 | "\n", 30 | "import matplotlib.pyplot as plt\n", 31 | "import seaborn as sns\n", 32 | "sns.set()\n", 33 | "\n", 34 | "from packaging.version import Version\n", 35 | "assert(Version(__version__) >= Version(\"0.20\")), \"Version >= 0.20 of sklearn is required.\"" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": {}, 41 | "source": [ 42 | "Then we load the MNIST data. First time we need to download the data, which can take a while." 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "X_train, y_train, X_test, y_test = get_mnist('MNIST')\n", 52 | "\n", 53 | "print('MNIST data loaded: train:',len(X_train),'test:',len(X_test))\n", 54 | "print('X_train:', X_train.shape)\n", 55 | "print('y_train:', y_train.shape)\n", 56 | "print('X_test', X_test.shape)\n", 57 | "print('y_test', y_test.shape)" 58 | ] 59 | }, 60 | { 61 | "cell_type": "markdown", 62 | "metadata": {}, 63 | "source": [ 64 | "The training data (`X_train`) is a matrix of size (60000, 784), i.e. it consists of 60000 digits expressed as 784 sized vectors (28x28 images flattened to 1D). `y_train` is a 60000-dimensional vector containing the correct classes (\"0\", \"1\", ..., \"9\") for each training digit.\n", 65 | "\n", 66 | "Let's take a closer look. Here are the first 10 training digits:" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": null, 72 | "metadata": { 73 | "collapsed": false, 74 | "jupyter": { 75 | "outputs_hidden": false 76 | } 77 | }, 78 | "outputs": [], 79 | "source": [ 80 | "pltsize=1\n", 81 | "plt.figure(figsize=(10*pltsize, pltsize))\n", 82 | "\n", 83 | "for i in range(10):\n", 84 | " plt.subplot(1,10,i+1)\n", 85 | " plt.axis('off')\n", 86 | " plt.imshow(X_train[i,:].reshape(28, 28), cmap=\"gray\")\n", 87 | " plt.title('Class: '+y_train[i])" 88 | ] 89 | }, 90 | { 91 | "cell_type": "markdown", 92 | "metadata": {}, 93 | "source": [ 94 | "## Naive Bayes classifiers\n", 95 | "\n", 96 | "Naive Bayes classifiers are a family of simple classifiers based on applying Bayes' theorem. The classifiers are called \"naive\" as we make a strong assumption that the features are conditionally independent given the value of the class variable. While this assumption is not usually true, a naive Bayes classifier may in practice work reasonably well. Naive Bayes classifiers are also simple and fast compared to many more sophisticated methods.\n", 97 | "\n", 98 | "The classification rule for naive Bayes is\n", 99 | "\\begin{equation}\n", 100 | "\\hat{y} = \\arg\\max_yP(y)\\prod_{i=1}^nP(x_i|y)\n", 101 | "\\end{equation}\n", 102 | "where $P(y)$ is the prior probability of class $y$ and $P(x_i|y)$ is the class-conditional likelihood of feature $i$.\n", 103 | "\n", 104 | "## Gaussian naive Bayes\n", 105 | "\n", 106 | "In Gaussian naive Bayes, the likelihood of the features is assumed to be Gaussian\n", 107 | "\\begin{equation}\n", 108 | "P(x_i|y) = \\mathcal{N}(x_i\\,|\\,\\mu_{iy},\\sigma_{iy}^2) \n", 109 | "\\end{equation}\n", 110 | "where $\\mu_{iy}$ and $\\sigma_{iy}^2$ are the mean and variance, respectively, of feature $i$ in objects of class $y$." 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": null, 116 | "metadata": {}, 117 | "outputs": [], 118 | "source": [ 119 | "mu = 192.\n", 120 | "sigma = 32.\n", 121 | "x = np.arange(255.)\n", 122 | "plt.plot(x, 1/(sigma * np.sqrt(2 * np.pi)) * np.exp( - (x - mu)**2 / (2 * sigma**2)),\n", 123 | " lw=3)\n", 124 | "plt.xticks([0,127,255])\n", 125 | "plt.title('Gaussian distribution with $\\mu={}$ and $\\sigma={}$'.format(mu, sigma));" 126 | ] 127 | }, 128 | { 129 | "cell_type": "markdown", 130 | "metadata": {}, 131 | "source": [ 132 | "The prior probabilities $P(y)$ are learned from training data by default. \n", 133 | "\n", 134 | "### Learning\n", 135 | "\n", 136 | "Training a naive Bayes classifier is fast:" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": null, 142 | "metadata": { 143 | "collapsed": false, 144 | "jupyter": { 145 | "outputs_hidden": false 146 | } 147 | }, 148 | "outputs": [], 149 | "source": [ 150 | "%%time\n", 151 | "\n", 152 | "clf_gnb = GaussianNB()\n", 153 | "clf_gnb.fit(X_train, y_train)" 154 | ] 155 | }, 156 | { 157 | "cell_type": "markdown", 158 | "metadata": {}, 159 | "source": [ 160 | "We can take a look at the mean and variance of features for each class." 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": null, 166 | "metadata": {}, 167 | "outputs": [], 168 | "source": [ 169 | "pltsize=1\n", 170 | "plt.figure(figsize=(10*pltsize, pltsize))\n", 171 | "plt.suptitle('Mean of each feature', y=1.3)\n", 172 | "\n", 173 | "for i in range(10):\n", 174 | " plt.subplot(1,10,i+1)\n", 175 | " plt.axis('off')\n", 176 | " plt.imshow(clf_gnb.theta_[i,:].reshape(28, 28), cmap=\"gray\")\n", 177 | " plt.title('Class: '+str(i))\n", 178 | "\n", 179 | "plt.figure(figsize=(10*pltsize, pltsize))\n", 180 | "plt.suptitle('Variance of each feature', y=1.1)\n", 181 | "for i in range(10):\n", 182 | " plt.subplot(1,10,i+1)\n", 183 | " plt.axis('off')\n", 184 | " plt.imshow(clf_gnb.var_[i,:].reshape(28, 28), cmap=\"gray\")" 185 | ] 186 | }, 187 | { 188 | "cell_type": "markdown", 189 | "metadata": {}, 190 | "source": [ 191 | "### Inference\n", 192 | "\n", 193 | "Evaluating a naive Bayes classifier is also fast:" 194 | ] 195 | }, 196 | { 197 | "cell_type": "code", 198 | "execution_count": null, 199 | "metadata": { 200 | "collapsed": false, 201 | "jupyter": { 202 | "outputs_hidden": false 203 | } 204 | }, 205 | "outputs": [], 206 | "source": [ 207 | "%%time\n", 208 | "\n", 209 | "predictions = clf_gnb.predict(X_test)" 210 | ] 211 | }, 212 | { 213 | "cell_type": "markdown", 214 | "metadata": {}, 215 | "source": [ 216 | "The accuracy of the classifier:" 217 | ] 218 | }, 219 | { 220 | "cell_type": "code", 221 | "execution_count": null, 222 | "metadata": { 223 | "collapsed": false, 224 | "jupyter": { 225 | "outputs_hidden": false 226 | } 227 | }, 228 | "outputs": [], 229 | "source": [ 230 | "print('Predicted', len(predictions), 'digits with accuracy:', accuracy_score(y_test, predictions))" 231 | ] 232 | }, 233 | { 234 | "cell_type": "markdown", 235 | "metadata": {}, 236 | "source": [ 237 | "We can also inspect the results in more detail. Let's use the `show_failures()` helper function to show the wrongly classified test digits." 238 | ] 239 | }, 240 | { 241 | "cell_type": "code", 242 | "execution_count": null, 243 | "metadata": {}, 244 | "outputs": [], 245 | "source": [ 246 | "show_failures(predictions, y_test, X_test)" 247 | ] 248 | }, 249 | { 250 | "cell_type": "code", 251 | "execution_count": null, 252 | "metadata": {}, 253 | "outputs": [], 254 | "source": [ 255 | "show_failures(predictions, y_test, X_test, trueclass='5')" 256 | ] 257 | }, 258 | { 259 | "cell_type": "markdown", 260 | "metadata": {}, 261 | "source": [ 262 | "We can observe that the classifier makes rather easy mistakes." 263 | ] 264 | }, 265 | { 266 | "cell_type": "markdown", 267 | "metadata": {}, 268 | "source": [ 269 | "## Bernoulli naive Bayes\n", 270 | "\n", 271 | "Gaussian naive Bayes assumes that the features are normally distributed, which is not a good assumption for the MNIST digits. Let's therefore use a second approach and model each feature as a binary variable (\"black\" or \"white\"). A suitable distribution in this case is the Bernoulli\n", 272 | "\\begin{equation}\n", 273 | "P(x_i|y) = \\mathrm{Ber}(x_i\\,|\\,\\theta_{iy}) \n", 274 | "\\end{equation}\n", 275 | "where $\\theta_{iy}$ is the probability that feature $i$ is \"white\" in objects of class $y$." 276 | ] 277 | }, 278 | { 279 | "cell_type": "code", 280 | "execution_count": null, 281 | "metadata": {}, 282 | "outputs": [], 283 | "source": [ 284 | "theta = 0.4\n", 285 | "plt.bar([0, 255], [1-theta, theta], width=16.)\n", 286 | "plt.xticks([0,127,255])\n", 287 | "plt.title('Bernoulli distribution with $\\\\theta={}$'.format(theta));" 288 | ] 289 | }, 290 | { 291 | "cell_type": "markdown", 292 | "metadata": {}, 293 | "source": [ 294 | "### Learning\n", 295 | "\n", 296 | "Bernoulli naive Bayes assumes binary data, so we'll binarize the digits with a threshold in the middle." 297 | ] 298 | }, 299 | { 300 | "cell_type": "code", 301 | "execution_count": null, 302 | "metadata": { 303 | "collapsed": false, 304 | "jupyter": { 305 | "outputs_hidden": false 306 | } 307 | }, 308 | "outputs": [], 309 | "source": [ 310 | "%%time\n", 311 | "\n", 312 | "clf_bnb = BernoulliNB(binarize=128.)\n", 313 | "clf_bnb.fit(X_train, y_train)" 314 | ] 315 | }, 316 | { 317 | "cell_type": "markdown", 318 | "metadata": {}, 319 | "source": [ 320 | "We can take a look at the probabilities of features for each class." 321 | ] 322 | }, 323 | { 324 | "cell_type": "code", 325 | "execution_count": null, 326 | "metadata": {}, 327 | "outputs": [], 328 | "source": [ 329 | "pltsize=1\n", 330 | "plt.figure(figsize=(10*pltsize, pltsize))\n", 331 | "plt.suptitle('Probability of each feature', y=1.3)\n", 332 | "\n", 333 | "for i in range(10):\n", 334 | " plt.subplot(1,10,i+1)\n", 335 | " plt.axis('off')\n", 336 | " plt.imshow(np.exp(clf_bnb.feature_log_prob_[i,:]).reshape(28, 28), cmap=\"gray\")\n", 337 | " plt.title('Class: '+str(i))" 338 | ] 339 | }, 340 | { 341 | "cell_type": "markdown", 342 | "metadata": {}, 343 | "source": [ 344 | "### Inference" 345 | ] 346 | }, 347 | { 348 | "cell_type": "code", 349 | "execution_count": null, 350 | "metadata": { 351 | "collapsed": false, 352 | "jupyter": { 353 | "outputs_hidden": false 354 | } 355 | }, 356 | "outputs": [], 357 | "source": [ 358 | "%%time\n", 359 | "\n", 360 | "predictions = clf_bnb.predict(X_test)" 361 | ] 362 | }, 363 | { 364 | "cell_type": "code", 365 | "execution_count": null, 366 | "metadata": { 367 | "collapsed": false, 368 | "jupyter": { 369 | "outputs_hidden": false 370 | } 371 | }, 372 | "outputs": [], 373 | "source": [ 374 | "print('Predicted', len(predictions), 'digits with accuracy:', accuracy_score(y_test, predictions))" 375 | ] 376 | }, 377 | { 378 | "cell_type": "code", 379 | "execution_count": null, 380 | "metadata": { 381 | "collapsed": false, 382 | "jupyter": { 383 | "outputs_hidden": false 384 | } 385 | }, 386 | "outputs": [], 387 | "source": [ 388 | "show_failures(predictions, y_test, X_test)" 389 | ] 390 | }, 391 | { 392 | "cell_type": "code", 393 | "execution_count": null, 394 | "metadata": {}, 395 | "outputs": [], 396 | "source": [ 397 | "show_failures(predictions, y_test, X_test, trueclass='5')" 398 | ] 399 | }, 400 | { 401 | "cell_type": "code", 402 | "execution_count": null, 403 | "metadata": {}, 404 | "outputs": [], 405 | "source": [] 406 | } 407 | ], 408 | "metadata": { 409 | "kernelspec": { 410 | "display_name": "Python 3 (ipykernel)", 411 | "language": "python", 412 | "name": "python3" 413 | }, 414 | "language_info": { 415 | "codemirror_mode": { 416 | "name": "ipython", 417 | "version": 3 418 | }, 419 | "file_extension": ".py", 420 | "mimetype": "text/x-python", 421 | "name": "python", 422 | "nbconvert_exporter": "python", 423 | "pygments_lexer": "ipython3", 424 | "version": "3.11.9" 425 | } 426 | }, 427 | "nbformat": 4, 428 | "nbformat_minor": 4 429 | } 430 | -------------------------------------------------------------------------------- /Extra-02.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "ein.tags": "worksheet-0", 7 | "slideshow": { 8 | "slide_type": "-" 9 | } 10 | }, 11 | "source": [ 12 | "# MNIST handwritten digits classification with parameter grid search for SVM\n", 13 | "\n", 14 | "In this notebook, we'll use [grid search](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html) and a validation set to find optimal values for our SVM model's hyperparameters.\n", 15 | "\n", 16 | "First, the needed imports. " 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": null, 22 | "metadata": { 23 | "autoscroll": false, 24 | "collapsed": false, 25 | "ein.hycell": false, 26 | "ein.tags": "worksheet-0", 27 | "jupyter": { 28 | "outputs_hidden": false 29 | }, 30 | "slideshow": { 31 | "slide_type": "-" 32 | } 33 | }, 34 | "outputs": [], 35 | "source": [ 36 | "%matplotlib inline\n", 37 | "\n", 38 | "from pml_utils import get_mnist\n", 39 | "\n", 40 | "import numpy as np\n", 41 | "from sklearn import svm, datasets, __version__\n", 42 | "from sklearn.linear_model import SGDClassifier\n", 43 | "from sklearn.metrics import accuracy_score\n", 44 | "from sklearn.model_selection import GridSearchCV, PredefinedSplit\n", 45 | "\n", 46 | "import matplotlib.pyplot as plt\n", 47 | "import seaborn as sns\n", 48 | "sns.set()\n", 49 | "\n", 50 | "# Suppress annoying warnings...\n", 51 | "import warnings\n", 52 | "from sklearn.exceptions import ConvergenceWarning\n", 53 | "warnings.filterwarnings(\"ignore\", category=ConvergenceWarning)\n", 54 | "\n", 55 | "from packaging.version import Version\n", 56 | "assert(Version(__version__) >= Version(\"0.20\")), \"Version >= 0.20 of sklearn is required.\"" 57 | ] 58 | }, 59 | { 60 | "cell_type": "markdown", 61 | "metadata": { 62 | "ein.tags": "worksheet-0", 63 | "slideshow": { 64 | "slide_type": "-" 65 | } 66 | }, 67 | "source": [ 68 | "Then we load the MNIST data. First time it downloads the data, which can take a while." 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": null, 74 | "metadata": { 75 | "autoscroll": false, 76 | "collapsed": false, 77 | "ein.hycell": false, 78 | "ein.tags": "worksheet-0", 79 | "jupyter": { 80 | "outputs_hidden": false 81 | }, 82 | "slideshow": { 83 | "slide_type": "-" 84 | } 85 | }, 86 | "outputs": [], 87 | "source": [ 88 | "X_train, y_train, X_test, y_test = get_mnist('MNIST')\n", 89 | "\n", 90 | "print('MNIST data loaded: train:',len(X_train),'test:',len(X_test))\n", 91 | "print('X_train:', X_train.shape)\n", 92 | "print('y_train:', y_train.shape)\n", 93 | "print('X_test', X_test.shape)\n", 94 | "print('y_test', y_test.shape)" 95 | ] 96 | }, 97 | { 98 | "cell_type": "markdown", 99 | "metadata": { 100 | "ein.tags": "worksheet-0", 101 | "slideshow": { 102 | "slide_type": "-" 103 | } 104 | }, 105 | "source": [ 106 | "## Linear SVM\n", 107 | "\n", 108 | "Let's start with the linear SVM trained with a subset of training data. `C` is the penalty parameter that we need to specify. Let's first try with just some guess, e.g., `C=1.0`." 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": null, 114 | "metadata": { 115 | "autoscroll": false, 116 | "collapsed": false, 117 | "ein.hycell": false, 118 | "ein.tags": "worksheet-0", 119 | "jupyter": { 120 | "outputs_hidden": false 121 | }, 122 | "slideshow": { 123 | "slide_type": "-" 124 | } 125 | }, 126 | "outputs": [], 127 | "source": [ 128 | "%%time\n", 129 | "\n", 130 | "clf_lsvm = svm.LinearSVC(C=1.0)\n", 131 | "\n", 132 | "print(clf_lsvm.fit(X_train[:10000,:], y_train[:10000]))\n", 133 | "\n", 134 | "pred_lsvm = clf_lsvm.predict(X_test)\n", 135 | "print('Predicted', len(pred_lsvm), 'digits with accuracy:', accuracy_score(y_test, pred_lsvm))" 136 | ] 137 | }, 138 | { 139 | "cell_type": "markdown", 140 | "metadata": { 141 | "ein.tags": "worksheet-0", 142 | "slideshow": { 143 | "slide_type": "-" 144 | } 145 | }, 146 | "source": [ 147 | "Next, let's try grid search, i.e., we try several different values for the parameter `C`. Remember that it's important to *not* use the test set for evaluating hyperparameters. Instead we opt to set aside the last 1000 images as a validation set.\n" 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "execution_count": null, 153 | "metadata": { 154 | "autoscroll": false, 155 | "collapsed": false, 156 | "ein.hycell": false, 157 | "ein.tags": "worksheet-0", 158 | "jupyter": { 159 | "outputs_hidden": false 160 | }, 161 | "slideshow": { 162 | "slide_type": "-" 163 | } 164 | }, 165 | "outputs": [], 166 | "source": [ 167 | "%%time\n", 168 | "\n", 169 | "# The values for C that we will try out\n", 170 | "param_grid = {'C': [1, 10, 100, 1000]}\n", 171 | "\n", 172 | "# Define the validation set\n", 173 | "valid_split = PredefinedSplit(9000*[-1] + 1000*[0])\n", 174 | "\n", 175 | "clf_lsvm_grid = GridSearchCV(clf_lsvm, param_grid, cv=valid_split, verbose=2)\n", 176 | "print(clf_lsvm_grid.fit(X_train[:10000,:], y_train[:10000]))" 177 | ] 178 | }, 179 | { 180 | "cell_type": "markdown", 181 | "metadata": { 182 | "ein.tags": "worksheet-0", 183 | "slideshow": { 184 | "slide_type": "-" 185 | } 186 | }, 187 | "source": [ 188 | "We can now see what was the best value for C that was selected." 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "execution_count": null, 194 | "metadata": { 195 | "autoscroll": false, 196 | "collapsed": false, 197 | "ein.hycell": false, 198 | "ein.tags": "worksheet-0", 199 | "jupyter": { 200 | "outputs_hidden": false 201 | }, 202 | "slideshow": { 203 | "slide_type": "-" 204 | } 205 | }, 206 | "outputs": [], 207 | "source": [ 208 | "print(clf_lsvm_grid.best_params_)\n", 209 | "\n", 210 | "best_C = clf_lsvm_grid.best_params_['C']" 211 | ] 212 | }, 213 | { 214 | "cell_type": "markdown", 215 | "metadata": { 216 | "ein.tags": "worksheet-0", 217 | "slideshow": { 218 | "slide_type": "-" 219 | } 220 | }, 221 | "source": [ 222 | "Let's try predicting with out new model with optimal hyperparameters." 223 | ] 224 | }, 225 | { 226 | "cell_type": "code", 227 | "execution_count": null, 228 | "metadata": { 229 | "autoscroll": false, 230 | "collapsed": false, 231 | "ein.hycell": false, 232 | "ein.tags": "worksheet-0", 233 | "jupyter": { 234 | "outputs_hidden": false 235 | }, 236 | "slideshow": { 237 | "slide_type": "-" 238 | } 239 | }, 240 | "outputs": [], 241 | "source": [ 242 | "clf_lsvm2 = svm.LinearSVC(C=best_C)\n", 243 | "\n", 244 | "print(clf_lsvm2.fit(X_train[:10000,:], y_train[:10000]))\n", 245 | "\n", 246 | "pred_lsvm2 = clf_lsvm2.predict(X_test)\n", 247 | "print('Predicted', len(pred_lsvm2), 'digits with accuracy:', accuracy_score(y_test, pred_lsvm2))" 248 | ] 249 | }, 250 | { 251 | "cell_type": "markdown", 252 | "metadata": { 253 | "ein.tags": "worksheet-0", 254 | "slideshow": { 255 | "slide_type": "-" 256 | } 257 | }, 258 | "source": [ 259 | "## Kernel SVM\n", 260 | "\n", 261 | "The Kernel SVM typically has two hyperparameters that need to be set. For example for a Gaussian (or RBF) kernel we also have `gamma` (Greek $\\gamma$) in addition to `C`. Let's first try with some initial guesses for the values." 262 | ] 263 | }, 264 | { 265 | "cell_type": "code", 266 | "execution_count": null, 267 | "metadata": { 268 | "autoscroll": false, 269 | "collapsed": false, 270 | "ein.hycell": false, 271 | "ein.tags": "worksheet-0", 272 | "jupyter": { 273 | "outputs_hidden": false 274 | }, 275 | "slideshow": { 276 | "slide_type": "-" 277 | } 278 | }, 279 | "outputs": [], 280 | "source": [ 281 | "%%time\n", 282 | "\n", 283 | "clf_ksvm = svm.SVC(decision_function_shape='ovr', kernel='rbf', C=1.0, gamma=1e-6)\n", 284 | "print(clf_ksvm.fit(X_train[:10000,:], y_train[:10000]))\n", 285 | "\n", 286 | "pred_ksvm = clf_ksvm.predict(X_test)\n", 287 | "print('Predicted', len(pred_ksvm), 'digits with accuracy:', accuracy_score(y_test, pred_ksvm))" 288 | ] 289 | }, 290 | { 291 | "cell_type": "markdown", 292 | "metadata": { 293 | "ein.tags": "worksheet-0", 294 | "slideshow": { 295 | "slide_type": "-" 296 | } 297 | }, 298 | "source": [ 299 | "Now we can try grid search again, now with two parameters. We use even a smaller subset of the training set it will otherwise be too slow." 300 | ] 301 | }, 302 | { 303 | "cell_type": "code", 304 | "execution_count": null, 305 | "metadata": { 306 | "autoscroll": false, 307 | "collapsed": false, 308 | "ein.hycell": false, 309 | "ein.tags": "worksheet-0", 310 | "jupyter": { 311 | "outputs_hidden": false 312 | }, 313 | "slideshow": { 314 | "slide_type": "-" 315 | } 316 | }, 317 | "outputs": [], 318 | "source": [ 319 | "%%time\n", 320 | "\n", 321 | "param_grid = {'C': [1, 10, 100],\n", 322 | " 'gamma': [1e-8, 5e-8, 1e-7, 5e-7, 1e-6]}\n", 323 | "\n", 324 | "train_items = 3000\n", 325 | "valid_items = 500\n", 326 | "tot_items = train_items + valid_items\n", 327 | "\n", 328 | "valid_split = PredefinedSplit(train_items*[-1] + valid_items*[0])\n", 329 | "\n", 330 | "clf_ksvm_grid = GridSearchCV(clf_ksvm, param_grid, cv=valid_split, verbose=2)\n", 331 | "print(clf_ksvm_grid.fit(X_train[:tot_items,:], y_train[:tot_items]))\n" 332 | ] 333 | }, 334 | { 335 | "cell_type": "markdown", 336 | "metadata": { 337 | "ein.tags": "worksheet-0", 338 | "slideshow": { 339 | "slide_type": "-" 340 | } 341 | }, 342 | "source": [ 343 | "Again, let's see what parameters were selected." 344 | ] 345 | }, 346 | { 347 | "cell_type": "code", 348 | "execution_count": null, 349 | "metadata": { 350 | "autoscroll": false, 351 | "collapsed": false, 352 | "ein.hycell": false, 353 | "ein.tags": "worksheet-0", 354 | "jupyter": { 355 | "outputs_hidden": false 356 | }, 357 | "slideshow": { 358 | "slide_type": "-" 359 | } 360 | }, 361 | "outputs": [], 362 | "source": [ 363 | "print(clf_ksvm_grid.best_params_)\n", 364 | "\n", 365 | "best_C = clf_ksvm_grid.best_params_['C']\n", 366 | "best_gamma = clf_ksvm_grid.best_params_['gamma']" 367 | ] 368 | }, 369 | { 370 | "cell_type": "markdown", 371 | "metadata": { 372 | "ein.tags": "worksheet-0", 373 | "slideshow": { 374 | "slide_type": "-" 375 | } 376 | }, 377 | "source": [ 378 | "As we did the grid search on a small subset of the training set it probably makes sense to retrain the model with the selected parameters using a bigger part of the training data." 379 | ] 380 | }, 381 | { 382 | "cell_type": "code", 383 | "execution_count": null, 384 | "metadata": { 385 | "autoscroll": false, 386 | "collapsed": false, 387 | "ein.hycell": false, 388 | "ein.tags": "worksheet-0", 389 | "jupyter": { 390 | "outputs_hidden": false 391 | }, 392 | "slideshow": { 393 | "slide_type": "-" 394 | } 395 | }, 396 | "outputs": [], 397 | "source": [ 398 | "clf_ksvm2 = svm.SVC(decision_function_shape='ovr', kernel='rbf', C=best_C, gamma=best_gamma)\n", 399 | "print(clf_ksvm2.fit(X_train[:10000,:], y_train[:10000]))\n", 400 | "\n", 401 | "pred_ksvm2 = clf_ksvm2.predict(X_test)\n", 402 | "print('Predicted', len(pred_ksvm2), 'digits with accuracy:', accuracy_score(y_test, pred_ksvm2))" 403 | ] 404 | }, 405 | { 406 | "cell_type": "code", 407 | "execution_count": null, 408 | "metadata": { 409 | "autoscroll": false, 410 | "collapsed": false, 411 | "ein.hycell": false, 412 | "ein.tags": "worksheet-0", 413 | "jupyter": { 414 | "outputs_hidden": false 415 | }, 416 | "slideshow": { 417 | "slide_type": "-" 418 | } 419 | }, 420 | "outputs": [], 421 | "source": [] 422 | } 423 | ], 424 | "metadata": { 425 | "kernelspec": { 426 | "display_name": "Python 3 (ipykernel)", 427 | "language": "python", 428 | "name": "python3" 429 | }, 430 | "language_info": { 431 | "codemirror_mode": { 432 | "name": "ipython", 433 | "version": 3 434 | }, 435 | "file_extension": ".py", 436 | "mimetype": "text/x-python", 437 | "name": "python", 438 | "nbconvert_exporter": "python", 439 | "pygments_lexer": "ipython3", 440 | "version": "3.11.9" 441 | }, 442 | "name": "sklearn-mnist-grid.ipynb" 443 | }, 444 | "nbformat": 4, 445 | "nbformat_minor": 4 446 | } 447 | -------------------------------------------------------------------------------- /Extra-03.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# MNIST handwritten digits classification with an ensemble of classifiers \n", 8 | "\n", 9 | "In this notebook, we'll use a [classifier emsemble](https://scikit-learn.org/stable/modules/ensemble.html#voting-classifier) to classify MNIST digits using scikit-learn (version 0.20 or later required).\n", 10 | "\n", 11 | "First, the needed imports. " 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": null, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "%matplotlib inline\n", 21 | "\n", 22 | "from pml_utils import get_mnist, show_failures\n", 23 | "\n", 24 | "import numpy as np\n", 25 | "from sklearn.model_selection import train_test_split\n", 26 | "from sklearn import __version__\n", 27 | "from sklearn.linear_model import SGDClassifier\n", 28 | "from sklearn.tree import DecisionTreeClassifier\n", 29 | "from sklearn.naive_bayes import BernoulliNB\n", 30 | "from sklearn.ensemble import VotingClassifier\n", 31 | "from sklearn.metrics import accuracy_score, confusion_matrix, classification_report\n", 32 | "\n", 33 | "import matplotlib.pyplot as plt\n", 34 | "import seaborn as sns\n", 35 | "sns.set()\n", 36 | "\n", 37 | "from packaging.version import Version\n", 38 | "assert(Version(__version__) >= Version(\"0.20\")), \"Version >= 0.20 of sklearn is required.\"" 39 | ] 40 | }, 41 | { 42 | "cell_type": "markdown", 43 | "metadata": {}, 44 | "source": [ 45 | "Then we load the MNIST data. First time we need to download the data, which can take a while." 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": null, 51 | "metadata": {}, 52 | "outputs": [], 53 | "source": [ 54 | "X_train, y_train, X_test, y_test = get_mnist('MNIST')\n", 55 | "\n", 56 | "print('MNIST data loaded: train:',len(X_train),'test:',len(X_test))\n", 57 | "print('X_train:', X_train.shape)\n", 58 | "print('y_train:', y_train.shape)\n", 59 | "print('X_test', X_test.shape)\n", 60 | "print('y_test', y_test.shape)" 61 | ] 62 | }, 63 | { 64 | "cell_type": "markdown", 65 | "metadata": {}, 66 | "source": [ 67 | "The training data (`X_train`) is a matrix of size (60000, 784), i.e. it consists of 60000 digits expressed as 784 sized vectors (28x28 images flattened to 1D). `y_train` is a 60000-dimensional vector containing the correct classes (\"0\", \"1\", ..., \"9\") for each training digit." 68 | ] 69 | }, 70 | { 71 | "cell_type": "markdown", 72 | "metadata": {}, 73 | "source": [ 74 | "## Individual classifiers\n", 75 | "\n", 76 | "Let's first define and train a set of different classifiers." 77 | ] 78 | }, 79 | { 80 | "cell_type": "markdown", 81 | "metadata": {}, 82 | "source": [ 83 | "### SGDClassifier" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": null, 89 | "metadata": {}, 90 | "outputs": [], 91 | "source": [ 92 | "%%time\n", 93 | "\n", 94 | "clf_sgd = SGDClassifier()\n", 95 | "print(clf_sgd.fit(X_train, y_train))\n", 96 | "pred_sgd = clf_sgd.predict(X_test)\n", 97 | "print('Predicted', len(pred_sgd), 'digits with accuracy:', accuracy_score(y_test, pred_sgd))" 98 | ] 99 | }, 100 | { 101 | "cell_type": "markdown", 102 | "metadata": {}, 103 | "source": [ 104 | "### Decision tree" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": null, 110 | "metadata": {}, 111 | "outputs": [], 112 | "source": [ 113 | "%%time\n", 114 | "\n", 115 | "clf_dt = DecisionTreeClassifier()\n", 116 | "print(clf_dt.fit(X_train, y_train))\n", 117 | "pred_dt = clf_dt.predict(X_test)\n", 118 | "print('Predicted', len(pred_dt), 'digits with accuracy:', accuracy_score(y_test, pred_dt))" 119 | ] 120 | }, 121 | { 122 | "cell_type": "markdown", 123 | "metadata": {}, 124 | "source": [ 125 | "### Bernoulli naive Bayes" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": null, 131 | "metadata": { 132 | "collapsed": false, 133 | "jupyter": { 134 | "outputs_hidden": false 135 | } 136 | }, 137 | "outputs": [], 138 | "source": [ 139 | "%%time\n", 140 | "\n", 141 | "clf_bnb = BernoulliNB(binarize=128.)\n", 142 | "print(clf_bnb.fit(X_train, y_train))\n", 143 | "pred_bnb = clf_bnb.predict(X_test)\n", 144 | "print('Predicted', len(pred_bnb), 'digits with accuracy:', accuracy_score(y_test, pred_bnb))" 145 | ] 146 | }, 147 | { 148 | "cell_type": "markdown", 149 | "metadata": {}, 150 | "source": [ 151 | "## Ensemble classifier\n", 152 | "\n", 153 | "The goal of ensemble methods is to combine the predictions of several base classifiers to improve generalizability and robustness.\n", 154 | "\n", 155 | "### Learning\n", 156 | "\n", 157 | "We use [`VotingClassifier`](https://scikit-learn.org/stable/modules/ensemble.html#voting-classifier) to combine the results of the individual classifiers.\n", 158 | "The default mode is to use majority (`\"hard\"`) voting, where each classifier gets a vote and the final prediction is the class that gets the majority of the votes.\n", 159 | "Another option is to use the average of the predicted probabilities (`\"soft\"` voting), which however requires that all used individual classifiers are able to predict class probabilities. " 160 | ] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "execution_count": null, 165 | "metadata": {}, 166 | "outputs": [], 167 | "source": [ 168 | "%%time\n", 169 | "\n", 170 | "clf_vote = VotingClassifier(estimators=[('sgd', clf_sgd),\n", 171 | " ('dt', clf_dt),\n", 172 | " ('bnb', clf_bnb)],\n", 173 | " voting='hard')\n", 174 | "clf_vote.fit(X_train, y_train)" 175 | ] 176 | }, 177 | { 178 | "cell_type": "markdown", 179 | "metadata": {}, 180 | "source": [ 181 | "### Inference\n", 182 | "\n", 183 | "The classification accuracy of the ensemble classifier:" 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "execution_count": null, 189 | "metadata": { 190 | "collapsed": false, 191 | "jupyter": { 192 | "outputs_hidden": false 193 | } 194 | }, 195 | "outputs": [], 196 | "source": [ 197 | "pred_vote = clf_vote.predict(X_test)\n", 198 | "print('Predicted', len(pred_vote), 'digits with accuracy:', accuracy_score(y_test, pred_vote))" 199 | ] 200 | }, 201 | { 202 | "cell_type": "markdown", 203 | "metadata": {}, 204 | "source": [ 205 | "#### Confusion matrix\n", 206 | "\n", 207 | "We can compute the confusion matrix to see which digits get mixed the most:" 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": null, 213 | "metadata": { 214 | "collapsed": false, 215 | "jupyter": { 216 | "outputs_hidden": false 217 | } 218 | }, 219 | "outputs": [], 220 | "source": [ 221 | "labels=[str(i) for i in range(10)]\n", 222 | "print('Confusion matrix (rows: true classes; columns: predicted classes):'); print()\n", 223 | "cm=confusion_matrix(y_test, pred_vote, labels=labels)\n", 224 | "print(cm); print()" 225 | ] 226 | }, 227 | { 228 | "cell_type": "markdown", 229 | "metadata": {}, 230 | "source": [ 231 | "#### Accuracy, precision and recall\n", 232 | "\n", 233 | "Classification accuracy for each class:" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": null, 239 | "metadata": { 240 | "collapsed": false, 241 | "jupyter": { 242 | "outputs_hidden": false 243 | } 244 | }, 245 | "outputs": [], 246 | "source": [ 247 | "for i,j in enumerate(cm.diagonal()/cm.sum(axis=1)): print(\"%d: %.4f\" % (i,j))" 248 | ] 249 | }, 250 | { 251 | "cell_type": "markdown", 252 | "metadata": {}, 253 | "source": [ 254 | "Precision and recall for each class:" 255 | ] 256 | }, 257 | { 258 | "cell_type": "code", 259 | "execution_count": null, 260 | "metadata": { 261 | "collapsed": false, 262 | "jupyter": { 263 | "outputs_hidden": false 264 | } 265 | }, 266 | "outputs": [], 267 | "source": [ 268 | "print(classification_report(y_test, pred_vote, labels=labels))" 269 | ] 270 | }, 271 | { 272 | "cell_type": "markdown", 273 | "metadata": { 274 | "ein.tags": "worksheet-0", 275 | "slideshow": { 276 | "slide_type": "-" 277 | } 278 | }, 279 | "source": [ 280 | "#### Failure analysis\n", 281 | "\n", 282 | "We can also do some failure analysis. Let's check the 10 first wrongly predicted digits." 283 | ] 284 | }, 285 | { 286 | "cell_type": "code", 287 | "execution_count": null, 288 | "metadata": {}, 289 | "outputs": [], 290 | "source": [ 291 | "show_failures(pred_vote, y_test, X_test)" 292 | ] 293 | }, 294 | { 295 | "cell_type": "markdown", 296 | "metadata": {}, 297 | "source": [ 298 | "## Model tuning\n", 299 | "\n", 300 | "Try adding various classifiers covered on this course to the ensemble and experiment with different setups. \n", 301 | "\n", 302 | "Report the highest classification accuracy you manage to obtain. Also mark down the parameters you used, so others can try to reproduce your results. \n" 303 | ] 304 | }, 305 | { 306 | "cell_type": "code", 307 | "execution_count": null, 308 | "metadata": {}, 309 | "outputs": [], 310 | "source": [] 311 | }, 312 | { 313 | "cell_type": "code", 314 | "execution_count": null, 315 | "metadata": {}, 316 | "outputs": [], 317 | "source": [] 318 | } 319 | ], 320 | "metadata": { 321 | "kernelspec": { 322 | "display_name": "Python 3 (ipykernel)", 323 | "language": "python", 324 | "name": "python3" 325 | }, 326 | "language_info": { 327 | "codemirror_mode": { 328 | "name": "ipython", 329 | "version": 3 330 | }, 331 | "file_extension": ".py", 332 | "mimetype": "text/x-python", 333 | "name": "python", 334 | "nbconvert_exporter": "python", 335 | "pygments_lexer": "ipython3", 336 | "version": "3.11.9" 337 | } 338 | }, 339 | "nbformat": 4, 340 | "nbformat_minor": 4 341 | } 342 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 CSC Training 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # intro-to-ml [![Gitter chat](https://badges.gitter.im/csc_training/intro-to-ml.svg)](https://gitter.im/csc_training/intro-to-ml) 2 | Exercises for [CSC](https://www.csc.fi/)'s Practical Machine Learning course 3 | 4 | See [SETUP.md](SETUP.md) for instructions on how to set up Notebooks. 5 | 6 | ## Exercises 7 | 8 | * Exercise 01: [Introduction to Notebooks, Python](https://github.com/csc-training/python-introduction/blob/gh-pages/notebooks/examples/1%20-%20Introduction.ipynb), and [numpy](https://github.com/csc-training/python-introduction/blob/gh-pages/notebooks/examples/7%20-%20NumPy.ipynb) 9 | * Exercise 02: [Linear classifiers](Exercise-02.ipynb) 10 | * Exercise 03: [Nearest neighbor classifiers](Exercise-03.ipynb) 11 | * Exercise 04: [Linear and polynomial regression](Exercise-04.ipynb) 12 | * Exercise 05: [Classification with SVMs](Exercise-05.ipynb) 13 | * Exercise 06: [Regression with SVMs](Exercise-06.ipynb) 14 | * Exercise 07: [Classification with decision trees](Exercise-07.ipynb) 15 | * Exercise 08: [Regression with decision trees](Exercise-08.ipynb) 16 | * Exercise 09: [Classification with neural networks](Exercise-09.ipynb) 17 | * Exercise 10: [Regression with neural networks](Exercise-10.ipynb) 18 | * Exercise 11: [Dimensionality reduction](Exercise-11.ipynb) 19 | * Exercise 12: [Data visualization](Exercise-12.ipynb) 20 | * Exercise 13: [Clustering](Exercise-13.ipynb) 21 | * Exercise 14: [Anomaly detection](Exercise-14.ipynb) 22 | 23 | ## Extra 24 | 25 | * Extra 01: [Classification with naive Bayes](Extra-01.ipynb) 26 | * Extra 02: [Parameter grid search for SVM classification](Extra-02.ipynb) 27 | * Extra 03: [Ensemble of classifiers](Extra-03.ipynb) 28 | -------------------------------------------------------------------------------- /SETUP.md: -------------------------------------------------------------------------------- 1 | 2 | # Setup 3 | 4 | We will use Jupyter Notebooks for all exercises. There are several ways to set up a Jupyter environment for running the exercises: 5 | 6 | ## Option 1. CSC’s Notebooks 7 | 8 | *The default option.* CSC’s Notebooks (https://notebooks.csc.fi) provides easy-to-use environments for working with data and programming. You can access everything via your web browser and CSC cloud environment computes on the background. 9 | 10 | * Point your browser to https://notebooks.csc.fi 11 | * Login using Haka or a CSC account (or using Alternate login and a separate username and password) 12 | * Find *Course Practical Machine Learning 2019* and click “Launch new” 13 | * Wait until the “Open in browser” link appears, then click on it 14 | * The JupyterLab notebook dashboard should appear 15 | * If you are not familiar with Jupyter, take a moment to get to know the interface 16 | * open a new notebook using the Launcher (click on *Notebook: Python 3*) 17 | * write some Python code to a Jupyter *cell* 18 | * execute the cell with *shift-enter* 19 | * Exercise 1: Navigate to `python-introduction/notebooks/examples` and go through the notebooks *1 - Introduction.ipynb* and *7 - NumPy.ipynb*. 20 | * Other exercises: Navigate to `intro-to-ml` where all the exercise notebooks are located 21 | 22 | ## Option 2. Running Jupyter on your laptop 23 | 24 | If you have a laptop that has Jupyter, Scikit-learn, and all the other necessary Python packages installed, it is possible to use it. Clone the Git repositories used on this course 25 | 26 | git clone https://github.com/csc-training/python-introduction 27 | git clone https://github.com/csc-training/intro-to-ml 28 | 29 | and launch Jupyter. 30 | -------------------------------------------------------------------------------- /pml_utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # Adapted from https://github.com/tensorflow/models/blob/master/official/mnist/dataset.py 4 | 5 | import gzip 6 | import os 7 | import shutil 8 | import tempfile 9 | import struct 10 | 11 | import numpy as np 12 | import urllib.request 13 | 14 | import matplotlib.pyplot as plt 15 | import seaborn as sns 16 | sns.set() 17 | 18 | def show_failures(predictions, y_test, X_test, trueclass=None, 19 | predictedclass=None, maxtoshow=10): 20 | import matplotlib.pyplot as plt 21 | 22 | if len(predictions.shape) > 1: 23 | predictions = np.argmax(predictions, axis=1) 24 | errors = predictions != y_test 25 | print('Showing max', maxtoshow, 'first failures. The predicted class is ' 26 | 'shown first and the correct class in parenthesis.') 27 | ii = 0 28 | plt.figure(figsize=(maxtoshow, 1)) 29 | for i in range(X_test.shape[0]): 30 | if ii >= maxtoshow: 31 | break 32 | if errors[i]: 33 | if trueclass is not None and y_test[i] != trueclass: 34 | continue 35 | if predictedclass is not None and predictions[i] != predictedclass: 36 | continue 37 | plt.subplot(1, maxtoshow, ii+1) 38 | plt.axis('off') 39 | plt.imshow(X_test[i, :].reshape(28, 28), cmap="gray") 40 | plt.title("%s (%s)" % (predictions[i], y_test[i])) 41 | ii = ii + 1 42 | 43 | def show_clusters(labels, n_clust, X, n_img_per_row = 32): 44 | img = np.zeros((28 * n_clust, 28 * n_img_per_row)) 45 | 46 | for i in range(n_clust): 47 | ix = 28 * i 48 | X_cluster = X[labels==i,:] 49 | try: 50 | for j in range(n_img_per_row): 51 | iy = 28 * j 52 | img[ix:ix + 28, iy:iy + 28] = X_cluster[j,:].reshape(28,28) 53 | except IndexError: 54 | pass 55 | 56 | plt.figure(figsize=(12, 12)) 57 | plt.imshow(img, cmap='gray') 58 | plt.title('Some MNIST digits from each cluster') 59 | plt.xticks([]) 60 | plt.yticks([]) 61 | plt.ylabel('clusters'); 62 | 63 | def show_anomalies(predictions, X, n_img_per_row = 32): 64 | img = np.zeros((28 * 2, 28 * n_img_per_row)) 65 | anolabels = [-1, 1] 66 | 67 | for i in range(2): 68 | ix = 28 * i 69 | X_ano = X[predictions==anolabels[i], :] 70 | try: 71 | for j in range(n_img_per_row): 72 | iy = 28 * j 73 | img[ix:ix + 28, iy:iy + 28] = X_ano[j,:].reshape(28,28) 74 | except IndexError: 75 | pass 76 | 77 | plt.figure(figsize=(12, 12)) 78 | plt.imshow(img, cmap='gray') 79 | plt.title('Examples of anomalies (upper row) and normal data (lower row)') 80 | plt.xticks([]) 81 | plt.yticks([]); 82 | 83 | def download_mnist(directory, filename): 84 | """Download (and unzip) a file from the MNIST dataset if not already done.""" 85 | 86 | filepath = os.path.join(directory, filename) 87 | if os.path.isfile(filepath): 88 | print('Not downloading, file already exists:', filepath) 89 | return filepath 90 | if not os.path.isdir(directory): 91 | os.mkdir(directory) 92 | # original: http://yann.lecun.com/exdb/mnist/ 93 | # CVDF mirror: https://storage.googleapis.com/cvdf-datasets/mnist/ 94 | # CSC mirror 95 | url_base = 'https://object.pouta.csc.fi/swift/v1/AUTH_dac/mldata/' 96 | url = url_base + filename + '.gz' 97 | _, zipped_filepath = tempfile.mkstemp(suffix='.gz') 98 | print('Downloading %s to %s' % (url, zipped_filepath)) 99 | urllib.request.urlretrieve(url, zipped_filepath) 100 | with gzip.open(zipped_filepath, 'rb') as f_in, open(filepath, 'wb') as f_out: 101 | shutil.copyfileobj(f_in, f_out) 102 | os.remove(zipped_filepath) 103 | return filepath 104 | 105 | 106 | def read_mnist_idx(filename): 107 | """Read MNIST file.""" 108 | 109 | with open(filename, 'rb') as f: 110 | zero, data_type, dims = struct.unpack('>HBB', f.read(4)) 111 | shape = tuple(struct.unpack('>I', f.read(4))[0] for d in range(dims)) 112 | return np.frombuffer(f.read(), dtype=np.uint8).reshape(shape) 113 | 114 | 115 | def get_mnist_dataset(directory, images_file, labels_file): 116 | """Download and parse MNIST dataset.""" 117 | 118 | images_file = download_mnist(directory, images_file) 119 | labels_file = download_mnist(directory, labels_file) 120 | 121 | images = read_mnist_idx(images_file) 122 | labels = read_mnist_idx(labels_file) 123 | 124 | return (images, labels) 125 | 126 | 127 | def get_mnist(directory, labels_as_strings=True, flatten=True): 128 | X_train, y_train = get_mnist_dataset(directory, 'train-images-idx3-ubyte', 129 | 'train-labels-idx1-ubyte') 130 | X_test, y_test = get_mnist_dataset(directory, 't10k-images-idx3-ubyte', 131 | 't10k-labels-idx1-ubyte') 132 | if labels_as_strings: 133 | y_train = y_train.astype(str) 134 | y_test = y_test.astype(str) 135 | 136 | if flatten: 137 | X_train = X_train.astype(np.float64).reshape(-1, 28*28) 138 | X_test = X_test.astype(np.float64).reshape(-1, 28*28) 139 | 140 | return (X_train, y_train, X_test, y_test) 141 | 142 | 143 | if __name__ == '__main__': 144 | X_train, y_train, X_test, y_test = get_mnist('MNIST') 145 | print() 146 | print('MNIST data loaded:') 147 | print('X_train:', X_train.shape, X_train.dtype) 148 | print('y_train:', y_train.shape, y_train.dtype) 149 | print('X_test:', X_test.shape, X_test.dtype) 150 | print('y_test:', y_test.shape, y_test.dtype) 151 | 152 | print() 153 | print(X_train[:3, :]) 154 | print(y_train[:3]) 155 | --------------------------------------------------------------------------------