├── .gitignore ├── .travis.yml ├── LICENSE ├── README.md ├── course ├── 0_Check_Environment.ipynb ├── 1 First Deep Learning Model.ipynb ├── 2 Data.ipynb ├── 3 Machine Learning.ipynb ├── 4 Deep Learning Intro.ipynb ├── 5 Gradient Descent.ipynb ├── 6 Convolutional Neural Networks.ipynb ├── 8 Recurrent Neural Networks.ipynb └── 9 Improving performance.ipynb ├── data ├── HR_comma_sep.csv ├── banknotes.csv ├── banknotes.png ├── cansim-0800020-eng-6674700030567901031.csv ├── diabetes.csv ├── generator │ └── class 0 │ │ └── squirrel.jpeg ├── housing-data.csv ├── international-airline-passengers.csv ├── iris.csv ├── iss.jpg ├── sms.wav ├── titanic-train.csv ├── us_retail_sales.csv ├── user_visit_duration.csv ├── weight-height.csv └── wines.csv ├── environment.yml ├── solutions ├── 2 Data exploration Exercises Solution.ipynb ├── 3 Machine Learning Exercises Solution.ipynb ├── 4 Deep Learning Intro Exercises Solution.ipynb ├── 5 Gradient Descent Exercises Solution.ipynb ├── 6 Convolutional Neural Networks Exercises Solution.ipynb ├── 8 Recurrent Neural Networks Exercises Solutions.ipynb └── 9 Improving performance Exercises Solutions.ipynb └── tests └── test_nb.py /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | .floydexpt 3 | .floydignore 4 | 5 | # Byte-compiled / optimized / DLL files 6 | __pycache__/ 7 | *.py[cod] 8 | *$py.class 9 | 10 | # C extensions 11 | *.so 12 | 13 | # Distribution / packaging 14 | .Python 15 | env/ 16 | build/ 17 | develop-eggs/ 18 | dist/ 19 | downloads/ 20 | eggs/ 21 | .eggs/ 22 | lib/ 23 | lib64/ 24 | parts/ 25 | sdist/ 26 | var/ 27 | *.egg-info/ 28 | .installed.cfg 29 | *.egg 30 | 31 | # PyInstaller 32 | # Usually these files are written by a python script from a template 33 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 34 | *.manifest 35 | *.spec 36 | 37 | # Installer logs 38 | pip-log.txt 39 | pip-delete-this-directory.txt 40 | 41 | # Unit test / coverage reports 42 | htmlcov/ 43 | .tox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *,cover 50 | .hypothesis/ 51 | 52 | # Translations 53 | *.mo 54 | *.pot 55 | 56 | # Django stuff: 57 | *.log 58 | local_settings.py 59 | 60 | # Flask stuff: 61 | instance/ 62 | .webassets-cache 63 | 64 | # Scrapy stuff: 65 | .scrapy 66 | 67 | # Sphinx documentation 68 | docs/_build/ 69 | 70 | # PyBuilder 71 | target/ 72 | 73 | # IPython Notebook 74 | .ipynb_checkpoints 75 | 76 | # pyenv 77 | .python-version 78 | 79 | # celery beat schedule file 80 | celerybeat-schedule 81 | 82 | # dotenv 83 | .env 84 | 85 | # virtualenv 86 | venv/ 87 | ENV/ 88 | 89 | # Spyder project settings 90 | .spyderproject 91 | 92 | # Rope project settings 93 | .ropeproject 94 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | dist: xenial 2 | language: python 3 | python: 4 | - "3.7" 5 | install: 6 | - sudo apt-get update 7 | # We do this conditionally because it saves us some downloading if the 8 | # version is the same. 9 | - wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh; 10 | - bash miniconda.sh -b -p $HOME/miniconda 11 | - export PATH="$HOME/miniconda/bin:$PATH" 12 | - hash -r 13 | - conda config --set always_yes yes --set changeps1 no 14 | - conda update -q conda 15 | # Useful for debugging any issues with conda 16 | - conda info -a 17 | 18 | - conda env create -q -n test-environment python=$TRAVIS_PYTHON_VERSION -f environment.yml 19 | - source activate test-environment 20 | 21 | script: 22 | - travis_wait 30 py.test -v 23 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | COPYRIGHT 2 | 3 | All contributions by Francesco Mosconi: 4 | Copyright (c) 2017, Francesco Mosconi. 5 | All rights reserved. 6 | 7 | All contributions by Catalit LLC: 8 | Copyright (c) 2017, Catalit LLC. 9 | All rights reserved. 10 | 11 | All other contributions: 12 | Copyright (c) 2015, the respective contributors. 13 | All rights reserved. 14 | 15 | Each contributor holds copyright over their respective contributions. 16 | The project versioning (Git) records all such contribution source information. 17 | MIT License 18 | 19 | Copyright (c) 2017 20 | 21 | Permission is hereby granted, free of charge, to any person obtaining a copy 22 | of this software and associated documentation files (the "Software"), to deal 23 | in the Software without restriction, including without limitation the rights 24 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 25 | copies of the Software, and to permit persons to whom the Software is 26 | furnished to do so, subject to the following conditions: 27 | 28 | The above copyright notice and this permission notice shall be included in all 29 | copies or substantial portions of the Software. 30 | 31 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 32 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 33 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 34 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 35 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 36 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 37 | SOFTWARE. 38 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## [Check our Zero To Deep Learning 5 day bootcamp. New dates are out!](https://www.zerotodeeplearning.com/?utm_source=github.com&utm_medium=affiliate&utm_campaign=https%3A%2F%2Fgithub.com%2FDataweekends%2Fzero_to_deep_learning_video&utm_content=README.md) 2 | 3 | ------ 4 | 5 | # Zero to Deep Learning® Video Course 6 | 7 | Welcome to the Zero to Deep Learning® Video Course repository. 8 | 9 | ## Get started guide 10 | 11 | #### Clone this repository on your local computer 12 | 13 | ``` 14 | git clone https://github.com/Dataweekends/zero_to_deep_learning_video.git 15 | ``` 16 | 17 | #### Download and Install Anaconda Python 3.7 18 | 19 | https://www.anaconda.com/distribution/ 20 | 21 | #### Change to course folder 22 | 23 | ``` 24 | cd zero_to_deep_learning_video 25 | ``` 26 | 27 | #### Create the course environment 28 | 29 | ``` 30 | conda env create 31 | ``` 32 | 33 | wait for the environment to create. 34 | 35 | #### Activate the environment (Mac/Linux) 36 | ``` 37 | conda activate ztdl 38 | ``` 39 | 40 | #### Activate the environment (Windows) 41 | ``` 42 | conda activate ztdl 43 | ``` 44 | 45 | Check that your prompt changed to 46 | 47 | ``` 48 | (ztdl) $ 49 | ``` 50 | 51 | #### Launch Jupyter Notebook 52 | 53 | ``` 54 | jupyter notebook 55 | ``` 56 | 57 | #### Open your browser to 58 | 59 | ``` 60 | http://localhost:8888 61 | ``` 62 | 63 | #### Run the Check environment Notebook 64 | 65 | Go to the course folder, open the notebook `0_Check_Environment.ipynb` and run it. If you see the message: 66 | 67 | Houston we are go! 68 | 69 | You are good to go! Enjoy! 70 | 71 | 72 | #### Troubleshooting installation 73 | If for some reason you don't see `Houston we are go!`, the simplest solution is to delete the environment and start from scratch again. 74 | 75 | To remove the environment: 76 | 77 | - close the browser and go back to your terminal 78 | - stop jupyter notebook (CTRL-C) 79 | - deactivate the environment (Mac/Linux): 80 | 81 | ``` 82 | conda deactivate 83 | ``` 84 | 85 | - deactivate the environment (Windows 10): 86 | 87 | ``` 88 | deactivate ztdl 89 | ``` 90 | 91 | - delete the environment: 92 | 93 | ``` 94 | conda remove -y -n ztdl --all 95 | ``` 96 | 97 | - restart from environment creation and make sure that each steps completes till the end. 98 | 99 | #### Updating Conda 100 | 101 | One thing you can also try is to update your conda executable. This may help if you already had Anaconda installed on your system. 102 | 103 | ``` 104 | conda update conda 105 | ``` 106 | 107 | These instructions have been tested on: 108 | 109 | - Mac OSX Sierra 10.15.7 110 | - Ubuntu 18.04 111 | - Windows 10 112 | 113 | ## Running the course on Google Colaboratory with free GPU support 114 | 115 | Google offers a free platform to run Jupyter notebooks called Google Colaboratory. You need a Gmail or Google Apps email address to use it. 116 | 117 | Follow these steps: 118 | 119 | 1. Open your browser and go to https://colab.research.google.com/ 120 | 2. Choose the **GITHUB** tab and paste the repository address: `https://github.com/Dataweekends/zero_to_deep_learning_video` in the search bar. 121 | 3. Click on the notebook you would like to run 122 | 4. Enable GPU support in the `Edit -> Notebook Settings` menu 123 | 5. Enjoy running the notebook with GPU support! 124 | 6. If the notebook loads data from the repo you will have to download the data too. Follow these steps to do that: 125 | 1. Create a code cell at the top of the notebook 126 | 2. Clone the repository in Colab: 127 | ``` 128 | !git clone https://github.com/Dataweekends/zero_to_deep_learning_video.git 129 | ``` 130 | 3. Replace the `../data` path with `zero_to_deep_learning_video/data` in the cell that loads the data. 131 | 7. Enjoy! -------------------------------------------------------------------------------- /course/0_Check_Environment.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Check Environment\n", 8 | "This notebook checks that you have correctly created the environment and that all packages needed are installed." 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "metadata": {}, 14 | "source": [ 15 | "## Environment\n", 16 | "\n", 17 | "The next command should return a line like (Mac/Linux):\n", 18 | "\n", 19 | " //anaconda/envs/ztdl/bin/python\n", 20 | "\n", 21 | "or like (Windows 10):\n", 22 | "\n", 23 | " C:\\\\\\\\Anaconda3\\\\envs\\\\ztdl\\\\python.exe\n", 24 | "\n", 25 | "In particular you should make sure that you are using the python executable from within the course environment.\n", 26 | "\n", 27 | "If that's not the case do this:\n", 28 | "\n", 29 | "1. close this notebook\n", 30 | "2. go to the terminal and stop jupyer notebook\n", 31 | "3. make sure that you have activated the environment, you should see a prompt like:\n", 32 | "\n", 33 | " (ztdl) $\n", 34 | "4. (optional) if you don't see that prompt activate the environment:\n", 35 | " - mac/linux:\n", 36 | " \n", 37 | " conda activate ztdl\n", 38 | "\n", 39 | " - windows:\n", 40 | "\n", 41 | " activate ztdl\n", 42 | "5. restart jupyter notebook" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "import os\n", 52 | "import sys\n", 53 | "sys.executable" 54 | ] 55 | }, 56 | { 57 | "cell_type": "markdown", 58 | "metadata": {}, 59 | "source": [ 60 | "## Python 3.7\n", 61 | "\n", 62 | "The next line should say that you're using Python 3.7.x from Anaconda. At the time of publication it looks like this (Mac/Linux):\n", 63 | "\n", 64 | " Python 3.7.3 (default, Mar 27 2019, 22:11:17)\n", 65 | " [GCC 7.3.0] :: Anaconda, Inc. on linux\n", 66 | " Type \"help\", \"copyright\", \"credits\" or \"license\" for more information.\n", 67 | "\n", 68 | "or like this (Windows 10):\n", 69 | "\n", 70 | " Python 3.7.3 (default, Apr 24 2019, 15:29:51) [MSC v.1915 64 bit (AMD64)] :: Anaconda, Inc. on win32\n", 71 | " Type \"help\", \"copyright\", \"credits\" or \"license\" for more information.\n", 72 | "\n", 73 | "but date and exact version of GCC may change in the future.\n", 74 | "\n", 75 | "If you see a different version of python, go back to the previous step and make sure you created and activated the environment correctly." 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": null, 81 | "metadata": {}, 82 | "outputs": [], 83 | "source": [ 84 | "import sys\n", 85 | "sys.version" 86 | ] 87 | }, 88 | { 89 | "cell_type": "markdown", 90 | "metadata": {}, 91 | "source": [ 92 | "## Jupyter\n", 93 | "\n", 94 | "Check that Jupyter is running from within the environment. The next line should look like (Mac/Linux):\n", 95 | "\n", 96 | " //anaconda/envs/ztdl/lib/python3.6/site-packages/jupyter.py'\n", 97 | "\n", 98 | "or like this (Windows 10):\n", 99 | "\n", 100 | " C:\\\\Users\\\\\\\\Anaconda3\\\\envs\\\\ztdl\\\\lib\\\\site-packages\\\\jupyter.py" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": null, 106 | "metadata": {}, 107 | "outputs": [], 108 | "source": [ 109 | "import jupyter\n", 110 | "jupyter.__file__" 111 | ] 112 | }, 113 | { 114 | "cell_type": "markdown", 115 | "metadata": {}, 116 | "source": [ 117 | "## Other packages\n", 118 | "\n", 119 | "Here we will check that all the packages are installed and have the correct versions. If everything is ok you should see:\n", 120 | " \n", 121 | " Using TensorFlow backend.\n", 122 | " \n", 123 | " Houston we are go!\n", 124 | "\n", 125 | "If there's any issue here please make sure you have checked the previous steps and if it's all good please send us a question in the Q&A forum." 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": null, 131 | "metadata": {}, 132 | "outputs": [], 133 | "source": [ 134 | "import pip\n", 135 | "import numpy\n", 136 | "import jupyter\n", 137 | "import matplotlib\n", 138 | "import sklearn\n", 139 | "import scipy\n", 140 | "import pandas\n", 141 | "import PIL\n", 142 | "import seaborn\n", 143 | "import tensorflow\n", 144 | "\n", 145 | "\n", 146 | "def check_version(pkg, version):\n", 147 | " actual = pkg.__version__.split('.')\n", 148 | " if len(actual) == 3:\n", 149 | " actual_major = '.'.join(actual[:2])\n", 150 | " elif len(actual) == 2:\n", 151 | " actual_major = '.'.join(actual)\n", 152 | " else:\n", 153 | " raise NotImplementedError(pkg.__name__ +\n", 154 | " \"actual version :\"+\n", 155 | " pkg.__version__)\n", 156 | " try:\n", 157 | " assert(actual_major == version)\n", 158 | " except Exception as ex:\n", 159 | " print(\"{} {}\\t=> {}\".format(pkg.__name__,\n", 160 | " version,\n", 161 | " pkg.__version__))\n", 162 | " raise ex\n", 163 | "\n", 164 | "check_version(pip, '21.0')\n", 165 | "check_version(numpy, '1.19')\n", 166 | "check_version(matplotlib, '3.3')\n", 167 | "check_version(sklearn, '0.24')\n", 168 | "check_version(scipy, '1.6')\n", 169 | "check_version(pandas, '1.2')\n", 170 | "check_version(PIL, '8.2')\n", 171 | "check_version(seaborn, '0.11')\n", 172 | "check_version(tensorflow, '2.5')\n", 173 | "\n", 174 | "print(\"Houston we are go!\")" 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "execution_count": null, 180 | "metadata": {}, 181 | "outputs": [], 182 | "source": [] 183 | } 184 | ], 185 | "metadata": { 186 | "kernelspec": { 187 | "display_name": "Python 3", 188 | "language": "python", 189 | "name": "python3" 190 | }, 191 | "language_info": { 192 | "codemirror_mode": { 193 | "name": "ipython", 194 | "version": 3 195 | }, 196 | "file_extension": ".py", 197 | "mimetype": "text/x-python", 198 | "name": "python", 199 | "nbconvert_exporter": "python", 200 | "pygments_lexer": "ipython3", 201 | "version": "3.7.10" 202 | } 203 | }, 204 | "nbformat": 4, 205 | "nbformat_minor": 2 206 | } 207 | -------------------------------------------------------------------------------- /course/1 First Deep Learning Model.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# First Deep Learning Model" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import numpy as np\n", 17 | "%matplotlib inline\n", 18 | "import matplotlib.pyplot as plt" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "from sklearn.datasets import make_circles" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": null, 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "X, y = make_circles(n_samples=1000,\n", 37 | " noise=0.1,\n", 38 | " factor=0.2,\n", 39 | " random_state=0)" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": null, 45 | "metadata": {}, 46 | "outputs": [], 47 | "source": [ 48 | "X" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": null, 54 | "metadata": {}, 55 | "outputs": [], 56 | "source": [ 57 | "X.shape" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": null, 63 | "metadata": {}, 64 | "outputs": [], 65 | "source": [ 66 | "plt.figure(figsize=(5, 5))\n", 67 | "plt.plot(X[y==0, 0], X[y==0, 1], 'ob', alpha=0.5)\n", 68 | "plt.plot(X[y==1, 0], X[y==1, 1], 'xr', alpha=0.5)\n", 69 | "plt.xlim(-1.5, 1.5)\n", 70 | "plt.ylim(-1.5, 1.5)\n", 71 | "plt.legend(['0', '1'])\n", 72 | "plt.title(\"Blue circles and Red crosses\")" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": null, 78 | "metadata": {}, 79 | "outputs": [], 80 | "source": [ 81 | "from tensorflow.keras.models import Sequential\n", 82 | "from tensorflow.keras.layers import Dense\n", 83 | "from tensorflow.keras.optimizers import SGD" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": null, 89 | "metadata": {}, 90 | "outputs": [], 91 | "source": [ 92 | "model = Sequential()" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": null, 98 | "metadata": {}, 99 | "outputs": [], 100 | "source": [ 101 | "model.add(Dense(4, input_shape=(2,), activation='tanh'))" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": null, 107 | "metadata": {}, 108 | "outputs": [], 109 | "source": [ 110 | "model.add(Dense(1, activation='sigmoid'))" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": null, 116 | "metadata": {}, 117 | "outputs": [], 118 | "source": [ 119 | "model.compile(SGD(learning_rate=0.5), 'binary_crossentropy', metrics=['accuracy'])" 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": null, 125 | "metadata": {}, 126 | "outputs": [], 127 | "source": [ 128 | "model.fit(X, y, epochs=20)" 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": null, 134 | "metadata": {}, 135 | "outputs": [], 136 | "source": [ 137 | "hticks = np.linspace(-1.5, 1.5, 101)\n", 138 | "vticks = np.linspace(-1.5, 1.5, 101)\n", 139 | "aa, bb = np.meshgrid(hticks, vticks)\n", 140 | "ab = np.c_[aa.ravel(), bb.ravel()]\n", 141 | "c = model.predict(ab)\n", 142 | "cc = c.reshape(aa.shape)" 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": null, 148 | "metadata": {}, 149 | "outputs": [], 150 | "source": [ 151 | "plt.figure(figsize=(5, 5))\n", 152 | "plt.contourf(aa, bb, cc, cmap='bwr', alpha=0.2)\n", 153 | "plt.plot(X[y==0, 0], X[y==0, 1], 'ob', alpha=0.5)\n", 154 | "plt.plot(X[y==1, 0], X[y==1, 1], 'xr', alpha=0.5)\n", 155 | "plt.xlim(-1.5, 1.5)\n", 156 | "plt.ylim(-1.5, 1.5)\n", 157 | "plt.legend(['0', '1'])\n", 158 | "plt.title(\"Blue circles and Red crosses\")" 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": null, 164 | "metadata": {}, 165 | "outputs": [], 166 | "source": [] 167 | } 168 | ], 169 | "metadata": { 170 | "kernelspec": { 171 | "display_name": "Python 3", 172 | "language": "python", 173 | "name": "python3" 174 | }, 175 | "language_info": { 176 | "codemirror_mode": { 177 | "name": "ipython", 178 | "version": 3 179 | }, 180 | "file_extension": ".py", 181 | "mimetype": "text/x-python", 182 | "name": "python", 183 | "nbconvert_exporter": "python", 184 | "pygments_lexer": "ipython3", 185 | "version": "3.7.10" 186 | } 187 | }, 188 | "nbformat": 4, 189 | "nbformat_minor": 2 190 | } 191 | -------------------------------------------------------------------------------- /course/2 Data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Data Exploration with Pandas" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "%matplotlib inline\n", 17 | "import matplotlib.pyplot as plt\n", 18 | "import pandas as pd\n", 19 | "import numpy as np" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": null, 25 | "metadata": {}, 26 | "outputs": [], 27 | "source": [ 28 | "df = pd.read_csv('../data/titanic-train.csv')" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": null, 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "type(df)" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": null, 43 | "metadata": {}, 44 | "outputs": [], 45 | "source": [ 46 | "df.head()" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": null, 52 | "metadata": {}, 53 | "outputs": [], 54 | "source": [ 55 | "df.info()" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": null, 61 | "metadata": {}, 62 | "outputs": [], 63 | "source": [ 64 | "df.describe()" 65 | ] 66 | }, 67 | { 68 | "cell_type": "markdown", 69 | "metadata": {}, 70 | "source": [ 71 | "### Indexing" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": null, 77 | "metadata": {}, 78 | "outputs": [], 79 | "source": [ 80 | "df.iloc[3]" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": null, 86 | "metadata": {}, 87 | "outputs": [], 88 | "source": [ 89 | "df.loc[0:4,'Ticket']" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": null, 95 | "metadata": {}, 96 | "outputs": [], 97 | "source": [ 98 | "df['Ticket'].head()" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": null, 104 | "metadata": {}, 105 | "outputs": [], 106 | "source": [ 107 | "df[['Embarked', 'Ticket']].head()" 108 | ] 109 | }, 110 | { 111 | "cell_type": "markdown", 112 | "metadata": {}, 113 | "source": [ 114 | "### Selections" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": null, 120 | "metadata": {}, 121 | "outputs": [], 122 | "source": [ 123 | "df[df['Age'] > 70]" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": null, 129 | "metadata": {}, 130 | "outputs": [], 131 | "source": [ 132 | "df['Age'] > 70" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": null, 138 | "metadata": {}, 139 | "outputs": [], 140 | "source": [ 141 | "df.query(\"Age > 70\")" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": null, 147 | "metadata": {}, 148 | "outputs": [], 149 | "source": [ 150 | "df[(df['Age'] == 11) & (df['SibSp'] == 5)]" 151 | ] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "execution_count": null, 156 | "metadata": {}, 157 | "outputs": [], 158 | "source": [ 159 | "df[(df.Age == 11) | (df.SibSp == 5)]" 160 | ] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "execution_count": null, 165 | "metadata": {}, 166 | "outputs": [], 167 | "source": [ 168 | "df.query('(Age == 11) | (SibSp == 5)')" 169 | ] 170 | }, 171 | { 172 | "cell_type": "markdown", 173 | "metadata": {}, 174 | "source": [ 175 | "### Unique Values" 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": null, 181 | "metadata": {}, 182 | "outputs": [], 183 | "source": [ 184 | "df['Embarked'].unique()" 185 | ] 186 | }, 187 | { 188 | "cell_type": "markdown", 189 | "metadata": {}, 190 | "source": [ 191 | "### Sorting" 192 | ] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "execution_count": null, 197 | "metadata": {}, 198 | "outputs": [], 199 | "source": [ 200 | "df.sort_values('Age', ascending = False).head()" 201 | ] 202 | }, 203 | { 204 | "cell_type": "markdown", 205 | "metadata": {}, 206 | "source": [ 207 | "### Aggregations" 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": null, 213 | "metadata": {}, 214 | "outputs": [], 215 | "source": [ 216 | "df['Survived'].value_counts()" 217 | ] 218 | }, 219 | { 220 | "cell_type": "code", 221 | "execution_count": null, 222 | "metadata": {}, 223 | "outputs": [], 224 | "source": [ 225 | "df['Pclass'].value_counts()" 226 | ] 227 | }, 228 | { 229 | "cell_type": "code", 230 | "execution_count": null, 231 | "metadata": {}, 232 | "outputs": [], 233 | "source": [ 234 | "df.groupby(['Pclass', 'Survived'])['PassengerId'].count()" 235 | ] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "execution_count": null, 240 | "metadata": {}, 241 | "outputs": [], 242 | "source": [ 243 | "df['Age'].min()" 244 | ] 245 | }, 246 | { 247 | "cell_type": "code", 248 | "execution_count": null, 249 | "metadata": {}, 250 | "outputs": [], 251 | "source": [ 252 | "df['Age'].max()" 253 | ] 254 | }, 255 | { 256 | "cell_type": "code", 257 | "execution_count": null, 258 | "metadata": {}, 259 | "outputs": [], 260 | "source": [ 261 | "df['Age'].mean()" 262 | ] 263 | }, 264 | { 265 | "cell_type": "code", 266 | "execution_count": null, 267 | "metadata": {}, 268 | "outputs": [], 269 | "source": [ 270 | "df['Age'].median()" 271 | ] 272 | }, 273 | { 274 | "cell_type": "code", 275 | "execution_count": null, 276 | "metadata": {}, 277 | "outputs": [], 278 | "source": [ 279 | "mean_age_by_survived = df.groupby('Survived')['Age'].mean()\n", 280 | "mean_age_by_survived" 281 | ] 282 | }, 283 | { 284 | "cell_type": "code", 285 | "execution_count": null, 286 | "metadata": {}, 287 | "outputs": [], 288 | "source": [ 289 | "std_age_by_survived = df.groupby('Survived')['Age'].std()\n", 290 | "std_age_by_survived" 291 | ] 292 | }, 293 | { 294 | "cell_type": "markdown", 295 | "metadata": {}, 296 | "source": [ 297 | "### Merge" 298 | ] 299 | }, 300 | { 301 | "cell_type": "code", 302 | "execution_count": null, 303 | "metadata": {}, 304 | "outputs": [], 305 | "source": [ 306 | "df1 = mean_age_by_survived.round(0).reset_index()\n", 307 | "df2 = std_age_by_survived.round(0).reset_index()" 308 | ] 309 | }, 310 | { 311 | "cell_type": "code", 312 | "execution_count": null, 313 | "metadata": {}, 314 | "outputs": [], 315 | "source": [ 316 | "df1" 317 | ] 318 | }, 319 | { 320 | "cell_type": "code", 321 | "execution_count": null, 322 | "metadata": {}, 323 | "outputs": [], 324 | "source": [ 325 | "df2" 326 | ] 327 | }, 328 | { 329 | "cell_type": "code", 330 | "execution_count": null, 331 | "metadata": {}, 332 | "outputs": [], 333 | "source": [ 334 | "df3 = pd.merge(df1, df2, on='Survived')" 335 | ] 336 | }, 337 | { 338 | "cell_type": "code", 339 | "execution_count": null, 340 | "metadata": {}, 341 | "outputs": [], 342 | "source": [ 343 | "df3" 344 | ] 345 | }, 346 | { 347 | "cell_type": "code", 348 | "execution_count": null, 349 | "metadata": {}, 350 | "outputs": [], 351 | "source": [ 352 | "df3.columns = ['Survived', 'Average Age', 'Age Standard Deviation']" 353 | ] 354 | }, 355 | { 356 | "cell_type": "code", 357 | "execution_count": null, 358 | "metadata": {}, 359 | "outputs": [], 360 | "source": [ 361 | "df3" 362 | ] 363 | }, 364 | { 365 | "cell_type": "markdown", 366 | "metadata": {}, 367 | "source": [ 368 | "### Pivot Tables" 369 | ] 370 | }, 371 | { 372 | "cell_type": "code", 373 | "execution_count": null, 374 | "metadata": {}, 375 | "outputs": [], 376 | "source": [ 377 | "df.pivot_table(index='Pclass',\n", 378 | " columns='Survived',\n", 379 | " values='PassengerId',\n", 380 | " aggfunc='count')" 381 | ] 382 | }, 383 | { 384 | "cell_type": "markdown", 385 | "metadata": {}, 386 | "source": [ 387 | "### Correlations" 388 | ] 389 | }, 390 | { 391 | "cell_type": "code", 392 | "execution_count": null, 393 | "metadata": {}, 394 | "outputs": [], 395 | "source": [ 396 | "df['IsFemale'] = df['Sex'] == 'female'" 397 | ] 398 | }, 399 | { 400 | "cell_type": "code", 401 | "execution_count": null, 402 | "metadata": {}, 403 | "outputs": [], 404 | "source": [ 405 | "correlated_with_survived = df.corr()['Survived'].sort_values()\n", 406 | "correlated_with_survived" 407 | ] 408 | }, 409 | { 410 | "cell_type": "code", 411 | "execution_count": null, 412 | "metadata": {}, 413 | "outputs": [], 414 | "source": [ 415 | "correlated_with_survived.iloc[:-1].plot(kind='bar',\n", 416 | " title='Titanic Passengers: correlation with survival');" 417 | ] 418 | }, 419 | { 420 | "cell_type": "markdown", 421 | "metadata": {}, 422 | "source": [ 423 | "# Visual Data Exploration with Matplotlib" 424 | ] 425 | }, 426 | { 427 | "cell_type": "code", 428 | "execution_count": null, 429 | "metadata": {}, 430 | "outputs": [], 431 | "source": [ 432 | "data1 = np.random.normal(0, 0.1, 1000)\n", 433 | "data2 = np.random.normal(1, 0.4, 1000) + np.linspace(0, 1, 1000)\n", 434 | "data3 = 2 + np.random.random(1000) * np.linspace(1, 5, 1000)\n", 435 | "data4 = np.random.normal(3, 0.2, 1000) + 0.3 * np.sin(np.linspace(0, 20, 1000))" 436 | ] 437 | }, 438 | { 439 | "cell_type": "code", 440 | "execution_count": null, 441 | "metadata": {}, 442 | "outputs": [], 443 | "source": [ 444 | "data = np.vstack([data1, data2, data3, data4]).transpose()" 445 | ] 446 | }, 447 | { 448 | "cell_type": "code", 449 | "execution_count": null, 450 | "metadata": {}, 451 | "outputs": [], 452 | "source": [ 453 | "df = pd.DataFrame(data, columns=['data1', 'data2', 'data3', 'data4'])\n", 454 | "df.head()" 455 | ] 456 | }, 457 | { 458 | "cell_type": "markdown", 459 | "metadata": {}, 460 | "source": [ 461 | "### Line Plot" 462 | ] 463 | }, 464 | { 465 | "cell_type": "code", 466 | "execution_count": null, 467 | "metadata": {}, 468 | "outputs": [], 469 | "source": [ 470 | "df.plot(title='Line plot');" 471 | ] 472 | }, 473 | { 474 | "cell_type": "code", 475 | "execution_count": null, 476 | "metadata": {}, 477 | "outputs": [], 478 | "source": [ 479 | "plt.plot(df)\n", 480 | "plt.title('Line plot')\n", 481 | "plt.legend(['data1', 'data2', 'data3', 'data4']);" 482 | ] 483 | }, 484 | { 485 | "cell_type": "markdown", 486 | "metadata": {}, 487 | "source": [ 488 | "### Scatter Plot" 489 | ] 490 | }, 491 | { 492 | "cell_type": "code", 493 | "execution_count": null, 494 | "metadata": {}, 495 | "outputs": [], 496 | "source": [ 497 | "df.plot(style='.');" 498 | ] 499 | }, 500 | { 501 | "cell_type": "code", 502 | "execution_count": null, 503 | "metadata": {}, 504 | "outputs": [], 505 | "source": [ 506 | "_ = df.plot(kind='scatter', x='data1', y='data2',\n", 507 | " xlim=(-1.5, 1.5), ylim=(0, 3))" 508 | ] 509 | }, 510 | { 511 | "cell_type": "markdown", 512 | "metadata": {}, 513 | "source": [ 514 | "### Histograms" 515 | ] 516 | }, 517 | { 518 | "cell_type": "code", 519 | "execution_count": null, 520 | "metadata": {}, 521 | "outputs": [], 522 | "source": [ 523 | "df.plot(kind='hist',\n", 524 | " bins=50,\n", 525 | " title='Histogram',\n", 526 | " alpha=0.6);" 527 | ] 528 | }, 529 | { 530 | "cell_type": "markdown", 531 | "metadata": {}, 532 | "source": [ 533 | "### Cumulative distribution" 534 | ] 535 | }, 536 | { 537 | "cell_type": "code", 538 | "execution_count": null, 539 | "metadata": {}, 540 | "outputs": [], 541 | "source": [ 542 | "df.plot(kind='hist',\n", 543 | " bins=100,\n", 544 | " title='Cumulative distributions',\n", 545 | " density=True,\n", 546 | " cumulative=True,\n", 547 | " alpha=0.4);" 548 | ] 549 | }, 550 | { 551 | "cell_type": "markdown", 552 | "metadata": {}, 553 | "source": [ 554 | "### Box Plot" 555 | ] 556 | }, 557 | { 558 | "cell_type": "code", 559 | "execution_count": null, 560 | "metadata": {}, 561 | "outputs": [], 562 | "source": [ 563 | "df.plot(kind='box',\n", 564 | " title='Boxplot');" 565 | ] 566 | }, 567 | { 568 | "cell_type": "markdown", 569 | "metadata": {}, 570 | "source": [ 571 | "### Subplots" 572 | ] 573 | }, 574 | { 575 | "cell_type": "code", 576 | "execution_count": null, 577 | "metadata": {}, 578 | "outputs": [], 579 | "source": [ 580 | "fig, ax = plt.subplots(2, 2, figsize=(5, 5))\n", 581 | "\n", 582 | "df.plot(ax=ax[0][0],\n", 583 | " title='Line plot')\n", 584 | "\n", 585 | "df.plot(ax=ax[0][1],\n", 586 | " style='o',\n", 587 | " title='Scatter plot')\n", 588 | "\n", 589 | "df.plot(ax=ax[1][0],\n", 590 | " kind='hist',\n", 591 | " bins=50,\n", 592 | " title='Histogram')\n", 593 | "\n", 594 | "df.plot(ax=ax[1][1],\n", 595 | " kind='box',\n", 596 | " title='Boxplot')\n", 597 | "\n", 598 | "plt.tight_layout()" 599 | ] 600 | }, 601 | { 602 | "cell_type": "markdown", 603 | "metadata": {}, 604 | "source": [ 605 | "### Pie charts" 606 | ] 607 | }, 608 | { 609 | "cell_type": "code", 610 | "execution_count": null, 611 | "metadata": {}, 612 | "outputs": [], 613 | "source": [ 614 | "gt01 = df['data1'] > 0.1\n", 615 | "piecounts = gt01.value_counts()\n", 616 | "piecounts" 617 | ] 618 | }, 619 | { 620 | "cell_type": "code", 621 | "execution_count": null, 622 | "metadata": {}, 623 | "outputs": [], 624 | "source": [ 625 | "piecounts.plot(kind='pie',\n", 626 | " figsize=(5, 5),\n", 627 | " explode=[0, 0.15],\n", 628 | " labels=['<= 0.1', '> 0.1'],\n", 629 | " autopct='%1.1f%%',\n", 630 | " shadow=True,\n", 631 | " startangle=90,\n", 632 | " fontsize=16);" 633 | ] 634 | }, 635 | { 636 | "cell_type": "markdown", 637 | "metadata": {}, 638 | "source": [ 639 | "### Hexbin plot" 640 | ] 641 | }, 642 | { 643 | "cell_type": "code", 644 | "execution_count": null, 645 | "metadata": {}, 646 | "outputs": [], 647 | "source": [ 648 | "data = np.vstack([np.random.normal((0, 0), 2, size=(1000, 2)),\n", 649 | " np.random.normal((9, 9), 3, size=(2000, 2))])\n", 650 | "df = pd.DataFrame(data, columns=['x', 'y'])" 651 | ] 652 | }, 653 | { 654 | "cell_type": "code", 655 | "execution_count": null, 656 | "metadata": {}, 657 | "outputs": [], 658 | "source": [ 659 | "df.head()" 660 | ] 661 | }, 662 | { 663 | "cell_type": "code", 664 | "execution_count": null, 665 | "metadata": {}, 666 | "outputs": [], 667 | "source": [ 668 | "df.plot();" 669 | ] 670 | }, 671 | { 672 | "cell_type": "code", 673 | "execution_count": null, 674 | "metadata": {}, 675 | "outputs": [], 676 | "source": [ 677 | "df.plot(kind='kde');" 678 | ] 679 | }, 680 | { 681 | "cell_type": "code", 682 | "execution_count": null, 683 | "metadata": {}, 684 | "outputs": [], 685 | "source": [ 686 | "df.plot(kind='hexbin', x='x', y='y', bins=100, cmap='rainbow');" 687 | ] 688 | }, 689 | { 690 | "cell_type": "markdown", 691 | "metadata": {}, 692 | "source": [ 693 | "# Unstructured data" 694 | ] 695 | }, 696 | { 697 | "cell_type": "markdown", 698 | "metadata": {}, 699 | "source": [ 700 | "### Images" 701 | ] 702 | }, 703 | { 704 | "cell_type": "code", 705 | "execution_count": null, 706 | "metadata": {}, 707 | "outputs": [], 708 | "source": [ 709 | "from PIL import Image" 710 | ] 711 | }, 712 | { 713 | "cell_type": "code", 714 | "execution_count": null, 715 | "metadata": {}, 716 | "outputs": [], 717 | "source": [ 718 | "img = Image.open('../data/iss.jpg')\n", 719 | "img" 720 | ] 721 | }, 722 | { 723 | "cell_type": "code", 724 | "execution_count": null, 725 | "metadata": {}, 726 | "outputs": [], 727 | "source": [ 728 | "type(img)" 729 | ] 730 | }, 731 | { 732 | "cell_type": "code", 733 | "execution_count": null, 734 | "metadata": {}, 735 | "outputs": [], 736 | "source": [ 737 | "imgarray = np.asarray(img)" 738 | ] 739 | }, 740 | { 741 | "cell_type": "code", 742 | "execution_count": null, 743 | "metadata": {}, 744 | "outputs": [], 745 | "source": [ 746 | "type(imgarray)" 747 | ] 748 | }, 749 | { 750 | "cell_type": "code", 751 | "execution_count": null, 752 | "metadata": {}, 753 | "outputs": [], 754 | "source": [ 755 | "imgarray.shape" 756 | ] 757 | }, 758 | { 759 | "cell_type": "code", 760 | "execution_count": null, 761 | "metadata": {}, 762 | "outputs": [], 763 | "source": [ 764 | "imgarray.ravel().shape" 765 | ] 766 | }, 767 | { 768 | "cell_type": "code", 769 | "execution_count": null, 770 | "metadata": {}, 771 | "outputs": [], 772 | "source": [ 773 | "435 * 640 * 3" 774 | ] 775 | }, 776 | { 777 | "cell_type": "markdown", 778 | "metadata": {}, 779 | "source": [ 780 | "### Sound" 781 | ] 782 | }, 783 | { 784 | "cell_type": "code", 785 | "execution_count": null, 786 | "metadata": {}, 787 | "outputs": [], 788 | "source": [ 789 | "from scipy.io import wavfile" 790 | ] 791 | }, 792 | { 793 | "cell_type": "code", 794 | "execution_count": null, 795 | "metadata": {}, 796 | "outputs": [], 797 | "source": [ 798 | "rate, snd = wavfile.read(filename='../data/sms.wav')" 799 | ] 800 | }, 801 | { 802 | "cell_type": "code", 803 | "execution_count": null, 804 | "metadata": {}, 805 | "outputs": [], 806 | "source": [ 807 | "from IPython.display import Audio" 808 | ] 809 | }, 810 | { 811 | "cell_type": "code", 812 | "execution_count": null, 813 | "metadata": {}, 814 | "outputs": [], 815 | "source": [ 816 | "Audio(data=snd, rate=rate)" 817 | ] 818 | }, 819 | { 820 | "cell_type": "code", 821 | "execution_count": null, 822 | "metadata": {}, 823 | "outputs": [], 824 | "source": [ 825 | "len(snd)" 826 | ] 827 | }, 828 | { 829 | "cell_type": "code", 830 | "execution_count": null, 831 | "metadata": {}, 832 | "outputs": [], 833 | "source": [ 834 | "snd" 835 | ] 836 | }, 837 | { 838 | "cell_type": "code", 839 | "execution_count": null, 840 | "metadata": {}, 841 | "outputs": [], 842 | "source": [ 843 | "plt.plot(snd)" 844 | ] 845 | }, 846 | { 847 | "cell_type": "code", 848 | "execution_count": null, 849 | "metadata": {}, 850 | "outputs": [], 851 | "source": [ 852 | "_ = plt.specgram(snd, NFFT=1024, Fs=44100)\n", 853 | "plt.ylabel('Frequency (Hz)')\n", 854 | "plt.xlabel('Time (s)')" 855 | ] 856 | }, 857 | { 858 | "cell_type": "markdown", 859 | "metadata": {}, 860 | "source": [ 861 | "# Data Exploration Exercises" 862 | ] 863 | }, 864 | { 865 | "cell_type": "markdown", 866 | "metadata": {}, 867 | "source": [ 868 | "## Exercise 1\n", 869 | "- load the dataset: `../data/international-airline-passengers.csv`\n", 870 | "- inspect it using the `.info()` and `.head()` commands\n", 871 | "- use the function [`pd.to_datetime()`](http://pandas.pydata.org/pandas-docs/version/0.20/generated/pandas.to_datetime.html) to change the column type of 'Month' to a datatime type\n", 872 | "- set the index of df to be a datetime index using the column 'Month' and the `df.set_index()` method\n", 873 | "- choose the appropriate plot and display the data\n", 874 | "- choose appropriate scale\n", 875 | "- label the axes" 876 | ] 877 | }, 878 | { 879 | "cell_type": "code", 880 | "execution_count": null, 881 | "metadata": {}, 882 | "outputs": [], 883 | "source": [] 884 | }, 885 | { 886 | "cell_type": "markdown", 887 | "metadata": {}, 888 | "source": [ 889 | "## Exercise 2\n", 890 | "- load the dataset: `../data/weight-height.csv`\n", 891 | "- inspect it\n", 892 | "- plot it using a scatter plot with Weight as a function of Height\n", 893 | "- plot the male and female populations with 2 different colors on a new scatter plot\n", 894 | "- remember to label the axes" 895 | ] 896 | }, 897 | { 898 | "cell_type": "code", 899 | "execution_count": null, 900 | "metadata": {}, 901 | "outputs": [], 902 | "source": [] 903 | }, 904 | { 905 | "cell_type": "markdown", 906 | "metadata": {}, 907 | "source": [ 908 | "## Exercise 3\n", 909 | "- plot the histogram of the heights for males and for females on the same plot\n", 910 | "- use alpha to control transparency in the plot comand\n", 911 | "- plot a vertical line at the mean of each population using `plt.axvline()`" 912 | ] 913 | }, 914 | { 915 | "cell_type": "code", 916 | "execution_count": null, 917 | "metadata": {}, 918 | "outputs": [], 919 | "source": [] 920 | }, 921 | { 922 | "cell_type": "markdown", 923 | "metadata": {}, 924 | "source": [ 925 | "## Exercise 4\n", 926 | "- plot the weights of the males and females using a box plot\n", 927 | "- which one is easier to read?\n", 928 | "- (remember to put in titles, axes and legends)" 929 | ] 930 | }, 931 | { 932 | "cell_type": "code", 933 | "execution_count": null, 934 | "metadata": {}, 935 | "outputs": [], 936 | "source": [] 937 | }, 938 | { 939 | "cell_type": "markdown", 940 | "metadata": {}, 941 | "source": [ 942 | "## Exercise 5\n", 943 | "- load the dataset: `../data/titanic-train.csv`\n", 944 | "- learn about scattermatrix here: http://pandas.pydata.org/pandas-docs/stable/visualization.html\n", 945 | "- display the data using a scattermatrix" 946 | ] 947 | }, 948 | { 949 | "cell_type": "code", 950 | "execution_count": null, 951 | "metadata": {}, 952 | "outputs": [], 953 | "source": [] 954 | } 955 | ], 956 | "metadata": { 957 | "kernelspec": { 958 | "display_name": "Python 3", 959 | "language": "python", 960 | "name": "python3" 961 | }, 962 | "language_info": { 963 | "codemirror_mode": { 964 | "name": "ipython", 965 | "version": 3 966 | }, 967 | "file_extension": ".py", 968 | "mimetype": "text/x-python", 969 | "name": "python", 970 | "nbconvert_exporter": "python", 971 | "pygments_lexer": "ipython3", 972 | "version": "3.7.10" 973 | } 974 | }, 975 | "nbformat": 4, 976 | "nbformat_minor": 2 977 | } 978 | -------------------------------------------------------------------------------- /course/4 Deep Learning Intro.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Deep Learning Intro" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "%matplotlib inline\n", 17 | "import matplotlib.pyplot as plt\n", 18 | "import pandas as pd\n", 19 | "import numpy as np" 20 | ] 21 | }, 22 | { 23 | "cell_type": "markdown", 24 | "metadata": {}, 25 | "source": [ 26 | "## Shallow and Deep Networks" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": null, 32 | "metadata": {}, 33 | "outputs": [], 34 | "source": [ 35 | "from sklearn.datasets import make_moons\n", 36 | "\n", 37 | "X, y = make_moons(n_samples=1000, noise=0.1, random_state=0)\n", 38 | "plt.plot(X[y==0, 0], X[y==0, 1], 'ob', alpha=0.5)\n", 39 | "plt.plot(X[y==1, 0], X[y==1, 1], 'xr', alpha=0.5)\n", 40 | "plt.legend(['0', '1'])" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": null, 46 | "metadata": {}, 47 | "outputs": [], 48 | "source": [ 49 | "X.shape" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": null, 55 | "metadata": {}, 56 | "outputs": [], 57 | "source": [ 58 | "from sklearn.model_selection import train_test_split" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": null, 64 | "metadata": {}, 65 | "outputs": [], 66 | "source": [ 67 | "X_train, X_test, y_train, y_test = train_test_split(X, y,\n", 68 | " test_size=0.3,\n", 69 | " random_state=42)" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": null, 75 | "metadata": {}, 76 | "outputs": [], 77 | "source": [ 78 | "from tensorflow.keras.models import Sequential\n", 79 | "from tensorflow.keras.layers import Dense\n", 80 | "from tensorflow.keras.optimizers import SGD, Adam" 81 | ] 82 | }, 83 | { 84 | "cell_type": "markdown", 85 | "metadata": {}, 86 | "source": [ 87 | "### Shallow Model" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": null, 93 | "metadata": {}, 94 | "outputs": [], 95 | "source": [ 96 | "model = Sequential()\n", 97 | "model.add(Dense(1, input_shape=(2,), activation='sigmoid'))\n", 98 | "model.compile(Adam(learning_rate=0.05), 'binary_crossentropy', metrics=['accuracy'])" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": null, 104 | "metadata": {}, 105 | "outputs": [], 106 | "source": [ 107 | "model.fit(X_train, y_train, epochs=200, verbose=0)" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": null, 113 | "metadata": {}, 114 | "outputs": [], 115 | "source": [ 116 | "results = model.evaluate(X_test, y_test)" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": null, 122 | "metadata": {}, 123 | "outputs": [], 124 | "source": [ 125 | "results" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": null, 131 | "metadata": {}, 132 | "outputs": [], 133 | "source": [ 134 | "print(\"The Accuracy score on the Train set is:\\t{:0.3f}\".format(results[1]))" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": null, 140 | "metadata": {}, 141 | "outputs": [], 142 | "source": [ 143 | "def plot_decision_boundary(model, X, y):\n", 144 | " amin, bmin = X.min(axis=0) - 0.1\n", 145 | " amax, bmax = X.max(axis=0) + 0.1\n", 146 | " hticks = np.linspace(amin, amax, 101)\n", 147 | " vticks = np.linspace(bmin, bmax, 101)\n", 148 | " \n", 149 | " aa, bb = np.meshgrid(hticks, vticks)\n", 150 | " ab = np.c_[aa.ravel(), bb.ravel()]\n", 151 | " \n", 152 | " c = model.predict(ab)\n", 153 | " cc = c.reshape(aa.shape)\n", 154 | "\n", 155 | " plt.figure(figsize=(12, 8))\n", 156 | " plt.contourf(aa, bb, cc, cmap='bwr', alpha=0.2)\n", 157 | " plt.plot(X[y==0, 0], X[y==0, 1], 'ob', alpha=0.5)\n", 158 | " plt.plot(X[y==1, 0], X[y==1, 1], 'xr', alpha=0.5)\n", 159 | " plt.legend(['0', '1'])\n", 160 | " \n", 161 | "plot_decision_boundary(model, X, y)" 162 | ] 163 | }, 164 | { 165 | "cell_type": "markdown", 166 | "metadata": {}, 167 | "source": [ 168 | "### Deep model" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": null, 174 | "metadata": {}, 175 | "outputs": [], 176 | "source": [ 177 | "model = Sequential()\n", 178 | "model.add(Dense(4, input_shape=(2,), activation='tanh'))\n", 179 | "model.add(Dense(2, activation='tanh'))\n", 180 | "model.add(Dense(1, activation='sigmoid'))\n", 181 | "model.compile(Adam(learning_rate=0.05), 'binary_crossentropy', metrics=['accuracy'])" 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": null, 187 | "metadata": {}, 188 | "outputs": [], 189 | "source": [ 190 | "model.fit(X_train, y_train, epochs=100, verbose=0)" 191 | ] 192 | }, 193 | { 194 | "cell_type": "code", 195 | "execution_count": null, 196 | "metadata": {}, 197 | "outputs": [], 198 | "source": [ 199 | "model.evaluate(X_test, y_test)" 200 | ] 201 | }, 202 | { 203 | "cell_type": "code", 204 | "execution_count": null, 205 | "metadata": {}, 206 | "outputs": [], 207 | "source": [ 208 | "from sklearn.metrics import accuracy_score, confusion_matrix" 209 | ] 210 | }, 211 | { 212 | "cell_type": "code", 213 | "execution_count": null, 214 | "metadata": {}, 215 | "outputs": [], 216 | "source": [ 217 | "y_train_pred = model.predict_classes(X_train)\n", 218 | "y_test_pred = model.predict_classes(X_test)\n", 219 | "\n", 220 | "print(\"The Accuracy score on the Train set is:\\t{:0.3f}\".format(accuracy_score(y_train, y_train_pred)))\n", 221 | "print(\"The Accuracy score on the Test set is:\\t{:0.3f}\".format(accuracy_score(y_test, y_test_pred)))" 222 | ] 223 | }, 224 | { 225 | "cell_type": "code", 226 | "execution_count": null, 227 | "metadata": {}, 228 | "outputs": [], 229 | "source": [ 230 | "plot_decision_boundary(model, X, y)" 231 | ] 232 | }, 233 | { 234 | "cell_type": "markdown", 235 | "metadata": {}, 236 | "source": [ 237 | "## Multiclass classification\n", 238 | "\n", 239 | "### The Iris dataset" 240 | ] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "execution_count": null, 245 | "metadata": {}, 246 | "outputs": [], 247 | "source": [ 248 | "df = pd.read_csv('../data/iris.csv')" 249 | ] 250 | }, 251 | { 252 | "cell_type": "code", 253 | "execution_count": null, 254 | "metadata": {}, 255 | "outputs": [], 256 | "source": [ 257 | "import seaborn as sns\n", 258 | "sns.pairplot(df, hue=\"species\")" 259 | ] 260 | }, 261 | { 262 | "cell_type": "code", 263 | "execution_count": null, 264 | "metadata": {}, 265 | "outputs": [], 266 | "source": [ 267 | "df.head()" 268 | ] 269 | }, 270 | { 271 | "cell_type": "code", 272 | "execution_count": null, 273 | "metadata": {}, 274 | "outputs": [], 275 | "source": [ 276 | "X = df.drop('species', axis=1)\n", 277 | "X.head()" 278 | ] 279 | }, 280 | { 281 | "cell_type": "code", 282 | "execution_count": null, 283 | "metadata": {}, 284 | "outputs": [], 285 | "source": [ 286 | "target_names = df['species'].unique()\n", 287 | "target_names" 288 | ] 289 | }, 290 | { 291 | "cell_type": "code", 292 | "execution_count": null, 293 | "metadata": {}, 294 | "outputs": [], 295 | "source": [ 296 | "target_dict = {n:i for i, n in enumerate(target_names)}\n", 297 | "target_dict" 298 | ] 299 | }, 300 | { 301 | "cell_type": "code", 302 | "execution_count": null, 303 | "metadata": {}, 304 | "outputs": [], 305 | "source": [ 306 | "y= df['species'].map(target_dict)\n", 307 | "y.head()" 308 | ] 309 | }, 310 | { 311 | "cell_type": "code", 312 | "execution_count": null, 313 | "metadata": {}, 314 | "outputs": [], 315 | "source": [ 316 | "from tensorflow.keras.utils import to_categorical" 317 | ] 318 | }, 319 | { 320 | "cell_type": "code", 321 | "execution_count": null, 322 | "metadata": {}, 323 | "outputs": [], 324 | "source": [ 325 | "y_cat = to_categorical(y)" 326 | ] 327 | }, 328 | { 329 | "cell_type": "code", 330 | "execution_count": null, 331 | "metadata": {}, 332 | "outputs": [], 333 | "source": [ 334 | "y_cat[:10]" 335 | ] 336 | }, 337 | { 338 | "cell_type": "code", 339 | "execution_count": null, 340 | "metadata": {}, 341 | "outputs": [], 342 | "source": [ 343 | "X_train, X_test, y_train, y_test = train_test_split(X.values, y_cat,\n", 344 | " test_size=0.2)" 345 | ] 346 | }, 347 | { 348 | "cell_type": "code", 349 | "execution_count": null, 350 | "metadata": {}, 351 | "outputs": [], 352 | "source": [ 353 | "model = Sequential()\n", 354 | "model.add(Dense(3, input_shape=(4,), activation='softmax'))\n", 355 | "model.compile(Adam(learning_rate=0.1),\n", 356 | " loss='categorical_crossentropy',\n", 357 | " metrics=['accuracy'])" 358 | ] 359 | }, 360 | { 361 | "cell_type": "code", 362 | "execution_count": null, 363 | "metadata": {}, 364 | "outputs": [], 365 | "source": [ 366 | "model.fit(X_train, y_train, epochs=20, validation_split=0.1)" 367 | ] 368 | }, 369 | { 370 | "cell_type": "code", 371 | "execution_count": null, 372 | "metadata": {}, 373 | "outputs": [], 374 | "source": [ 375 | "y_pred = model.predict(X_test)" 376 | ] 377 | }, 378 | { 379 | "cell_type": "code", 380 | "execution_count": null, 381 | "metadata": {}, 382 | "outputs": [], 383 | "source": [ 384 | "y_pred[:5]" 385 | ] 386 | }, 387 | { 388 | "cell_type": "code", 389 | "execution_count": null, 390 | "metadata": {}, 391 | "outputs": [], 392 | "source": [ 393 | "y_test_class = np.argmax(y_test, axis=1)\n", 394 | "y_pred_class = np.argmax(y_pred, axis=1)" 395 | ] 396 | }, 397 | { 398 | "cell_type": "code", 399 | "execution_count": null, 400 | "metadata": {}, 401 | "outputs": [], 402 | "source": [ 403 | "from sklearn.metrics import classification_report" 404 | ] 405 | }, 406 | { 407 | "cell_type": "code", 408 | "execution_count": null, 409 | "metadata": {}, 410 | "outputs": [], 411 | "source": [ 412 | "print(classification_report(y_test_class, y_pred_class))" 413 | ] 414 | }, 415 | { 416 | "cell_type": "code", 417 | "execution_count": null, 418 | "metadata": {}, 419 | "outputs": [], 420 | "source": [ 421 | "confusion_matrix(y_test_class, y_pred_class)" 422 | ] 423 | }, 424 | { 425 | "cell_type": "markdown", 426 | "metadata": {}, 427 | "source": [ 428 | "## Exercise 1" 429 | ] 430 | }, 431 | { 432 | "cell_type": "markdown", 433 | "metadata": { 434 | "collapsed": true 435 | }, 436 | "source": [ 437 | "The [Pima Indians dataset](https://archive.ics.uci.edu/ml/datasets/diabetes) is a very famous dataset distributed by UCI and originally collected from the National Institute of Diabetes and Digestive and Kidney Diseases. It contains data from clinical exams for women age 21 and above of Pima indian origins. The objective is to predict based on diagnostic measurements whether a patient has diabetes.\n", 438 | "\n", 439 | "It has the following features:\n", 440 | "\n", 441 | "- Pregnancies: Number of times pregnant\n", 442 | "- Glucose: Plasma glucose concentration a 2 hours in an oral glucose tolerance test\n", 443 | "- BloodPressure: Diastolic blood pressure (mm Hg)\n", 444 | "- SkinThickness: Triceps skin fold thickness (mm)\n", 445 | "- Insulin: 2-Hour serum insulin (mu U/ml)\n", 446 | "- BMI: Body mass index (weight in kg/(height in m)^2)\n", 447 | "- DiabetesPedigreeFunction: Diabetes pedigree function\n", 448 | "- Age: Age (years)\n", 449 | "\n", 450 | "The last colum is the outcome, and it is a binary variable.\n", 451 | "\n", 452 | "In this first exercise we will explore it through the following steps:\n", 453 | "\n", 454 | "1. Load the ..data/diabetes.csv dataset, use pandas to explore the range of each feature\n", 455 | "- For each feature draw a histogram. Bonus points if you draw all the histograms in the same figure.\n", 456 | "- Explore correlations of features with the outcome column. You can do this in several ways, for example using the `sns.pairplot` we used above or drawing a heatmap of the correlations.\n", 457 | "- Do features need standardization? If so what stardardization technique will you use? MinMax? Standard?\n", 458 | "- Prepare your final `X` and `y` variables to be used by a ML model. Make sure you define your target variable well. Will you need dummy columns?" 459 | ] 460 | }, 461 | { 462 | "cell_type": "code", 463 | "execution_count": null, 464 | "metadata": {}, 465 | "outputs": [], 466 | "source": [] 467 | }, 468 | { 469 | "cell_type": "markdown", 470 | "metadata": {}, 471 | "source": [ 472 | "## Exercise 2" 473 | ] 474 | }, 475 | { 476 | "cell_type": "markdown", 477 | "metadata": { 478 | "collapsed": true 479 | }, 480 | "source": [ 481 | "Build a fully connected NN model that predicts diabetes. Follow these steps:\n", 482 | "\n", 483 | "1. Split your data in a train/test with a test size of 20% and a `random_state = 22`\n", 484 | "- define a sequential model with at least one inner layer. You will have to make choices for the following things:\n", 485 | " - what is the size of the input?\n", 486 | " - how many nodes will you use in each layer?\n", 487 | " - what is the size of the output?\n", 488 | " - what activation functions will you use in the inner layers?\n", 489 | " - what activation function will you use at output?\n", 490 | " - what loss function will you use?\n", 491 | " - what optimizer will you use?\n", 492 | "- fit your model on the training set, using a validation_split of 0.1\n", 493 | "- test your trained model on the test data from the train/test split\n", 494 | "- check the accuracy score, the confusion matrix and the classification report" 495 | ] 496 | }, 497 | { 498 | "cell_type": "code", 499 | "execution_count": null, 500 | "metadata": {}, 501 | "outputs": [], 502 | "source": [] 503 | }, 504 | { 505 | "cell_type": "markdown", 506 | "metadata": {}, 507 | "source": [ 508 | "## Exercise 3\n", 509 | "Compare your work with the results presented in [this notebook](https://www.kaggle.com/futurist/d/uciml/pima-indians-diabetes-database/pima-data-visualisation-and-machine-learning). Are your Neural Network results better or worse than the results obtained by traditional Machine Learning techniques?\n", 510 | "\n", 511 | "- Try training a Support Vector Machine or a Random Forest model on the exact same train/test split. Is the performance better or worse?\n", 512 | "- Try restricting your features to only 4 features like in the suggested notebook. How does model performance change?" 513 | ] 514 | }, 515 | { 516 | "cell_type": "code", 517 | "execution_count": null, 518 | "metadata": {}, 519 | "outputs": [], 520 | "source": [] 521 | }, 522 | { 523 | "cell_type": "markdown", 524 | "metadata": {}, 525 | "source": [ 526 | "## Exercise 4\n", 527 | "\n", 528 | "[Tensorflow playground](http://playground.tensorflow.org/) is a web based neural network demo. It is really useful to develop an intuition about what happens when you change architecture, activation function or other parameters. Try playing with it for a few minutes. You don't need do understand the meaning of every knob and button in the page, just get a sense for what happens if you change something. In the next chapter we'll explore these things in more detail.\n" 529 | ] 530 | }, 531 | { 532 | "cell_type": "code", 533 | "execution_count": null, 534 | "metadata": {}, 535 | "outputs": [], 536 | "source": [] 537 | } 538 | ], 539 | "metadata": { 540 | "kernelspec": { 541 | "display_name": "Python 3", 542 | "language": "python", 543 | "name": "python3" 544 | }, 545 | "language_info": { 546 | "codemirror_mode": { 547 | "name": "ipython", 548 | "version": 3 549 | }, 550 | "file_extension": ".py", 551 | "mimetype": "text/x-python", 552 | "name": "python", 553 | "nbconvert_exporter": "python", 554 | "pygments_lexer": "ipython3", 555 | "version": "3.7.10" 556 | } 557 | }, 558 | "nbformat": 4, 559 | "nbformat_minor": 2 560 | } 561 | -------------------------------------------------------------------------------- /course/8 Recurrent Neural Networks.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Recurrent Neural Networks" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import pandas as pd\n", 17 | "import numpy as np\n", 18 | "%matplotlib inline\n", 19 | "import matplotlib.pyplot as plt" 20 | ] 21 | }, 22 | { 23 | "cell_type": "markdown", 24 | "metadata": {}, 25 | "source": [ 26 | "## Time series forecasting" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": null, 32 | "metadata": {}, 33 | "outputs": [], 34 | "source": [ 35 | "df = pd.read_csv('../data/cansim-0800020-eng-6674700030567901031.csv',\n", 36 | " skiprows=6, skipfooter=9,\n", 37 | " engine='python')\n", 38 | "df.head()" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": null, 44 | "metadata": {}, 45 | "outputs": [], 46 | "source": [ 47 | "from pandas.tseries.offsets import MonthEnd" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": null, 53 | "metadata": {}, 54 | "outputs": [], 55 | "source": [ 56 | "df['Adjustments'] = pd.to_datetime(df['Adjustments']) + MonthEnd(1)\n", 57 | "df = df.set_index('Adjustments')\n", 58 | "df.head()" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": null, 64 | "metadata": {}, 65 | "outputs": [], 66 | "source": [ 67 | "df.plot()" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "metadata": {}, 74 | "outputs": [], 75 | "source": [ 76 | "split_date = pd.Timestamp('01-01-2011')" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": null, 82 | "metadata": {}, 83 | "outputs": [], 84 | "source": [ 85 | "train = df.loc[:split_date, ['Unadjusted']]\n", 86 | "test = df.loc[split_date:, ['Unadjusted']]" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": null, 92 | "metadata": {}, 93 | "outputs": [], 94 | "source": [ 95 | "ax = train.plot()\n", 96 | "test.plot(ax=ax)\n", 97 | "plt.legend(['train', 'test'])" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": null, 103 | "metadata": {}, 104 | "outputs": [], 105 | "source": [ 106 | "from sklearn.preprocessing import MinMaxScaler\n", 107 | "\n", 108 | "sc = MinMaxScaler()\n", 109 | "\n", 110 | "train_sc = sc.fit_transform(train)\n", 111 | "test_sc = sc.transform(test)" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": null, 117 | "metadata": {}, 118 | "outputs": [], 119 | "source": [ 120 | "train_sc[:4]" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": null, 126 | "metadata": {}, 127 | "outputs": [], 128 | "source": [ 129 | "X_train = train_sc[:-1]\n", 130 | "y_train = train_sc[1:]\n", 131 | "\n", 132 | "X_test = test_sc[:-1]\n", 133 | "y_test = test_sc[1:]" 134 | ] 135 | }, 136 | { 137 | "cell_type": "markdown", 138 | "metadata": {}, 139 | "source": [ 140 | "### Fully connected predictor" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": null, 146 | "metadata": {}, 147 | "outputs": [], 148 | "source": [ 149 | "from tensorflow.keras.models import Sequential\n", 150 | "from tensorflow.keras.layers import Dense\n", 151 | "import tensorflow.keras.backend as K\n", 152 | "from tensorflow.keras.callbacks import EarlyStopping" 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": null, 158 | "metadata": {}, 159 | "outputs": [], 160 | "source": [ 161 | "K.clear_session()\n", 162 | "\n", 163 | "model = Sequential()\n", 164 | "model.add(Dense(12, input_dim=1, activation='relu'))\n", 165 | "model.add(Dense(1))\n", 166 | "model.compile(loss='mean_squared_error', optimizer='adam')\n", 167 | "model.summary()" 168 | ] 169 | }, 170 | { 171 | "cell_type": "code", 172 | "execution_count": null, 173 | "metadata": {}, 174 | "outputs": [], 175 | "source": [ 176 | "early_stop = EarlyStopping(monitor='loss', patience=1, verbose=1)" 177 | ] 178 | }, 179 | { 180 | "cell_type": "code", 181 | "execution_count": null, 182 | "metadata": {}, 183 | "outputs": [], 184 | "source": [ 185 | "model.fit(X_train, y_train, epochs=200,\n", 186 | " batch_size=2, verbose=1,\n", 187 | " callbacks=[early_stop])" 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "execution_count": null, 193 | "metadata": {}, 194 | "outputs": [], 195 | "source": [ 196 | "y_pred = model.predict(X_test)" 197 | ] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": null, 202 | "metadata": {}, 203 | "outputs": [], 204 | "source": [ 205 | "plt.plot(y_test)\n", 206 | "plt.plot(y_pred)" 207 | ] 208 | }, 209 | { 210 | "cell_type": "markdown", 211 | "metadata": {}, 212 | "source": [ 213 | "### Recurrent predictor" 214 | ] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "execution_count": null, 219 | "metadata": {}, 220 | "outputs": [], 221 | "source": [ 222 | "from tensorflow.keras.layers import LSTM" 223 | ] 224 | }, 225 | { 226 | "cell_type": "code", 227 | "execution_count": null, 228 | "metadata": {}, 229 | "outputs": [], 230 | "source": [ 231 | "X_train.shape" 232 | ] 233 | }, 234 | { 235 | "cell_type": "code", 236 | "execution_count": null, 237 | "metadata": {}, 238 | "outputs": [], 239 | "source": [ 240 | "#3D tensor with shape (batch_size, timesteps, input_dim)\n", 241 | "X_train[:, None].shape" 242 | ] 243 | }, 244 | { 245 | "cell_type": "code", 246 | "execution_count": null, 247 | "metadata": {}, 248 | "outputs": [], 249 | "source": [ 250 | "X_train_t = X_train[:, None]\n", 251 | "X_test_t = X_test[:, None]" 252 | ] 253 | }, 254 | { 255 | "cell_type": "code", 256 | "execution_count": null, 257 | "metadata": {}, 258 | "outputs": [], 259 | "source": [ 260 | "K.clear_session()\n", 261 | "model = Sequential()\n", 262 | "\n", 263 | "model.add(LSTM(6, input_shape=(1, 1)))\n", 264 | "\n", 265 | "model.add(Dense(1))\n", 266 | "\n", 267 | "model.compile(loss='mean_squared_error', optimizer='adam')" 268 | ] 269 | }, 270 | { 271 | "cell_type": "code", 272 | "execution_count": null, 273 | "metadata": {}, 274 | "outputs": [], 275 | "source": [ 276 | "model.fit(X_train_t, y_train,\n", 277 | " epochs=100, batch_size=1, verbose=1,\n", 278 | " callbacks=[early_stop])" 279 | ] 280 | }, 281 | { 282 | "cell_type": "code", 283 | "execution_count": null, 284 | "metadata": {}, 285 | "outputs": [], 286 | "source": [ 287 | "y_pred = model.predict(X_test_t)\n", 288 | "plt.plot(y_test)\n", 289 | "plt.plot(y_pred)" 290 | ] 291 | }, 292 | { 293 | "cell_type": "markdown", 294 | "metadata": {}, 295 | "source": [ 296 | "## Windows" 297 | ] 298 | }, 299 | { 300 | "cell_type": "code", 301 | "execution_count": null, 302 | "metadata": {}, 303 | "outputs": [], 304 | "source": [ 305 | "train_sc.shape" 306 | ] 307 | }, 308 | { 309 | "cell_type": "code", 310 | "execution_count": null, 311 | "metadata": {}, 312 | "outputs": [], 313 | "source": [ 314 | "train_sc_df = pd.DataFrame(train_sc, columns=['Scaled'], index=train.index)\n", 315 | "test_sc_df = pd.DataFrame(test_sc, columns=['Scaled'], index=test.index)\n", 316 | "train_sc_df.head()" 317 | ] 318 | }, 319 | { 320 | "cell_type": "code", 321 | "execution_count": null, 322 | "metadata": {}, 323 | "outputs": [], 324 | "source": [ 325 | "for s in range(1, 13):\n", 326 | " train_sc_df['shift_{}'.format(s)] = train_sc_df['Scaled'].shift(s)\n", 327 | " test_sc_df['shift_{}'.format(s)] = test_sc_df['Scaled'].shift(s)" 328 | ] 329 | }, 330 | { 331 | "cell_type": "code", 332 | "execution_count": null, 333 | "metadata": {}, 334 | "outputs": [], 335 | "source": [ 336 | "train_sc_df.head(13)" 337 | ] 338 | }, 339 | { 340 | "cell_type": "code", 341 | "execution_count": null, 342 | "metadata": {}, 343 | "outputs": [], 344 | "source": [ 345 | "X_train = train_sc_df.dropna().drop('Scaled', axis=1)\n", 346 | "y_train = train_sc_df.dropna()[['Scaled']]\n", 347 | "\n", 348 | "X_test = test_sc_df.dropna().drop('Scaled', axis=1)\n", 349 | "y_test = test_sc_df.dropna()[['Scaled']]" 350 | ] 351 | }, 352 | { 353 | "cell_type": "code", 354 | "execution_count": null, 355 | "metadata": {}, 356 | "outputs": [], 357 | "source": [ 358 | "X_train.head()" 359 | ] 360 | }, 361 | { 362 | "cell_type": "code", 363 | "execution_count": null, 364 | "metadata": {}, 365 | "outputs": [], 366 | "source": [ 367 | "X_train.shape" 368 | ] 369 | }, 370 | { 371 | "cell_type": "code", 372 | "execution_count": null, 373 | "metadata": {}, 374 | "outputs": [], 375 | "source": [ 376 | "X_train = X_train.values\n", 377 | "X_test= X_test.values\n", 378 | "\n", 379 | "y_train = y_train.values\n", 380 | "y_test = y_test.values" 381 | ] 382 | }, 383 | { 384 | "cell_type": "markdown", 385 | "metadata": {}, 386 | "source": [ 387 | "### Fully Connected on Windows" 388 | ] 389 | }, 390 | { 391 | "cell_type": "code", 392 | "execution_count": null, 393 | "metadata": {}, 394 | "outputs": [], 395 | "source": [ 396 | "K.clear_session()\n", 397 | "\n", 398 | "model = Sequential()\n", 399 | "model.add(Dense(12, input_dim=12, activation='relu'))\n", 400 | "model.add(Dense(1))\n", 401 | "model.compile(loss='mean_squared_error', optimizer='adam')\n", 402 | "model.summary()" 403 | ] 404 | }, 405 | { 406 | "cell_type": "code", 407 | "execution_count": null, 408 | "metadata": {}, 409 | "outputs": [], 410 | "source": [ 411 | "model.fit(X_train, y_train, epochs=200,\n", 412 | " batch_size=1, verbose=1, callbacks=[early_stop])" 413 | ] 414 | }, 415 | { 416 | "cell_type": "code", 417 | "execution_count": null, 418 | "metadata": {}, 419 | "outputs": [], 420 | "source": [ 421 | "y_pred = model.predict(X_test)\n", 422 | "plt.plot(y_test)\n", 423 | "plt.plot(y_pred)" 424 | ] 425 | }, 426 | { 427 | "cell_type": "markdown", 428 | "metadata": {}, 429 | "source": [ 430 | "### LSTM on Windows" 431 | ] 432 | }, 433 | { 434 | "cell_type": "code", 435 | "execution_count": null, 436 | "metadata": {}, 437 | "outputs": [], 438 | "source": [ 439 | "X_train_t = X_train.reshape(X_train.shape[0], 1, 12)\n", 440 | "X_test_t = X_test.reshape(X_test.shape[0], 1, 12)" 441 | ] 442 | }, 443 | { 444 | "cell_type": "code", 445 | "execution_count": null, 446 | "metadata": {}, 447 | "outputs": [], 448 | "source": [ 449 | "X_train_t.shape" 450 | ] 451 | }, 452 | { 453 | "cell_type": "code", 454 | "execution_count": null, 455 | "metadata": {}, 456 | "outputs": [], 457 | "source": [ 458 | "K.clear_session()\n", 459 | "model = Sequential()\n", 460 | "\n", 461 | "model.add(LSTM(6, input_shape=(1, 12)))\n", 462 | "\n", 463 | "model.add(Dense(1))\n", 464 | "\n", 465 | "model.compile(loss='mean_squared_error', optimizer='adam')" 466 | ] 467 | }, 468 | { 469 | "cell_type": "code", 470 | "execution_count": null, 471 | "metadata": {}, 472 | "outputs": [], 473 | "source": [ 474 | "model.summary()" 475 | ] 476 | }, 477 | { 478 | "cell_type": "code", 479 | "execution_count": null, 480 | "metadata": {}, 481 | "outputs": [], 482 | "source": [ 483 | "model.fit(X_train_t, y_train, epochs=100,\n", 484 | " batch_size=1, verbose=1, callbacks=[early_stop])" 485 | ] 486 | }, 487 | { 488 | "cell_type": "code", 489 | "execution_count": null, 490 | "metadata": {}, 491 | "outputs": [], 492 | "source": [ 493 | "y_pred = model.predict(X_test_t)\n", 494 | "plt.plot(y_test)\n", 495 | "plt.plot(y_pred)" 496 | ] 497 | }, 498 | { 499 | "cell_type": "markdown", 500 | "metadata": {}, 501 | "source": [ 502 | "## Exercise 1\n", 503 | "\n", 504 | "In the model above we reshaped the input shape to: `(num_samples, 1, 12)`, i.e. we treated a window of 12 months as a vector of 12 coordinates that we simultaneously passed to all the LSTM nodes. An alternative way to look at the problem is to reshape the input to `(num_samples, 12, 1)`. This means we consider each input window as a sequence of 12 values that we will pass in sequence to the LSTM. In principle this looks like a more accurate description of our situation. But does it yield better predictions? Let's check it.\n", 505 | "\n", 506 | "- Reshape `X_train` and `X_test` so that they represent a set of univariate sequences\n", 507 | "- retrain the same LSTM(6) model, you'll have to adapt the `input_shape`\n", 508 | "- check the performance of this new model, is it better at predicting the test data?" 509 | ] 510 | }, 511 | { 512 | "cell_type": "code", 513 | "execution_count": null, 514 | "metadata": {}, 515 | "outputs": [], 516 | "source": [] 517 | }, 518 | { 519 | "cell_type": "markdown", 520 | "metadata": { 521 | "collapsed": true 522 | }, 523 | "source": [ 524 | "## Exercise 2\n", 525 | "\n", 526 | "RNN models can be applied to images too. In general we can apply them to any data where there's a connnection between nearby units. Let's see how we can easily build a model that works with images.\n", 527 | "\n", 528 | "- Load the MNIST data, by now you should be able to do it blindfolded :)\n", 529 | "- reshape it so that an image looks like a long sequence of pixels\n", 530 | "- create a recurrent model and train it on the training data\n", 531 | "- how does it perform compared to a fully connected? How does it compare to Convolutional Neural Networks?\n", 532 | "\n", 533 | "(feel free to run this exercise on a cloud GPU if it's too slow on your laptop)" 534 | ] 535 | }, 536 | { 537 | "cell_type": "code", 538 | "execution_count": null, 539 | "metadata": {}, 540 | "outputs": [], 541 | "source": [] 542 | } 543 | ], 544 | "metadata": { 545 | "kernelspec": { 546 | "display_name": "Python 3", 547 | "language": "python", 548 | "name": "python3" 549 | }, 550 | "language_info": { 551 | "codemirror_mode": { 552 | "name": "ipython", 553 | "version": 3 554 | }, 555 | "file_extension": ".py", 556 | "mimetype": "text/x-python", 557 | "name": "python", 558 | "nbconvert_exporter": "python", 559 | "pygments_lexer": "ipython3", 560 | "version": "3.7.10" 561 | } 562 | }, 563 | "nbformat": 4, 564 | "nbformat_minor": 2 565 | } 566 | -------------------------------------------------------------------------------- /data/banknotes.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Dataweekends/zero_to_deep_learning_video/efff3403504906c1e38c23f4d35fd3cb98af1950/data/banknotes.png -------------------------------------------------------------------------------- /data/cansim-0800020-eng-6674700030567901031.csv: -------------------------------------------------------------------------------- 1 | "Table 080-0020 Retail trade, sales by the North American Industry Classification System (NAICS), monthly (dollars x 1,000)(2,3,4,5,6)" 2 | Survey or program details: 3 | Retail Trade Survey (Monthly) - 2406 4 | Monthly Retail Trade Survey (Department Store Organizations) - 2408 5 | Geography,Canada,Canada 6 | North American Industry Classification System (NAICS),Retail trade [44-45] ,Retail trade [44-45] 7 | Adjustments,Unadjusted,Seasonally adjusted 8 | Jan-1991,12588862,15026890 9 | Feb-1991,12154321,15304585 10 | Mar-1991,14337072,15413591 11 | Apr-1991,15108570,15293409 12 | May-1991,17225734,15676083 13 | Jun-1991,16342833,15507931 14 | Jul-1991,15996243,15556313 15 | Aug-1991,16064910,15430645 16 | Sep-1991,15015317,15427313 17 | Oct-1991,15606864,15410250 18 | Nov-1991,16237366,15662790 19 | Dec-1991,18381340,15349625 20 | Jan-1992,13084963,15477875 21 | Feb-1992,12773972,15513022 22 | Mar-1992,14198775,15527933 23 | Apr-1992,15558390,15708556 24 | May-1992,16776396,15642000 25 | Jun-1992,16716231,15823989 26 | Jul-1992,16637483,15869453 27 | Aug-1992,15842075,15844631 28 | Sep-1992,15812400,15983239 29 | Oct-1992,16562268,16125835 30 | Nov-1992,16015869,16049478 31 | Dec-1992,19682921,16095727 32 | Jan-1993,13672727,16408864 33 | Feb-1993,12900733,16239039 34 | Mar-1993,15211859,16314960 35 | Apr-1993,16642246,16577426 36 | May-1993,17442405,16472045 37 | Jun-1993,17444074,16351907 38 | Jul-1993,17610326,16712914 39 | Aug-1993,16645660,16703413 40 | Sep-1993,16790330,16755338 41 | Oct-1993,16921755,16819382 42 | Nov-1993,17124609,16958202 43 | Dec-1993,20928208,17021436 44 | Jan-1994,14005058,17076164 45 | Feb-1994,13799079,17393150 46 | Mar-1994,16865149,17890903 47 | Apr-1994,17494589,17507688 48 | May-1994,18739509,17775079 49 | Jun-1994,19323481,17882069 50 | Jul-1994,18297834,17785800 51 | Aug-1994,18101290,17881976 52 | Sep-1994,18161417,17952647 53 | Oct-1994,17998875,18193703 54 | Nov-1994,18516766,18264676 55 | Dec-1994,22688647,18387840 56 | Jan-1995,14927996,18337565 57 | Feb-1995,14520623,18259470 58 | Mar-1995,17457477,18225708 59 | Apr-1995,17774107,18217661 60 | May-1995,19740889,18333051 61 | Jun-1995,20319460,18503481 62 | Jul-1995,18747299,18407254 63 | Aug-1995,19280525,18720783 64 | Sep-1995,18860566,18628735 65 | Oct-1995,18177152,18412692 66 | Nov-1995,18962903,18506305 67 | Dec-1995,22308880,18525162 68 | Jan-1996,15379086,18531426 69 | Feb-1996,15521981,18657652 70 | Mar-1996,17613469,18774049 71 | Apr-1996,18421405,18739023 72 | May-1996,20624568,18758009 73 | Jun-1996,20099348,18977805 74 | Jul-1996,19423284,18914063 75 | Aug-1996,19889359,19071178 76 | Sep-1996,18589571,19019991 77 | Oct-1996,19686383,19488074 78 | Nov-1996,20293165,19820074 79 | Dec-1996,22897980,19688254 80 | Jan-1997,16882321,19857365 81 | Feb-1997,16033605,20141489 82 | Mar-1997,18225453,20056949 83 | Apr-1997,20432272,20215340 84 | May-1997,22594727,20386953 85 | Jun-1997,21577744,20466883 86 | Jul-1997,21570145,20681230 87 | Aug-1997,21065784,20605349 88 | Sep-1997,20532806,20646564 89 | Oct-1997,21491163,21037021 90 | Nov-1997,20904746,20973561 91 | Dec-1997,25507180,21749250 92 | Jan-1998,17736224,20776214 93 | Feb-1998,16797018,21153779 94 | Mar-1998,19408883,21041225 95 | Apr-1998,21501677,21504619 96 | May-1998,23312947,21504262 97 | Jun-1998,22654803,21247311 98 | Jul-1998,22594775,21385620 99 | Aug-1998,21512734,21335980 100 | Sep-1998,21645562,21660645 101 | Oct-1998,21994089,21565457 102 | Nov-1998,21461344,21714061 103 | Dec-1998,25874332,21605209 104 | Jan-1999,18438151,22074043 105 | Feb-1999,17658952,22286260 106 | Mar-1999,21082603,22402680 107 | Apr-1999,22587382,22389229 108 | May-1999,23892100,22300484 109 | Jun-1999,24036828,22450487 110 | Jul-1999,23994614,22614164 111 | Aug-1999,22926469,22806183 112 | Sep-1999,22984278,22817165 113 | Oct-1999,22813633,22967565 114 | Nov-1999,22972959,23036527 115 | Dec-1999,28143999,23387176 116 | Jan-2000,19324692,23434451 117 | Feb-2000,19140440,23378327 118 | Mar-2000,22918829,23813646 119 | Apr-2000,22914155,23537859 120 | May-2000,25659687,23644020 121 | Jun-2000,25945400,23841011 122 | Jul-2000,24821347,24204193 123 | Aug-2000,25102965,24266358 124 | Sep-2000,24710257,24495699 125 | Oct-2000,23687124,24330740 126 | Nov-2000,24556357,24373656 127 | Dec-2000,29057176,24518477 128 | Jan-2001,20607642,24640517 129 | Feb-2001,19444855,24477976 130 | Mar-2001,23652255,24583988 131 | Apr-2001,24370700,24944482 132 | May-2001,27585889,25143416 133 | Jun-2001,27243919,25190078 134 | Jul-2001,25507932,24813831 135 | Aug-2001,26322941,25017925 136 | Sep-2001,24263969,24703734 137 | Oct-2001,24917747,25149772 138 | Nov-2001,26048646,25657476 139 | Dec-2001,30481412,26124705 140 | Jan-2002,22361219,26496729 141 | Feb-2002,20787209,26105990 142 | Mar-2002,24642692,26072633 143 | Apr-2002,26405170,26657565 144 | May-2002,29087583,26165998 145 | Jun-2002,28363263,26768924 146 | Jul-2002,27912328,26620588 147 | Aug-2002,28202300,26767279 148 | Sep-2002,26054411,26620259 149 | Oct-2002,27131743,27061844 150 | Nov-2002,27276942,27015215 151 | Dec-2002,31300554,27172388 152 | Jan-2003,23301871,27137545 153 | Feb-2003,21980804,27622538 154 | Mar-2003,25468203,27275170 155 | Apr-2003,27059495,27176306 156 | May-2003,30417563,27484615 157 | Jun-2003,28912102,27569750 158 | Jul-2003,29492832,27707861 159 | Aug-2003,29102135,28020755 160 | Sep-2003,27467571,27841344 161 | Oct-2003,28223631,27825024 162 | Nov-2003,27391422,27777531 163 | Dec-2003,32325789,27704978 164 | Jan-2004,23778728,27935993 165 | Feb-2004,23008594,28719948 166 | Mar-2004,26967793,28689514 167 | Apr-2004,28592026,28254086 168 | May-2004,30479247,28554094 169 | Jun-2004,30711705,28550528 170 | Jul-2004,30898334,28616168 171 | Aug-2004,29535183,28836665 172 | Sep-2004,29245397,29243662 173 | Oct-2004,29445711,29561177 174 | Nov-2004,29232659,29901036 175 | Dec-2004,34561103,29593609 176 | Jan-2005,24498615,29888781 177 | Feb-2005,24028226,30460620 178 | Mar-2005,28600602,30006264 179 | Apr-2005,30600811,29940271 180 | May-2005,31948565,29935878 181 | Jun-2005,32967426,30590992 182 | Jul-2005,32620077,30800241 183 | Aug-2005,32025283,30647393 184 | Sep-2005,30914826,30600008 185 | Oct-2005,30241532,30887481 186 | Nov-2005,30828069,31012789 187 | Dec-2005,36726743,31230056 188 | Jan-2006,25993203,31747679 189 | Feb-2006,25128165,31744450 190 | Mar-2006,30760061,31871717 191 | Apr-2006,32106585,32354405 192 | May-2006,34894460,32048264 193 | Jun-2006,35049112,32242363 194 | Jul-2006,34341547,33040218 195 | Aug-2006,35045180,33007575 196 | Sep-2006,33056559,32492838 197 | Oct-2006,31830349,32595465 198 | Nov-2006,32663281,32814138 199 | Dec-2006,38605976,33515367 200 | Jan-2007,27777968,33221023 201 | Feb-2007,26548520,33466188 202 | Mar-2007,32818504,33910296 203 | Apr-2007,33621240,34385868 204 | May-2007,38434319,34789277 205 | Jun-2007,37555708,34436278 206 | Jul-2007,35635889,34430726 207 | Aug-2007,36978090,34725483 208 | Sep-2007,34057842,34341226 209 | Oct-2007,34070363,34305870 210 | Nov-2007,35091406,34958995 211 | Dec-2007,40006665,35625285 212 | Jan-2008,30525699,35933156 213 | Feb-2008,29418898,35526909 214 | Mar-2008,32925876,35530142 215 | Apr-2008,36272111,35914167 216 | May-2008,39778972,36006888 217 | Jun-2008,37842321,36400129 218 | Jul-2008,38632038,36403867 219 | Aug-2008,37775417,36137537 220 | Sep-2008,36138751,36390521 221 | Oct-2008,36158245,35747571 222 | Nov-2008,34230901,34661690 223 | Dec-2008,38256712,33303365 224 | Jan-2009,29192654,33747849 225 | Feb-2009,26804723,33869426 226 | Mar-2009,31356949,33894022 227 | Apr-2009,33942769,33930673 228 | May-2009,37316515,34345160 229 | Jun-2009,36865690,34735113 230 | Jul-2009,37191480,34755444 231 | Aug-2009,36049418,35039739 232 | Sep-2009,35537357,35239195 233 | Oct-2009,36133694,35330869 234 | Nov-2009,34354756,35250896 235 | Dec-2009,40969765,35577384 236 | Jan-2010,30668321,36080033 237 | Feb-2010,28632551,36051781 238 | Mar-2010,34967182,37001843 239 | Apr-2010,36469949,36148495 240 | May-2010,38424455,36041318 241 | Jun-2010,38973462,36350588 242 | Jul-2010,38932294,36295314 243 | Aug-2010,37395330,36515170 244 | Sep-2010,36923390,36632898 245 | Oct-2010,37014326,36879707 246 | Nov-2010,37408825,37568029 247 | Dec-2010,43147947,37392857 248 | Jan-2011,31191594,37392259 249 | Feb-2011,29797949,37437926 250 | Mar-2011,36099866,37617167 251 | Apr-2011,38035760,37755408 252 | May-2011,40046516,37723958 253 | Jun-2011,40839556,38228307 254 | Jul-2011,39832282,37925826 255 | Aug-2011,39541248,37976798 256 | Sep-2011,38877263,38181654 257 | Oct-2011,38203872,38623692 258 | Nov-2011,39174736,38779553 259 | Dec-2011,45089701,39087795 260 | Jan-2012,32361808,39102435 261 | Feb-2012,32087072,38968001 262 | Mar-2012,37933733,39201228 263 | Apr-2012,37775805,38920526 264 | May-2012,42584571,38841267 265 | Jun-2012,41789242,38773515 266 | Jul-2012,40130908,38854126 267 | Aug-2012,41321526,38854279 268 | Sep-2012,39069513,39058649 269 | Oct-2012,39487597,39277317 270 | Nov-2012,40095933,39224805 271 | Dec-2012,43489091,39050651 272 | Jan-2013,33574671,39523536 273 | Feb-2013,31636843,39710038 274 | Mar-2013,37561378,39811962 275 | Apr-2013,39401295,39655045 276 | May-2013,44577490,40295930 277 | Jun-2013,42169145,39992542 278 | Jul-2013,42417829,40388278 279 | Aug-2013,43237460,40660890 280 | Sep-2013,40170270,40631164 281 | Oct-2013,41560987,40813306 282 | Nov-2013,41893714,40798569 283 | Dec-2013,44796794,40716614 284 | Jan-2014,34980327,40976155 285 | Feb-2014,32905708,41256280 286 | Mar-2014,38460091,41242344 287 | Apr-2014,41809373,41852467 288 | May-2014,46379543,41906455 289 | Jun-2014,44178750,42457216 290 | Jul-2014,45285331,42562972 291 | Aug-2014,44359733,42456214 292 | Sep-2014,43017529,42685882 293 | Oct-2014,43775478,42690228 294 | Nov-2014,42968326,42603501 295 | Dec-2014,46887481,42317955 296 | Jan-2015,34820395,40971992 297 | Feb-2015,33174923,41801906 298 | Mar-2015,39444291,42420253 299 | Apr-2015,42297319,42331926 300 | May-2015,46670930,42721761 301 | Jun-2015,45584849,42989280 302 | Jul-2015,46295664,43154020 303 | Aug-2015,44793347,43309509 304 | Sep-2015,43999627,43303889 305 | Oct-2015,44507776,43378904 306 | Nov-2015,43696305,43921767 307 | Dec-2015,48097829,43078048 308 | Jan-2016,36415115,43977584 309 | Feb-2016,35649450,44205540 310 | Mar-2016,41403762,43839996 311 | Apr-2016,44881587,44181416 312 | May-2016,47337082,44176591 313 | Jun-2016,47399117,44162244 314 | Jul-2016,46321314,44110862 315 | Aug-2016,46201453,44216280 316 | Sep-2016,45528702,44534797 317 | Oct-2016,44770113,45061618 318 | Nov-2016,46285062,45141762 319 | Dec-2016,50016137,44943929 320 | Jan-2017,37628452,45952103 321 | Footnotes: 322 | 2,The total for retail trade excludes North American Industry Classification System (NAICS) 454. 323 | 3,"This CANSIM table replaces archived table 80-0014, 80-0015 and 80-0017." 324 | 4,"Quality indicator: Code A=Excellent. Code B=Very good. Code C=Good. Code D=Acceptable. Code E=Poor, use with caution. Code F=Unreliable (data not published)." 325 | 5,"Data for Northwest Territories includes Nunavut, from 1991-01 to 1998-12." 326 | 6,"In April 2013, data from 2004 onwards will be based on the 2012 North American Industry Classification System (NAICS). Data prior to 2004 will continue to be based on the 2007 North American Industry Classification System (NAICS)." 327 | Source: 328 | "Statistics Canada. Table 080-0020 - Retail trade, sales by the North American Industry Classification System (NAICS), monthly (dollars)" 329 | "(accessed: April 19, 2017)" 330 | -------------------------------------------------------------------------------- /data/generator/class 0/squirrel.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Dataweekends/zero_to_deep_learning_video/efff3403504906c1e38c23f4d35fd3cb98af1950/data/generator/class 0/squirrel.jpeg -------------------------------------------------------------------------------- /data/housing-data.csv: -------------------------------------------------------------------------------- 1 | sqft,bdrms,age,price 2 | 2104,3,70,399900 3 | 1600,3,28,329900 4 | 2400,3,44,369000 5 | 1416,2,49,232000 6 | 3000,4,75,539900 7 | 1985,4,61,299900 8 | 1534,3,12,314900 9 | 1427,3,57,198999 10 | 1380,3,14,212000 11 | 1494,3,15,242500 12 | 1940,4,7,239999 13 | 2000,3,27,347000 14 | 1890,3,45,329999 15 | 4478,5,49,699900 16 | 1268,3,58,259900 17 | 2300,4,77,449900 18 | 1320,2,62,299900 19 | 1236,3,78,199900 20 | 2609,4,5,499998 21 | 3031,4,21,599000 22 | 1767,3,44,252900 23 | 1888,2,79,255000 24 | 1604,3,13,242900 25 | 1962,4,53,259900 26 | 3890,3,36,573900 27 | 1100,3,60,249900 28 | 1458,3,29,464500 29 | 2526,3,13,469000 30 | 2200,3,28,475000 31 | 2637,3,25,299900 32 | 1839,2,40,349900 33 | 1000,1,5,169900 34 | 2040,4,75,314900 35 | 3137,3,67,579900 36 | 1811,4,24,285900 37 | 1437,3,50,249900 38 | 1239,3,22,229900 39 | 2132,4,28,345000 40 | 4215,4,66,549000 41 | 2162,4,43,287000 42 | 1664,2,40,368500 43 | 2238,3,37,329900 44 | 2567,4,57,314000 45 | 1200,3,76,299000 46 | 852,2,70,179900 47 | 1852,4,64,299900 48 | 1203,3,11,239500 49 | -------------------------------------------------------------------------------- /data/international-airline-passengers.csv: -------------------------------------------------------------------------------- 1 | "Month","Thousand Passengers" 2 | "1949-01",112 3 | "1949-02",118 4 | "1949-03",132 5 | "1949-04",129 6 | "1949-05",121 7 | "1949-06",135 8 | "1949-07",148 9 | "1949-08",148 10 | "1949-09",136 11 | "1949-10",119 12 | "1949-11",104 13 | "1949-12",118 14 | "1950-01",115 15 | "1950-02",126 16 | "1950-03",141 17 | "1950-04",135 18 | "1950-05",125 19 | "1950-06",149 20 | "1950-07",170 21 | "1950-08",170 22 | "1950-09",158 23 | "1950-10",133 24 | "1950-11",114 25 | "1950-12",140 26 | "1951-01",145 27 | "1951-02",150 28 | "1951-03",178 29 | "1951-04",163 30 | "1951-05",172 31 | "1951-06",178 32 | "1951-07",199 33 | "1951-08",199 34 | "1951-09",184 35 | "1951-10",162 36 | "1951-11",146 37 | "1951-12",166 38 | "1952-01",171 39 | "1952-02",180 40 | "1952-03",193 41 | "1952-04",181 42 | "1952-05",183 43 | "1952-06",218 44 | "1952-07",230 45 | "1952-08",242 46 | "1952-09",209 47 | "1952-10",191 48 | "1952-11",172 49 | "1952-12",194 50 | "1953-01",196 51 | "1953-02",196 52 | "1953-03",236 53 | "1953-04",235 54 | "1953-05",229 55 | "1953-06",243 56 | "1953-07",264 57 | "1953-08",272 58 | "1953-09",237 59 | "1953-10",211 60 | "1953-11",180 61 | "1953-12",201 62 | "1954-01",204 63 | "1954-02",188 64 | "1954-03",235 65 | "1954-04",227 66 | "1954-05",234 67 | "1954-06",264 68 | "1954-07",302 69 | "1954-08",293 70 | "1954-09",259 71 | "1954-10",229 72 | "1954-11",203 73 | "1954-12",229 74 | "1955-01",242 75 | "1955-02",233 76 | "1955-03",267 77 | "1955-04",269 78 | "1955-05",270 79 | "1955-06",315 80 | "1955-07",364 81 | "1955-08",347 82 | "1955-09",312 83 | "1955-10",274 84 | "1955-11",237 85 | "1955-12",278 86 | "1956-01",284 87 | "1956-02",277 88 | "1956-03",317 89 | "1956-04",313 90 | "1956-05",318 91 | "1956-06",374 92 | "1956-07",413 93 | "1956-08",405 94 | "1956-09",355 95 | "1956-10",306 96 | "1956-11",271 97 | "1956-12",306 98 | "1957-01",315 99 | "1957-02",301 100 | "1957-03",356 101 | "1957-04",348 102 | "1957-05",355 103 | "1957-06",422 104 | "1957-07",465 105 | "1957-08",467 106 | "1957-09",404 107 | "1957-10",347 108 | "1957-11",305 109 | "1957-12",336 110 | "1958-01",340 111 | "1958-02",318 112 | "1958-03",362 113 | "1958-04",348 114 | "1958-05",363 115 | "1958-06",435 116 | "1958-07",491 117 | "1958-08",505 118 | "1958-09",404 119 | "1958-10",359 120 | "1958-11",310 121 | "1958-12",337 122 | "1959-01",360 123 | "1959-02",342 124 | "1959-03",406 125 | "1959-04",396 126 | "1959-05",420 127 | "1959-06",472 128 | "1959-07",548 129 | "1959-08",559 130 | "1959-09",463 131 | "1959-10",407 132 | "1959-11",362 133 | "1959-12",405 134 | "1960-01",417 135 | "1960-02",391 136 | "1960-03",419 137 | "1960-04",461 138 | "1960-05",472 139 | "1960-06",535 140 | "1960-07",622 141 | "1960-08",606 142 | "1960-09",508 143 | "1960-10",461 144 | "1960-11",390 145 | "1960-12",432 -------------------------------------------------------------------------------- /data/iris.csv: -------------------------------------------------------------------------------- 1 | sepal_length,sepal_width,petal_length,petal_width,species 2 | 5.1,3.5,1.4,0.2,setosa 3 | 4.9,3.0,1.4,0.2,setosa 4 | 4.7,3.2,1.3,0.2,setosa 5 | 4.6,3.1,1.5,0.2,setosa 6 | 5.0,3.6,1.4,0.2,setosa 7 | 5.4,3.9,1.7,0.4,setosa 8 | 4.6,3.4,1.4,0.3,setosa 9 | 5.0,3.4,1.5,0.2,setosa 10 | 4.4,2.9,1.4,0.2,setosa 11 | 4.9,3.1,1.5,0.1,setosa 12 | 5.4,3.7,1.5,0.2,setosa 13 | 4.8,3.4,1.6,0.2,setosa 14 | 4.8,3.0,1.4,0.1,setosa 15 | 4.3,3.0,1.1,0.1,setosa 16 | 5.8,4.0,1.2,0.2,setosa 17 | 5.7,4.4,1.5,0.4,setosa 18 | 5.4,3.9,1.3,0.4,setosa 19 | 5.1,3.5,1.4,0.3,setosa 20 | 5.7,3.8,1.7,0.3,setosa 21 | 5.1,3.8,1.5,0.3,setosa 22 | 5.4,3.4,1.7,0.2,setosa 23 | 5.1,3.7,1.5,0.4,setosa 24 | 4.6,3.6,1.0,0.2,setosa 25 | 5.1,3.3,1.7,0.5,setosa 26 | 4.8,3.4,1.9,0.2,setosa 27 | 5.0,3.0,1.6,0.2,setosa 28 | 5.0,3.4,1.6,0.4,setosa 29 | 5.2,3.5,1.5,0.2,setosa 30 | 5.2,3.4,1.4,0.2,setosa 31 | 4.7,3.2,1.6,0.2,setosa 32 | 4.8,3.1,1.6,0.2,setosa 33 | 5.4,3.4,1.5,0.4,setosa 34 | 5.2,4.1,1.5,0.1,setosa 35 | 5.5,4.2,1.4,0.2,setosa 36 | 4.9,3.1,1.5,0.2,setosa 37 | 5.0,3.2,1.2,0.2,setosa 38 | 5.5,3.5,1.3,0.2,setosa 39 | 4.9,3.6,1.4,0.1,setosa 40 | 4.4,3.0,1.3,0.2,setosa 41 | 5.1,3.4,1.5,0.2,setosa 42 | 5.0,3.5,1.3,0.3,setosa 43 | 4.5,2.3,1.3,0.3,setosa 44 | 4.4,3.2,1.3,0.2,setosa 45 | 5.0,3.5,1.6,0.6,setosa 46 | 5.1,3.8,1.9,0.4,setosa 47 | 4.8,3.0,1.4,0.3,setosa 48 | 5.1,3.8,1.6,0.2,setosa 49 | 4.6,3.2,1.4,0.2,setosa 50 | 5.3,3.7,1.5,0.2,setosa 51 | 5.0,3.3,1.4,0.2,setosa 52 | 7.0,3.2,4.7,1.4,versicolor 53 | 6.4,3.2,4.5,1.5,versicolor 54 | 6.9,3.1,4.9,1.5,versicolor 55 | 5.5,2.3,4.0,1.3,versicolor 56 | 6.5,2.8,4.6,1.5,versicolor 57 | 5.7,2.8,4.5,1.3,versicolor 58 | 6.3,3.3,4.7,1.6,versicolor 59 | 4.9,2.4,3.3,1.0,versicolor 60 | 6.6,2.9,4.6,1.3,versicolor 61 | 5.2,2.7,3.9,1.4,versicolor 62 | 5.0,2.0,3.5,1.0,versicolor 63 | 5.9,3.0,4.2,1.5,versicolor 64 | 6.0,2.2,4.0,1.0,versicolor 65 | 6.1,2.9,4.7,1.4,versicolor 66 | 5.6,2.9,3.6,1.3,versicolor 67 | 6.7,3.1,4.4,1.4,versicolor 68 | 5.6,3.0,4.5,1.5,versicolor 69 | 5.8,2.7,4.1,1.0,versicolor 70 | 6.2,2.2,4.5,1.5,versicolor 71 | 5.6,2.5,3.9,1.1,versicolor 72 | 5.9,3.2,4.8,1.8,versicolor 73 | 6.1,2.8,4.0,1.3,versicolor 74 | 6.3,2.5,4.9,1.5,versicolor 75 | 6.1,2.8,4.7,1.2,versicolor 76 | 6.4,2.9,4.3,1.3,versicolor 77 | 6.6,3.0,4.4,1.4,versicolor 78 | 6.8,2.8,4.8,1.4,versicolor 79 | 6.7,3.0,5.0,1.7,versicolor 80 | 6.0,2.9,4.5,1.5,versicolor 81 | 5.7,2.6,3.5,1.0,versicolor 82 | 5.5,2.4,3.8,1.1,versicolor 83 | 5.5,2.4,3.7,1.0,versicolor 84 | 5.8,2.7,3.9,1.2,versicolor 85 | 6.0,2.7,5.1,1.6,versicolor 86 | 5.4,3.0,4.5,1.5,versicolor 87 | 6.0,3.4,4.5,1.6,versicolor 88 | 6.7,3.1,4.7,1.5,versicolor 89 | 6.3,2.3,4.4,1.3,versicolor 90 | 5.6,3.0,4.1,1.3,versicolor 91 | 5.5,2.5,4.0,1.3,versicolor 92 | 5.5,2.6,4.4,1.2,versicolor 93 | 6.1,3.0,4.6,1.4,versicolor 94 | 5.8,2.6,4.0,1.2,versicolor 95 | 5.0,2.3,3.3,1.0,versicolor 96 | 5.6,2.7,4.2,1.3,versicolor 97 | 5.7,3.0,4.2,1.2,versicolor 98 | 5.7,2.9,4.2,1.3,versicolor 99 | 6.2,2.9,4.3,1.3,versicolor 100 | 5.1,2.5,3.0,1.1,versicolor 101 | 5.7,2.8,4.1,1.3,versicolor 102 | 6.3,3.3,6.0,2.5,virginica 103 | 5.8,2.7,5.1,1.9,virginica 104 | 7.1,3.0,5.9,2.1,virginica 105 | 6.3,2.9,5.6,1.8,virginica 106 | 6.5,3.0,5.8,2.2,virginica 107 | 7.6,3.0,6.6,2.1,virginica 108 | 4.9,2.5,4.5,1.7,virginica 109 | 7.3,2.9,6.3,1.8,virginica 110 | 6.7,2.5,5.8,1.8,virginica 111 | 7.2,3.6,6.1,2.5,virginica 112 | 6.5,3.2,5.1,2.0,virginica 113 | 6.4,2.7,5.3,1.9,virginica 114 | 6.8,3.0,5.5,2.1,virginica 115 | 5.7,2.5,5.0,2.0,virginica 116 | 5.8,2.8,5.1,2.4,virginica 117 | 6.4,3.2,5.3,2.3,virginica 118 | 6.5,3.0,5.5,1.8,virginica 119 | 7.7,3.8,6.7,2.2,virginica 120 | 7.7,2.6,6.9,2.3,virginica 121 | 6.0,2.2,5.0,1.5,virginica 122 | 6.9,3.2,5.7,2.3,virginica 123 | 5.6,2.8,4.9,2.0,virginica 124 | 7.7,2.8,6.7,2.0,virginica 125 | 6.3,2.7,4.9,1.8,virginica 126 | 6.7,3.3,5.7,2.1,virginica 127 | 7.2,3.2,6.0,1.8,virginica 128 | 6.2,2.8,4.8,1.8,virginica 129 | 6.1,3.0,4.9,1.8,virginica 130 | 6.4,2.8,5.6,2.1,virginica 131 | 7.2,3.0,5.8,1.6,virginica 132 | 7.4,2.8,6.1,1.9,virginica 133 | 7.9,3.8,6.4,2.0,virginica 134 | 6.4,2.8,5.6,2.2,virginica 135 | 6.3,2.8,5.1,1.5,virginica 136 | 6.1,2.6,5.6,1.4,virginica 137 | 7.7,3.0,6.1,2.3,virginica 138 | 6.3,3.4,5.6,2.4,virginica 139 | 6.4,3.1,5.5,1.8,virginica 140 | 6.0,3.0,4.8,1.8,virginica 141 | 6.9,3.1,5.4,2.1,virginica 142 | 6.7,3.1,5.6,2.4,virginica 143 | 6.9,3.1,5.1,2.3,virginica 144 | 5.8,2.7,5.1,1.9,virginica 145 | 6.8,3.2,5.9,2.3,virginica 146 | 6.7,3.3,5.7,2.5,virginica 147 | 6.7,3.0,5.2,2.3,virginica 148 | 6.3,2.5,5.0,1.9,virginica 149 | 6.5,3.0,5.2,2.0,virginica 150 | 6.2,3.4,5.4,2.3,virginica 151 | 5.9,3.0,5.1,1.8,virginica 152 | -------------------------------------------------------------------------------- /data/iss.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Dataweekends/zero_to_deep_learning_video/efff3403504906c1e38c23f4d35fd3cb98af1950/data/iss.jpg -------------------------------------------------------------------------------- /data/sms.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Dataweekends/zero_to_deep_learning_video/efff3403504906c1e38c23f4d35fd3cb98af1950/data/sms.wav -------------------------------------------------------------------------------- /data/us_retail_sales.csv: -------------------------------------------------------------------------------- 1 | YEAR,JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC 1992,164083,164260,163747,164759,165617,166098,167305,167797,169407,170681,171025,172995 1993,175078,173770,172328,176766,178445,178201,180759,180692,181800,182910,184746,186339 1994,185128,188077,191588,191632,190940,193196,193763,196157,197754,199579,199723,200670 1995,201583,198383,200230,201048,202993,205507,204959,206529,206978,206157,208661,210434 1996,208731,212011,213855,214644,216304,216059,216374,216355,219240,221039,220989,221898 1997,223524,225409,226136,224588,222906,226048,228738,229317,230284,229822,230486,231197 1998,231605,231664,233043,235976,237055,238958,237423,236412,238542,242531,244307,246577 1999,246891,249510,250657,252418,254738,255472,257441,260253,261352,261825,264883,269876 2000,268091,272020,275214,271004,271418,273440,272638,272943,277523,276973,275923,275736 2001,278916,278799,276468,280804,281540,280399,279522,281423,276095,294613,286960,283708 2002,283577,285061,284263,288820,284994,287401,290427,292582,288434,289634,291475,293819 2003,295294,291178,296347,295643,296395,299662,302788,307745,305916,304824,308551,307362 2004,309225,311427,316935,313531,318962,314971,318532,318945,324607,326680,327837,331877 2005,329187,332847,332517,337608,334551,344034,346715,342950,343711,344390,347047,347441 2006,357626,354614,356357,357320,356423,357763,359217,360497,358468,358435,359256,364300 2007,363462,364090,367714,366065,370887,367933,369472,370982,373138,375048,378371,375349 2008,375303,371661,371961,373877,375790,376584,374774,372091,366167,352900,339942,332065 2009,336977,335765,330219,331314,334290,339532,340286,346705,338356,341590,344911,346258 2010,346349,346972,354625,357086,354430,353830,354536,356560,359440,363855,367352,369299 2011,372041,375003,378472,380284,380398,383081,382966,383541,387313,390302,391478,391695 2012,395280,400181,401830,400095,399463,395723,397516,401937,405707,405598,407327,409468 2013,412207,416783,412514,413391,414207,415989,417962,417101,417682,419686,421056,422891 2014,419195,424657,429707,433082,434161,435587,435429,438598,437932,439829,441445,437996 2015,435881,433579,440463,440355,444400,444747,447112,447738,447641,446489,448724,450831 2016,446757,448895,447757,453397,454135,457409,457849,457722,462284,465321,466028,470616 2017,473104,471865,470844,,,,,,,,, -------------------------------------------------------------------------------- /data/user_visit_duration.csv: -------------------------------------------------------------------------------- 1 | Time (min),Buy 2 | 2.0,0 3 | 0.6833333333333333,0 4 | 3.216666666666667,1 5 | 0.9,0 6 | 1.5333333333333334,1 7 | 2.8833333333333333,1 8 | 0.8,0 9 | 1.4666666666666666,0 10 | 1.1166666666666667,0 11 | 0.6,0 12 | 1.35,1 13 | 3.183333333333333,1 14 | 2.7666666666666666,0 15 | 2.183333333333333,1 16 | 1.5,1 17 | 1.3333333333333333,1 18 | 1.5333333333333334,0 19 | 0.7833333333333333,0 20 | 2.9833333333333334,1 21 | 4.15,1 22 | 0.85,0 23 | 2.033333333333333,1 24 | 1.6,0 25 | 2.6166666666666667,1 26 | 2.683333333333333,1 27 | 1.95,0 28 | 0.4666666666666667,1 29 | 2.716666666666667,1 30 | 2.333333333333333,1 31 | 3.4166666666666665,1 32 | 0.26666666666666666,0 33 | 1.3833333333333333,1 34 | 0.5166666666666667,0 35 | 2.7,1 36 | 2.05,0 37 | 2.95,1 38 | 1.2333333333333334,0 39 | 3.6166666666666663,1 40 | 1.4333333333333331,1 41 | 2.066666666666667,0 42 | 2.066666666666667,1 43 | 1.5,0 44 | 2.433333333333333,0 45 | 2.95,1 46 | 2.216666666666667,1 47 | 0.9166666666666666,0 48 | 2.1,1 49 | 3.75,1 50 | 1.0,0 51 | 0.0,0 52 | 2.65,1 53 | 1.55,0 54 | 1.0666666666666669,0 55 | 2.0166666666666666,1 56 | 0.0,0 57 | 0.0,0 58 | 0.6666666666666666,0 59 | 2.5166666666666666,1 60 | 1.0666666666666669,0 61 | 1.25,0 62 | 2.95,1 63 | 0.0,0 64 | 1.9666666666666663,0 65 | 2.2,1 66 | 2.9,1 67 | 3.85,1 68 | 2.3833333333333333,1 69 | 2.083333333333333,1 70 | 3.183333333333333,1 71 | 3.8666666666666663,1 72 | 2.183333333333333,0 73 | 2.833333333333333,1 74 | 2.7333333333333334,1 75 | 1.3833333333333333,0 76 | 1.1666666666666667,0 77 | 0.38333333333333336,0 78 | 1.1666666666666667,0 79 | 1.5166666666666666,0 80 | 3.216666666666667,1 81 | 1.1333333333333333,0 82 | 0.7,0 83 | 0.8166666666666667,0 84 | 3.883333333333333,1 85 | 2.216666666666667,1 86 | 0.75,0 87 | 2.566666666666667,0 88 | 0.0,0 89 | 0.0,0 90 | 1.7666666666666666,1 91 | 1.6833333333333331,1 92 | 0.21666666666666667,0 93 | 0.0,0 94 | 2.8833333333333333,1 95 | 2.466666666666667,1 96 | 1.2666666666666666,0 97 | 3.75,1 98 | 3.883333333333333,1 99 | 1.5666666666666669,0 100 | 1.6666666666666667,0 101 | 2.15,1 102 | -------------------------------------------------------------------------------- /data/wines.csv: -------------------------------------------------------------------------------- 1 | Class,Alcohol,Malic_acid,Ash,Alcalinity_of_ash,Magnesium,Total_phenols,Flavanoids,Nonflavanoid_phenols,Proanthocyanins,Color_intensity,Hue,OD280-OD315_of_diluted_wines,Proline 2 | 1,14.23,1.71,2.43,15.6,127,2.8,3.06,.28,2.29,5.64,1.04,3.92,1065 3 | 1,13.2,1.78,2.14,11.2,100,2.65,2.76,.26,1.28,4.38,1.05,3.4,1050 4 | 1,13.16,2.36,2.67,18.6,101,2.8,3.24,.3,2.81,5.68,1.03,3.17,1185 5 | 1,14.37,1.95,2.5,16.8,113,3.85,3.49,.24,2.18,7.8,.86,3.45,1480 6 | 1,13.24,2.59,2.87,21,118,2.8,2.69,.39,1.82,4.32,1.04,2.93,735 7 | 1,14.2,1.76,2.45,15.2,112,3.27,3.39,.34,1.97,6.75,1.05,2.85,1450 8 | 1,14.39,1.87,2.45,14.6,96,2.5,2.52,.3,1.98,5.25,1.02,3.58,1290 9 | 1,14.06,2.15,2.61,17.6,121,2.6,2.51,.31,1.25,5.05,1.06,3.58,1295 10 | 1,14.83,1.64,2.17,14,97,2.8,2.98,.29,1.98,5.2,1.08,2.85,1045 11 | 1,13.86,1.35,2.27,16,98,2.98,3.15,.22,1.85,7.22,1.01,3.55,1045 12 | 1,14.1,2.16,2.3,18,105,2.95,3.32,.22,2.38,5.75,1.25,3.17,1510 13 | 1,14.12,1.48,2.32,16.8,95,2.2,2.43,.26,1.57,5,1.17,2.82,1280 14 | 1,13.75,1.73,2.41,16,89,2.6,2.76,.29,1.81,5.6,1.15,2.9,1320 15 | 1,14.75,1.73,2.39,11.4,91,3.1,3.69,.43,2.81,5.4,1.25,2.73,1150 16 | 1,14.38,1.87,2.38,12,102,3.3,3.64,.29,2.96,7.5,1.2,3,1547 17 | 1,13.63,1.81,2.7,17.2,112,2.85,2.91,.3,1.46,7.3,1.28,2.88,1310 18 | 1,14.3,1.92,2.72,20,120,2.8,3.14,.33,1.97,6.2,1.07,2.65,1280 19 | 1,13.83,1.57,2.62,20,115,2.95,3.4,.4,1.72,6.6,1.13,2.57,1130 20 | 1,14.19,1.59,2.48,16.5,108,3.3,3.93,.32,1.86,8.7,1.23,2.82,1680 21 | 1,13.64,3.1,2.56,15.2,116,2.7,3.03,.17,1.66,5.1,.96,3.36,845 22 | 1,14.06,1.63,2.28,16,126,3,3.17,.24,2.1,5.65,1.09,3.71,780 23 | 1,12.93,3.8,2.65,18.6,102,2.41,2.41,.25,1.98,4.5,1.03,3.52,770 24 | 1,13.71,1.86,2.36,16.6,101,2.61,2.88,.27,1.69,3.8,1.11,4,1035 25 | 1,12.85,1.6,2.52,17.8,95,2.48,2.37,.26,1.46,3.93,1.09,3.63,1015 26 | 1,13.5,1.81,2.61,20,96,2.53,2.61,.28,1.66,3.52,1.12,3.82,845 27 | 1,13.05,2.05,3.22,25,124,2.63,2.68,.47,1.92,3.58,1.13,3.2,830 28 | 1,13.39,1.77,2.62,16.1,93,2.85,2.94,.34,1.45,4.8,.92,3.22,1195 29 | 1,13.3,1.72,2.14,17,94,2.4,2.19,.27,1.35,3.95,1.02,2.77,1285 30 | 1,13.87,1.9,2.8,19.4,107,2.95,2.97,.37,1.76,4.5,1.25,3.4,915 31 | 1,14.02,1.68,2.21,16,96,2.65,2.33,.26,1.98,4.7,1.04,3.59,1035 32 | 1,13.73,1.5,2.7,22.5,101,3,3.25,.29,2.38,5.7,1.19,2.71,1285 33 | 1,13.58,1.66,2.36,19.1,106,2.86,3.19,.22,1.95,6.9,1.09,2.88,1515 34 | 1,13.68,1.83,2.36,17.2,104,2.42,2.69,.42,1.97,3.84,1.23,2.87,990 35 | 1,13.76,1.53,2.7,19.5,132,2.95,2.74,.5,1.35,5.4,1.25,3,1235 36 | 1,13.51,1.8,2.65,19,110,2.35,2.53,.29,1.54,4.2,1.1,2.87,1095 37 | 1,13.48,1.81,2.41,20.5,100,2.7,2.98,.26,1.86,5.1,1.04,3.47,920 38 | 1,13.28,1.64,2.84,15.5,110,2.6,2.68,.34,1.36,4.6,1.09,2.78,880 39 | 1,13.05,1.65,2.55,18,98,2.45,2.43,.29,1.44,4.25,1.12,2.51,1105 40 | 1,13.07,1.5,2.1,15.5,98,2.4,2.64,.28,1.37,3.7,1.18,2.69,1020 41 | 1,14.22,3.99,2.51,13.2,128,3,3.04,.2,2.08,5.1,.89,3.53,760 42 | 1,13.56,1.71,2.31,16.2,117,3.15,3.29,.34,2.34,6.13,.95,3.38,795 43 | 1,13.41,3.84,2.12,18.8,90,2.45,2.68,.27,1.48,4.28,.91,3,1035 44 | 1,13.88,1.89,2.59,15,101,3.25,3.56,.17,1.7,5.43,.88,3.56,1095 45 | 1,13.24,3.98,2.29,17.5,103,2.64,2.63,.32,1.66,4.36,.82,3,680 46 | 1,13.05,1.77,2.1,17,107,3,3,.28,2.03,5.04,.88,3.35,885 47 | 1,14.21,4.04,2.44,18.9,111,2.85,2.65,.3,1.25,5.24,.87,3.33,1080 48 | 1,14.38,3.59,2.28,16,102,3.25,3.17,.27,2.19,4.9,1.04,3.44,1065 49 | 1,13.9,1.68,2.12,16,101,3.1,3.39,.21,2.14,6.1,.91,3.33,985 50 | 1,14.1,2.02,2.4,18.8,103,2.75,2.92,.32,2.38,6.2,1.07,2.75,1060 51 | 1,13.94,1.73,2.27,17.4,108,2.88,3.54,.32,2.08,8.90,1.12,3.1,1260 52 | 1,13.05,1.73,2.04,12.4,92,2.72,3.27,.17,2.91,7.2,1.12,2.91,1150 53 | 1,13.83,1.65,2.6,17.2,94,2.45,2.99,.22,2.29,5.6,1.24,3.37,1265 54 | 1,13.82,1.75,2.42,14,111,3.88,3.74,.32,1.87,7.05,1.01,3.26,1190 55 | 1,13.77,1.9,2.68,17.1,115,3,2.79,.39,1.68,6.3,1.13,2.93,1375 56 | 1,13.74,1.67,2.25,16.4,118,2.6,2.9,.21,1.62,5.85,.92,3.2,1060 57 | 1,13.56,1.73,2.46,20.5,116,2.96,2.78,.2,2.45,6.25,.98,3.03,1120 58 | 1,14.22,1.7,2.3,16.3,118,3.2,3,.26,2.03,6.38,.94,3.31,970 59 | 1,13.29,1.97,2.68,16.8,102,3,3.23,.31,1.66,6,1.07,2.84,1270 60 | 1,13.72,1.43,2.5,16.7,108,3.4,3.67,.19,2.04,6.8,.89,2.87,1285 61 | 2,12.37,.94,1.36,10.6,88,1.98,.57,.28,.42,1.95,1.05,1.82,520 62 | 2,12.33,1.1,2.28,16,101,2.05,1.09,.63,.41,3.27,1.25,1.67,680 63 | 2,12.64,1.36,2.02,16.8,100,2.02,1.41,.53,.62,5.75,.98,1.59,450 64 | 2,13.67,1.25,1.92,18,94,2.1,1.79,.32,.73,3.8,1.23,2.46,630 65 | 2,12.37,1.13,2.16,19,87,3.5,3.1,.19,1.87,4.45,1.22,2.87,420 66 | 2,12.17,1.45,2.53,19,104,1.89,1.75,.45,1.03,2.95,1.45,2.23,355 67 | 2,12.37,1.21,2.56,18.1,98,2.42,2.65,.37,2.08,4.6,1.19,2.3,678 68 | 2,13.11,1.01,1.7,15,78,2.98,3.18,.26,2.28,5.3,1.12,3.18,502 69 | 2,12.37,1.17,1.92,19.6,78,2.11,2,.27,1.04,4.68,1.12,3.48,510 70 | 2,13.34,.94,2.36,17,110,2.53,1.3,.55,.42,3.17,1.02,1.93,750 71 | 2,12.21,1.19,1.75,16.8,151,1.85,1.28,.14,2.5,2.85,1.28,3.07,718 72 | 2,12.29,1.61,2.21,20.4,103,1.1,1.02,.37,1.46,3.05,.906,1.82,870 73 | 2,13.86,1.51,2.67,25,86,2.95,2.86,.21,1.87,3.38,1.36,3.16,410 74 | 2,13.49,1.66,2.24,24,87,1.88,1.84,.27,1.03,3.74,.98,2.78,472 75 | 2,12.99,1.67,2.6,30,139,3.3,2.89,.21,1.96,3.35,1.31,3.5,985 76 | 2,11.96,1.09,2.3,21,101,3.38,2.14,.13,1.65,3.21,.99,3.13,886 77 | 2,11.66,1.88,1.92,16,97,1.61,1.57,.34,1.15,3.8,1.23,2.14,428 78 | 2,13.03,.9,1.71,16,86,1.95,2.03,.24,1.46,4.6,1.19,2.48,392 79 | 2,11.84,2.89,2.23,18,112,1.72,1.32,.43,.95,2.65,.96,2.52,500 80 | 2,12.33,.99,1.95,14.8,136,1.9,1.85,.35,2.76,3.4,1.06,2.31,750 81 | 2,12.7,3.87,2.4,23,101,2.83,2.55,.43,1.95,2.57,1.19,3.13,463 82 | 2,12,.92,2,19,86,2.42,2.26,.3,1.43,2.5,1.38,3.12,278 83 | 2,12.72,1.81,2.2,18.8,86,2.2,2.53,.26,1.77,3.9,1.16,3.14,714 84 | 2,12.08,1.13,2.51,24,78,2,1.58,.4,1.4,2.2,1.31,2.72,630 85 | 2,13.05,3.86,2.32,22.5,85,1.65,1.59,.61,1.62,4.8,.84,2.01,515 86 | 2,11.84,.89,2.58,18,94,2.2,2.21,.22,2.35,3.05,.79,3.08,520 87 | 2,12.67,.98,2.24,18,99,2.2,1.94,.3,1.46,2.62,1.23,3.16,450 88 | 2,12.16,1.61,2.31,22.8,90,1.78,1.69,.43,1.56,2.45,1.33,2.26,495 89 | 2,11.65,1.67,2.62,26,88,1.92,1.61,.4,1.34,2.6,1.36,3.21,562 90 | 2,11.64,2.06,2.46,21.6,84,1.95,1.69,.48,1.35,2.8,1,2.75,680 91 | 2,12.08,1.33,2.3,23.6,70,2.2,1.59,.42,1.38,1.74,1.07,3.21,625 92 | 2,12.08,1.83,2.32,18.5,81,1.6,1.5,.52,1.64,2.4,1.08,2.27,480 93 | 2,12,1.51,2.42,22,86,1.45,1.25,.5,1.63,3.6,1.05,2.65,450 94 | 2,12.69,1.53,2.26,20.7,80,1.38,1.46,.58,1.62,3.05,.96,2.06,495 95 | 2,12.29,2.83,2.22,18,88,2.45,2.25,.25,1.99,2.15,1.15,3.3,290 96 | 2,11.62,1.99,2.28,18,98,3.02,2.26,.17,1.35,3.25,1.16,2.96,345 97 | 2,12.47,1.52,2.2,19,162,2.5,2.27,.32,3.28,2.6,1.16,2.63,937 98 | 2,11.81,2.12,2.74,21.5,134,1.6,.99,.14,1.56,2.5,.95,2.26,625 99 | 2,12.29,1.41,1.98,16,85,2.55,2.5,.29,1.77,2.9,1.23,2.74,428 100 | 2,12.37,1.07,2.1,18.5,88,3.52,3.75,.24,1.95,4.5,1.04,2.77,660 101 | 2,12.29,3.17,2.21,18,88,2.85,2.99,.45,2.81,2.3,1.42,2.83,406 102 | 2,12.08,2.08,1.7,17.5,97,2.23,2.17,.26,1.4,3.3,1.27,2.96,710 103 | 2,12.6,1.34,1.9,18.5,88,1.45,1.36,.29,1.35,2.45,1.04,2.77,562 104 | 2,12.34,2.45,2.46,21,98,2.56,2.11,.34,1.31,2.8,.8,3.38,438 105 | 2,11.82,1.72,1.88,19.5,86,2.5,1.64,.37,1.42,2.06,.94,2.44,415 106 | 2,12.51,1.73,1.98,20.5,85,2.2,1.92,.32,1.48,2.94,1.04,3.57,672 107 | 2,12.42,2.55,2.27,22,90,1.68,1.84,.66,1.42,2.7,.86,3.3,315 108 | 2,12.25,1.73,2.12,19,80,1.65,2.03,.37,1.63,3.4,1,3.17,510 109 | 2,12.72,1.75,2.28,22.5,84,1.38,1.76,.48,1.63,3.3,.88,2.42,488 110 | 2,12.22,1.29,1.94,19,92,2.36,2.04,.39,2.08,2.7,.86,3.02,312 111 | 2,11.61,1.35,2.7,20,94,2.74,2.92,.29,2.49,2.65,.96,3.26,680 112 | 2,11.46,3.74,1.82,19.5,107,3.18,2.58,.24,3.58,2.9,.75,2.81,562 113 | 2,12.52,2.43,2.17,21,88,2.55,2.27,.26,1.22,2,.9,2.78,325 114 | 2,11.76,2.68,2.92,20,103,1.75,2.03,.6,1.05,3.8,1.23,2.5,607 115 | 2,11.41,.74,2.5,21,88,2.48,2.01,.42,1.44,3.08,1.1,2.31,434 116 | 2,12.08,1.39,2.5,22.5,84,2.56,2.29,.43,1.04,2.9,.93,3.19,385 117 | 2,11.03,1.51,2.2,21.5,85,2.46,2.17,.52,2.01,1.9,1.71,2.87,407 118 | 2,11.82,1.47,1.99,20.8,86,1.98,1.6,.3,1.53,1.95,.95,3.33,495 119 | 2,12.42,1.61,2.19,22.5,108,2,2.09,.34,1.61,2.06,1.06,2.96,345 120 | 2,12.77,3.43,1.98,16,80,1.63,1.25,.43,.83,3.4,.7,2.12,372 121 | 2,12,3.43,2,19,87,2,1.64,.37,1.87,1.28,.93,3.05,564 122 | 2,11.45,2.4,2.42,20,96,2.9,2.79,.32,1.83,3.25,.8,3.39,625 123 | 2,11.56,2.05,3.23,28.5,119,3.18,5.08,.47,1.87,6,.93,3.69,465 124 | 2,12.42,4.43,2.73,26.5,102,2.2,2.13,.43,1.71,2.08,.92,3.12,365 125 | 2,13.05,5.8,2.13,21.5,86,2.62,2.65,.3,2.01,2.6,.73,3.1,380 126 | 2,11.87,4.31,2.39,21,82,2.86,3.03,.21,2.91,2.8,.75,3.64,380 127 | 2,12.07,2.16,2.17,21,85,2.6,2.65,.37,1.35,2.76,.86,3.28,378 128 | 2,12.43,1.53,2.29,21.5,86,2.74,3.15,.39,1.77,3.94,.69,2.84,352 129 | 2,11.79,2.13,2.78,28.5,92,2.13,2.24,.58,1.76,3,.97,2.44,466 130 | 2,12.37,1.63,2.3,24.5,88,2.22,2.45,.4,1.9,2.12,.89,2.78,342 131 | 2,12.04,4.3,2.38,22,80,2.1,1.75,.42,1.35,2.6,.79,2.57,580 132 | 3,12.86,1.35,2.32,18,122,1.51,1.25,.21,.94,4.1,.76,1.29,630 133 | 3,12.88,2.99,2.4,20,104,1.3,1.22,.24,.83,5.4,.74,1.42,530 134 | 3,12.81,2.31,2.4,24,98,1.15,1.09,.27,.83,5.7,.66,1.36,560 135 | 3,12.7,3.55,2.36,21.5,106,1.7,1.2,.17,.84,5,.78,1.29,600 136 | 3,12.51,1.24,2.25,17.5,85,2,.58,.6,1.25,5.45,.75,1.51,650 137 | 3,12.6,2.46,2.2,18.5,94,1.62,.66,.63,.94,7.1,.73,1.58,695 138 | 3,12.25,4.72,2.54,21,89,1.38,.47,.53,.8,3.85,.75,1.27,720 139 | 3,12.53,5.51,2.64,25,96,1.79,.6,.63,1.1,5,.82,1.69,515 140 | 3,13.49,3.59,2.19,19.5,88,1.62,.48,.58,.88,5.7,.81,1.82,580 141 | 3,12.84,2.96,2.61,24,101,2.32,.6,.53,.81,4.92,.89,2.15,590 142 | 3,12.93,2.81,2.7,21,96,1.54,.5,.53,.75,4.6,.77,2.31,600 143 | 3,13.36,2.56,2.35,20,89,1.4,.5,.37,.64,5.6,.7,2.47,780 144 | 3,13.52,3.17,2.72,23.5,97,1.55,.52,.5,.55,4.35,.89,2.06,520 145 | 3,13.62,4.95,2.35,20,92,2,.8,.47,1.02,4.4,.91,2.05,550 146 | 3,12.25,3.88,2.2,18.5,112,1.38,.78,.29,1.14,8.21,.65,2,855 147 | 3,13.16,3.57,2.15,21,102,1.5,.55,.43,1.3,4,.6,1.68,830 148 | 3,13.88,5.04,2.23,20,80,.98,.34,.4,.68,4.9,.58,1.33,415 149 | 3,12.87,4.61,2.48,21.5,86,1.7,.65,.47,.86,7.65,.54,1.86,625 150 | 3,13.32,3.24,2.38,21.5,92,1.93,.76,.45,1.25,8.42,.55,1.62,650 151 | 3,13.08,3.9,2.36,21.5,113,1.41,1.39,.34,1.14,9.40,.57,1.33,550 152 | 3,13.5,3.12,2.62,24,123,1.4,1.57,.22,1.25,8.60,.59,1.3,500 153 | 3,12.79,2.67,2.48,22,112,1.48,1.36,.24,1.26,10.8,.48,1.47,480 154 | 3,13.11,1.9,2.75,25.5,116,2.2,1.28,.26,1.56,7.1,.61,1.33,425 155 | 3,13.23,3.3,2.28,18.5,98,1.8,.83,.61,1.87,10.52,.56,1.51,675 156 | 3,12.58,1.29,2.1,20,103,1.48,.58,.53,1.4,7.6,.58,1.55,640 157 | 3,13.17,5.19,2.32,22,93,1.74,.63,.61,1.55,7.9,.6,1.48,725 158 | 3,13.84,4.12,2.38,19.5,89,1.8,.83,.48,1.56,9.01,.57,1.64,480 159 | 3,12.45,3.03,2.64,27,97,1.9,.58,.63,1.14,7.5,.67,1.73,880 160 | 3,14.34,1.68,2.7,25,98,2.8,1.31,.53,2.7,13,.57,1.96,660 161 | 3,13.48,1.67,2.64,22.5,89,2.6,1.1,.52,2.29,11.75,.57,1.78,620 162 | 3,12.36,3.83,2.38,21,88,2.3,.92,.5,1.04,7.65,.56,1.58,520 163 | 3,13.69,3.26,2.54,20,107,1.83,.56,.5,.8,5.88,.96,1.82,680 164 | 3,12.85,3.27,2.58,22,106,1.65,.6,.6,.96,5.58,.87,2.11,570 165 | 3,12.96,3.45,2.35,18.5,106,1.39,.7,.4,.94,5.28,.68,1.75,675 166 | 3,13.78,2.76,2.3,22,90,1.35,.68,.41,1.03,9.58,.7,1.68,615 167 | 3,13.73,4.36,2.26,22.5,88,1.28,.47,.52,1.15,6.62,.78,1.75,520 168 | 3,13.45,3.7,2.6,23,111,1.7,.92,.43,1.46,10.68,.85,1.56,695 169 | 3,12.82,3.37,2.3,19.5,88,1.48,.66,.4,.97,10.26,.72,1.75,685 170 | 3,13.58,2.58,2.69,24.5,105,1.55,.84,.39,1.54,8.66,.74,1.8,750 171 | 3,13.4,4.6,2.86,25,112,1.98,.96,.27,1.11,8.5,.67,1.92,630 172 | 3,12.2,3.03,2.32,19,96,1.25,.49,.4,.73,5.5,.66,1.83,510 173 | 3,12.77,2.39,2.28,19.5,86,1.39,.51,.48,.64,9.899999,.57,1.63,470 174 | 3,14.16,2.51,2.48,20,91,1.68,.7,.44,1.24,9.7,.62,1.71,660 175 | 3,13.71,5.65,2.45,20.5,95,1.68,.61,.52,1.06,7.7,.64,1.74,740 176 | 3,13.4,3.91,2.48,23,102,1.8,.75,.43,1.41,7.3,.7,1.56,750 177 | 3,13.27,4.28,2.26,20,120,1.59,.69,.43,1.35,10.2,.59,1.56,835 178 | 3,13.17,2.59,2.37,20,120,1.65,.68,.53,1.46,9.3,.6,1.62,840 179 | 3,14.13,4.1,2.74,24.5,96,2.05,.76,.56,1.35,9.2,.61,1.6,560 -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: ztdl 2 | channels: 3 | - defaults 4 | dependencies: 5 | - python=3.7.* 6 | - bz2file==0.98 7 | - cython==0.29.* 8 | - pip==21.0.* 9 | - numpy==1.19.* 10 | - jupyter==1.0.* 11 | - matplotlib==3.3.* 12 | - setuptools==52.0.* 13 | - scikit-learn==0.24.* 14 | - scipy==1.6.* 15 | - pandas==1.2.* 16 | - pillow==8.2.* 17 | - seaborn==0.11.* 18 | - pytest==6.2.* 19 | - twisted==21.2.* 20 | - pip: 21 | - tensorflow==2.5.* 22 | -------------------------------------------------------------------------------- /solutions/2 Data exploration Exercises Solution.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import numpy as np\n", 10 | "\n", 11 | "%matplotlib inline\n", 12 | "import matplotlib.pyplot as plt\n", 13 | "\n", 14 | "import pandas as pd" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "## Exercise 1\n", 22 | "- load the dataset: `../data/international-airline-passengers.csv`\n", 23 | "- inspect it using the `.info()` and `.head()` commands\n", 24 | "- use the function `pd.to_datetime()` to change the column type of 'Month' to a datatime type\n", 25 | "- set the index of df to be a datetime index using the column 'Month' and the `df.set_index()` method\n", 26 | "- choose the appropriate plot and display the data\n", 27 | "- choose appropriate scale\n", 28 | "- label the axes" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": null, 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "# - load the dataset: ../data/international-airline-passengers.csv\n", 38 | "df = pd.read_csv('../data/international-airline-passengers.csv')" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": null, 44 | "metadata": {}, 45 | "outputs": [], 46 | "source": [ 47 | "# - inspect it using the .info() and .head() commands\n", 48 | "df.info()" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": null, 54 | "metadata": {}, 55 | "outputs": [], 56 | "source": [ 57 | "df.head()" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": null, 63 | "metadata": {}, 64 | "outputs": [], 65 | "source": [ 66 | "# - use the function to_datetime() to change the column type of 'Month' to a datatime type\n", 67 | "# - set the index of df to be a datetime index using the column 'Month' and tthe set_index() method\n", 68 | "\n", 69 | "df['Month'] = pd.to_datetime(df['Month'])\n", 70 | "df = df.set_index('Month')" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": null, 76 | "metadata": {}, 77 | "outputs": [], 78 | "source": [ 79 | "df.head()" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": null, 85 | "metadata": {}, 86 | "outputs": [], 87 | "source": [ 88 | "# - choose the appropriate plot and display the data\n", 89 | "# - choose appropriate scale\n", 90 | "# - label the axes\n", 91 | "\n", 92 | "df.plot();" 93 | ] 94 | }, 95 | { 96 | "cell_type": "markdown", 97 | "metadata": {}, 98 | "source": [ 99 | "## Exercise 2\n", 100 | "- load the dataset: `../data/weight-height.csv`\n", 101 | "- inspect it\n", 102 | "- plot it using a scatter plot with Weight as a function of Height\n", 103 | "- plot the male and female populations with 2 different colors on a new scatter plot\n", 104 | "- remember to label the axes" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": null, 110 | "metadata": {}, 111 | "outputs": [], 112 | "source": [ 113 | "# - load the dataset: ../data/weight-height.csv\n", 114 | "# - inspect it\n", 115 | "df = pd.read_csv('../data/weight-height.csv')\n", 116 | "df.head()" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": null, 122 | "metadata": {}, 123 | "outputs": [], 124 | "source": [ 125 | "df.info()" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": null, 131 | "metadata": {}, 132 | "outputs": [], 133 | "source": [ 134 | "df.describe()" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": null, 140 | "metadata": { 141 | "scrolled": true 142 | }, 143 | "outputs": [], 144 | "source": [ 145 | "df['Gender'].value_counts()" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": null, 151 | "metadata": {}, 152 | "outputs": [], 153 | "source": [ 154 | "# - plot it using a scatter plot with Weight as a function of Height\n", 155 | "_ = df.plot(kind='scatter', x='Height', y='Weight');" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": null, 161 | "metadata": {}, 162 | "outputs": [], 163 | "source": [ 164 | "# - plot the male and female populations with 2 different colors on a new scatter plot\n", 165 | "# - remember to label the axes\n", 166 | "\n", 167 | "# this can be done in several ways, showing 2 here:\n", 168 | "males = df[df['Gender'] == 'Male']\n", 169 | "females = df.query('Gender == \"Female\"')\n", 170 | "fig, ax = plt.subplots()\n", 171 | "\n", 172 | "males.plot(kind='scatter', x='Height', y='Weight',\n", 173 | " ax=ax, color='blue', alpha=0.3,\n", 174 | " title='Male & Female Populations')\n", 175 | "\n", 176 | "females.plot(kind='scatter', x='Height', y='Weight',\n", 177 | " ax=ax, color='red', alpha=0.3);" 178 | ] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": null, 183 | "metadata": {}, 184 | "outputs": [], 185 | "source": [ 186 | "df['Gendercolor'] = df['Gender'].map({'Male': 'blue', 'Female': 'red'})\n", 187 | "df.head()" 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "execution_count": null, 193 | "metadata": {}, 194 | "outputs": [], 195 | "source": [ 196 | "df.plot(kind='scatter', \n", 197 | " x='Height',\n", 198 | " y='Weight',\n", 199 | " c=df['Gendercolor'],\n", 200 | " alpha=0.3,\n", 201 | " title='Male & Female Populations');" 202 | ] 203 | }, 204 | { 205 | "cell_type": "code", 206 | "execution_count": null, 207 | "metadata": {}, 208 | "outputs": [], 209 | "source": [ 210 | "fig, ax = plt.subplots()\n", 211 | "ax.plot(males['Height'], males['Weight'], 'ob', \n", 212 | " females['Height'], females['Weight'], 'or', alpha=0.3)\n", 213 | "plt.xlabel('Height')\n", 214 | "plt.ylabel('Weight')\n", 215 | "plt.title('Male & Female Populations');" 216 | ] 217 | }, 218 | { 219 | "cell_type": "markdown", 220 | "metadata": { 221 | "collapsed": true 222 | }, 223 | "source": [ 224 | "## Exercise 3\n", 225 | "- plot the histogram of the heights for males and for females on the same plot\n", 226 | "- use alpha to control transparency in the plot comand\n", 227 | "- plot a vertical line at the mean of each population using `plt.axvline()`" 228 | ] 229 | }, 230 | { 231 | "cell_type": "code", 232 | "execution_count": null, 233 | "metadata": {}, 234 | "outputs": [], 235 | "source": [ 236 | "males['Height'].plot(kind='hist',\n", 237 | " bins=50,\n", 238 | " range=(50, 80),\n", 239 | " alpha=0.3,\n", 240 | " color='blue')\n", 241 | "\n", 242 | "females['Height'].plot(kind='hist',\n", 243 | " bins=50,\n", 244 | " range=(50, 80),\n", 245 | " alpha=0.3,\n", 246 | " color='red')\n", 247 | "\n", 248 | "plt.title('Height distribution')\n", 249 | "plt.legend([\"Males\", \"Females\"])\n", 250 | "plt.xlabel(\"Heigth (in)\")\n", 251 | "\n", 252 | "\n", 253 | "plt.axvline(males['Height'].mean(), color='blue', linewidth=2)\n", 254 | "plt.axvline(females['Height'].mean(), color='red', linewidth=2);" 255 | ] 256 | }, 257 | { 258 | "cell_type": "code", 259 | "execution_count": null, 260 | "metadata": {}, 261 | "outputs": [], 262 | "source": [ 263 | "males['Height'].plot(kind='hist',\n", 264 | " bins=200,\n", 265 | " range=(50, 80),\n", 266 | " alpha=0.3,\n", 267 | " color='blue',\n", 268 | " cumulative=True,\n", 269 | " density=True)\n", 270 | "\n", 271 | "females['Height'].plot(kind='hist',\n", 272 | " bins=200,\n", 273 | " range=(50, 80),\n", 274 | " alpha=0.3,\n", 275 | " color='red',\n", 276 | " cumulative=True,\n", 277 | " density=True)\n", 278 | "\n", 279 | "plt.title('Height distribution')\n", 280 | "plt.legend([\"Males\", \"Females\"])\n", 281 | "plt.xlabel(\"Heigth (in)\")\n", 282 | "\n", 283 | "plt.axhline(0.8)\n", 284 | "plt.axhline(0.5)\n", 285 | "plt.axhline(0.2);" 286 | ] 287 | }, 288 | { 289 | "cell_type": "markdown", 290 | "metadata": {}, 291 | "source": [ 292 | "## Exercise 4\n", 293 | "- plot the weights of the males and females using a box plot\n", 294 | "- which one is easier to read?\n", 295 | "- (remember to put in titles, axes and legends)" 296 | ] 297 | }, 298 | { 299 | "cell_type": "code", 300 | "execution_count": null, 301 | "metadata": {}, 302 | "outputs": [], 303 | "source": [ 304 | "dfpvt = df.pivot(columns = 'Gender', values = 'Weight')" 305 | ] 306 | }, 307 | { 308 | "cell_type": "code", 309 | "execution_count": null, 310 | "metadata": {}, 311 | "outputs": [], 312 | "source": [ 313 | "dfpvt.head()" 314 | ] 315 | }, 316 | { 317 | "cell_type": "code", 318 | "execution_count": null, 319 | "metadata": {}, 320 | "outputs": [], 321 | "source": [ 322 | "dfpvt.info()" 323 | ] 324 | }, 325 | { 326 | "cell_type": "code", 327 | "execution_count": null, 328 | "metadata": {}, 329 | "outputs": [], 330 | "source": [ 331 | "dfpvt.plot(kind='box')\n", 332 | "plt.title('Weight Box Plot')\n", 333 | "plt.ylabel(\"Weight (lbs)\");" 334 | ] 335 | }, 336 | { 337 | "cell_type": "markdown", 338 | "metadata": {}, 339 | "source": [ 340 | "## Exercise 5\n", 341 | "- load the dataset: `../data/titanic-train.csv`\n", 342 | "- learn about scattermatrix here: http://pandas.pydata.org/pandas-docs/stable/visualization.html\n", 343 | "- display the data using a scattermatrix" 344 | ] 345 | }, 346 | { 347 | "cell_type": "code", 348 | "execution_count": null, 349 | "metadata": {}, 350 | "outputs": [], 351 | "source": [ 352 | "df = pd.read_csv('../data/titanic-train.csv')\n", 353 | "df.head()" 354 | ] 355 | }, 356 | { 357 | "cell_type": "code", 358 | "execution_count": null, 359 | "metadata": {}, 360 | "outputs": [], 361 | "source": [ 362 | "from pandas.plotting import scatter_matrix" 363 | ] 364 | }, 365 | { 366 | "cell_type": "code", 367 | "execution_count": null, 368 | "metadata": {}, 369 | "outputs": [], 370 | "source": [ 371 | "_ = scatter_matrix(df.drop('PassengerId', axis=1), figsize=(10, 10))" 372 | ] 373 | }, 374 | { 375 | "cell_type": "code", 376 | "execution_count": null, 377 | "metadata": {}, 378 | "outputs": [], 379 | "source": [] 380 | } 381 | ], 382 | "metadata": { 383 | "anaconda-cloud": {}, 384 | "kernelspec": { 385 | "display_name": "Python 3", 386 | "language": "python", 387 | "name": "python3" 388 | }, 389 | "language_info": { 390 | "codemirror_mode": { 391 | "name": "ipython", 392 | "version": 3 393 | }, 394 | "file_extension": ".py", 395 | "mimetype": "text/x-python", 396 | "name": "python", 397 | "nbconvert_exporter": "python", 398 | "pygments_lexer": "ipython3", 399 | "version": "3.7.10" 400 | } 401 | }, 402 | "nbformat": 4, 403 | "nbformat_minor": 1 404 | } 405 | -------------------------------------------------------------------------------- /solutions/3 Machine Learning Exercises Solution.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Machine Learning Exercises Solution" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "%matplotlib inline\n", 17 | "import matplotlib.pyplot as plt\n", 18 | "import pandas as pd\n", 19 | "import numpy as np" 20 | ] 21 | }, 22 | { 23 | "cell_type": "markdown", 24 | "metadata": {}, 25 | "source": [ 26 | "## Exercise 1\n", 27 | "\n", 28 | "You've just been hired at a real estate investment firm and they would like you to build a model for pricing houses. You are given a dataset that contains data for house prices and a few features like number of bedrooms, size in square feet and age of the house. Let's see if you can build a model that is able to predict the price. In this exercise we extend what we have learned about linear regression to a dataset with more than one feature. Here are the steps to complete it:\n", 29 | "\n", 30 | "1. Load the dataset ../data/housing-data.csv\n", 31 | "- plot the histograms for each feature\n", 32 | "- create 2 variables called X and y: X shall be a matrix with 3 columns (sqft,bdrms,age) and y shall be a vector with 1 column (price)\n", 33 | "- create a linear regression model in Keras with the appropriate number of inputs and output\n", 34 | "- split the data into train and test with a 20% test size\n", 35 | "- train the model on the training set and check its accuracy on training and test set\n", 36 | "- how's your model doing? Is the loss growing smaller?\n", 37 | "- try to improve your model with these experiments:\n", 38 | " - normalize the input features with one of the rescaling techniques mentioned above\n", 39 | " - use a different value for the learning rate of your model\n", 40 | " - use a different optimizer\n", 41 | "- once you're satisfied with training, check the R2score on the test set" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": null, 47 | "metadata": {}, 48 | "outputs": [], 49 | "source": [ 50 | "# Load the dataset ../data/housing-data.csv\n", 51 | "df = pd.read_csv('../data/housing-data.csv')\n", 52 | "df.head()" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": null, 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [ 61 | "df.columns" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": null, 67 | "metadata": {}, 68 | "outputs": [], 69 | "source": [ 70 | "# plot the histograms for each feature\n", 71 | "plt.figure(figsize=(15, 5))\n", 72 | "for i, feature in enumerate(df.columns):\n", 73 | " plt.subplot(1, 4, i+1)\n", 74 | " df[feature].plot(kind='hist', title=feature)\n", 75 | " plt.xlabel(feature)" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": null, 81 | "metadata": {}, 82 | "outputs": [], 83 | "source": [ 84 | "# create 2 variables called X and y:\n", 85 | "# X shall be a matrix with 3 columns (sqft,bdrms,age)\n", 86 | "# and y shall be a vector with 1 column (price)\n", 87 | "X = df[['sqft', 'bdrms', 'age']].values\n", 88 | "y = df['price'].values" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": null, 94 | "metadata": {}, 95 | "outputs": [], 96 | "source": [ 97 | "X" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": null, 103 | "metadata": {}, 104 | "outputs": [], 105 | "source": [ 106 | "y" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": null, 112 | "metadata": {}, 113 | "outputs": [], 114 | "source": [ 115 | "from tensorflow.keras.models import Sequential\n", 116 | "from tensorflow.keras.layers import Dense\n", 117 | "from tensorflow.keras.optimizers import Adam" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": null, 123 | "metadata": {}, 124 | "outputs": [], 125 | "source": [ 126 | "# create a linear regression model in Keras\n", 127 | "# with the appropriate number of inputs and output\n", 128 | "model = Sequential()\n", 129 | "model.add(Dense(1, input_shape=(3,)))\n", 130 | "model.compile(Adam(learning_rate=0.8), 'mean_squared_error')" 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": null, 136 | "metadata": {}, 137 | "outputs": [], 138 | "source": [ 139 | "from sklearn.model_selection import train_test_split" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": null, 145 | "metadata": {}, 146 | "outputs": [], 147 | "source": [ 148 | "# split the data into train and test with a 20% test size\n", 149 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": null, 155 | "metadata": {}, 156 | "outputs": [], 157 | "source": [ 158 | "len(X_train)" 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": null, 164 | "metadata": {}, 165 | "outputs": [], 166 | "source": [ 167 | "len(X)" 168 | ] 169 | }, 170 | { 171 | "cell_type": "code", 172 | "execution_count": null, 173 | "metadata": {}, 174 | "outputs": [], 175 | "source": [ 176 | "# train the model on the training set and check its accuracy on training and test set\n", 177 | "# how's your model doing? Is the loss growing smaller?\n", 178 | "model.fit(X_train, y_train, epochs=10)" 179 | ] 180 | }, 181 | { 182 | "cell_type": "code", 183 | "execution_count": null, 184 | "metadata": {}, 185 | "outputs": [], 186 | "source": [ 187 | "df.describe()" 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "execution_count": null, 193 | "metadata": {}, 194 | "outputs": [], 195 | "source": [ 196 | "from sklearn.metrics import r2_score" 197 | ] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": null, 202 | "metadata": {}, 203 | "outputs": [], 204 | "source": [ 205 | "# check the R2score on training and test set (probably very bad)\n", 206 | "\n", 207 | "y_train_pred = model.predict(X_train)\n", 208 | "y_test_pred = model.predict(X_test)\n", 209 | "\n", 210 | "print(\"The R2 score on the Train set is:\\t{:0.3f}\".format(r2_score(y_train, y_train_pred)))\n", 211 | "print(\"The R2 score on the Test set is:\\t{:0.3f}\".format(r2_score(y_test, y_test_pred)))" 212 | ] 213 | }, 214 | { 215 | "cell_type": "code", 216 | "execution_count": null, 217 | "metadata": {}, 218 | "outputs": [], 219 | "source": [ 220 | "# try to improve your model with these experiments:\n", 221 | "# - normalize the input features with one of the rescaling techniques mentioned above\n", 222 | "# - use a different value for the learning rate of your model\n", 223 | "# - use a different optimizer\n", 224 | "df['sqft1000'] = df['sqft']/1000.0\n", 225 | "df['age10'] = df['age']/10.0\n", 226 | "df['price100k'] = df['price']/1e5" 227 | ] 228 | }, 229 | { 230 | "cell_type": "code", 231 | "execution_count": null, 232 | "metadata": {}, 233 | "outputs": [], 234 | "source": [ 235 | "X = df[['sqft1000', 'bdrms', 'age10']].values\n", 236 | "y = df['price100k'].values" 237 | ] 238 | }, 239 | { 240 | "cell_type": "code", 241 | "execution_count": null, 242 | "metadata": {}, 243 | "outputs": [], 244 | "source": [ 245 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)" 246 | ] 247 | }, 248 | { 249 | "cell_type": "code", 250 | "execution_count": null, 251 | "metadata": {}, 252 | "outputs": [], 253 | "source": [ 254 | "model = Sequential()\n", 255 | "model.add(Dense(1, input_dim=3))\n", 256 | "model.compile(Adam(learning_rate=0.1), 'mean_squared_error')\n", 257 | "model.fit(X_train, y_train, epochs=20)" 258 | ] 259 | }, 260 | { 261 | "cell_type": "code", 262 | "execution_count": null, 263 | "metadata": {}, 264 | "outputs": [], 265 | "source": [ 266 | "# once you're satisfied with training, check the R2score on the test set\n", 267 | "\n", 268 | "y_train_pred = model.predict(X_train)\n", 269 | "y_test_pred = model.predict(X_test)\n", 270 | "\n", 271 | "print(\"The R2 score on the Train set is:\\t{:0.3f}\".format(r2_score(y_train, y_train_pred)))\n", 272 | "print(\"The R2 score on the Test set is:\\t{:0.3f}\".format(r2_score(y_test, y_test_pred)))" 273 | ] 274 | }, 275 | { 276 | "cell_type": "code", 277 | "execution_count": null, 278 | "metadata": {}, 279 | "outputs": [], 280 | "source": [ 281 | "model.fit(X_train, y_train, epochs=40, verbose=0)" 282 | ] 283 | }, 284 | { 285 | "cell_type": "code", 286 | "execution_count": null, 287 | "metadata": {}, 288 | "outputs": [], 289 | "source": [ 290 | "# once you're satisfied with training, check the R2score on the test set\n", 291 | "\n", 292 | "y_train_pred = model.predict(X_train)\n", 293 | "y_test_pred = model.predict(X_test)\n", 294 | "\n", 295 | "print(\"The R2 score on the Train set is:\\t{:0.3f}\".format(r2_score(y_train, y_train_pred)))\n", 296 | "print(\"The R2 score on the Test set is:\\t{:0.3f}\".format(r2_score(y_test, y_test_pred)))" 297 | ] 298 | }, 299 | { 300 | "cell_type": "markdown", 301 | "metadata": {}, 302 | "source": [ 303 | "## Exercise 2\n", 304 | "\n", 305 | "Your boss was extremely happy with your work on the housing price prediction model and decided to entrust you with a more challenging task. They've seen a lot of people leave the company recently and they would like to understand why that's happening. They have collected historical data on employees and they would like you to build a model that is able to predict which employee will leave next. The would like a model that is better than random guessing. They also prefer false negatives than false positives, in this first phase. Fields in the dataset include:\n", 306 | "\n", 307 | "- Employee satisfaction level\n", 308 | "- Last evaluation\n", 309 | "- Number of projects\n", 310 | "- Average monthly hours\n", 311 | "- Time spent at the company\n", 312 | "- Whether they have had a work accident\n", 313 | "- Whether they have had a promotion in the last 5 years\n", 314 | "- Department\n", 315 | "- Salary\n", 316 | "- Whether the employee has left\n", 317 | "\n", 318 | "Your goal is to predict the binary outcome variable `left` using the rest of the data. Since the outcome is binary, this is a classification problem. Here are some things you may want to try out:\n", 319 | "\n", 320 | "1. load the dataset at ../data/HR_comma_sep.csv, inspect it with `.head()`, `.info()` and `.describe()`.\n", 321 | "- Establish a benchmark: what would be your accuracy score if you predicted everyone stay?\n", 322 | "- Check if any feature needs rescaling. You may plot a histogram of the feature to decide which rescaling method is more appropriate.\n", 323 | "- convert the categorical features into binary dummy columns. You will then have to combine them with the numerical features using `pd.concat`.\n", 324 | "- do the usual train/test split with a 20% test size\n", 325 | "- play around with learning rate and optimizer\n", 326 | "- check the confusion matrix, precision and recall\n", 327 | "- check if you still get the same results if you use a 5-Fold cross validation on all the data\n", 328 | "- Is the model good enough for your boss?\n", 329 | "\n", 330 | "As you will see in this exercise, the a logistic regression model is not good enough to help your boss. In the next chapter we will learn how to go beyond linear models.\n", 331 | "\n", 332 | "This dataset comes from https://www.kaggle.com/ludobenistant/hr-analytics/ and is released under [CC BY-SA 4.0 License](https://creativecommons.org/licenses/by-sa/4.0/)." 333 | ] 334 | }, 335 | { 336 | "cell_type": "code", 337 | "execution_count": null, 338 | "metadata": {}, 339 | "outputs": [], 340 | "source": [ 341 | "# load the dataset at ../data/HR_comma_sep.csv, inspect it with `.head()`, `.info()` and `.describe()`.\n", 342 | "\n", 343 | "df = pd.read_csv('../data/HR_comma_sep.csv')" 344 | ] 345 | }, 346 | { 347 | "cell_type": "code", 348 | "execution_count": null, 349 | "metadata": {}, 350 | "outputs": [], 351 | "source": [ 352 | "df.head()" 353 | ] 354 | }, 355 | { 356 | "cell_type": "code", 357 | "execution_count": null, 358 | "metadata": {}, 359 | "outputs": [], 360 | "source": [ 361 | "df.info()" 362 | ] 363 | }, 364 | { 365 | "cell_type": "code", 366 | "execution_count": null, 367 | "metadata": {}, 368 | "outputs": [], 369 | "source": [ 370 | "df.describe()" 371 | ] 372 | }, 373 | { 374 | "cell_type": "code", 375 | "execution_count": null, 376 | "metadata": {}, 377 | "outputs": [], 378 | "source": [ 379 | "# Establish a benchmark: what would be your accuracy score if you predicted everyone stay?\n", 380 | "\n", 381 | "df.left.value_counts() / len(df)" 382 | ] 383 | }, 384 | { 385 | "cell_type": "markdown", 386 | "metadata": {}, 387 | "source": [ 388 | "Predicting 0 all the time would yield an accuracy of 76%" 389 | ] 390 | }, 391 | { 392 | "cell_type": "code", 393 | "execution_count": null, 394 | "metadata": {}, 395 | "outputs": [], 396 | "source": [ 397 | "# Check if any feature needs rescaling.\n", 398 | "# You may plot a histogram of the feature to decide which rescaling method is more appropriate.\n", 399 | "df['average_montly_hours'].plot(kind='hist');" 400 | ] 401 | }, 402 | { 403 | "cell_type": "code", 404 | "execution_count": null, 405 | "metadata": {}, 406 | "outputs": [], 407 | "source": [ 408 | "df['average_montly_hours_100'] = df['average_montly_hours']/100.0" 409 | ] 410 | }, 411 | { 412 | "cell_type": "code", 413 | "execution_count": null, 414 | "metadata": {}, 415 | "outputs": [], 416 | "source": [ 417 | "df['average_montly_hours_100'].plot(kind='hist');" 418 | ] 419 | }, 420 | { 421 | "cell_type": "code", 422 | "execution_count": null, 423 | "metadata": {}, 424 | "outputs": [], 425 | "source": [ 426 | "df['time_spend_company'].plot(kind='hist');" 427 | ] 428 | }, 429 | { 430 | "cell_type": "code", 431 | "execution_count": null, 432 | "metadata": {}, 433 | "outputs": [], 434 | "source": [ 435 | "# convert the categorical features into binary dummy columns.\n", 436 | "# You will then have to combine them with\n", 437 | "# the numerical features using `pd.concat`.\n", 438 | "df_dummies = pd.get_dummies(df[['sales', 'salary']])" 439 | ] 440 | }, 441 | { 442 | "cell_type": "code", 443 | "execution_count": null, 444 | "metadata": {}, 445 | "outputs": [], 446 | "source": [ 447 | "df_dummies.head()\n" 448 | ] 449 | }, 450 | { 451 | "cell_type": "code", 452 | "execution_count": null, 453 | "metadata": {}, 454 | "outputs": [], 455 | "source": [ 456 | "df.columns" 457 | ] 458 | }, 459 | { 460 | "cell_type": "code", 461 | "execution_count": null, 462 | "metadata": {}, 463 | "outputs": [], 464 | "source": [ 465 | "X = pd.concat([df[['satisfaction_level', 'last_evaluation', 'number_project',\n", 466 | " 'time_spend_company', 'Work_accident',\n", 467 | " 'promotion_last_5years', 'average_montly_hours_100']],\n", 468 | " df_dummies], axis=1).values\n", 469 | "y = df['left'].values" 470 | ] 471 | }, 472 | { 473 | "cell_type": "code", 474 | "execution_count": null, 475 | "metadata": {}, 476 | "outputs": [], 477 | "source": [ 478 | "X.shape" 479 | ] 480 | }, 481 | { 482 | "cell_type": "code", 483 | "execution_count": null, 484 | "metadata": {}, 485 | "outputs": [], 486 | "source": [ 487 | "# do the usual train/test split with a 20% test size\n", 488 | "\n", 489 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)" 490 | ] 491 | }, 492 | { 493 | "cell_type": "code", 494 | "execution_count": null, 495 | "metadata": {}, 496 | "outputs": [], 497 | "source": [ 498 | "# play around with learning rate and optimizer\n", 499 | "\n", 500 | "model = Sequential()\n", 501 | "model.add(Dense(1, input_dim=20, activation='sigmoid'))\n", 502 | "model.compile(Adam(learning_rate=0.5), 'binary_crossentropy', metrics=['accuracy'])" 503 | ] 504 | }, 505 | { 506 | "cell_type": "code", 507 | "execution_count": null, 508 | "metadata": {}, 509 | "outputs": [], 510 | "source": [ 511 | "model.summary()" 512 | ] 513 | }, 514 | { 515 | "cell_type": "code", 516 | "execution_count": null, 517 | "metadata": {}, 518 | "outputs": [], 519 | "source": [ 520 | "model.fit(X_train, y_train, epochs=10)" 521 | ] 522 | }, 523 | { 524 | "cell_type": "code", 525 | "execution_count": null, 526 | "metadata": {}, 527 | "outputs": [], 528 | "source": [ 529 | "y_test_pred = model.predict_classes(X_test)" 530 | ] 531 | }, 532 | { 533 | "cell_type": "code", 534 | "execution_count": null, 535 | "metadata": {}, 536 | "outputs": [], 537 | "source": [ 538 | "from sklearn.metrics import confusion_matrix, classification_report" 539 | ] 540 | }, 541 | { 542 | "cell_type": "code", 543 | "execution_count": null, 544 | "metadata": {}, 545 | "outputs": [], 546 | "source": [ 547 | "def pretty_confusion_matrix(y_true, y_pred, labels=[\"False\", \"True\"]):\n", 548 | " cm = confusion_matrix(y_true, y_pred)\n", 549 | " pred_labels = ['Predicted '+ l for l in labels]\n", 550 | " df = pd.DataFrame(cm, index=labels, columns=pred_labels)\n", 551 | " return df" 552 | ] 553 | }, 554 | { 555 | "cell_type": "code", 556 | "execution_count": null, 557 | "metadata": {}, 558 | "outputs": [], 559 | "source": [ 560 | "# check the confusion matrix, precision and recall\n", 561 | "\n", 562 | "pretty_confusion_matrix(y_test, y_test_pred, labels=['Stay', 'Leave'])" 563 | ] 564 | }, 565 | { 566 | "cell_type": "code", 567 | "execution_count": null, 568 | "metadata": {}, 569 | "outputs": [], 570 | "source": [ 571 | "print(classification_report(y_test, y_test_pred))" 572 | ] 573 | }, 574 | { 575 | "cell_type": "code", 576 | "execution_count": null, 577 | "metadata": {}, 578 | "outputs": [], 579 | "source": [ 580 | "from tensorflow.keras.wrappers.scikit_learn import KerasClassifier" 581 | ] 582 | }, 583 | { 584 | "cell_type": "code", 585 | "execution_count": null, 586 | "metadata": {}, 587 | "outputs": [], 588 | "source": [ 589 | "# check if you still get the same results if you use a 5-Fold cross validation on all the data\n", 590 | "\n", 591 | "def build_logistic_regression_model():\n", 592 | " model = Sequential()\n", 593 | " model.add(Dense(1, input_dim=20, activation='sigmoid'))\n", 594 | " model.compile(Adam(learning_rate=0.5), 'binary_crossentropy', metrics=['accuracy'])\n", 595 | " return model\n", 596 | "\n", 597 | "model = KerasClassifier(build_fn=build_logistic_regression_model,\n", 598 | " epochs=10, verbose=0)" 599 | ] 600 | }, 601 | { 602 | "cell_type": "code", 603 | "execution_count": null, 604 | "metadata": {}, 605 | "outputs": [], 606 | "source": [ 607 | "from sklearn.model_selection import KFold, cross_val_score" 608 | ] 609 | }, 610 | { 611 | "cell_type": "code", 612 | "execution_count": null, 613 | "metadata": {}, 614 | "outputs": [], 615 | "source": [ 616 | "cv = KFold(5, shuffle=True)\n", 617 | "scores = cross_val_score(model, X, y, cv=cv)\n", 618 | "\n", 619 | "print(\"The cross validation accuracy is {:0.4f} ± {:0.4f}\".format(scores.mean(), scores.std()))" 620 | ] 621 | }, 622 | { 623 | "cell_type": "code", 624 | "execution_count": null, 625 | "metadata": {}, 626 | "outputs": [], 627 | "source": [ 628 | "scores" 629 | ] 630 | }, 631 | { 632 | "cell_type": "code", 633 | "execution_count": null, 634 | "metadata": {}, 635 | "outputs": [], 636 | "source": [ 637 | "# Is the model good enough for your boss?" 638 | ] 639 | }, 640 | { 641 | "cell_type": "markdown", 642 | "metadata": {}, 643 | "source": [ 644 | "No, the model is not good enough for my boss, since it performs no better than the benchmark." 645 | ] 646 | } 647 | ], 648 | "metadata": { 649 | "kernelspec": { 650 | "display_name": "Python 3", 651 | "language": "python", 652 | "name": "python3" 653 | }, 654 | "language_info": { 655 | "codemirror_mode": { 656 | "name": "ipython", 657 | "version": 3 658 | }, 659 | "file_extension": ".py", 660 | "mimetype": "text/x-python", 661 | "name": "python", 662 | "nbconvert_exporter": "python", 663 | "pygments_lexer": "ipython3", 664 | "version": "3.7.10" 665 | } 666 | }, 667 | "nbformat": 4, 668 | "nbformat_minor": 2 669 | } 670 | -------------------------------------------------------------------------------- /solutions/4 Deep Learning Intro Exercises Solution.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Deep Learning Intro" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "%matplotlib inline\n", 17 | "import matplotlib.pyplot as plt\n", 18 | "import pandas as pd\n", 19 | "import numpy as np" 20 | ] 21 | }, 22 | { 23 | "cell_type": "markdown", 24 | "metadata": {}, 25 | "source": [ 26 | "## Exercise 1" 27 | ] 28 | }, 29 | { 30 | "cell_type": "markdown", 31 | "metadata": { 32 | "collapsed": true 33 | }, 34 | "source": [ 35 | "The [Pima Indians dataset](https://archive.ics.uci.edu/ml/datasets/diabetes) is a very famous dataset distributed by UCI and originally collected from the National Institute of Diabetes and Digestive and Kidney Diseases. It contains data from clinical exams for women age 21 and above of Pima indian origins. The objective is to predict based on diagnostic measurements whether a patient has diabetes.\n", 36 | "\n", 37 | "It has the following features:\n", 38 | "\n", 39 | "- Pregnancies: Number of times pregnant\n", 40 | "- Glucose: Plasma glucose concentration a 2 hours in an oral glucose tolerance test\n", 41 | "- BloodPressure: Diastolic blood pressure (mm Hg)\n", 42 | "- SkinThickness: Triceps skin fold thickness (mm)\n", 43 | "- Insulin: 2-Hour serum insulin (mu U/ml)\n", 44 | "- BMI: Body mass index (weight in kg/(height in m)^2)\n", 45 | "- DiabetesPedigreeFunction: Diabetes pedigree function\n", 46 | "- Age: Age (years)\n", 47 | "\n", 48 | "The last colum is the outcome, and it is a binary variable.\n", 49 | "\n", 50 | "In this first exercise we will explore it through the following steps:\n", 51 | "\n", 52 | "1. Load the ..data/diabetes.csv dataset, use pandas to explore the range of each feature\n", 53 | "- For each feature draw a histogram. Bonus points if you draw all the histograms in the same figure.\n", 54 | "- Explore correlations of features with the outcome column. You can do this in several ways, for example using the `sns.pairplot` we used above or drawing a heatmap of the correlations.\n", 55 | "- Do features need standardization? If so what stardardization technique will you use? MinMax? Standard?\n", 56 | "- Prepare your final `X` and `y` variables to be used by a ML model. Make sure you define your target variable well. Will you need dummy columns?" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": null, 62 | "metadata": {}, 63 | "outputs": [], 64 | "source": [ 65 | "df = pd.read_csv('../data/diabetes.csv')\n", 66 | "df.head()" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": null, 72 | "metadata": {}, 73 | "outputs": [], 74 | "source": [ 75 | "_ = df.hist(figsize=(12, 10))" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": null, 81 | "metadata": {}, 82 | "outputs": [], 83 | "source": [ 84 | "import seaborn as sns" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": null, 90 | "metadata": {}, 91 | "outputs": [], 92 | "source": [ 93 | "sns.pairplot(df, hue='Outcome');" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": null, 99 | "metadata": {}, 100 | "outputs": [], 101 | "source": [ 102 | "sns.heatmap(df.corr(), annot = True)" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": null, 108 | "metadata": {}, 109 | "outputs": [], 110 | "source": [ 111 | "df.info()" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": null, 117 | "metadata": {}, 118 | "outputs": [], 119 | "source": [ 120 | "df.describe()" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": null, 126 | "metadata": {}, 127 | "outputs": [], 128 | "source": [ 129 | "from sklearn.preprocessing import StandardScaler" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": null, 135 | "metadata": {}, 136 | "outputs": [], 137 | "source": [ 138 | "from tensorflow.keras.utils import to_categorical" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": null, 144 | "metadata": {}, 145 | "outputs": [], 146 | "source": [ 147 | "sc = StandardScaler()\n", 148 | "X = sc.fit_transform(df.drop('Outcome', axis=1))\n", 149 | "y = df['Outcome'].values\n", 150 | "y_cat = to_categorical(y)" 151 | ] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "execution_count": null, 156 | "metadata": {}, 157 | "outputs": [], 158 | "source": [ 159 | "X.shape" 160 | ] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "execution_count": null, 165 | "metadata": {}, 166 | "outputs": [], 167 | "source": [ 168 | "y_cat.shape" 169 | ] 170 | }, 171 | { 172 | "cell_type": "markdown", 173 | "metadata": {}, 174 | "source": [ 175 | "## Exercise 2" 176 | ] 177 | }, 178 | { 179 | "cell_type": "markdown", 180 | "metadata": { 181 | "collapsed": true 182 | }, 183 | "source": [ 184 | "Build a fully connected NN model that predicts diabetes. Follow these steps:\n", 185 | "\n", 186 | "1. Split your data in a train/test with a test size of 20% and a `random_state = 22`\n", 187 | "- define a sequential model with at least one inner layer. You will have to make choices for the following things:\n", 188 | " - what is the size of the input?\n", 189 | " - how many nodes will you use in each layer?\n", 190 | " - what is the size of the output?\n", 191 | " - what activation functions will you use in the inner layers?\n", 192 | " - what activation function will you use at output?\n", 193 | " - what loss function will you use?\n", 194 | " - what optimizer will you use?\n", 195 | "- fit your model on the training set, using a validation_split of 0.1\n", 196 | "- test your trained model on the test data from the train/test split\n", 197 | "- check the accuracy score, the confusion matrix and the classification report" 198 | ] 199 | }, 200 | { 201 | "cell_type": "code", 202 | "execution_count": null, 203 | "metadata": {}, 204 | "outputs": [], 205 | "source": [ 206 | "X.shape" 207 | ] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "execution_count": null, 212 | "metadata": {}, 213 | "outputs": [], 214 | "source": [ 215 | "from sklearn.model_selection import train_test_split" 216 | ] 217 | }, 218 | { 219 | "cell_type": "code", 220 | "execution_count": null, 221 | "metadata": {}, 222 | "outputs": [], 223 | "source": [ 224 | "X_train, X_test, y_train, y_test = train_test_split(X, y_cat,\n", 225 | " random_state=22,\n", 226 | " test_size=0.2)" 227 | ] 228 | }, 229 | { 230 | "cell_type": "code", 231 | "execution_count": null, 232 | "metadata": {}, 233 | "outputs": [], 234 | "source": [ 235 | "from tensorflow.keras.models import Sequential\n", 236 | "from tensorflow.keras.layers import Dense\n", 237 | "from tensorflow.keras.optimizers import Adam" 238 | ] 239 | }, 240 | { 241 | "cell_type": "code", 242 | "execution_count": null, 243 | "metadata": {}, 244 | "outputs": [], 245 | "source": [ 246 | "model = Sequential()\n", 247 | "model.add(Dense(32, input_shape=(8,), activation='relu'))\n", 248 | "model.add(Dense(32, activation='relu'))\n", 249 | "model.add(Dense(2, activation='softmax'))\n", 250 | "model.compile(Adam(learning_rate=0.05),\n", 251 | " loss='categorical_crossentropy',\n", 252 | " metrics=['accuracy'])" 253 | ] 254 | }, 255 | { 256 | "cell_type": "code", 257 | "execution_count": null, 258 | "metadata": {}, 259 | "outputs": [], 260 | "source": [ 261 | "model.summary()" 262 | ] 263 | }, 264 | { 265 | "cell_type": "code", 266 | "execution_count": null, 267 | "metadata": {}, 268 | "outputs": [], 269 | "source": [ 270 | "32*8 + 32" 271 | ] 272 | }, 273 | { 274 | "cell_type": "code", 275 | "execution_count": null, 276 | "metadata": {}, 277 | "outputs": [], 278 | "source": [ 279 | "model.fit(X_train, y_train, epochs=20, verbose=2, validation_split=0.1)" 280 | ] 281 | }, 282 | { 283 | "cell_type": "code", 284 | "execution_count": null, 285 | "metadata": {}, 286 | "outputs": [], 287 | "source": [ 288 | "y_pred = model.predict(X_test)" 289 | ] 290 | }, 291 | { 292 | "cell_type": "code", 293 | "execution_count": null, 294 | "metadata": {}, 295 | "outputs": [], 296 | "source": [ 297 | "y_test_class = np.argmax(y_test, axis=1)\n", 298 | "y_pred_class = np.argmax(y_pred, axis=1)" 299 | ] 300 | }, 301 | { 302 | "cell_type": "code", 303 | "execution_count": null, 304 | "metadata": {}, 305 | "outputs": [], 306 | "source": [ 307 | "from sklearn.metrics import accuracy_score\n", 308 | "from sklearn.metrics import classification_report\n", 309 | "from sklearn.metrics import confusion_matrix" 310 | ] 311 | }, 312 | { 313 | "cell_type": "code", 314 | "execution_count": null, 315 | "metadata": {}, 316 | "outputs": [], 317 | "source": [ 318 | "pd.Series(y_test_class).value_counts() / len(y_test_class)" 319 | ] 320 | }, 321 | { 322 | "cell_type": "code", 323 | "execution_count": null, 324 | "metadata": {}, 325 | "outputs": [], 326 | "source": [ 327 | "accuracy_score(y_test_class, y_pred_class)" 328 | ] 329 | }, 330 | { 331 | "cell_type": "code", 332 | "execution_count": null, 333 | "metadata": {}, 334 | "outputs": [], 335 | "source": [ 336 | "print(classification_report(y_test_class, y_pred_class))" 337 | ] 338 | }, 339 | { 340 | "cell_type": "code", 341 | "execution_count": null, 342 | "metadata": {}, 343 | "outputs": [], 344 | "source": [ 345 | "confusion_matrix(y_test_class, y_pred_class)" 346 | ] 347 | }, 348 | { 349 | "cell_type": "markdown", 350 | "metadata": {}, 351 | "source": [ 352 | "## Exercise 3\n", 353 | "Compare your work with the results presented in [this notebook](https://www.kaggle.com/sheshu/pima-data-visualisation-and-machine-learning). Are your Neural Network results better or worse than the results obtained by traditional Machine Learning techniques?\n", 354 | "\n", 355 | "- Try training a Support Vector Machine or a Random Forest model on the exact same train/test split. Is the performance better or worse?\n", 356 | "- Try restricting your features to only 4 features like in the suggested notebook. How does model performance change?" 357 | ] 358 | }, 359 | { 360 | "cell_type": "code", 361 | "execution_count": null, 362 | "metadata": {}, 363 | "outputs": [], 364 | "source": [ 365 | "from sklearn.ensemble import RandomForestClassifier\n", 366 | "from sklearn.svm import SVC\n", 367 | "from sklearn.naive_bayes import GaussianNB\n", 368 | "\n", 369 | "for mod in [RandomForestClassifier(), SVC(), GaussianNB()]:\n", 370 | " mod.fit(X_train, y_train[:, 1])\n", 371 | " y_pred = mod.predict(X_test)\n", 372 | " print(\"=\"*80)\n", 373 | " print(mod)\n", 374 | " print(\"-\"*80)\n", 375 | " print(\"Accuracy score: {:0.3}\".format(accuracy_score(y_test_class,\n", 376 | " y_pred)))\n", 377 | " print(\"Confusion Matrix:\")\n", 378 | " print(confusion_matrix(y_test_class, y_pred))\n", 379 | " print()" 380 | ] 381 | }, 382 | { 383 | "cell_type": "markdown", 384 | "metadata": {}, 385 | "source": [ 386 | "## Exercise 4\n", 387 | "\n", 388 | "[Tensorflow playground](http://playground.tensorflow.org/) is a web based neural network demo. It is really useful to develop an intuition about what happens when you change architecture, activation function or other parameters. Try playing with it for a few minutes. You don't nee do understand the meaning of every knob and button in the page, just get a sense for what happens if you change something. In the next chapter we'll explore these things in more detail.\n" 389 | ] 390 | } 391 | ], 392 | "metadata": { 393 | "kernelspec": { 394 | "display_name": "Python 3", 395 | "language": "python", 396 | "name": "python3" 397 | }, 398 | "language_info": { 399 | "codemirror_mode": { 400 | "name": "ipython", 401 | "version": 3 402 | }, 403 | "file_extension": ".py", 404 | "mimetype": "text/x-python", 405 | "name": "python", 406 | "nbconvert_exporter": "python", 407 | "pygments_lexer": "ipython3", 408 | "version": "3.7.10" 409 | } 410 | }, 411 | "nbformat": 4, 412 | "nbformat_minor": 2 413 | } 414 | -------------------------------------------------------------------------------- /solutions/5 Gradient Descent Exercises Solution.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Gradient Descent" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import numpy as np\n", 17 | "import pandas as pd\n", 18 | "%matplotlib inline\n", 19 | "import matplotlib.pyplot as plt" 20 | ] 21 | }, 22 | { 23 | "cell_type": "markdown", 24 | "metadata": {}, 25 | "source": [ 26 | "### Exercise 1\n", 27 | "\n", 28 | "You've just been hired at a wine company and they would like you to help them build a model that predicts the quality of their wine based on several measurements. They give you a dataset with wine\n", 29 | "\n", 30 | "- Load the ../data/wines.csv into Pandas\n", 31 | "- Use the column called \"Class\" as target\n", 32 | "- Check how many classes are there in target, and if necessary use dummy columns for a multi-class classification\n", 33 | "- Use all the other columns as features, check their range and distribution (using seaborn pairplot)\n", 34 | "- Rescale all the features using either MinMaxScaler or StandardScaler\n", 35 | "- Build a deep model with at least 1 hidden layer to classify the data\n", 36 | "- Choose the cost function, what will you use? Mean Squared Error? Binary Cross-Entropy? Categorical Cross-Entropy?\n", 37 | "- Choose an optimizer\n", 38 | "- Choose a value for the learning rate, you may want to try with several values\n", 39 | "- Choose a batch size\n", 40 | "- Train your model on all the data using a `validation_split=0.2`. Can you converge to 100% validation accuracy?\n", 41 | "- What's the minumum number of epochs to converge?\n", 42 | "- Repeat the training several times to verify how stable your results are" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "df = pd.read_csv('../data/wines.csv')" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "df.head()" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": null, 66 | "metadata": {}, 67 | "outputs": [], 68 | "source": [ 69 | "y = df['Class']" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": null, 75 | "metadata": {}, 76 | "outputs": [], 77 | "source": [ 78 | "y.value_counts()" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": null, 84 | "metadata": {}, 85 | "outputs": [], 86 | "source": [ 87 | "y_cat = pd.get_dummies(y)" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": null, 93 | "metadata": {}, 94 | "outputs": [], 95 | "source": [ 96 | "y_cat.head()" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": null, 102 | "metadata": {}, 103 | "outputs": [], 104 | "source": [ 105 | "X = df.drop('Class', axis=1)" 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": null, 111 | "metadata": {}, 112 | "outputs": [], 113 | "source": [ 114 | "X.shape" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": null, 120 | "metadata": {}, 121 | "outputs": [], 122 | "source": [ 123 | "import seaborn as sns" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": null, 129 | "metadata": {}, 130 | "outputs": [], 131 | "source": [ 132 | "sns.pairplot(df, hue='Class')" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": null, 138 | "metadata": {}, 139 | "outputs": [], 140 | "source": [ 141 | "from sklearn.preprocessing import StandardScaler" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": null, 147 | "metadata": {}, 148 | "outputs": [], 149 | "source": [ 150 | "sc = StandardScaler()" 151 | ] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "execution_count": null, 156 | "metadata": {}, 157 | "outputs": [], 158 | "source": [ 159 | "Xsc = sc.fit_transform(X)" 160 | ] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "execution_count": null, 165 | "metadata": {}, 166 | "outputs": [], 167 | "source": [ 168 | "from tensorflow.keras.models import Sequential\n", 169 | "from tensorflow.keras.layers import Dense\n", 170 | "from tensorflow.keras.optimizers import SGD, Adam, Adadelta, RMSprop\n", 171 | "import tensorflow.keras.backend as K" 172 | ] 173 | }, 174 | { 175 | "cell_type": "code", 176 | "execution_count": null, 177 | "metadata": {}, 178 | "outputs": [], 179 | "source": [ 180 | "K.clear_session()\n", 181 | "model = Sequential()\n", 182 | "model.add(Dense(5, input_shape=(13,),\n", 183 | " kernel_initializer='he_normal',\n", 184 | " activation='relu'))\n", 185 | "model.add(Dense(3, activation='softmax'))\n", 186 | "\n", 187 | "model.compile(RMSprop(learning_rate=0.1),\n", 188 | " 'categorical_crossentropy',\n", 189 | " metrics=['accuracy'])\n", 190 | "\n", 191 | "model.fit(Xsc, y_cat.values,\n", 192 | " batch_size=8,\n", 193 | " epochs=10,\n", 194 | " verbose=1,\n", 195 | " validation_split=0.2)" 196 | ] 197 | }, 198 | { 199 | "cell_type": "markdown", 200 | "metadata": {}, 201 | "source": [ 202 | "### Exercise 2\n", 203 | "\n", 204 | "Since this dataset has 13 features we can only visualize pairs of features like we did in the Paired plot. We could however exploit the fact that a neural network is a function to extract 2 high level features to represent our data.\n", 205 | "\n", 206 | "- Build a deep fully connected network with the following structure:\n", 207 | " - Layer 1: 8 nodes\n", 208 | " - Layer 2: 5 nodes\n", 209 | " - Layer 3: 2 nodes\n", 210 | " - Output : 3 nodes\n", 211 | "- Choose activation functions, inizializations, optimizer and learning rate so that it converges to 100% accuracy within 20 epochs (not easy)\n", 212 | "- Remember to train the model on the scaled data\n", 213 | "- Define a Feature Function like we did above between the input of the 1st layer and the output of the 3rd layer\n", 214 | "- Calculate the features and plot them on a 2-dimensional scatter plot\n", 215 | "- Can we distinguish the 3 classes well?\n" 216 | ] 217 | }, 218 | { 219 | "cell_type": "code", 220 | "execution_count": null, 221 | "metadata": {}, 222 | "outputs": [], 223 | "source": [ 224 | "K.clear_session()\n", 225 | "model = Sequential()\n", 226 | "model.add(Dense(8, input_shape=(13,),\n", 227 | " kernel_initializer='he_normal', activation='tanh'))\n", 228 | "model.add(Dense(5, kernel_initializer='he_normal', activation='tanh'))\n", 229 | "model.add(Dense(2, kernel_initializer='he_normal', activation='tanh'))\n", 230 | "model.add(Dense(3, activation='softmax'))\n", 231 | "\n", 232 | "model.compile(RMSprop(learning_rate=0.05),\n", 233 | " 'categorical_crossentropy',\n", 234 | " metrics=['accuracy'])\n", 235 | "\n", 236 | "model.fit(Xsc, y_cat.values,\n", 237 | " batch_size=16,\n", 238 | " epochs=20,\n", 239 | " verbose=1)" 240 | ] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "execution_count": null, 245 | "metadata": {}, 246 | "outputs": [], 247 | "source": [ 248 | "model.summary()" 249 | ] 250 | }, 251 | { 252 | "cell_type": "code", 253 | "execution_count": null, 254 | "metadata": {}, 255 | "outputs": [], 256 | "source": [ 257 | "inp = model.layers[0].input\n", 258 | "out = model.layers[2].output" 259 | ] 260 | }, 261 | { 262 | "cell_type": "code", 263 | "execution_count": null, 264 | "metadata": {}, 265 | "outputs": [], 266 | "source": [ 267 | "features_function = K.function([inp], [out])" 268 | ] 269 | }, 270 | { 271 | "cell_type": "code", 272 | "execution_count": null, 273 | "metadata": {}, 274 | "outputs": [], 275 | "source": [ 276 | "features = features_function([Xsc])[0]" 277 | ] 278 | }, 279 | { 280 | "cell_type": "code", 281 | "execution_count": null, 282 | "metadata": {}, 283 | "outputs": [], 284 | "source": [ 285 | "features.shape" 286 | ] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "execution_count": null, 291 | "metadata": {}, 292 | "outputs": [], 293 | "source": [ 294 | "plt.scatter(features[:, 0], features[:, 1], c=y)" 295 | ] 296 | }, 297 | { 298 | "cell_type": "markdown", 299 | "metadata": {}, 300 | "source": [ 301 | "### Exercise 3\n", 302 | "\n", 303 | "Keras functional API. So far we've always used the Sequential model API in Keras. However, Keras also offers a Functional API, which is much more powerful. You can find its [documentation here](https://keras.io/getting-started/functional-api-guide/). Let's see how we can leverage it.\n", 304 | "\n", 305 | "- define an input layer called `inputs`\n", 306 | "- define two hidden layers as before, one with 8 nodes, one with 5 nodes\n", 307 | "- define a `second_to_last` layer with 2 nodes\n", 308 | "- define an output layer with 3 nodes\n", 309 | "- create a model that connect input and output\n", 310 | "- train it and make sure that it converges\n", 311 | "- define a function between inputs and second_to_last layer\n", 312 | "- recalculate the features and plot them" 313 | ] 314 | }, 315 | { 316 | "cell_type": "code", 317 | "execution_count": null, 318 | "metadata": {}, 319 | "outputs": [], 320 | "source": [ 321 | "from tensorflow.keras.layers import Input\n", 322 | "from tensorflow.keras.models import Model" 323 | ] 324 | }, 325 | { 326 | "cell_type": "code", 327 | "execution_count": null, 328 | "metadata": {}, 329 | "outputs": [], 330 | "source": [ 331 | "K.clear_session()\n", 332 | "\n", 333 | "inputs = Input(shape=(13,))\n", 334 | "x = Dense(8, kernel_initializer='he_normal', activation='tanh')(inputs)\n", 335 | "x = Dense(5, kernel_initializer='he_normal', activation='tanh')(x)\n", 336 | "second_to_last = Dense(2, kernel_initializer='he_normal',\n", 337 | " activation='tanh')(x)\n", 338 | "outputs = Dense(3, activation='softmax')(second_to_last)\n", 339 | "\n", 340 | "model = Model(inputs=inputs, outputs=outputs)\n", 341 | "\n", 342 | "model.compile(RMSprop(learning_rate=0.05),\n", 343 | " 'categorical_crossentropy',\n", 344 | " metrics=['accuracy'])\n", 345 | "\n", 346 | "model.fit(Xsc, y_cat.values, batch_size=16, epochs=20, verbose=1)" 347 | ] 348 | }, 349 | { 350 | "cell_type": "code", 351 | "execution_count": null, 352 | "metadata": {}, 353 | "outputs": [], 354 | "source": [ 355 | "features_function = K.function([inputs], [second_to_last])" 356 | ] 357 | }, 358 | { 359 | "cell_type": "code", 360 | "execution_count": null, 361 | "metadata": {}, 362 | "outputs": [], 363 | "source": [ 364 | "features = features_function([Xsc])[0]" 365 | ] 366 | }, 367 | { 368 | "cell_type": "code", 369 | "execution_count": null, 370 | "metadata": {}, 371 | "outputs": [], 372 | "source": [ 373 | "plt.scatter(features[:, 0], features[:, 1], c=y)" 374 | ] 375 | }, 376 | { 377 | "cell_type": "markdown", 378 | "metadata": {}, 379 | "source": [ 380 | "## Exercise 4 \n", 381 | "\n", 382 | "Keras offers the possibility to call a function at each epoch. These are Callbacks, and their [documentation is here](https://keras.io/callbacks/). Callbacks allow us to add some neat functionality. In this exercise we'll explore a few of them.\n", 383 | "\n", 384 | "- Split the data into train and test sets with a test_size = 0.3 and random_state=42\n", 385 | "- Reset and recompile your model\n", 386 | "- train the model on the train data using `validation_data=(X_test, y_test)`\n", 387 | "- Use the `EarlyStopping` callback to stop your training if the `val_loss` doesn't improve\n", 388 | "- Use the `ModelCheckpoint` callback to save the trained model to disk once training is finished\n", 389 | "- Use the `TensorBoard` callback to output your training information to a `/tmp/` subdirectory\n", 390 | "- Watch the next video for an overview of tensorboard" 391 | ] 392 | }, 393 | { 394 | "cell_type": "code", 395 | "execution_count": null, 396 | "metadata": {}, 397 | "outputs": [], 398 | "source": [ 399 | "from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, TensorBoard" 400 | ] 401 | }, 402 | { 403 | "cell_type": "code", 404 | "execution_count": null, 405 | "metadata": {}, 406 | "outputs": [], 407 | "source": [ 408 | "checkpointer = ModelCheckpoint(filepath=\"/tmp/udemy/weights.hdf5\",\n", 409 | " verbose=1, save_best_only=True)" 410 | ] 411 | }, 412 | { 413 | "cell_type": "code", 414 | "execution_count": null, 415 | "metadata": {}, 416 | "outputs": [], 417 | "source": [ 418 | "earlystopper = EarlyStopping(monitor='val_loss', min_delta=0,\n", 419 | " patience=1, verbose=1, mode='auto')" 420 | ] 421 | }, 422 | { 423 | "cell_type": "code", 424 | "execution_count": null, 425 | "metadata": {}, 426 | "outputs": [], 427 | "source": [ 428 | "tensorboard = TensorBoard(log_dir='/tmp/udemy/tensorboard/')" 429 | ] 430 | }, 431 | { 432 | "cell_type": "code", 433 | "execution_count": null, 434 | "metadata": {}, 435 | "outputs": [], 436 | "source": [ 437 | "from sklearn.model_selection import train_test_split" 438 | ] 439 | }, 440 | { 441 | "cell_type": "code", 442 | "execution_count": null, 443 | "metadata": {}, 444 | "outputs": [], 445 | "source": [ 446 | "X_train, X_test, y_train, y_test = train_test_split(Xsc, y_cat.values,\n", 447 | " test_size=0.3,\n", 448 | " random_state=42)" 449 | ] 450 | }, 451 | { 452 | "cell_type": "code", 453 | "execution_count": null, 454 | "metadata": {}, 455 | "outputs": [], 456 | "source": [ 457 | "K.clear_session()\n", 458 | "\n", 459 | "inputs = Input(shape=(13,))\n", 460 | "\n", 461 | "x = Dense(8, kernel_initializer='he_normal', activation='tanh')(inputs)\n", 462 | "x = Dense(5, kernel_initializer='he_normal', activation='tanh')(x)\n", 463 | "second_to_last = Dense(2, kernel_initializer='he_normal',\n", 464 | " activation='tanh')(x)\n", 465 | "outputs = Dense(3, activation='softmax')(second_to_last)\n", 466 | "\n", 467 | "model = Model(inputs=inputs, outputs=outputs)\n", 468 | "\n", 469 | "model.compile(RMSprop(learning_rate=0.05), 'categorical_crossentropy',\n", 470 | " metrics=['accuracy'])\n", 471 | "\n", 472 | "model.fit(X_train, y_train, batch_size=32,\n", 473 | " epochs=20, verbose=2,\n", 474 | " validation_data=(X_test, y_test),\n", 475 | " callbacks=[checkpointer, earlystopper, tensorboard])" 476 | ] 477 | }, 478 | { 479 | "cell_type": "markdown", 480 | "metadata": {}, 481 | "source": [ 482 | "Run Tensorboard with the command:\n", 483 | "\n", 484 | " tensorboard --logdir /tmp/udemy/tensorboard/\n", 485 | " \n", 486 | "and open your browser at http://localhost:6006" 487 | ] 488 | }, 489 | { 490 | "cell_type": "code", 491 | "execution_count": null, 492 | "metadata": {}, 493 | "outputs": [], 494 | "source": [] 495 | } 496 | ], 497 | "metadata": { 498 | "kernelspec": { 499 | "display_name": "Python 3", 500 | "language": "python", 501 | "name": "python3" 502 | }, 503 | "language_info": { 504 | "codemirror_mode": { 505 | "name": "ipython", 506 | "version": 3 507 | }, 508 | "file_extension": ".py", 509 | "mimetype": "text/x-python", 510 | "name": "python", 511 | "nbconvert_exporter": "python", 512 | "pygments_lexer": "ipython3", 513 | "version": "3.7.10" 514 | } 515 | }, 516 | "nbformat": 4, 517 | "nbformat_minor": 2 518 | } 519 | -------------------------------------------------------------------------------- /solutions/6 Convolutional Neural Networks Exercises Solution.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Convolutional Neural Networks Exercises Solution" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import pandas as pd\n", 17 | "import numpy as np\n", 18 | "%matplotlib inline\n", 19 | "import matplotlib.pyplot as plt" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": null, 25 | "metadata": {}, 26 | "outputs": [], 27 | "source": [ 28 | "from tensorflow.keras.utils import to_categorical" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": null, 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "from tensorflow.keras.models import Sequential\n", 38 | "from tensorflow.keras.layers import Dense, Conv2D, MaxPool2D, Flatten\n", 39 | "import tensorflow.keras.backend as K" 40 | ] 41 | }, 42 | { 43 | "cell_type": "markdown", 44 | "metadata": { 45 | "collapsed": true 46 | }, 47 | "source": [ 48 | "### Exercise 1\n", 49 | "\n", 50 | "You've been hired by a shipping company to overhaul the way they route mail, parcels and packages. They want to build an image recognition system capable of recognizing the digits in the zipcode on a package, so that it can be automatically routed to the correct location.\n", 51 | "You are tasked to build the digit recognition system. Luckily, you can rely on the MNIST dataset for the intial training of your model!\n", 52 | "\n", 53 | "Build a deep convolutional neural network with at least two convolutional and two pooling layers before the fully connected layer.\n", 54 | "\n", 55 | "- Start from the network we have just built\n", 56 | "- Insert a `Conv2D` layer after the first `MaxPool2D`, give it 64 filters.\n", 57 | "- Insert a `MaxPool2D` after that one\n", 58 | "- Insert an `Activation` layer\n", 59 | "- retrain the model\n", 60 | "- does performance improve?\n", 61 | "- how many parameters does this new model have? More or less than the previous model? Why?\n", 62 | "- how long did this second model take to train? Longer or shorter than the previous model? Why?\n", 63 | "- did it perform better or worse than the previous model?" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": null, 69 | "metadata": {}, 70 | "outputs": [], 71 | "source": [ 72 | "from tensorflow.keras.datasets import mnist" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": null, 78 | "metadata": {}, 79 | "outputs": [], 80 | "source": [ 81 | "(X_train, y_train), (X_test, y_test) = mnist.load_data(('/tmp/mnist.npz'))" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": null, 87 | "metadata": {}, 88 | "outputs": [], 89 | "source": [ 90 | "X_train.shape" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": null, 96 | "metadata": {}, 97 | "outputs": [], 98 | "source": [ 99 | "X_train = X_train.astype('float32') / 255.0\n", 100 | "X_test = X_test.astype('float32') / 255.0\n", 101 | "\n", 102 | "X_train = X_train.reshape(-1, 28, 28, 1)\n", 103 | "X_test = X_test.reshape(-1, 28, 28, 1)\n", 104 | "\n", 105 | "y_train_cat = to_categorical(y_train, 10)\n", 106 | "y_test_cat = to_categorical(y_test, 10)" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": null, 112 | "metadata": {}, 113 | "outputs": [], 114 | "source": [ 115 | "K.clear_session()\n", 116 | "\n", 117 | "model = Sequential()\n", 118 | "\n", 119 | "model.add(Conv2D(32, (3, 3), activation='relu', input_shape=(28, 28, 1)))\n", 120 | "model.add(MaxPool2D(pool_size=(2, 2)))\n", 121 | "\n", 122 | "model.add(Conv2D(64, (3, 3), activation='relu'))\n", 123 | "model.add(MaxPool2D(pool_size=(2, 2)))\n", 124 | "\n", 125 | "model.add(Flatten())\n", 126 | "\n", 127 | "model.add(Dense(128, activation='relu'))\n", 128 | "\n", 129 | "model.add(Dense(10, activation='softmax'))\n", 130 | "\n", 131 | "model.compile(loss='categorical_crossentropy',\n", 132 | " optimizer='rmsprop',\n", 133 | " metrics=['accuracy'])" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": null, 139 | "metadata": {}, 140 | "outputs": [], 141 | "source": [ 142 | "model.summary()" 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": null, 148 | "metadata": {}, 149 | "outputs": [], 150 | "source": [ 151 | "model.fit(X_train, y_train_cat, batch_size=128,\n", 152 | " epochs=2, verbose=1, validation_split=0.3)" 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": null, 158 | "metadata": {}, 159 | "outputs": [], 160 | "source": [ 161 | "model.evaluate(X_test, y_test_cat)" 162 | ] 163 | }, 164 | { 165 | "cell_type": "markdown", 166 | "metadata": {}, 167 | "source": [ 168 | "### Exercise 2\n", 169 | "\n", 170 | "Pleased with your performance with the digits recognition task, your boss decides to challenge you with a harder task. Their online branch allows people to upload images to a website that generates and prints a postcard that is shipped to destination. Your boss would like to know what images people are loading on the site in order to provide targeted advertising on the same page, so he asks you to build an image recognition system capable of recognizing a few objects. Luckily for you, there's a dataset ready made with a collection of labeled images. This is the [Cifar 10 Dataset](http://www.cs.toronto.edu/~kriz/cifar.html), a very famous dataset that contains images for 10 different categories:\n", 171 | "\n", 172 | "- airplane \t\t\t\t\t\t\t\t\t\t\n", 173 | "- automobile \t\t\t\t\t\t\t\t\t\t\n", 174 | "- bird \t\t\t\t\t\t\t\t\t\t\n", 175 | "- cat \t\t\t\t\t\t\t\t\t\t\n", 176 | "- deer \t\t\t\t\t\t\t\t\t\t\n", 177 | "- dog \t\t\t\t\t\t\t\t\t\t\n", 178 | "- frog \t\t\t\t\t\t\t\t\t\t\n", 179 | "- horse \t\t\t\t\t\t\t\t\t\t\n", 180 | "- ship \t\t\t\t\t\t\t\t\t\t\n", 181 | "- truck\n", 182 | "\n", 183 | "In this exercise we will reach the limit of what you can achieve on your laptop and get ready for the next session on cloud GPUs.\n", 184 | "\n", 185 | "Here's what you have to do:\n", 186 | "- load the cifar10 dataset using `keras.datasets.cifar10.load_data()`\n", 187 | "- display a few images, see how hard/easy it is for you to recognize an object with such low resolution\n", 188 | "- check the shape of X_train, does it need reshape?\n", 189 | "- check the scale of X_train, does it need rescaling?\n", 190 | "- check the shape of y_train, does it need reshape?\n", 191 | "- build a model with the following architecture, and choose the parameters and activation functions for each of the layers:\n", 192 | " - conv2d\n", 193 | " - conv2d\n", 194 | " - maxpool\n", 195 | " - conv2d\n", 196 | " - conv2d\n", 197 | " - maxpool\n", 198 | " - flatten\n", 199 | " - dense\n", 200 | " - output\n", 201 | "- compile the model and check the number of parameters\n", 202 | "- attempt to train the model with the optimizer of your choice. How fast does training proceed?\n", 203 | "- If training is too slow (as expected) stop the execution and move to the next session!" 204 | ] 205 | }, 206 | { 207 | "cell_type": "code", 208 | "execution_count": null, 209 | "metadata": {}, 210 | "outputs": [], 211 | "source": [ 212 | "from tensorflow.keras.datasets import cifar10" 213 | ] 214 | }, 215 | { 216 | "cell_type": "code", 217 | "execution_count": null, 218 | "metadata": {}, 219 | "outputs": [], 220 | "source": [ 221 | "(X_train, y_train), (X_test, y_test) = cifar10.load_data()" 222 | ] 223 | }, 224 | { 225 | "cell_type": "code", 226 | "execution_count": null, 227 | "metadata": {}, 228 | "outputs": [], 229 | "source": [ 230 | "X_train.shape" 231 | ] 232 | }, 233 | { 234 | "cell_type": "code", 235 | "execution_count": null, 236 | "metadata": {}, 237 | "outputs": [], 238 | "source": [ 239 | "plt.imshow(X_train[1])" 240 | ] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "execution_count": null, 245 | "metadata": {}, 246 | "outputs": [], 247 | "source": [ 248 | "X_train = X_train.astype('float32') / 255.0\n", 249 | "X_test = X_test.astype('float32') / 255.0" 250 | ] 251 | }, 252 | { 253 | "cell_type": "code", 254 | "execution_count": null, 255 | "metadata": {}, 256 | "outputs": [], 257 | "source": [ 258 | "y_train.shape" 259 | ] 260 | }, 261 | { 262 | "cell_type": "code", 263 | "execution_count": null, 264 | "metadata": {}, 265 | "outputs": [], 266 | "source": [ 267 | "y_train_cat = to_categorical(y_train, 10)\n", 268 | "y_test_cat = to_categorical(y_test, 10)" 269 | ] 270 | }, 271 | { 272 | "cell_type": "code", 273 | "execution_count": null, 274 | "metadata": {}, 275 | "outputs": [], 276 | "source": [ 277 | "y_train_cat.shape" 278 | ] 279 | }, 280 | { 281 | "cell_type": "code", 282 | "execution_count": null, 283 | "metadata": {}, 284 | "outputs": [], 285 | "source": [ 286 | "model = Sequential()\n", 287 | "model.add(Conv2D(32, (3, 3),\n", 288 | " padding='same',\n", 289 | " input_shape=(32, 32, 3),\n", 290 | " activation='relu'))\n", 291 | "model.add(Conv2D(32, (3, 3), activation='relu'))\n", 292 | "model.add(MaxPool2D(pool_size=(2, 2)))\n", 293 | "\n", 294 | "model.add(Conv2D(64, (3, 3), padding='same', activation='relu'))\n", 295 | "model.add(Conv2D(64, (3, 3), activation='relu'))\n", 296 | "model.add(MaxPool2D(pool_size=(2, 2)))\n", 297 | "\n", 298 | "model.add(Flatten())\n", 299 | "model.add(Dense(512, activation='relu'))\n", 300 | "model.add(Dense(10, activation='softmax'))" 301 | ] 302 | }, 303 | { 304 | "cell_type": "code", 305 | "execution_count": null, 306 | "metadata": {}, 307 | "outputs": [], 308 | "source": [ 309 | "model.compile(loss='categorical_crossentropy',\n", 310 | " optimizer='rmsprop',\n", 311 | " metrics=['accuracy'])" 312 | ] 313 | }, 314 | { 315 | "cell_type": "code", 316 | "execution_count": null, 317 | "metadata": {}, 318 | "outputs": [], 319 | "source": [ 320 | "model.summary()" 321 | ] 322 | }, 323 | { 324 | "cell_type": "code", 325 | "execution_count": null, 326 | "metadata": {}, 327 | "outputs": [], 328 | "source": [ 329 | "model.fit(X_train, y_train_cat,\n", 330 | " batch_size=32,\n", 331 | " epochs=2,\n", 332 | " validation_data=(X_test, y_test_cat),\n", 333 | " shuffle=True)" 334 | ] 335 | }, 336 | { 337 | "cell_type": "code", 338 | "execution_count": null, 339 | "metadata": {}, 340 | "outputs": [], 341 | "source": [] 342 | } 343 | ], 344 | "metadata": { 345 | "kernelspec": { 346 | "display_name": "Python 3", 347 | "language": "python", 348 | "name": "python3" 349 | }, 350 | "language_info": { 351 | "codemirror_mode": { 352 | "name": "ipython", 353 | "version": 3 354 | }, 355 | "file_extension": ".py", 356 | "mimetype": "text/x-python", 357 | "name": "python", 358 | "nbconvert_exporter": "python", 359 | "pygments_lexer": "ipython3", 360 | "version": "3.7.10" 361 | } 362 | }, 363 | "nbformat": 4, 364 | "nbformat_minor": 2 365 | } 366 | -------------------------------------------------------------------------------- /solutions/8 Recurrent Neural Networks Exercises Solutions.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Recurrent Neural Networks" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import pandas as pd\n", 17 | "import numpy as np\n", 18 | "%matplotlib inline\n", 19 | "import matplotlib.pyplot as plt" 20 | ] 21 | }, 22 | { 23 | "cell_type": "markdown", 24 | "metadata": {}, 25 | "source": [ 26 | "## Time series forecasting" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": null, 32 | "metadata": {}, 33 | "outputs": [], 34 | "source": [ 35 | "from pandas.tseries.offsets import MonthEnd\n", 36 | "\n", 37 | "df = pd.read_csv('../data/cansim-0800020-eng-6674700030567901031.csv',\n", 38 | " skiprows=6, skipfooter=9,\n", 39 | " engine='python')\n", 40 | "\n", 41 | "df['Adjustments'] = pd.to_datetime(df['Adjustments']) + MonthEnd(1)\n", 42 | "df = df.set_index('Adjustments')\n", 43 | "df.head()" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": null, 49 | "metadata": {}, 50 | "outputs": [], 51 | "source": [ 52 | "split_date = pd.Timestamp('01-01-2011')" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": null, 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [ 61 | "train = df.loc[:split_date, ['Unadjusted']]\n", 62 | "test = df.loc[split_date:, ['Unadjusted']]" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": null, 68 | "metadata": {}, 69 | "outputs": [], 70 | "source": [ 71 | "from sklearn.preprocessing import MinMaxScaler\n", 72 | "\n", 73 | "sc = MinMaxScaler()\n", 74 | "\n", 75 | "train_sc = sc.fit_transform(train)\n", 76 | "test_sc = sc.transform(test)" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": null, 82 | "metadata": {}, 83 | "outputs": [], 84 | "source": [ 85 | "train_sc_df = pd.DataFrame(train_sc, columns=['Scaled'], index=train.index)\n", 86 | "test_sc_df = pd.DataFrame(test_sc, columns=['Scaled'], index=test.index)\n", 87 | "\n", 88 | "for s in range(1, 13):\n", 89 | " train_sc_df['shift_{}'.format(s)] = train_sc_df['Scaled'].shift(s)\n", 90 | " test_sc_df['shift_{}'.format(s)] = test_sc_df['Scaled'].shift(s)\n", 91 | "\n", 92 | "X_train = train_sc_df.dropna().drop('Scaled', axis=1)\n", 93 | "y_train = train_sc_df.dropna()[['Scaled']]\n", 94 | "\n", 95 | "X_test = test_sc_df.dropna().drop('Scaled', axis=1)\n", 96 | "y_test = test_sc_df.dropna()[['Scaled']]\n", 97 | "\n", 98 | "X_train = X_train.values\n", 99 | "X_test= X_test.values\n", 100 | "\n", 101 | "y_train = y_train.values\n", 102 | "y_test = y_test.values" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": null, 108 | "metadata": {}, 109 | "outputs": [], 110 | "source": [ 111 | "X_train.shape" 112 | ] 113 | }, 114 | { 115 | "cell_type": "markdown", 116 | "metadata": {}, 117 | "source": [ 118 | "## Exercise 1\n", 119 | "\n", 120 | "In the model above we reshaped the input shape to: `(num_samples, 1, 12)`, i.e. we treated a window of 12 months as a vector of 12 coordinates that we simultaneously passed to all the LSTM nodes. An alternative way to look at the problem is to reshape the input to `(num_samples, 12, 1)`. This means we consider each input window as a sequence of 12 values that we will pass in sequence to the LSTM. In principle this looks like a more accurate description of our situation. But does it yield better predictions? Let's check it.\n", 121 | "\n", 122 | "- Reshape `X_train` and `X_test` so that they represent a set of univariate sequences\n", 123 | "- retrain the same LSTM(6) model, you'll have to adapt the `input_shape`\n", 124 | "- check the performance of this new model, is it better at predicting the test data?" 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": null, 130 | "metadata": {}, 131 | "outputs": [], 132 | "source": [ 133 | "X_train_t = X_train.reshape(X_train.shape[0], 12, 1)\n", 134 | "X_test_t = X_test.reshape(X_test.shape[0], 12, 1)" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": null, 140 | "metadata": {}, 141 | "outputs": [], 142 | "source": [ 143 | "X_train_t.shape" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": null, 149 | "metadata": {}, 150 | "outputs": [], 151 | "source": [ 152 | "from tensorflow.keras.models import Sequential\n", 153 | "from tensorflow.keras.layers import LSTM, Dense\n", 154 | "import tensorflow.keras.backend as K\n", 155 | "from tensorflow.keras.callbacks import EarlyStopping" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": null, 161 | "metadata": {}, 162 | "outputs": [], 163 | "source": [ 164 | "K.clear_session()\n", 165 | "model = Sequential()\n", 166 | "\n", 167 | "model.add(LSTM(6, input_shape=(12, 1)))\n", 168 | "\n", 169 | "model.add(Dense(1))\n", 170 | "\n", 171 | "model.compile(loss='mean_squared_error', optimizer='adam')" 172 | ] 173 | }, 174 | { 175 | "cell_type": "code", 176 | "execution_count": null, 177 | "metadata": {}, 178 | "outputs": [], 179 | "source": [ 180 | "model.summary()" 181 | ] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "execution_count": null, 186 | "metadata": {}, 187 | "outputs": [], 188 | "source": [ 189 | "early_stop = EarlyStopping(monitor='loss', patience=1, verbose=1)" 190 | ] 191 | }, 192 | { 193 | "cell_type": "code", 194 | "execution_count": null, 195 | "metadata": {}, 196 | "outputs": [], 197 | "source": [ 198 | "model.fit(X_train_t, y_train, epochs=600,\n", 199 | " batch_size=32, verbose=0)" 200 | ] 201 | }, 202 | { 203 | "cell_type": "code", 204 | "execution_count": null, 205 | "metadata": {}, 206 | "outputs": [], 207 | "source": [ 208 | "y_pred = model.predict(X_test_t)\n", 209 | "plt.plot(y_test)\n", 210 | "plt.plot(y_pred)" 211 | ] 212 | }, 213 | { 214 | "cell_type": "markdown", 215 | "metadata": { 216 | "collapsed": true 217 | }, 218 | "source": [ 219 | "## Exercise 2\n", 220 | "\n", 221 | "RNN models can be applied to images too. In general we can apply them to any data where there's a connnection between nearby units. Let's see how we can easily build a model that works with images.\n", 222 | "\n", 223 | "- Load the MNIST data, by now you should be able to do it blindfolded :)\n", 224 | "- reshape it so that an image looks like a long sequence of pixels\n", 225 | "- create a recurrent model and train it on the training data\n", 226 | "- how does it perform compared to a fully connected? How does it compare to Convolutional Neural Networks?\n", 227 | "\n", 228 | "(feel free to run this exercise on a cloud GPU if it's too slow on your laptop)" 229 | ] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "execution_count": null, 234 | "metadata": {}, 235 | "outputs": [], 236 | "source": [ 237 | "from tensorflow.keras.datasets import mnist\n", 238 | "from tensorflow.keras.utils import to_categorical" 239 | ] 240 | }, 241 | { 242 | "cell_type": "code", 243 | "execution_count": null, 244 | "metadata": {}, 245 | "outputs": [], 246 | "source": [ 247 | "(X_train, y_train), (X_test, y_test) = mnist.load_data()\n", 248 | "X_train = X_train.astype('float32') / 255.0\n", 249 | "X_test = X_test.astype('float32') / 255.0\n", 250 | "y_train_cat = to_categorical(y_train, 10)\n", 251 | "y_test_cat = to_categorical(y_test, 10)" 252 | ] 253 | }, 254 | { 255 | "cell_type": "code", 256 | "execution_count": null, 257 | "metadata": {}, 258 | "outputs": [], 259 | "source": [ 260 | "X_train = X_train.reshape(X_train.shape[0], -1, 1)\n", 261 | "X_test = X_test.reshape(X_test.shape[0], -1, 1)" 262 | ] 263 | }, 264 | { 265 | "cell_type": "code", 266 | "execution_count": null, 267 | "metadata": {}, 268 | "outputs": [], 269 | "source": [ 270 | "print(X_train.shape)\n", 271 | "print(X_test.shape)\n", 272 | "print(y_train_cat.shape)\n", 273 | "print(y_test_cat.shape)" 274 | ] 275 | }, 276 | { 277 | "cell_type": "code", 278 | "execution_count": null, 279 | "metadata": {}, 280 | "outputs": [], 281 | "source": [ 282 | "# define the model\n", 283 | "K.clear_session()\n", 284 | "model = Sequential()\n", 285 | "model.add(LSTM(32, input_shape=X_train.shape[1:]))\n", 286 | "model.add(Dense(10, activation='softmax'))\n", 287 | "\n", 288 | "# compile the model\n", 289 | "model.compile(loss='categorical_crossentropy',\n", 290 | " optimizer='rmsprop',\n", 291 | " metrics=['accuracy'])\n", 292 | "\n", 293 | "model.fit(X_train, y_train_cat,\n", 294 | " batch_size=32,\n", 295 | " epochs=100,\n", 296 | " validation_split=0.3,\n", 297 | " shuffle=True,\n", 298 | " verbose=2,\n", 299 | " )\n", 300 | "\n", 301 | "model.evaluate(X_test, y_test_cat)" 302 | ] 303 | }, 304 | { 305 | "cell_type": "code", 306 | "execution_count": null, 307 | "metadata": {}, 308 | "outputs": [], 309 | "source": [] 310 | } 311 | ], 312 | "metadata": { 313 | "kernelspec": { 314 | "display_name": "Python 3", 315 | "language": "python", 316 | "name": "python3" 317 | }, 318 | "language_info": { 319 | "codemirror_mode": { 320 | "name": "ipython", 321 | "version": 3 322 | }, 323 | "file_extension": ".py", 324 | "mimetype": "text/x-python", 325 | "name": "python", 326 | "nbconvert_exporter": "python", 327 | "pygments_lexer": "ipython3", 328 | "version": "3.7.10" 329 | } 330 | }, 331 | "nbformat": 4, 332 | "nbformat_minor": 2 333 | } 334 | -------------------------------------------------------------------------------- /solutions/9 Improving performance Exercises Solutions.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# 9 Improving performance Exercises Solutions" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import pandas as pd\n", 17 | "import numpy as np\n", 18 | "%matplotlib inline\n", 19 | "import matplotlib.pyplot as plt" 20 | ] 21 | }, 22 | { 23 | "cell_type": "markdown", 24 | "metadata": {}, 25 | "source": [ 26 | "## Exercise 1\n", 27 | "\n", 28 | "- Reload the IMDB data keeping only the first 20000 most common words\n", 29 | "- pad the reviews to a shorter length (eg. 70 or 80), this time make sure you keep the first part of the review if it's longer than the maximum length\n", 30 | "- re run the model (remember to set max_features correctly)\n", 31 | "- does it train faster this time?\n", 32 | "- do you get a better performance?" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": null, 38 | "metadata": {}, 39 | "outputs": [], 40 | "source": [ 41 | "from tensorflow.keras.datasets import imdb\n", 42 | "from tensorflow.keras.preprocessing.sequence import pad_sequences\n", 43 | "from tensorflow.keras.models import Sequential\n", 44 | "from tensorflow.keras.layers import Embedding, LSTM, Dense" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": null, 50 | "metadata": {}, 51 | "outputs": [], 52 | "source": [ 53 | "max_features = 20000\n", 54 | "skip_top = 200" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": null, 60 | "metadata": {}, 61 | "outputs": [], 62 | "source": [ 63 | "(X_train, y_train), (X_test, y_test) = imdb.load_data('/tmp/imdb.npz',\n", 64 | " num_words=max_features,\n", 65 | " start_char=1,\n", 66 | " oov_char=2,\n", 67 | " index_from=3)" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "metadata": {}, 74 | "outputs": [], 75 | "source": [ 76 | "X_train.shape" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": null, 82 | "metadata": {}, 83 | "outputs": [], 84 | "source": [ 85 | "maxlen = 80" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": null, 91 | "metadata": {}, 92 | "outputs": [], 93 | "source": [ 94 | "X_train_pad = pad_sequences(X_train, maxlen=maxlen, truncating='post')\n", 95 | "X_test_pad = pad_sequences(X_test, maxlen=maxlen, truncating='post')" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": null, 101 | "metadata": {}, 102 | "outputs": [], 103 | "source": [ 104 | "model = Sequential()\n", 105 | "model.add(Embedding(max_features, 128))\n", 106 | "model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2))\n", 107 | "model.add(Dense(1, activation='sigmoid'))\n", 108 | "\n", 109 | "model.compile(loss='binary_crossentropy',\n", 110 | " optimizer='adam',\n", 111 | " metrics=['accuracy'])" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": null, 117 | "metadata": {}, 118 | "outputs": [], 119 | "source": [ 120 | "X_train[0]" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": null, 126 | "metadata": {}, 127 | "outputs": [], 128 | "source": [ 129 | "model.fit(X_train_pad, y_train,\n", 130 | " batch_size=32,\n", 131 | " epochs=2,\n", 132 | " validation_split=0.3)" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": null, 138 | "metadata": {}, 139 | "outputs": [], 140 | "source": [ 141 | "score, acc = model.evaluate(X_test_pad, y_test)\n", 142 | "print('Test score:', score)\n", 143 | "print('Test accuracy:', acc)" 144 | ] 145 | }, 146 | { 147 | "cell_type": "markdown", 148 | "metadata": {}, 149 | "source": [ 150 | "## Exercise 2\n", 151 | "\n", 152 | "- Reload the digits data as above\n", 153 | "- define a function repeated_training_reg_dropout that adds regularization and dropout to a fully connected network\n", 154 | "- compare the performance with/witouth dropout and regularization like we did for batch normalization\n", 155 | "- do you get a better performance?" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": null, 161 | "metadata": {}, 162 | "outputs": [], 163 | "source": [ 164 | "from sklearn.datasets import load_digits\n", 165 | "from tensorflow.keras.utils import to_categorical\n", 166 | "from sklearn.model_selection import train_test_split\n", 167 | "from tensorflow.keras.layers import Dropout\n", 168 | "import tensorflow.keras.backend as K" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": null, 174 | "metadata": {}, 175 | "outputs": [], 176 | "source": [ 177 | "digits = load_digits()\n", 178 | "X, y = digits.data, digits.target\n", 179 | "y_cat = to_categorical(y)\n", 180 | "\n", 181 | "X_train, X_test, y_train, y_test = train_test_split(X, y_cat, test_size=0.3)" 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": null, 187 | "metadata": {}, 188 | "outputs": [], 189 | "source": [ 190 | "def repeated_training_reg_dropout(X_train,\n", 191 | " y_train,\n", 192 | " X_test,\n", 193 | " y_test,\n", 194 | " units=512,\n", 195 | " activation='sigmoid',\n", 196 | " optimizer='sgd',\n", 197 | " do_dropout=False,\n", 198 | " rate=0.3,\n", 199 | " kernel_regularizer='l2',\n", 200 | " epochs=10,\n", 201 | " repeats=3):\n", 202 | " histories = []\n", 203 | " \n", 204 | " for repeat in range(repeats):\n", 205 | " K.clear_session()\n", 206 | "\n", 207 | " model = Sequential()\n", 208 | " \n", 209 | " # first fully connected layer\n", 210 | " model.add(Dense(units,\n", 211 | " input_shape=X_train.shape[1:],\n", 212 | " kernel_initializer='normal',\n", 213 | " kernel_regularizer=kernel_regularizer,\n", 214 | " activation=activation))\n", 215 | " if do_dropout:\n", 216 | " model.add(Dropout(rate))\n", 217 | "\n", 218 | " # second fully connected layer\n", 219 | " model.add(Dense(units,\n", 220 | " kernel_initializer='normal',\n", 221 | " kernel_regularizer=kernel_regularizer,\n", 222 | " activation=activation))\n", 223 | " if do_dropout:\n", 224 | " model.add(Dropout(rate))\n", 225 | "\n", 226 | " # third fully connected layer\n", 227 | " model.add(Dense(units,\n", 228 | " kernel_initializer='normal',\n", 229 | " kernel_regularizer=kernel_regularizer,\n", 230 | " activation=activation))\n", 231 | " if do_dropout:\n", 232 | " model.add(Dropout(rate))\n", 233 | "\n", 234 | " # output layer\n", 235 | " model.add(Dense(10, activation='softmax'))\n", 236 | " \n", 237 | " model.compile(optimizer,\n", 238 | " 'categorical_crossentropy',\n", 239 | " metrics=['accuracy'])\n", 240 | "\n", 241 | " h = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=epochs, verbose=0)\n", 242 | " histories.append([h.history['accuracy'], h.history['val_accuracy']])\n", 243 | " print(repeat, end=' ')\n", 244 | "\n", 245 | " histories = np.array(histories)\n", 246 | " \n", 247 | " # calculate mean and standard deviation across repeats:\n", 248 | " mean_acc = histories.mean(axis=0)\n", 249 | " std_acc = histories.std(axis=0)\n", 250 | " print()\n", 251 | " \n", 252 | " return mean_acc[0], std_acc[0], mean_acc[1], std_acc[1]" 253 | ] 254 | }, 255 | { 256 | "cell_type": "code", 257 | "execution_count": null, 258 | "metadata": {}, 259 | "outputs": [], 260 | "source": [ 261 | "mean_acc, std_acc, mean_acc_val, std_acc_val = repeated_training_reg_dropout(X_train,\n", 262 | " y_train,\n", 263 | " X_test,\n", 264 | " y_test,\n", 265 | " do_dropout=False)" 266 | ] 267 | }, 268 | { 269 | "cell_type": "code", 270 | "execution_count": null, 271 | "metadata": {}, 272 | "outputs": [], 273 | "source": [ 274 | "mean_acc_do, std_acc_do, mean_acc_val_do, std_acc_val_do = repeated_training_reg_dropout(X_train,\n", 275 | " y_train,\n", 276 | " X_test,\n", 277 | " y_test,\n", 278 | " do_dropout=True)" 279 | ] 280 | }, 281 | { 282 | "cell_type": "code", 283 | "execution_count": null, 284 | "metadata": {}, 285 | "outputs": [], 286 | "source": [ 287 | "def plot_mean_std(m, s):\n", 288 | " plt.plot(m)\n", 289 | " plt.fill_between(range(len(m)), m-s, m+s, alpha=0.1)" 290 | ] 291 | }, 292 | { 293 | "cell_type": "code", 294 | "execution_count": null, 295 | "metadata": {}, 296 | "outputs": [], 297 | "source": [ 298 | "plot_mean_std(mean_acc, std_acc)\n", 299 | "plot_mean_std(mean_acc_val, std_acc_val)\n", 300 | "plot_mean_std(mean_acc_do, std_acc_do)\n", 301 | "plot_mean_std(mean_acc_val_do, std_acc_val_do)\n", 302 | "plt.ylim(0, 1.01)\n", 303 | "plt.title(\"Dropout and Regularization Accuracy\")\n", 304 | "plt.xlabel('Epochs')\n", 305 | "plt.ylabel('Accuracy')\n", 306 | "plt.legend(['Train', 'Test', 'Train with Dropout and Regularization', 'Test with Dropout and Regularization'], loc='best')" 307 | ] 308 | }, 309 | { 310 | "cell_type": "markdown", 311 | "metadata": {}, 312 | "source": [ 313 | "## Exercise 3\n", 314 | "\n", 315 | "This is a very long and complex exercise, that should give you an idea of a real world scenario. Feel free to look at the solution if you feel lost. Also, feel free to run this with a GPU, in which case you don't need to download the data.\n", 316 | "\n", 317 | "If you are running this locally, download and unpack the male/female pictures from [here](https://www.dropbox.com/s/nov493om2jmh2gp/male_female.tgz?dl=0). These images and labels were obtained from [Crowdflower](https://www.crowdflower.com/data-for-everyone/).\n", 318 | "\n", 319 | "Your goal is to build an image classifier that will recognize the gender of a person from pictures.\n", 320 | "\n", 321 | "- Have a look at the directory structure and inspect a couple of pictures\n", 322 | "- Design a model that will take a color image of size 64x64 as input and return a binary output (female=0/male=1)\n", 323 | "- Feel free to introduce any regularization technique in your model (Dropout, Batch Normalization, Weight Regularization)\n", 324 | "- Compile your model with an optimizer of your choice\n", 325 | "- Using `ImageDataGenerator`, define a train generator that will augment your images with some geometric transformations. Feel free to choose the parameters that make sense to you.\n", 326 | "- Define also a test generator, whose only purpose is to rescale the pixels by 1./255\n", 327 | "- use the function `flow_from_directory` to generate batches from the train and test folders. Make sure you set the `target_size` to 64x64.\n", 328 | "- Use the `model.fit_generator` function to fit the model on the batches generated from the ImageDataGenerator. Since you are streaming and augmenting the data in real time you will have to decide how many batches make an epoch and how many epochs you want to run\n", 329 | "- Train your model (you should get to at least 85% accuracy)\n", 330 | "- Once you are satisfied with your training, check a few of the misclassified pictures. Are those sensible errors?" 331 | ] 332 | }, 333 | { 334 | "cell_type": "code", 335 | "execution_count": null, 336 | "metadata": {}, 337 | "outputs": [], 338 | "source": [ 339 | "# If you are running this locally\n", 340 | "# uncomment the next 4 lines to download, extract and set the data path:\n", 341 | "# !wget 'https://www.dropbox.com/s/nov493om2jmh2gp/male_female.tgz?dl=1' -O ../data/male_female.tgz\n", 342 | "# data_path = '../data/male_female'\n", 343 | "# !mkdir -p {data_path}\n", 344 | "# !tar -xzvf ../data/male_female.tgz --directory {data_path}" 345 | ] 346 | }, 347 | { 348 | "cell_type": "code", 349 | "execution_count": null, 350 | "metadata": {}, 351 | "outputs": [], 352 | "source": [ 353 | "from tensorflow.keras.layers import Conv2D\n", 354 | "from tensorflow.keras.layers import MaxPooling2D\n", 355 | "from tensorflow.keras.layers import Flatten\n", 356 | "from tensorflow.keras.layers import BatchNormalization\n", 357 | "from itertools import islice\n", 358 | "from tensorflow.keras.preprocessing.image import ImageDataGenerator" 359 | ] 360 | }, 361 | { 362 | "cell_type": "code", 363 | "execution_count": null, 364 | "metadata": {}, 365 | "outputs": [], 366 | "source": [ 367 | "K.clear_session()\n", 368 | "\n", 369 | "model = Sequential()\n", 370 | "model.add(Conv2D(32, (3, 3), input_shape = (64, 64, 3), activation = 'relu'))\n", 371 | "model.add(MaxPooling2D(pool_size = (2, 2)))\n", 372 | "model.add(BatchNormalization())\n", 373 | "\n", 374 | "model.add(Conv2D(64, (3, 3), activation = 'relu'))\n", 375 | "model.add(MaxPooling2D(pool_size = (2, 2)))\n", 376 | "model.add(BatchNormalization())\n", 377 | "\n", 378 | "model.add(Conv2D(64, (3, 3), activation = 'relu'))\n", 379 | "model.add(MaxPooling2D(pool_size = (2, 2)))\n", 380 | "model.add(BatchNormalization())\n", 381 | "\n", 382 | "model.add(Flatten())\n", 383 | "\n", 384 | "model.add(Dense(128, activation = 'relu'))\n", 385 | "model.add(Dense(1, activation = 'sigmoid'))" 386 | ] 387 | }, 388 | { 389 | "cell_type": "code", 390 | "execution_count": null, 391 | "metadata": {}, 392 | "outputs": [], 393 | "source": [ 394 | "model.compile(optimizer = 'adam',\n", 395 | " loss = 'binary_crossentropy',\n", 396 | " metrics = ['accuracy'])\n", 397 | "\n", 398 | "model.summary()" 399 | ] 400 | }, 401 | { 402 | "cell_type": "code", 403 | "execution_count": null, 404 | "metadata": {}, 405 | "outputs": [], 406 | "source": [ 407 | "train_gen = ImageDataGenerator(rescale = 1./255,\n", 408 | " width_shift_range=0.1,\n", 409 | " height_shift_range=0.1,\n", 410 | " rotation_range = 10,\n", 411 | " shear_range = 0.2,\n", 412 | " zoom_range = 0.2,\n", 413 | " horizontal_flip = True)\n", 414 | "\n", 415 | "test_gen = ImageDataGenerator(rescale = 1./255)" 416 | ] 417 | }, 418 | { 419 | "cell_type": "code", 420 | "execution_count": null, 421 | "metadata": {}, 422 | "outputs": [], 423 | "source": [ 424 | "train = train_gen.flow_from_directory(data_path + '/train',\n", 425 | " target_size = (64, 64),\n", 426 | " batch_size = 16,\n", 427 | " class_mode = 'binary')\n", 428 | "\n", 429 | "test = test_gen.flow_from_directory(data_path + '/test',\n", 430 | " target_size = (64, 64),\n", 431 | " batch_size = 16,\n", 432 | " class_mode = 'binary')" 433 | ] 434 | }, 435 | { 436 | "cell_type": "code", 437 | "execution_count": null, 438 | "metadata": {}, 439 | "outputs": [], 440 | "source": [ 441 | "model.fit(train,\n", 442 | " steps_per_epoch = 800,\n", 443 | " epochs = 200,\n", 444 | " validation_data = test,\n", 445 | " validation_steps = 200)" 446 | ] 447 | }, 448 | { 449 | "cell_type": "code", 450 | "execution_count": null, 451 | "metadata": {}, 452 | "outputs": [], 453 | "source": [ 454 | "X_test = []\n", 455 | "y_test = []\n", 456 | "for ts in islice(test, 50):\n", 457 | " X_test.append(ts[0])\n", 458 | " y_test.append(ts[1])\n", 459 | "\n", 460 | "X_test = np.concatenate(X_test)\n", 461 | "y_test = np.concatenate(y_test)" 462 | ] 463 | }, 464 | { 465 | "cell_type": "code", 466 | "execution_count": null, 467 | "metadata": {}, 468 | "outputs": [], 469 | "source": [ 470 | "y_pred = model.predict_classes(X_test).ravel()" 471 | ] 472 | }, 473 | { 474 | "cell_type": "code", 475 | "execution_count": null, 476 | "metadata": {}, 477 | "outputs": [], 478 | "source": [ 479 | "np.argwhere(y_test != y_pred).ravel()" 480 | ] 481 | }, 482 | { 483 | "cell_type": "code", 484 | "execution_count": null, 485 | "metadata": {}, 486 | "outputs": [], 487 | "source": [ 488 | "plt.imshow(X_test[14])" 489 | ] 490 | }, 491 | { 492 | "cell_type": "code", 493 | "execution_count": null, 494 | "metadata": {}, 495 | "outputs": [], 496 | "source": [] 497 | } 498 | ], 499 | "metadata": { 500 | "kernelspec": { 501 | "display_name": "Python 3", 502 | "language": "python", 503 | "name": "python3" 504 | }, 505 | "language_info": { 506 | "codemirror_mode": { 507 | "name": "ipython", 508 | "version": 3 509 | }, 510 | "file_extension": ".py", 511 | "mimetype": "text/x-python", 512 | "name": "python", 513 | "nbconvert_exporter": "python", 514 | "pygments_lexer": "ipython3", 515 | "version": "3.7.10" 516 | } 517 | }, 518 | "nbformat": 4, 519 | "nbformat_minor": 2 520 | } 521 | -------------------------------------------------------------------------------- /tests/test_nb.py: -------------------------------------------------------------------------------- 1 | # tests that too long to execute on Travis are temporarily commented out 2 | # TODO: find a way to fix this 3 | 4 | import subprocess 5 | import tempfile 6 | 7 | 8 | def _exec_notebook(path): 9 | with tempfile.NamedTemporaryFile(suffix=".ipynb") as fout: 10 | args = ["jupyter", "nbconvert", "--to", "notebook", "--execute", 11 | "--ExecutePreprocessor.timeout=1000", 12 | "--output", fout.name, path] 13 | subprocess.check_call(args) 14 | 15 | 16 | def test_0(): 17 | _exec_notebook('course/0_Check_Environment.ipynb') 18 | 19 | 20 | def test_1(): 21 | _exec_notebook('course/1 First Deep Learning Model.ipynb') 22 | 23 | 24 | def test_2(): 25 | _exec_notebook('course/2 Data.ipynb') 26 | 27 | 28 | def test_3(): 29 | _exec_notebook('course/3 Machine Learning.ipynb') 30 | 31 | 32 | def test_4(): 33 | _exec_notebook('course/4 Deep Learning Intro.ipynb') 34 | 35 | 36 | def test_5(): 37 | _exec_notebook('course/5 Gradient Descent.ipynb') 38 | 39 | 40 | def test_6(): 41 | _exec_notebook('course/6 Convolutional Neural Networks.ipynb') 42 | 43 | 44 | def test_8(): 45 | _exec_notebook('course/8 Recurrent Neural Networks.ipynb') 46 | 47 | 48 | def test_9(): 49 | _exec_notebook('course/9 Improving performance.ipynb') 50 | 51 | 52 | def test_2_sol(): 53 | _exec_notebook('solutions/2 Data exploration Exercises Solution.ipynb') 54 | 55 | 56 | def test_3_sol(): 57 | _exec_notebook('solutions/3 Machine Learning Exercises Solution.ipynb') 58 | 59 | 60 | def test_4_sol(): 61 | _exec_notebook('solutions/4 Deep Learning Intro Exercises Solution.ipynb') 62 | 63 | 64 | def test_5_sol(): 65 | _exec_notebook('solutions/5 Gradient Descent Exercises Solution.ipynb') 66 | 67 | 68 | def test_6_sol(): 69 | _exec_notebook('solutions/6 Convolutional Neural Networks Exercises Solution.ipynb') 70 | 71 | 72 | def test_8_sol(): 73 | _exec_notebook('solutions/8 Recurrent Neural Networks Exercises Solutions.ipynb') 74 | 75 | 76 | def test_9_sol(): 77 | _exec_notebook('solutions/9 Improving performance Exercises Solutions.ipynb') 78 | --------------------------------------------------------------------------------