├── .gitignore ├── LICENSE.md ├── README.md ├── assets ├── banner.png └── video_play.png ├── lab1 ├── PT_Part1_Intro.ipynb ├── PT_Part2_Music_Generation.ipynb ├── README.md ├── TF_Part1_Intro.ipynb ├── TF_Part2_Music_Generation.ipynb ├── img │ ├── add-graph.png │ ├── computation-graph-2.png │ ├── computation-graph.png │ ├── lab1ngram.png │ ├── lstm_inference.png │ ├── lstm_unrolled-01-01.png │ ├── lstm_unrolled-01.png │ ├── lstm_unrolled.png │ └── music_waveform.png └── solutions │ ├── PT_Part1_Intro_Solution.ipynb │ ├── PT_Part2_Music_Generation_Solution.ipynb │ ├── TF_Part1_Intro_Solution.ipynb │ └── TF_Part2_Music_Generation_Solution.ipynb ├── lab2 ├── PT_Part1_MNIST.ipynb ├── PT_Part2_Debiasing.ipynb ├── TF_Part1_MNIST.ipynb ├── TF_Part2_Debiasing.ipynb ├── img │ ├── DB-VAE.png │ ├── SS-VAE.png │ ├── convnet_fig.png │ ├── mnist_2layers_arch.png │ └── mnist_model.png └── solutions │ ├── PT_Part1_MNIST_Solution.ipynb │ ├── PT_Part2_Debiasing_Solution.ipynb │ ├── TF_Part1_MNIST_Solution.ipynb │ └── TF_Part2_Debiasing_Solution.ipynb ├── lab3 ├── LLM_Finetuning.ipynb ├── README.md ├── img │ └── yoda_wallpaper.jpg └── solutions │ └── LLM_Finetuning_Solution.ipynb ├── mitdeeplearning ├── __init__.py ├── bin │ └── abc2wav ├── data │ ├── faces │ │ ├── DF │ │ │ ├── 10.png │ │ │ ├── 19.png │ │ │ ├── 6.png │ │ │ ├── 7.png │ │ │ └── 9.png │ │ ├── DM │ │ │ ├── 20.png │ │ │ ├── 3.png │ │ │ ├── 5.png │ │ │ ├── 8.png │ │ │ └── 9.png │ │ ├── LF │ │ │ ├── 1.png │ │ │ ├── 11.png │ │ │ ├── 2.png │ │ │ ├── 4.png │ │ │ └── 8.png │ │ └── LM │ │ │ ├── 1.png │ │ │ ├── 11.png │ │ │ ├── 5.png │ │ │ ├── 8.png │ │ │ └── 9.png │ ├── irish.abc │ └── text_styles │ │ ├── leprechaun.txt │ │ └── yoda.txt ├── lab1.py ├── lab2.py ├── lab3.py ├── lab3_old.py └── util.py ├── setup.cfg ├── setup.py ├── test.py └── xtra_labs ├── llm_finetune ├── NOT_FINAL ├── benchmark.csv ├── draft.py ├── spider.png └── utils.py ├── rl_pong ├── RL.ipynb ├── img │ ├── COMING SOON │ └── vista_overview.png └── solutions │ └── RL_Solution.ipynb ├── rl_selfdriving ├── RL.ipynb ├── img │ ├── COMING SOON │ └── vista_overview.png └── solutions │ └── RL_Solution.ipynb └── uncertainty ├── Part1_IntroductionCapsa.ipynb └── Part2_BiasAndUncertainty.ipynb /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | .ipynb_checkpoints 3 | 4 | 5 | *.pyc 6 | *.h5 7 | lab2/logs/* 8 | 9 | 10 | # Byte-compiled / optimized / DLL files 11 | __pycache__/ 12 | *.py[cod] 13 | *$py.class 14 | 15 | # C extensions 16 | *.so 17 | 18 | # Distribution / packaging 19 | .Python 20 | build/ 21 | develop-eggs/ 22 | dist/ 23 | downloads/ 24 | eggs/ 25 | .eggs/ 26 | lib/ 27 | lib64/ 28 | parts/ 29 | sdist/ 30 | var/ 31 | wheels/ 32 | pip-wheel-metadata/ 33 | share/python-wheels/ 34 | *.egg-info/ 35 | .installed.cfg 36 | *.egg 37 | MANIFEST 38 | 39 | # PyInstaller 40 | # Usually these files are written by a python script from a template 41 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 42 | *.manifest 43 | *.spec 44 | 45 | # Installer logs 46 | pip-log.txt 47 | pip-delete-this-directory.txt 48 | 49 | # Unit test / coverage reports 50 | htmlcov/ 51 | .tox/ 52 | .nox/ 53 | .coverage 54 | .coverage.* 55 | .cache 56 | nosetests.xml 57 | coverage.xml 58 | *.cover 59 | *.py,cover 60 | .hypothesis/ 61 | .pytest_cache/ 62 | 63 | # Translations 64 | *.mo 65 | *.pot 66 | 67 | # Django stuff: 68 | *.log 69 | local_settings.py 70 | db.sqlite3 71 | db.sqlite3-journal 72 | 73 | # Flask stuff: 74 | instance/ 75 | .webassets-cache 76 | 77 | # Scrapy stuff: 78 | .scrapy 79 | 80 | # Sphinx documentation 81 | docs/_build/ 82 | 83 | # PyBuilder 84 | target/ 85 | 86 | # Jupyter Notebook 87 | .ipynb_checkpoints 88 | 89 | # IPython 90 | profile_default/ 91 | ipython_config.py 92 | 93 | # pyenv 94 | .python-version 95 | 96 | # pipenv 97 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 98 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 99 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 100 | # install all needed dependencies. 101 | #Pipfile.lock 102 | 103 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 104 | __pypackages__/ 105 | 106 | # Celery stuff 107 | celerybeat-schedule 108 | celerybeat.pid 109 | 110 | # SageMath parsed files 111 | *.sage.py 112 | 113 | # Environments 114 | .env 115 | .venv 116 | env/ 117 | venv/ 118 | ENV/ 119 | env.bak/ 120 | venv.bak/ 121 | 122 | # Spyder project settings 123 | .spyderproject 124 | .spyproject 125 | 126 | # Rope project settings 127 | .ropeproject 128 | 129 | # mkdocs documentation 130 | /site 131 | 132 | # mypy 133 | .mypy_cache/ 134 | .dmypy.json 135 | dmypy.json 136 | 137 | # Pyre type checker 138 | .pyre/ 139 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 [MIT Introduction to Deep Learning](http://introtodeeplearning.com/) 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | The above copyright notice and this permission notice shall be included in all 12 | copies or substantial portions of the Software. 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 19 | SOFTWARE. 20 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![banner](assets/banner.png)](http://introtodeeplearning.com) 2 | 3 | This repository contains all of the code and software labs for [MIT Introduction to Deep Learning](http://introtodeeplearning.com)! All lecture slides and videos are available on the program website. 4 | 5 | # Instructions 6 | MIT Introduction to Deep Learning software labs are designed to be completed at your own pace. At the end of each of the labs, there will be instructions on how you can submit your materials as part of the lab competitions. These instructions include what information must be submitted and in what format. 7 | 8 | ## Opening the labs in Google Colaboratory: 9 | 10 | The 2025 Introduction to Deep Learning labs will be run in Google's Colaboratory, a Jupyter notebook environment that runs entirely in the cloud, so you don't need to download anything. To run these labs, you must have a Google account. 11 | 12 | On this Github repo, navigate to the lab folder you want to run (`lab1`, `lab2`, `lab3`) and open the appropriate python notebook (\*.ipynb). Click the "Run in Colab" link on the top of the lab. That's it! 13 | 14 | ## Running the labs 15 | Now, to run the labs, open the Jupyter notebook on Colab. Navigate to the "Runtime" tab --> "Change runtime type". In the pop-up window, under "Runtime type" select "Python 3", and under "Hardware accelerator" select "GPU". Go through the notebooks and fill in the `#TODO` cells to get the code to compile for yourself! 16 | 17 | 18 | ### MIT Deep Learning package 19 | You might notice that inside the labs we install the `mitdeeplearning` python package from the Python Package repository: 20 | 21 | `pip install mitdeeplearning` 22 | 23 | This package contains convienence functions that we use throughout the course and can be imported like any other Python package. 24 | 25 | `>>> import mitdeeplearning as mdl` 26 | 27 | We do this for you in each of the labs, but the package is also open source under the same license so you can also use it outside the class. 28 | 29 | ## Lecture Videos 30 | 31 | [](https://www.youtube.com/watch?v=njKP3FqW3Sk&list=PLtBw6njQRU-rwp5__7C0oIVt26ZgjG9NI&index=1) 32 | 33 | All lecture videos are available publicly online and linked above! Use and/or modification of lecture slides outside of MIT Introduction to Deep Learning must reference: 34 | 35 | > © MIT Introduction to Deep Learning 36 | > 37 | > http://introtodeeplearning.com 38 | 39 | ## License 40 | All code in this repository is copyright 2025 [MIT Introduction to Deep Learning](http://introtodeeplearning.com). All Rights Reserved. 41 | 42 | Licensed under the MIT License. You may not use this file except in compliance with the License. Use and/or modification of this code outside of MIT Introduction to Deep Learning must reference: 43 | 44 | > © MIT Introduction to Deep Learning 45 | > 46 | > http://introtodeeplearning.com 47 | -------------------------------------------------------------------------------- /assets/banner.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MITDeepLearning/introtodeeplearning/2e90e0461ffa952f5855d4a89a32742796794527/assets/banner.png -------------------------------------------------------------------------------- /assets/video_play.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MITDeepLearning/introtodeeplearning/2e90e0461ffa952f5855d4a89a32742796794527/assets/video_play.png -------------------------------------------------------------------------------- /lab1/PT_Part1_Intro.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "id": "WBk0ZDWY-ff8" 7 | }, 8 | "source": [ 9 | "\n", 10 | " \n", 13 | " \n", 15 | " \n", 17 | "
\n", 11 | " \n", 12 | " Visit MIT Deep Learning\n", 14 | " Run in Google Colab\n", 16 | " View Source on GitHub
\n", 18 | "\n", 19 | "# Copyright Information\n" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": null, 25 | "metadata": { 26 | "id": "3eI6DUic-6jo" 27 | }, 28 | "outputs": [], 29 | "source": [ 30 | "# Copyright 2025 MIT Introduction to Deep Learning. All Rights Reserved.\n", 31 | "#\n", 32 | "# Licensed under the MIT License. You may not use this file except in compliance\n", 33 | "# with the License. Use and/or modification of this code outside of MIT Introduction\n", 34 | "# to Deep Learning must reference:\n", 35 | "#\n", 36 | "# © MIT Introduction to Deep Learning\n", 37 | "# http://introtodeeplearning.com\n", 38 | "#" 39 | ] 40 | }, 41 | { 42 | "cell_type": "markdown", 43 | "metadata": { 44 | "id": "57knM8jrYZ2t" 45 | }, 46 | "source": [ 47 | "# Lab 1: Intro to PyTorch and Music Generation with RNNs\n", 48 | "\n", 49 | "In this lab, you'll get exposure to using PyTorch and learn how it can be used for deep learning. Go through the code and run each cell. Along the way, you'll encounter several ***TODO*** blocks -- follow the instructions to fill them out before running those cells and continuing.\n", 50 | "\n", 51 | "\n", 52 | "# Part 1: Intro to PyTorch\n", 53 | "\n", 54 | "## 0.1 Install PyTorch\n", 55 | "\n", 56 | "[PyTorch](https://pytorch.org/) is a popular deep learning library known for its flexibility and ease of use. Here we'll learn how computations are represented and how to define a simple neural network in PyTorch. For all the labs in Introduction to Deep Learning 2025, there will be a PyTorch version available.\n", 57 | "\n", 58 | "Let's install PyTorch and a couple of dependencies." 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": null, 64 | "metadata": { 65 | "id": "LkaimNJfYZ2w" 66 | }, 67 | "outputs": [], 68 | "source": [ 69 | "import torch\n", 70 | "import torch.nn as nn\n", 71 | "\n", 72 | "# Download and import the MIT Introduction to Deep Learning package\n", 73 | "!pip install mitdeeplearning --quiet\n", 74 | "import mitdeeplearning as mdl\n", 75 | "\n", 76 | "import numpy as np\n", 77 | "import matplotlib.pyplot as plt" 78 | ] 79 | }, 80 | { 81 | "cell_type": "markdown", 82 | "metadata": { 83 | "id": "2QNMcdP4m3Vs" 84 | }, 85 | "source": [ 86 | "## 1.1 What is PyTorch?\n", 87 | "\n", 88 | "PyTorch is a machine learning library, like TensorFlow. At its core, PyTorch provides an interface for creating and manipulating [tensors](https://pytorch.org/docs/stable/tensors.html), which are data structures that you can think of as multi-dimensional arrays. Tensors are represented as n-dimensional arrays of base datatypes such as a string or integer -- they provide a way to generalize vectors and matrices to higher dimensions. PyTorch provides the ability to perform computation on these tensors, define neural networks, and train them efficiently.\n", 89 | "\n", 90 | "The [```shape```](https://pytorch.org/docs/stable/generated/torch.Tensor.shape.html#torch.Tensor.shape) of a PyTorch tensor defines its number of dimensions and the size of each dimension. The `ndim` or [```dim```](https://pytorch.org/docs/stable/generated/torch.Tensor.dim.html#torch.Tensor.dim) of a PyTorch tensor provides the number of dimensions (n-dimensions) -- this is equivalent to the tensor's rank (as is used in TensorFlow), and you can also think of this as the tensor's order or degree.\n", 91 | "\n", 92 | "Let’s start by creating some tensors and inspecting their properties:\n" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": null, 98 | "metadata": { 99 | "id": "tFxztZQInlAB" 100 | }, 101 | "outputs": [], 102 | "source": [ 103 | "integer = torch.tensor(1234)\n", 104 | "decimal = torch.tensor(3.14159265359)\n", 105 | "\n", 106 | "print(f\"`integer` is a {integer.ndim}-d Tensor: {integer}\")\n", 107 | "print(f\"`decimal` is a {decimal.ndim}-d Tensor: {decimal}\")\n" 108 | ] 109 | }, 110 | { 111 | "cell_type": "markdown", 112 | "metadata": { 113 | "id": "-dljcPUcoJZ6" 114 | }, 115 | "source": [ 116 | "Vectors and lists can be used to create 1-d tensors:" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": null, 122 | "metadata": { 123 | "id": "oaHXABe8oPcO" 124 | }, 125 | "outputs": [], 126 | "source": [ 127 | "fibonacci = torch.tensor([1, 1, 2, 3, 5, 8])\n", 128 | "count_to_100 = torch.tensor(range(100))\n", 129 | "\n", 130 | "print(f\"`fibonacci` is a {fibonacci.ndim}-d Tensor with shape: {fibonacci.shape}\")\n", 131 | "print(f\"`count_to_100` is a {count_to_100.ndim}-d Tensor with shape: {count_to_100.shape}\")\n" 132 | ] 133 | }, 134 | { 135 | "cell_type": "markdown", 136 | "metadata": { 137 | "id": "gvffwkvtodLP" 138 | }, 139 | "source": [ 140 | "Next, let’s create 2-d (i.e., matrices) and higher-rank tensors. In image processing and computer vision, we will use 4-d Tensors with dimensions corresponding to batch size, number of color channels, image height, and image width." 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": null, 146 | "metadata": { 147 | "id": "tFeBBe1IouS3" 148 | }, 149 | "outputs": [], 150 | "source": [ 151 | "### Defining higher-order Tensors ###\n", 152 | "\n", 153 | "'''TODO: Define a 2-d Tensor'''\n", 154 | "matrix = # TODO\n", 155 | "\n", 156 | "assert isinstance(matrix, torch.Tensor), \"matrix must be a torch Tensor object\"\n", 157 | "assert matrix.ndim == 2\n", 158 | "\n", 159 | "'''TODO: Define a 4-d Tensor.'''\n", 160 | "# Use torch.zeros to initialize a 4-d Tensor of zeros with size 10 x 3 x 256 x 256.\n", 161 | "# You can think of this as 10 images where each image is RGB 256 x 256.\n", 162 | "images = # TODO\n", 163 | "\n", 164 | "assert isinstance(images, torch.Tensor), \"images must be a torch Tensor object\"\n", 165 | "assert images.ndim == 4, \"images must have 4 dimensions\"\n", 166 | "assert images.shape == (10, 3, 256, 256), \"images is incorrect shape\"\n", 167 | "print(f\"images is a {images.ndim}-d Tensor with shape: {images.shape}\")" 168 | ] 169 | }, 170 | { 171 | "cell_type": "markdown", 172 | "metadata": { 173 | "id": "wkaCDOGapMyl" 174 | }, 175 | "source": [ 176 | "As you have seen, the `shape` of a tensor provides the number of elements in each tensor dimension. The `shape` is quite useful, and we'll use it often. You can also use slicing to access subtensors within a higher-rank tensor:" 177 | ] 178 | }, 179 | { 180 | "cell_type": "code", 181 | "execution_count": null, 182 | "metadata": { 183 | "id": "FhaufyObuLEG" 184 | }, 185 | "outputs": [], 186 | "source": [ 187 | "row_vector = matrix[1]\n", 188 | "column_vector = matrix[:, 1]\n", 189 | "scalar = matrix[0, 1]\n", 190 | "\n", 191 | "print(f\"`row_vector`: {row_vector}\")\n", 192 | "print(f\"`column_vector`: {column_vector}\")\n", 193 | "print(f\"`scalar`: {scalar}\")" 194 | ] 195 | }, 196 | { 197 | "cell_type": "markdown", 198 | "metadata": { 199 | "id": "iD3VO-LZYZ2z" 200 | }, 201 | "source": [ 202 | "## 1.2 Computations on Tensors\n", 203 | "\n", 204 | "A convenient way to think about and visualize computations in a machine learning framework like PyTorch is in terms of graphs. We can define this graph in terms of tensors, which hold data, and the mathematical operations that act on these tensors in some order. Let's look at a simple example, and define this computation using PyTorch:\n", 205 | "\n", 206 | "![alt text](https://raw.githubusercontent.com/MITDeepLearning/introtodeeplearning/2025/lab1/img/add-graph.png)" 207 | ] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "execution_count": null, 212 | "metadata": { 213 | "id": "X_YJrZsxYZ2z" 214 | }, 215 | "outputs": [], 216 | "source": [ 217 | "# Create the nodes in the graph and initialize values\n", 218 | "a = torch.tensor(15)\n", 219 | "b = torch.tensor(61)\n", 220 | "\n", 221 | "# Add them!\n", 222 | "c1 = torch.add(a, b)\n", 223 | "c2 = a + b # PyTorch overrides the \"+\" operation so that it is able to act on Tensors\n", 224 | "print(f\"c1: {c1}\")\n", 225 | "print(f\"c2: {c2}\")\n" 226 | ] 227 | }, 228 | { 229 | "cell_type": "markdown", 230 | "metadata": { 231 | "id": "Mbfv_QOiYZ23" 232 | }, 233 | "source": [ 234 | "Notice how we've created a computation graph consisting of PyTorch operations, and how the output is a tensor with value 76 -- we've just created a computation graph consisting of operations, and it's executed them and given us back the result.\n", 235 | "\n", 236 | "Now let's consider a slightly more complicated example:\n", 237 | "\n", 238 | "![alt text](https://raw.githubusercontent.com/MITDeepLearning/introtodeeplearning/2025/lab1/img/computation-graph.png)\n", 239 | "\n", 240 | "Here, we take two inputs, `a, b`, and compute an output `e`. Each node in the graph represents an operation that takes some input, does some computation, and passes its output to another node.\n", 241 | "\n", 242 | "Let's define a simple function in PyTorch to construct this computation function:" 243 | ] 244 | }, 245 | { 246 | "cell_type": "code", 247 | "execution_count": null, 248 | "metadata": { 249 | "id": "PJnfzpWyYZ23", 250 | "scrolled": true 251 | }, 252 | "outputs": [], 253 | "source": [ 254 | "### Defining Tensor computations ###\n", 255 | "\n", 256 | "# Construct a simple computation function\n", 257 | "def func(a, b):\n", 258 | " '''TODO: Define the operation for c, d, e.'''\n", 259 | " c = # TODO\n", 260 | " d = # TODO\n", 261 | " e = # TODO\n", 262 | " return e\n" 263 | ] 264 | }, 265 | { 266 | "cell_type": "markdown", 267 | "metadata": { 268 | "id": "AwrRfDMS2-oy" 269 | }, 270 | "source": [ 271 | "Now, we can call this function to execute the computation graph given some inputs `a,b`:" 272 | ] 273 | }, 274 | { 275 | "cell_type": "code", 276 | "execution_count": null, 277 | "metadata": { 278 | "id": "pnwsf8w2uF7p" 279 | }, 280 | "outputs": [], 281 | "source": [ 282 | "# Consider example values for a,b\n", 283 | "a, b = 1.5, 2.5\n", 284 | "# Execute the computation\n", 285 | "e_out = func(a, b)\n", 286 | "print(f\"e_out: {e_out}\")" 287 | ] 288 | }, 289 | { 290 | "cell_type": "markdown", 291 | "metadata": { 292 | "id": "6HqgUIUhYZ29" 293 | }, 294 | "source": [ 295 | "Notice how our output is a tensor with value defined by the output of the computation, and that the output has no shape as it is a single scalar value." 296 | ] 297 | }, 298 | { 299 | "cell_type": "markdown", 300 | "metadata": { 301 | "id": "1h4o9Bb0YZ29" 302 | }, 303 | "source": [ 304 | "## 1.3 Neural networks in PyTorch\n", 305 | "We can also define neural networks in PyTorch. PyTorch uses [``torch.nn.Module``](https://pytorch.org/docs/stable/generated/torch.nn.Module.html), which serves as a base class for all neural network modules in PyTorch and thus provides a framework for building and training neural networks.\n", 306 | "\n", 307 | "Let's consider the example of a simple perceptron defined by just one dense (aka fully-connected or linear) layer: $ y = \\sigma(Wx + b) $, where $W$ represents a matrix of weights, $b$ is a bias, $x$ is the input, $\\sigma$ is the sigmoid activation function, and $y$ is the output.\n", 308 | "\n", 309 | "![alt text](https://raw.githubusercontent.com/MITDeepLearning/introtodeeplearning/2025/lab1/img/computation-graph-2.png)\n", 310 | "\n", 311 | "We will use `torch.nn.Module` to define layers -- the building blocks of neural networks. Layers implement common neural networks operations. In PyTorch, when we implement a layer, we subclass `nn.Module` and define the parameters of the layer as attributes of our new class. We also define and override a function [``forward``](https://pytorch.org/docs/stable/generated/torch.nn.Module.html#torch.nn.Module.forward), which will define the forward pass computation that is performed at every step. All classes subclassing `nn.Module` should override the `forward` function.\n", 312 | "\n", 313 | "Let's write a dense layer class to implement a perceptron defined above." 314 | ] 315 | }, 316 | { 317 | "cell_type": "code", 318 | "execution_count": null, 319 | "metadata": { 320 | "id": "HutbJk-1kHPh" 321 | }, 322 | "outputs": [], 323 | "source": [ 324 | "### Defining a dense layer ###\n", 325 | "\n", 326 | "# num_inputs: number of input nodes\n", 327 | "# num_outputs: number of output nodes\n", 328 | "# x: input to the layer\n", 329 | "\n", 330 | "class OurDenseLayer(torch.nn.Module):\n", 331 | " def __init__(self, num_inputs, num_outputs):\n", 332 | " super(OurDenseLayer, self).__init__()\n", 333 | " # Define and initialize parameters: a weight matrix W and bias b\n", 334 | " # Note that the parameter initialize is random!\n", 335 | " self.W = torch.nn.Parameter(torch.randn(num_inputs, num_outputs))\n", 336 | " self.bias = torch.nn.Parameter(torch.randn(num_outputs))\n", 337 | "\n", 338 | " def forward(self, x):\n", 339 | " '''TODO: define the operation for z (hint: use torch.matmul).'''\n", 340 | " z = # TODO\n", 341 | "\n", 342 | " '''TODO: define the operation for out (hint: use torch.sigmoid).'''\n", 343 | " y = # TODO\n", 344 | " return y\n" 345 | ] 346 | }, 347 | { 348 | "cell_type": "markdown", 349 | "metadata": { 350 | "id": "GqeEbn959hV_" 351 | }, 352 | "source": [ 353 | "Now, let's test the output of our layer." 354 | ] 355 | }, 356 | { 357 | "cell_type": "code", 358 | "execution_count": null, 359 | "metadata": { 360 | "id": "2yxjCPa69hV_" 361 | }, 362 | "outputs": [], 363 | "source": [ 364 | "# Define a layer and test the output!\n", 365 | "num_inputs = 2\n", 366 | "num_outputs = 3\n", 367 | "layer = OurDenseLayer(num_inputs, num_outputs)\n", 368 | "x_input = torch.tensor([[1, 2.]])\n", 369 | "y = layer(x_input)\n", 370 | "\n", 371 | "print(f\"input shape: {x_input.shape}\")\n", 372 | "print(f\"output shape: {y.shape}\")\n", 373 | "print(f\"output result: {y}\")" 374 | ] 375 | }, 376 | { 377 | "cell_type": "markdown", 378 | "metadata": { 379 | "id": "Jt1FgM7qYZ3D" 380 | }, 381 | "source": [ 382 | "Conveniently, PyTorch has defined a number of ```nn.Modules``` (or Layers) that are commonly used in neural networks, for example a [```nn.Linear```](https://pytorch.org/docs/stable/generated/torch.nn.Linear.html) or [`nn.Sigmoid`](https://pytorch.org/docs/stable/generated/torch.nn.Sigmoid.html) module.\n", 383 | "\n", 384 | "Now, instead of using a single ```Module``` to define our simple neural network, we'll use the [`nn.Sequential`](https://pytorch.org/docs/stable/generated/torch.nn.Sequential.html) module from PyTorch and a single [`nn.Linear` ](https://pytorch.org/docs/stable/generated/torch.nn.Linear.html) layer to define our network. With the `Sequential` API, you can readily create neural networks by stacking together layers like building blocks." 385 | ] 386 | }, 387 | { 388 | "cell_type": "code", 389 | "execution_count": null, 390 | "metadata": { 391 | "id": "7WXTpmoL6TDz" 392 | }, 393 | "outputs": [], 394 | "source": [ 395 | "### Defining a neural network using the PyTorch Sequential API ###\n", 396 | "\n", 397 | "# define the number of inputs and outputs\n", 398 | "n_input_nodes = 2\n", 399 | "n_output_nodes = 3\n", 400 | "\n", 401 | "# Define the model\n", 402 | "'''TODO: Use the Sequential API to define a neural network with a\n", 403 | " single linear (dense!) layer, followed by non-linearity to compute z'''\n", 404 | "model = nn.Sequential( ''' TODO ''' )\n" 405 | ] 406 | }, 407 | { 408 | "cell_type": "markdown", 409 | "metadata": { 410 | "id": "HDGcwYfUyR-U" 411 | }, 412 | "source": [ 413 | "We've defined our model using the Sequential API. Now, we can test it out using an example input:" 414 | ] 415 | }, 416 | { 417 | "cell_type": "code", 418 | "execution_count": null, 419 | "metadata": { 420 | "id": "zKhp6XqCFFa0" 421 | }, 422 | "outputs": [], 423 | "source": [ 424 | "# Test the model with example input\n", 425 | "x_input = torch.tensor([[1, 2.]])\n", 426 | "model_output = model(x_input)\n", 427 | "print(f\"input shape: {x_input.shape}\")\n", 428 | "print(f\"output shape: {y.shape}\")\n", 429 | "print(f\"output result: {y}\")" 430 | ] 431 | }, 432 | { 433 | "cell_type": "markdown", 434 | "metadata": { 435 | "id": "596NvsOOtr9F" 436 | }, 437 | "source": [ 438 | "With PyTorch, we can create more flexible models by subclassing [`nn.Module`](https://pytorch.org/docs/stable/generated/torch.nn.Module.html). The `nn.Module` class allows us to group layers together flexibly to define new architectures.\n", 439 | "\n", 440 | "As we saw earlier with `OurDenseLayer`, we can subclass `nn.Module` to create a class for our model, and then define the forward pass through the network using the `forward` function. Subclassing affords the flexibility to define custom layers, custom training loops, custom activation functions, and custom models. Let's define the same neural network model as above (i.e., Linear layer with an activation function after it), now using subclassing and using PyTorch's built in linear layer from `nn.Linear`." 441 | ] 442 | }, 443 | { 444 | "cell_type": "code", 445 | "execution_count": null, 446 | "metadata": { 447 | "id": "K4aCflPVyViD" 448 | }, 449 | "outputs": [], 450 | "source": [ 451 | "### Defining a model using subclassing ###\n", 452 | "\n", 453 | "class LinearWithSigmoidActivation(nn.Module):\n", 454 | " def __init__(self, num_inputs, num_outputs):\n", 455 | " super(LinearWithSigmoidActivation, self).__init__()\n", 456 | " '''TODO: define a model with a single Linear layer and sigmoid activation.'''\n", 457 | " self.linear = '''TODO: linear layer'''\n", 458 | " self.activation = '''TODO: sigmoid activation'''\n", 459 | "\n", 460 | " def forward(self, inputs):\n", 461 | " linear_output = self.linear(inputs)\n", 462 | " output = self.activation(linear_output)\n", 463 | " return output\n" 464 | ] 465 | }, 466 | { 467 | "cell_type": "markdown", 468 | "metadata": { 469 | "id": "goKCQ9dEGzRn" 470 | }, 471 | "source": [ 472 | "Let's test out our new model, using an example input, setting `n_input_nodes=2` and `n_output_nodes=3` as before." 473 | ] 474 | }, 475 | { 476 | "cell_type": "code", 477 | "execution_count": null, 478 | "metadata": { 479 | "id": "V-eNhSyRG6hl" 480 | }, 481 | "outputs": [], 482 | "source": [ 483 | "n_input_nodes = 2\n", 484 | "n_output_nodes = 3\n", 485 | "model = LinearWithSigmoidActivation(n_input_nodes, n_output_nodes)\n", 486 | "x_input = torch.tensor([[1, 2.]])\n", 487 | "y = model(x_input)\n", 488 | "print(f\"input shape: {x_input.shape}\")\n", 489 | "print(f\"output shape: {y.shape}\")\n", 490 | "print(f\"output result: {y}\")" 491 | ] 492 | }, 493 | { 494 | "cell_type": "markdown", 495 | "metadata": { 496 | "id": "HTIFMJLAzsyE" 497 | }, 498 | "source": [ 499 | "Importantly, `nn.Module` affords us a lot of flexibility to define custom models. For example, we can use boolean arguments in the `forward` function to specify different network behaviors, for example different behaviors during training and inference. Let's suppose under some instances we want our network to simply output the input, without any perturbation. We define a boolean argument `isidentity` to control this behavior:" 500 | ] 501 | }, 502 | { 503 | "cell_type": "code", 504 | "execution_count": null, 505 | "metadata": { 506 | "id": "P7jzGX5D1xT5" 507 | }, 508 | "outputs": [], 509 | "source": [ 510 | "### Custom behavior with subclassing nn.Module ###\n", 511 | "\n", 512 | "class LinearButSometimesIdentity(nn.Module):\n", 513 | " def __init__(self, num_inputs, num_outputs):\n", 514 | " super(LinearButSometimesIdentity, self).__init__()\n", 515 | " self.linear = nn.Linear(num_inputs, num_outputs)\n", 516 | "\n", 517 | " '''TODO: Implement the behavior where the network outputs the input, unchanged,\n", 518 | " under control of the isidentity argument.'''\n", 519 | " def forward(self, inputs, isidentity=False):\n", 520 | " ''' TODO '''\n" 521 | ] 522 | }, 523 | { 524 | "cell_type": "markdown", 525 | "metadata": { 526 | "id": "Ku4rcCGx5T3y" 527 | }, 528 | "source": [ 529 | "Let's test this behavior:" 530 | ] 531 | }, 532 | { 533 | "cell_type": "code", 534 | "execution_count": null, 535 | "metadata": { 536 | "id": "NzC0mgbk5dp2" 537 | }, 538 | "outputs": [], 539 | "source": [ 540 | "# Test the IdentityModel\n", 541 | "model = LinearButSometimesIdentity(num_inputs=2, num_outputs=3)\n", 542 | "x_input = torch.tensor([[1, 2.]])\n", 543 | "\n", 544 | "'''TODO: pass the input into the model and call with and without the input identity option.'''\n", 545 | "out_with_linear = # TODO\n", 546 | "\n", 547 | "out_with_identity = # TODO\n", 548 | "\n", 549 | "print(f\"input: {x_input}\")\n", 550 | "print(\"Network linear output: {}; network identity output: {}\".format(out_with_linear, out_with_identity))" 551 | ] 552 | }, 553 | { 554 | "cell_type": "markdown", 555 | "metadata": { 556 | "id": "7V1dEqdk6VI5" 557 | }, 558 | "source": [ 559 | "Now that we have learned how to define layers and models in PyTorch using both the Sequential API and subclassing `nn.Module`, we're ready to turn our attention to how to actually implement network training with backpropagation." 560 | ] 561 | }, 562 | { 563 | "cell_type": "markdown", 564 | "metadata": { 565 | "id": "dQwDhKn8kbO2" 566 | }, 567 | "source": [ 568 | "## 1.4 Automatic Differentiation in PyTorch\n", 569 | "\n", 570 | "In PyTorch, [`torch.autograd`](https://pytorch.org/docs/stable/autograd.html) is used for [automatic differentiation](https://en.wikipedia.org/wiki/Automatic_differentiation), which is critical for training deep learning models with [backpropagation](https://en.wikipedia.org/wiki/Backpropagation).\n", 571 | "\n", 572 | "We will use the PyTorch [`.backward()`](https://pytorch.org/docs/stable/generated/torch.Tensor.backward.html) method to trace operations for computing gradients. On a tensor, the [`requires_grad`](https://pytorch.org/docs/stable/generated/torch.Tensor.requires_grad_.html) attribute controls whether autograd should record operations on that tensor. When a forward pass is made through the network, PyTorch builds a computational graph dynamically; then, to compute the gradient, the `backward()` method is called to perform backpropagation.\n", 573 | "\n", 574 | "Let's compute the gradient of $ y = x^2 $:" 575 | ] 576 | }, 577 | { 578 | "cell_type": "code", 579 | "execution_count": null, 580 | "metadata": { 581 | "id": "tdkqk8pw5yJM" 582 | }, 583 | "outputs": [], 584 | "source": [ 585 | "### Gradient computation ###\n", 586 | "\n", 587 | "# y = x^2\n", 588 | "# Example: x = 3.0\n", 589 | "x = torch.tensor(3.0, requires_grad=True)\n", 590 | "y = x ** 2\n", 591 | "y.backward() # Compute the gradient\n", 592 | "\n", 593 | "dy_dx = x.grad\n", 594 | "print(\"dy_dx of y=x^2 at x=3.0 is: \", dy_dx)\n", 595 | "assert dy_dx == 6.0\n" 596 | ] 597 | }, 598 | { 599 | "cell_type": "markdown", 600 | "metadata": { 601 | "id": "JhU5metS5xF3" 602 | }, 603 | "source": [ 604 | "In training neural networks, we use differentiation and stochastic gradient descent (SGD) to optimize a loss function. Now that we have a sense of how PyTorch's autograd can be used to compute and access derivatives, we will look at an example where we use automatic differentiation and SGD to find the minimum of $ L=(x-x_f)^2 $. Here $x_f$ is a variable for a desired value we are trying to optimize for; $L$ represents a loss that we are trying to minimize. While we can clearly solve this problem analytically ($ x_{min}=x_f $), considering how we can compute this using PyTorch's autograd sets us up nicely for future labs where we use gradient descent to optimize entire neural network losses." 605 | ] 606 | }, 607 | { 608 | "cell_type": "code", 609 | "execution_count": null, 610 | "metadata": { 611 | "attributes": { 612 | "classes": [ 613 | "py" 614 | ], 615 | "id": "" 616 | }, 617 | "id": "7g1yWiSXqEf-" 618 | }, 619 | "outputs": [], 620 | "source": [ 621 | "### Function minimization with autograd and gradient descent ###\n", 622 | "\n", 623 | "# Initialize a random value for our intial x\n", 624 | "x = torch.randn(1)\n", 625 | "print(f\"Initializing x={x.item()}\")\n", 626 | "\n", 627 | "learning_rate = 1e-2 # Learning rate\n", 628 | "history = []\n", 629 | "x_f = 4 # Target value\n", 630 | "\n", 631 | "\n", 632 | "# We will run gradient descent for a number of iterations. At each iteration, we compute the loss,\n", 633 | "# compute the derivative of the loss with respect to x, and perform the update.\n", 634 | "for i in range(500):\n", 635 | " x = torch.tensor([x], requires_grad=True)\n", 636 | "\n", 637 | " # TODO: Compute the loss as the square of the difference between x and x_f\n", 638 | " loss = # TODO\n", 639 | "\n", 640 | " # Backpropagate through the loss to compute gradients\n", 641 | " loss.backward()\n", 642 | "\n", 643 | " # Update x with gradient descent\n", 644 | " x = x.item() - learning_rate * x.grad\n", 645 | "\n", 646 | " history.append(x.item())\n", 647 | "\n", 648 | "# Plot the evolution of x as we optimize toward x_f!\n", 649 | "plt.plot(history)\n", 650 | "plt.plot([0, 500], [x_f, x_f])\n", 651 | "plt.legend(('Predicted', 'True'))\n", 652 | "plt.xlabel('Iteration')\n", 653 | "plt.ylabel('x value')\n", 654 | "plt.show()\n" 655 | ] 656 | }, 657 | { 658 | "cell_type": "markdown", 659 | "metadata": { 660 | "id": "pC7czCwk3ceH" 661 | }, 662 | "source": [ 663 | "Now, we have covered the fundamental concepts of PyTorch -- tensors, operations, neural networks, and automatic differentiation. Fire!!\n" 664 | ] 665 | } 666 | ], 667 | "metadata": { 668 | "accelerator": "GPU", 669 | "colab": { 670 | "collapsed_sections": [ 671 | "WBk0ZDWY-ff8" 672 | ], 673 | "name": "PT_Part1_Intro.ipynb", 674 | "provenance": [] 675 | }, 676 | "kernelspec": { 677 | "display_name": "Python 3", 678 | "language": "python", 679 | "name": "python3" 680 | }, 681 | "language_info": { 682 | "codemirror_mode": { 683 | "name": "ipython", 684 | "version": 3 685 | }, 686 | "file_extension": ".py", 687 | "mimetype": "text/x-python", 688 | "name": "python", 689 | "nbconvert_exporter": "python", 690 | "pygments_lexer": "ipython3", 691 | "version": "3.10.6" 692 | }, 693 | "vscode": { 694 | "interpreter": { 695 | "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6" 696 | } 697 | } 698 | }, 699 | "nbformat": 4, 700 | "nbformat_minor": 0 701 | } 702 | -------------------------------------------------------------------------------- /lab1/README.md: -------------------------------------------------------------------------------- 1 | # MIT 6.S191 Lab 1: Intro to Deep Learning in Python and Music Generation with RNNs 2 | 3 | ![alt text](https://github.com/MITDeepLearning/introtodeeplearning/raw/master/lab1/img/music_waveform.png) 4 | ## Part 1: Intro to Deep Learning in Python -- TensorFlow and PyTorch 5 | TensorFlow ("TF") and PyTorch ("PT") are software libraries used in machine learning. Here we'll learn how computations are represented and how to define simple neural networks in TensorFlow and PyTorch. The TensorFlow labs will be prefixed by `TF`; PyTorch labs will be prefixed by `PT`. 6 | 7 | TensorFlow uses a high-level API called [Keras](https://www.tensorflow.org/guide/keras) that provides a powerful, intuitive framework for building and training deep learning models. In the TensorFlow Intro (`TF_Part1_Intro`) you will learn the basics of computations in TensorFlow, the Keras API, and TensorFlow 2.0's imperative execution style. 8 | 9 | [PyTorch](https://pytorch.org/) is a popular deep learning library known for its flexibility, ease of use, and dynamic execution. In the PyTorch Intro (`PT_Part1_Intro`) you will learn the basics of computations in PyTorch and how to define neural networks using either the sequential API and `torch.nn.Module`. 10 | 11 | ## Part 2: Music Generation with RNNs 12 | In the second portion of the lab, we will play around with building a Recurrent Neural Network (RNN) for music generation. We will be using a "character RNN" to predict the next character of sheet music in ABC notation. Finally, we will sample from this model to generate a brand new music file that has never been heard before! 13 | 14 | -------------------------------------------------------------------------------- /lab1/TF_Part1_Intro.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "id": "WBk0ZDWY-ff8" 7 | }, 8 | "source": [ 9 | "\n", 10 | " \n", 13 | " \n", 15 | " \n", 17 | "
\n", 11 | " \n", 12 | " Visit MIT Deep Learning\n", 14 | " Run in Google Colab\n", 16 | " View Source on GitHub
\n", 18 | "\n", 19 | "# Copyright Information\n" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": null, 25 | "metadata": { 26 | "id": "3eI6DUic-6jo" 27 | }, 28 | "outputs": [], 29 | "source": [ 30 | "# Copyright 2025 MIT Introduction to Deep Learning. All Rights Reserved.\n", 31 | "#\n", 32 | "# Licensed under the MIT License. You may not use this file except in compliance\n", 33 | "# with the License. Use and/or modification of this code outside of MIT Introduction\n", 34 | "# to Deep Learning must reference:\n", 35 | "#\n", 36 | "# © MIT Introduction to Deep Learning\n", 37 | "# http://introtodeeplearning.com\n", 38 | "#" 39 | ] 40 | }, 41 | { 42 | "cell_type": "markdown", 43 | "metadata": { 44 | "id": "57knM8jrYZ2t" 45 | }, 46 | "source": [ 47 | "# Lab 1: Intro to TensorFlow and Music Generation with RNNs\n", 48 | "\n", 49 | "In this lab, you'll get exposure to using TensorFlow and learn how it can be used for solving deep learning tasks. Go through the code and run each cell. Along the way, you'll encounter several ***TODO*** blocks -- follow the instructions to fill them out before running those cells and continuing.\n", 50 | "\n", 51 | "\n", 52 | "# Part 1: Intro to TensorFlow\n", 53 | "\n", 54 | "## 0.1 Install TensorFlow\n", 55 | "\n", 56 | "TensorFlow is a software library extensively used in machine learning. Here we'll learn how computations are represented and how to define a simple neural network in TensorFlow. For all the TensorFlow labs in Introduction to Deep Learning 2025, we'll be using TensorFlow 2, which affords great flexibility and the ability to imperatively execute operations, just like in Python. You'll notice that TensorFlow 2 is quite similar to Python in its syntax and imperative execution. Let's install TensorFlow and a couple of dependencies.\n" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": null, 62 | "metadata": { 63 | "id": "LkaimNJfYZ2w" 64 | }, 65 | "outputs": [], 66 | "source": [ 67 | "import tensorflow as tf\n", 68 | "\n", 69 | "# Download and import the MIT Introduction to Deep Learning package\n", 70 | "!pip install mitdeeplearning --quiet\n", 71 | "import mitdeeplearning as mdl\n", 72 | "\n", 73 | "import numpy as np\n", 74 | "import matplotlib.pyplot as plt" 75 | ] 76 | }, 77 | { 78 | "cell_type": "markdown", 79 | "metadata": { 80 | "id": "2QNMcdP4m3Vs" 81 | }, 82 | "source": [ 83 | "## 1.1 Why is TensorFlow called TensorFlow?\n", 84 | "\n", 85 | "TensorFlow is called 'TensorFlow' because it handles the flow (node/mathematical operation) of Tensors, which are data structures that you can think of as multi-dimensional arrays. Tensors are represented as n-dimensional arrays of base dataypes such as a string or integer -- they provide a way to generalize vectors and matrices to higher dimensions.\n", 86 | "\n", 87 | "The ```shape``` of a Tensor defines its number of dimensions and the size of each dimension. The ```rank``` of a Tensor provides the number of dimensions (n-dimensions) -- you can also think of this as the Tensor's order or degree.\n", 88 | "\n", 89 | "Let's first look at 0-d Tensors, of which a scalar is an example:" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": null, 95 | "metadata": { 96 | "id": "tFxztZQInlAB" 97 | }, 98 | "outputs": [], 99 | "source": [ 100 | "sport = tf.constant(\"Tennis\", tf.string)\n", 101 | "number = tf.constant(1.41421356237, tf.float64)\n", 102 | "\n", 103 | "print(\"`sport` is a {}-d Tensor\".format(tf.rank(sport).numpy()))\n", 104 | "print(\"`number` is a {}-d Tensor\".format(tf.rank(number).numpy()))" 105 | ] 106 | }, 107 | { 108 | "cell_type": "markdown", 109 | "metadata": { 110 | "id": "-dljcPUcoJZ6" 111 | }, 112 | "source": [ 113 | "Vectors and lists can be used to create 1-d Tensors:" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": null, 119 | "metadata": { 120 | "id": "oaHXABe8oPcO" 121 | }, 122 | "outputs": [], 123 | "source": [ 124 | "sports = tf.constant([\"Tennis\", \"Basketball\"], tf.string)\n", 125 | "numbers = tf.constant([3.141592, 1.414213, 2.71821], tf.float64)\n", 126 | "\n", 127 | "print(\"`sports` is a {}-d Tensor with shape: {}\".format(tf.rank(sports).numpy(), tf.shape(sports)))\n", 128 | "print(\"`numbers` is a {}-d Tensor with shape: {}\".format(tf.rank(numbers).numpy(), tf.shape(numbers)))" 129 | ] 130 | }, 131 | { 132 | "cell_type": "markdown", 133 | "metadata": { 134 | "id": "gvffwkvtodLP" 135 | }, 136 | "source": [ 137 | "Next we consider creating 2-d (i.e., matrices) and higher-rank Tensors. For examples, in future labs involving image processing and computer vision, we will use 4-d Tensors. Here the dimensions correspond to the number of example images in our batch, image height, image width, and the number of color channels." 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": null, 143 | "metadata": { 144 | "id": "tFeBBe1IouS3" 145 | }, 146 | "outputs": [], 147 | "source": [ 148 | "### Defining higher-order Tensors ###\n", 149 | "\n", 150 | "'''TODO: Define a 2-d Tensor'''\n", 151 | "matrix = # TODO\n", 152 | "\n", 153 | "assert isinstance(matrix, tf.Tensor), \"matrix must be a tf Tensor object\"\n", 154 | "assert tf.rank(matrix).numpy() == 2" 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": null, 160 | "metadata": { 161 | "id": "Zv1fTn_Ya_cz" 162 | }, 163 | "outputs": [], 164 | "source": [ 165 | "'''TODO: Define a 4-d Tensor.'''\n", 166 | "# Use tf.zeros to initialize a 4-d Tensor of zeros with size 10 x 256 x 256 x 3.\n", 167 | "# You can think of this as 10 images where each image is RGB 256 x 256.\n", 168 | "images = # TODO\n", 169 | "\n", 170 | "assert isinstance(images, tf.Tensor), \"matrix must be a tf Tensor object\"\n", 171 | "assert tf.rank(images).numpy() == 4, \"matrix must be of rank 4\"\n", 172 | "assert tf.shape(images).numpy().tolist() == [10, 256, 256, 3], \"matrix is incorrect shape\"" 173 | ] 174 | }, 175 | { 176 | "cell_type": "markdown", 177 | "metadata": { 178 | "id": "wkaCDOGapMyl" 179 | }, 180 | "source": [ 181 | "As you have seen, the ```shape``` of a Tensor provides the number of elements in each Tensor dimension. The ```shape``` is quite useful, and we'll use it often. You can also use slicing to access subtensors within a higher-rank Tensor:" 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": null, 187 | "metadata": { 188 | "id": "FhaufyObuLEG" 189 | }, 190 | "outputs": [], 191 | "source": [ 192 | "row_vector = matrix[1]\n", 193 | "column_vector = matrix[:,1]\n", 194 | "scalar = matrix[0, 1]\n", 195 | "\n", 196 | "print(\"`row_vector`: {}\".format(row_vector.numpy()))\n", 197 | "print(\"`column_vector`: {}\".format(column_vector.numpy()))\n", 198 | "print(\"`scalar`: {}\".format(scalar.numpy()))" 199 | ] 200 | }, 201 | { 202 | "cell_type": "markdown", 203 | "metadata": { 204 | "id": "iD3VO-LZYZ2z" 205 | }, 206 | "source": [ 207 | "## 1.2 Computations on Tensors\n", 208 | "\n", 209 | "A convenient way to think about and visualize computations in TensorFlow is in terms of graphs. We can define this graph in terms of Tensors, which hold data, and the mathematical operations that act on these Tensors in some order. Let's look at a simple example, and define this computation using TensorFlow:\n", 210 | "\n", 211 | "![alt text](https://raw.githubusercontent.com/MITDeepLearning/introtodeeplearning/2025/lab1/img/add-graph.png)" 212 | ] 213 | }, 214 | { 215 | "cell_type": "code", 216 | "execution_count": null, 217 | "metadata": { 218 | "id": "X_YJrZsxYZ2z" 219 | }, 220 | "outputs": [], 221 | "source": [ 222 | "# Create the nodes in the graph, and initialize values\n", 223 | "a = tf.constant(15)\n", 224 | "b = tf.constant(61)\n", 225 | "\n", 226 | "# Add them!\n", 227 | "c1 = tf.add(a,b)\n", 228 | "c2 = a + b # TensorFlow overrides the \"+\" operation so that it is able to act on Tensors\n", 229 | "print(c1)\n", 230 | "print(c2)" 231 | ] 232 | }, 233 | { 234 | "cell_type": "markdown", 235 | "metadata": { 236 | "id": "Mbfv_QOiYZ23" 237 | }, 238 | "source": [ 239 | "Notice how we've created a computation graph consisting of TensorFlow operations, and how the output is a Tensor with value 76 -- we've just created a computation graph consisting of operations, and it's executed them and given us back the result.\n", 240 | "\n", 241 | "Now let's consider a slightly more complicated example:\n", 242 | "\n", 243 | "![alt text](https://raw.githubusercontent.com/MITDeepLearning/introtodeeplearning/2025/lab1/img/computation-graph.png)\n", 244 | "\n", 245 | "Here, we take two inputs, `a, b`, and compute an output `e`. Each node in the graph represents an operation that takes some input, does some computation, and passes its output to another node.\n", 246 | "\n", 247 | "Let's define a simple function in TensorFlow to construct this computation function:" 248 | ] 249 | }, 250 | { 251 | "cell_type": "code", 252 | "execution_count": null, 253 | "metadata": { 254 | "id": "PJnfzpWyYZ23", 255 | "scrolled": true 256 | }, 257 | "outputs": [], 258 | "source": [ 259 | "### Defining Tensor computations ###\n", 260 | "\n", 261 | "# Construct a simple computation function\n", 262 | "def func(a,b):\n", 263 | " '''TODO: Define the operation for c, d, e (use tf.add, tf.subtract, tf.multiply).'''\n", 264 | " c = # TODO\n", 265 | " d = # TODO\n", 266 | " e = # TODO\n", 267 | " return e" 268 | ] 269 | }, 270 | { 271 | "cell_type": "markdown", 272 | "metadata": { 273 | "id": "AwrRfDMS2-oy" 274 | }, 275 | "source": [ 276 | "Now, we can call this function to execute the computation graph given some inputs `a,b`:" 277 | ] 278 | }, 279 | { 280 | "cell_type": "code", 281 | "execution_count": null, 282 | "metadata": { 283 | "id": "pnwsf8w2uF7p" 284 | }, 285 | "outputs": [], 286 | "source": [ 287 | "# Consider example values for a,b\n", 288 | "a, b = 1.5, 2.5\n", 289 | "# Execute the computation\n", 290 | "e_out = func(a,b)\n", 291 | "print(e_out)" 292 | ] 293 | }, 294 | { 295 | "cell_type": "markdown", 296 | "metadata": { 297 | "id": "6HqgUIUhYZ29" 298 | }, 299 | "source": [ 300 | "Notice how our output is a Tensor with value defined by the output of the computation, and that the output has no shape as it is a single scalar value." 301 | ] 302 | }, 303 | { 304 | "cell_type": "markdown", 305 | "metadata": { 306 | "id": "1h4o9Bb0YZ29" 307 | }, 308 | "source": [ 309 | "## 1.3 Neural networks in TensorFlow\n", 310 | "We can also define neural networks in TensorFlow. TensorFlow uses a high-level API called [Keras](https://www.tensorflow.org/guide/keras) that provides a powerful, intuitive framework for building and training deep learning models.\n", 311 | "\n", 312 | "Let's first consider the example of a simple perceptron defined by just one dense layer: $ y = \\sigma(Wx + b)$, where $W$ represents a matrix of weights, $b$ is a bias, $x$ is the input, $\\sigma$ is the sigmoid activation function, and $y$ is the output. We can also visualize this operation using a graph:\n", 313 | "\n", 314 | "![alt text](https://raw.githubusercontent.com/MITDeepLearning/introtodeeplearning/2025/lab1/img/computation-graph-2.png)\n", 315 | "\n", 316 | "Tensors can flow through abstract types called [```Layers```](https://www.tensorflow.org/api_docs/python/tf/keras/layers/Layer) -- the building blocks of neural networks. ```Layers``` implement common neural networks operations, and are used to update weights, compute losses, and define inter-layer connectivity. We will first define a ```Layer``` to implement the simple perceptron defined above." 317 | ] 318 | }, 319 | { 320 | "cell_type": "code", 321 | "execution_count": null, 322 | "metadata": { 323 | "id": "HutbJk-1kHPh" 324 | }, 325 | "outputs": [], 326 | "source": [ 327 | "### Defining a network Layer ###\n", 328 | "\n", 329 | "# n_output_nodes: number of output nodes\n", 330 | "# input_shape: shape of the input\n", 331 | "# x: input to the layer\n", 332 | "\n", 333 | "class OurDenseLayer(tf.keras.layers.Layer):\n", 334 | " def __init__(self, n_output_nodes):\n", 335 | " super(OurDenseLayer, self).__init__()\n", 336 | " self.n_output_nodes = n_output_nodes\n", 337 | "\n", 338 | " def build(self, input_shape):\n", 339 | " d = int(input_shape[-1])\n", 340 | " # Define and initialize parameters: a weight matrix W and bias b\n", 341 | " # Note that parameter initialization is random!\n", 342 | " self.W = self.add_weight(\"weight\", shape=[d, self.n_output_nodes]) # note the dimensionality\n", 343 | " self.b = self.add_weight(\"bias\", shape=[1, self.n_output_nodes]) # note the dimensionality\n", 344 | "\n", 345 | " def call(self, x):\n", 346 | " '''TODO: define the operation for z (hint: use tf.matmul)'''\n", 347 | " z = # TODO\n", 348 | "\n", 349 | " '''TODO: define the operation for out (hint: use tf.sigmoid)'''\n", 350 | " y = # TODO\n", 351 | " return y\n", 352 | "\n", 353 | "# Since layer parameters are initialized randomly, we will set a random seed for reproducibility\n", 354 | "tf.keras.utils.set_random_seed(1)\n", 355 | "layer = OurDenseLayer(3)\n", 356 | "layer.build((1,2))\n", 357 | "x_input = tf.constant([[1,2.]], shape=(1,2))\n", 358 | "y = layer.call(x_input)\n", 359 | "\n", 360 | "# test the output!\n", 361 | "print(y.numpy())\n", 362 | "mdl.lab1.test_custom_dense_layer_output(y)" 363 | ] 364 | }, 365 | { 366 | "cell_type": "markdown", 367 | "metadata": { 368 | "id": "Jt1FgM7qYZ3D" 369 | }, 370 | "source": [ 371 | "Conveniently, TensorFlow has defined a number of ```Layers``` that are commonly used in neural networks, for example a [```Dense```](https://www.tensorflow.org/api_docs/python/tf/keras/layers/Dense?version=stable). Now, instead of using a single ```Layer``` to define our simple neural network, we'll use the [`Sequential`](https://www.tensorflow.org/versions/r2.0/api_docs/python/tf/keras/Sequential) model from Keras and a single [`Dense` ](https://www.tensorflow.org/versions/r2.0/api_docs/python/tf/keras/layers/Dense) layer to define our network. With the `Sequential` API, you can readily create neural networks by stacking together layers like building blocks." 372 | ] 373 | }, 374 | { 375 | "cell_type": "code", 376 | "execution_count": null, 377 | "metadata": { 378 | "id": "7WXTpmoL6TDz" 379 | }, 380 | "outputs": [], 381 | "source": [ 382 | "### Defining a neural network using the Sequential API ###\n", 383 | "\n", 384 | "# Import relevant packages\n", 385 | "from tensorflow.keras import Sequential\n", 386 | "from tensorflow.keras.layers import Dense\n", 387 | "\n", 388 | "# Define the number of outputs\n", 389 | "n_output_nodes = 3\n", 390 | "\n", 391 | "# First define the model\n", 392 | "model = Sequential()\n", 393 | "\n", 394 | "'''TODO: Define a dense (fully connected) layer to compute z'''\n", 395 | "# Remember: dense layers are defined by the parameters W and b!\n", 396 | "# You can read more about the initialization of W and b in the TF documentation :)\n", 397 | "# https://www.tensorflow.org/api_docs/python/tf/keras/layers/Dense?version=stable\n", 398 | "dense_layer = # TODO\n", 399 | "\n", 400 | "# Add the dense layer to the model\n", 401 | "model.add(dense_layer)\n" 402 | ] 403 | }, 404 | { 405 | "cell_type": "markdown", 406 | "metadata": { 407 | "id": "HDGcwYfUyR-U" 408 | }, 409 | "source": [ 410 | "That's it! We've defined our model using the Sequential API. Now, we can test it out using an example input:" 411 | ] 412 | }, 413 | { 414 | "cell_type": "code", 415 | "execution_count": null, 416 | "metadata": { 417 | "id": "sg23OczByRDb" 418 | }, 419 | "outputs": [], 420 | "source": [ 421 | "# Test model with example input\n", 422 | "x_input = tf.constant([[1,2.]], shape=(1,2))\n", 423 | "\n", 424 | "'''TODO: feed input into the model and predict the output!'''\n", 425 | "model_output = # TODO\n", 426 | "print(model_output)" 427 | ] 428 | }, 429 | { 430 | "cell_type": "markdown", 431 | "metadata": { 432 | "id": "596NvsOOtr9F" 433 | }, 434 | "source": [ 435 | "In addition to defining models using the `Sequential` API, we can also define neural networks by directly subclassing the [`Model`](https://www.tensorflow.org/api_docs/python/tf/keras/Model?version=stable) class, which groups layers together to enable model training and inference. The `Model` class captures what we refer to as a \"model\" or as a \"network\". Using Subclassing, we can create a class for our model, and then define the forward pass through the network using the `call` function. Subclassing affords the flexibility to define custom layers, custom training loops, custom activation functions, and custom models. Let's define the same neural network as above now using Subclassing rather than the `Sequential` model." 436 | ] 437 | }, 438 | { 439 | "cell_type": "code", 440 | "execution_count": null, 441 | "metadata": { 442 | "id": "K4aCflPVyViD" 443 | }, 444 | "outputs": [], 445 | "source": [ 446 | "### Defining a model using subclassing ###\n", 447 | "\n", 448 | "from tensorflow.keras import Model\n", 449 | "from tensorflow.keras.layers import Dense\n", 450 | "\n", 451 | "class SubclassModel(tf.keras.Model):\n", 452 | "\n", 453 | " # In __init__, we define the Model's layers\n", 454 | " def __init__(self, n_output_nodes):\n", 455 | " super(SubclassModel, self).__init__()\n", 456 | " '''TODO: Our model consists of a single Dense layer. Define this layer.'''\n", 457 | " self.dense_layer = '''TODO: Dense Layer'''\n", 458 | "\n", 459 | " # In the call function, we define the Model's forward pass.\n", 460 | " def call(self, inputs):\n", 461 | " return self.dense_layer(inputs)" 462 | ] 463 | }, 464 | { 465 | "cell_type": "markdown", 466 | "metadata": { 467 | "id": "U0-lwHDk4irB" 468 | }, 469 | "source": [ 470 | "Just like the model we built using the `Sequential` API, let's test out our `SubclassModel` using an example input.\n", 471 | "\n" 472 | ] 473 | }, 474 | { 475 | "cell_type": "code", 476 | "execution_count": null, 477 | "metadata": { 478 | "id": "LhB34RA-4gXb" 479 | }, 480 | "outputs": [], 481 | "source": [ 482 | "n_output_nodes = 3\n", 483 | "model = SubclassModel(n_output_nodes)\n", 484 | "\n", 485 | "x_input = tf.constant([[1,2.]], shape=(1,2))\n", 486 | "\n", 487 | "print(model.call(x_input))" 488 | ] 489 | }, 490 | { 491 | "cell_type": "markdown", 492 | "metadata": { 493 | "id": "HTIFMJLAzsyE" 494 | }, 495 | "source": [ 496 | "Importantly, Subclassing affords us a lot of flexibility to define custom models. For example, we can use boolean arguments in the `call` function to specify different network behaviors, for example different behaviors during training and inference. Let's suppose under some instances we want our network to simply output the input, without any perturbation. We define a boolean argument `isidentity` to control this behavior:" 497 | ] 498 | }, 499 | { 500 | "cell_type": "code", 501 | "execution_count": null, 502 | "metadata": { 503 | "id": "P7jzGX5D1xT5" 504 | }, 505 | "outputs": [], 506 | "source": [ 507 | "### Defining a model using subclassing and specifying custom behavior ###\n", 508 | "\n", 509 | "from tensorflow.keras import Model\n", 510 | "from tensorflow.keras.layers import Dense\n", 511 | "\n", 512 | "class IdentityModel(tf.keras.Model):\n", 513 | "\n", 514 | " # As before, in __init__ we define the Model's layers\n", 515 | " # Since our desired behavior involves the forward pass, this part is unchanged\n", 516 | " def __init__(self, n_output_nodes):\n", 517 | " super(IdentityModel, self).__init__()\n", 518 | " self.dense_layer = tf.keras.layers.Dense(n_output_nodes, activation='sigmoid')\n", 519 | "\n", 520 | " '''TODO: Implement the behavior where the network outputs the input, unchanged, under control of the isidentity argument.'''\n", 521 | " def call(self, inputs, isidentity=False):\n", 522 | " ### TODO" 523 | ] 524 | }, 525 | { 526 | "cell_type": "markdown", 527 | "metadata": { 528 | "id": "Ku4rcCGx5T3y" 529 | }, 530 | "source": [ 531 | "Let's test this behavior:" 532 | ] 533 | }, 534 | { 535 | "cell_type": "code", 536 | "execution_count": null, 537 | "metadata": { 538 | "id": "NzC0mgbk5dp2" 539 | }, 540 | "outputs": [], 541 | "source": [ 542 | "n_output_nodes = 3\n", 543 | "model = IdentityModel(n_output_nodes)\n", 544 | "\n", 545 | "x_input = tf.constant([[1,2.]], shape=(1,2))\n", 546 | "'''TODO: pass the input into the model and call with and without the input identity option.'''\n", 547 | "out_activate = # TODO\n", 548 | "out_identity = # TODO\n", 549 | "\n", 550 | "print(\"Network output with activation: {}; network identity output: {}\".format(out_activate.numpy(), out_identity.numpy()))" 551 | ] 552 | }, 553 | { 554 | "cell_type": "markdown", 555 | "metadata": { 556 | "id": "7V1dEqdk6VI5" 557 | }, 558 | "source": [ 559 | "Now that we have learned how to define `Layers` as well as neural networks in TensorFlow using both the `Sequential` and Subclassing APIs, we're ready to turn our attention to how to actually implement network training with backpropagation." 560 | ] 561 | }, 562 | { 563 | "cell_type": "markdown", 564 | "metadata": { 565 | "id": "dQwDhKn8kbO2" 566 | }, 567 | "source": [ 568 | "## 1.4 Automatic differentiation in TensorFlow\n", 569 | "\n", 570 | "[Automatic differentiation](https://en.wikipedia.org/wiki/Automatic_differentiation)\n", 571 | "is one of the most important parts of TensorFlow and is the backbone of training with\n", 572 | "[backpropagation](https://en.wikipedia.org/wiki/Backpropagation). We will use the TensorFlow GradientTape [`tf.GradientTape`](https://www.tensorflow.org/api_docs/python/tf/GradientTape?version=stable) to trace operations for computing gradients later.\n", 573 | "\n", 574 | "When a forward pass is made through the network, all forward-pass operations get recorded to a \"tape\"; then, to compute the gradient, the tape is played backwards. By default, the tape is discarded after it is played backwards; this means that a particular `tf.GradientTape` can only\n", 575 | "compute one gradient, and subsequent calls throw a runtime error. However, we can compute multiple gradients over the same computation by creating a ```persistent``` gradient tape.\n", 576 | "\n", 577 | "First, we will look at how we can compute gradients using GradientTape and access them for computation. We define the simple function $ y = x^2$ and compute the gradient:" 578 | ] 579 | }, 580 | { 581 | "cell_type": "code", 582 | "execution_count": null, 583 | "metadata": { 584 | "id": "tdkqk8pw5yJM" 585 | }, 586 | "outputs": [], 587 | "source": [ 588 | "### Gradient computation with GradientTape ###\n", 589 | "\n", 590 | "# y = x^2\n", 591 | "# Example: x = 3.0\n", 592 | "x = tf.Variable(3.0)\n", 593 | "\n", 594 | "# Initiate the gradient tape\n", 595 | "with tf.GradientTape() as tape:\n", 596 | " # Define the function\n", 597 | " y = x * x\n", 598 | "# Access the gradient -- derivative of y with respect to x\n", 599 | "dy_dx = tape.gradient(y, x)\n", 600 | "\n", 601 | "assert dy_dx.numpy() == 6.0" 602 | ] 603 | }, 604 | { 605 | "cell_type": "markdown", 606 | "metadata": { 607 | "id": "JhU5metS5xF3" 608 | }, 609 | "source": [ 610 | "In training neural networks, we use differentiation and stochastic gradient descent (SGD) to optimize a loss function. Now that we have a sense of how `GradientTape` can be used to compute and access derivatives, we will look at an example where we use automatic differentiation and SGD to find the minimum of $L=(x-x_f)^2$. Here $x_f$ is a variable for a desired value we are trying to optimize for; $L$ represents a loss that we are trying to minimize. While we can clearly solve this problem analytically ($x_{min}=x_f$), considering how we can compute this using `GradientTape` sets us up nicely for future labs where we use gradient descent to optimize entire neural network losses." 611 | ] 612 | }, 613 | { 614 | "cell_type": "code", 615 | "execution_count": null, 616 | "metadata": { 617 | "attributes": { 618 | "classes": [ 619 | "py" 620 | ], 621 | "id": "" 622 | }, 623 | "id": "7g1yWiSXqEf-" 624 | }, 625 | "outputs": [], 626 | "source": [ 627 | "### Function minimization with automatic differentiation and SGD ###\n", 628 | "\n", 629 | "# Initialize a random value for our initial x\n", 630 | "x = tf.Variable([tf.random.normal([1])])\n", 631 | "print(\"Initializing x={}\".format(x.numpy()))\n", 632 | "\n", 633 | "learning_rate = 1e-2 # learning rate for SGD\n", 634 | "history = []\n", 635 | "# Define the target value\n", 636 | "x_f = 4\n", 637 | "\n", 638 | "# We will run SGD for a number of iterations. At each iteration, we compute the loss,\n", 639 | "# compute the derivative of the loss with respect to x, and perform the SGD update.\n", 640 | "for i in range(500):\n", 641 | " with tf.GradientTape() as tape:\n", 642 | " '''TODO: define the loss as described above'''\n", 643 | " loss = # TODO\n", 644 | "\n", 645 | " # loss minimization using gradient tape\n", 646 | " grad = tape.gradient(loss, x) # compute the derivative of the loss with respect to x\n", 647 | " new_x = x - learning_rate*grad # sgd update\n", 648 | " x.assign(new_x) # update the value of x\n", 649 | " history.append(x.numpy()[0])\n", 650 | "\n", 651 | "# Plot the evolution of x as we optimize towards x_f!\n", 652 | "plt.plot(history)\n", 653 | "plt.plot([0, 500],[x_f,x_f])\n", 654 | "plt.legend(('Predicted', 'True'))\n", 655 | "plt.xlabel('Iteration')\n", 656 | "plt.ylabel('x value')" 657 | ] 658 | }, 659 | { 660 | "cell_type": "markdown", 661 | "metadata": { 662 | "id": "pC7czCwk3ceH" 663 | }, 664 | "source": [ 665 | "`GradientTape` provides an extremely flexible framework for automatic differentiation. In order to back propagate errors through a neural network, we track forward passes on the Tape, use this information to determine the gradients, and then use these gradients for optimization using SGD.\n" 666 | ] 667 | } 668 | ], 669 | "metadata": { 670 | "accelerator": "GPU", 671 | "colab": { 672 | "collapsed_sections": [ 673 | "WBk0ZDWY-ff8" 674 | ], 675 | "name": "TF_Part1_Intro.ipynb", 676 | "provenance": [] 677 | }, 678 | "kernelspec": { 679 | "display_name": "Python 3", 680 | "language": "python", 681 | "name": "python3" 682 | }, 683 | "language_info": { 684 | "name": "python", 685 | "version": "3.9.6" 686 | }, 687 | "vscode": { 688 | "interpreter": { 689 | "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6" 690 | } 691 | } 692 | }, 693 | "nbformat": 4, 694 | "nbformat_minor": 0 695 | } 696 | -------------------------------------------------------------------------------- /lab1/img/add-graph.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MITDeepLearning/introtodeeplearning/2e90e0461ffa952f5855d4a89a32742796794527/lab1/img/add-graph.png -------------------------------------------------------------------------------- /lab1/img/computation-graph-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MITDeepLearning/introtodeeplearning/2e90e0461ffa952f5855d4a89a32742796794527/lab1/img/computation-graph-2.png -------------------------------------------------------------------------------- /lab1/img/computation-graph.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MITDeepLearning/introtodeeplearning/2e90e0461ffa952f5855d4a89a32742796794527/lab1/img/computation-graph.png -------------------------------------------------------------------------------- /lab1/img/lab1ngram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MITDeepLearning/introtodeeplearning/2e90e0461ffa952f5855d4a89a32742796794527/lab1/img/lab1ngram.png -------------------------------------------------------------------------------- /lab1/img/lstm_inference.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MITDeepLearning/introtodeeplearning/2e90e0461ffa952f5855d4a89a32742796794527/lab1/img/lstm_inference.png -------------------------------------------------------------------------------- /lab1/img/lstm_unrolled-01-01.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MITDeepLearning/introtodeeplearning/2e90e0461ffa952f5855d4a89a32742796794527/lab1/img/lstm_unrolled-01-01.png -------------------------------------------------------------------------------- /lab1/img/lstm_unrolled-01.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MITDeepLearning/introtodeeplearning/2e90e0461ffa952f5855d4a89a32742796794527/lab1/img/lstm_unrolled-01.png -------------------------------------------------------------------------------- /lab1/img/lstm_unrolled.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MITDeepLearning/introtodeeplearning/2e90e0461ffa952f5855d4a89a32742796794527/lab1/img/lstm_unrolled.png -------------------------------------------------------------------------------- /lab1/img/music_waveform.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MITDeepLearning/introtodeeplearning/2e90e0461ffa952f5855d4a89a32742796794527/lab1/img/music_waveform.png -------------------------------------------------------------------------------- /lab1/solutions/TF_Part1_Intro_Solution.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "id": "WBk0ZDWY-ff8" 7 | }, 8 | "source": [ 9 | "\n", 10 | " \n", 13 | " \n", 15 | " \n", 17 | "
\n", 11 | " \n", 12 | " Visit MIT Deep Learning\n", 14 | " Run in Google Colab\n", 16 | " View Source on GitHub
\n", 18 | "\n", 19 | "# Copyright Information\n" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": null, 25 | "metadata": { 26 | "id": "3eI6DUic-6jo" 27 | }, 28 | "outputs": [], 29 | "source": [ 30 | "# Copyright 2025 MIT Introduction to Deep Learning. All Rights Reserved.\n", 31 | "#\n", 32 | "# Licensed under the MIT License. You may not use this file except in compliance\n", 33 | "# with the License. Use and/or modification of this code outside of MIT Introduction\n", 34 | "# to Deep Learning must reference:\n", 35 | "#\n", 36 | "# © MIT Introduction to Deep Learning\n", 37 | "# http://introtodeeplearning.com\n", 38 | "#" 39 | ] 40 | }, 41 | { 42 | "cell_type": "markdown", 43 | "metadata": { 44 | "id": "57knM8jrYZ2t" 45 | }, 46 | "source": [ 47 | "# Lab 1: Intro to TensorFlow and Music Generation with RNNs\n", 48 | "\n", 49 | "In this lab, you'll get exposure to using TensorFlow and learn how it can be used for solving deep learning tasks. Go through the code and run each cell. Along the way, you'll encounter several ***TODO*** blocks -- follow the instructions to fill them out before running those cells and continuing.\n", 50 | "\n", 51 | "\n", 52 | "# Part 1: Intro to TensorFlow\n", 53 | "\n", 54 | "## 0.1 Install TensorFlow\n", 55 | "\n", 56 | "TensorFlow is a software library extensively used in machine learning. Here we'll learn how computations are represented and how to define a simple neural network in TensorFlow. For all the TensorFlow labs in Introduction to Deep Learning 2025, we'll be using TensorFlow 2, which affords great flexibility and the ability to imperatively execute operations, just like in Python. You'll notice that TensorFlow 2 is quite similar to Python in its syntax and imperative execution. Let's install TensorFlow and a couple of dependencies.\n" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": null, 62 | "metadata": { 63 | "id": "LkaimNJfYZ2w" 64 | }, 65 | "outputs": [], 66 | "source": [ 67 | "import tensorflow as tf\n", 68 | "\n", 69 | "# Download and import the MIT Introduction to Deep Learning package\n", 70 | "!pip install mitdeeplearning --quiet\n", 71 | "import mitdeeplearning as mdl\n", 72 | "\n", 73 | "import numpy as np\n", 74 | "import matplotlib.pyplot as plt" 75 | ] 76 | }, 77 | { 78 | "cell_type": "markdown", 79 | "metadata": { 80 | "id": "2QNMcdP4m3Vs" 81 | }, 82 | "source": [ 83 | "## 1.1 Why is TensorFlow called TensorFlow?\n", 84 | "\n", 85 | "TensorFlow is called 'TensorFlow' because it handles the flow (node/mathematical operation) of Tensors, which are data structures that you can think of as multi-dimensional arrays. Tensors are represented as n-dimensional arrays of base dataypes such as a string or integer -- they provide a way to generalize vectors and matrices to higher dimensions.\n", 86 | "\n", 87 | "The ```shape``` of a Tensor defines its number of dimensions and the size of each dimension. The ```rank``` of a Tensor provides the number of dimensions (n-dimensions) -- you can also think of this as the Tensor's order or degree.\n", 88 | "\n", 89 | "Let's first look at 0-d Tensors, of which a scalar is an example:" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": null, 95 | "metadata": { 96 | "id": "tFxztZQInlAB" 97 | }, 98 | "outputs": [], 99 | "source": [ 100 | "sport = tf.constant(\"Tennis\", tf.string)\n", 101 | "number = tf.constant(1.41421356237, tf.float64)\n", 102 | "\n", 103 | "print(\"`sport` is a {}-d Tensor\".format(tf.rank(sport).numpy()))\n", 104 | "print(\"`number` is a {}-d Tensor\".format(tf.rank(number).numpy()))" 105 | ] 106 | }, 107 | { 108 | "cell_type": "markdown", 109 | "metadata": { 110 | "id": "-dljcPUcoJZ6" 111 | }, 112 | "source": [ 113 | "Vectors and lists can be used to create 1-d Tensors:" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": null, 119 | "metadata": { 120 | "id": "oaHXABe8oPcO" 121 | }, 122 | "outputs": [], 123 | "source": [ 124 | "sports = tf.constant([\"Tennis\", \"Basketball\"], tf.string)\n", 125 | "numbers = tf.constant([3.141592, 1.414213, 2.71821], tf.float64)\n", 126 | "\n", 127 | "print(\"`sports` is a {}-d Tensor with shape: {}\".format(tf.rank(sports).numpy(), tf.shape(sports)))\n", 128 | "print(\"`numbers` is a {}-d Tensor with shape: {}\".format(tf.rank(numbers).numpy(), tf.shape(numbers)))" 129 | ] 130 | }, 131 | { 132 | "cell_type": "markdown", 133 | "metadata": { 134 | "id": "gvffwkvtodLP" 135 | }, 136 | "source": [ 137 | "Next we consider creating 2-d (i.e., matrices) and higher-rank Tensors. For examples, in future labs involving image processing and computer vision, we will use 4-d Tensors. Here the dimensions correspond to the number of example images in our batch, image height, image width, and the number of color channels." 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": null, 143 | "metadata": { 144 | "id": "tFeBBe1IouS3" 145 | }, 146 | "outputs": [], 147 | "source": [ 148 | "### Defining higher-order Tensors ###\n", 149 | "\n", 150 | "'''TODO: Define a 2-d Tensor'''\n", 151 | "matrix = tf.constant([[1.0, 2.0, 3.0, 4.0], [5.0, 6.0, 7.0, 8.0]]) # TODO\n", 152 | "# matrix = # TODO\n", 153 | "\n", 154 | "assert isinstance(matrix, tf.Tensor), \"matrix must be a tf Tensor object\"\n", 155 | "assert tf.rank(matrix).numpy() == 2" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": null, 161 | "metadata": { 162 | "id": "Zv1fTn_Ya_cz" 163 | }, 164 | "outputs": [], 165 | "source": [ 166 | "'''TODO: Define a 4-d Tensor.'''\n", 167 | "# Use tf.zeros to initialize a 4-d Tensor of zeros with size 10 x 256 x 256 x 3.\n", 168 | "# You can think of this as 10 images where each image is RGB 256 x 256.\n", 169 | "images = tf.zeros([10, 256, 256, 3]) # TODO\n", 170 | "# images = # TODO\n", 171 | "\n", 172 | "assert isinstance(images, tf.Tensor), \"matrix must be a tf Tensor object\"\n", 173 | "assert tf.rank(images).numpy() == 4, \"matrix must be of rank 4\"\n", 174 | "assert tf.shape(images).numpy().tolist() == [10, 256, 256, 3], \"matrix is incorrect shape\"" 175 | ] 176 | }, 177 | { 178 | "cell_type": "markdown", 179 | "metadata": { 180 | "id": "wkaCDOGapMyl" 181 | }, 182 | "source": [ 183 | "As you have seen, the ```shape``` of a Tensor provides the number of elements in each Tensor dimension. The ```shape``` is quite useful, and we'll use it often. You can also use slicing to access subtensors within a higher-rank Tensor:" 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "execution_count": null, 189 | "metadata": { 190 | "id": "FhaufyObuLEG" 191 | }, 192 | "outputs": [], 193 | "source": [ 194 | "row_vector = matrix[1]\n", 195 | "column_vector = matrix[:,1]\n", 196 | "scalar = matrix[0, 1]\n", 197 | "\n", 198 | "print(\"`row_vector`: {}\".format(row_vector.numpy()))\n", 199 | "print(\"`column_vector`: {}\".format(column_vector.numpy()))\n", 200 | "print(\"`scalar`: {}\".format(scalar.numpy()))" 201 | ] 202 | }, 203 | { 204 | "cell_type": "markdown", 205 | "metadata": { 206 | "id": "iD3VO-LZYZ2z" 207 | }, 208 | "source": [ 209 | "## 1.2 Computations on Tensors\n", 210 | "\n", 211 | "A convenient way to think about and visualize computations in TensorFlow is in terms of graphs. We can define this graph in terms of Tensors, which hold data, and the mathematical operations that act on these Tensors in some order. Let's look at a simple example, and define this computation using TensorFlow:\n", 212 | "\n", 213 | "![alt text](https://raw.githubusercontent.com/MITDeepLearning/introtodeeplearning/2025/lab1/img/add-graph.png)" 214 | ] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "execution_count": null, 219 | "metadata": { 220 | "id": "X_YJrZsxYZ2z" 221 | }, 222 | "outputs": [], 223 | "source": [ 224 | "# Create the nodes in the graph, and initialize values\n", 225 | "a = tf.constant(15)\n", 226 | "b = tf.constant(61)\n", 227 | "\n", 228 | "# Add them!\n", 229 | "c1 = tf.add(a,b)\n", 230 | "c2 = a + b # TensorFlow overrides the \"+\" operation so that it is able to act on Tensors\n", 231 | "print(c1)\n", 232 | "print(c2)" 233 | ] 234 | }, 235 | { 236 | "cell_type": "markdown", 237 | "metadata": { 238 | "id": "Mbfv_QOiYZ23" 239 | }, 240 | "source": [ 241 | "Notice how we've created a computation graph consisting of TensorFlow operations, and how the output is a Tensor with value 76 -- we've just created a computation graph consisting of operations, and it's executed them and given us back the result.\n", 242 | "\n", 243 | "Now let's consider a slightly more complicated example:\n", 244 | "\n", 245 | "![alt text](https://raw.githubusercontent.com/MITDeepLearning/introtodeeplearning/2025/lab1/img/computation-graph.png)\n", 246 | "\n", 247 | "Here, we take two inputs, `a, b`, and compute an output `e`. Each node in the graph represents an operation that takes some input, does some computation, and passes its output to another node.\n", 248 | "\n", 249 | "Let's define a simple function in TensorFlow to construct this computation function:" 250 | ] 251 | }, 252 | { 253 | "cell_type": "code", 254 | "execution_count": null, 255 | "metadata": { 256 | "id": "PJnfzpWyYZ23", 257 | "scrolled": true 258 | }, 259 | "outputs": [], 260 | "source": [ 261 | "### Defining Tensor computations ###\n", 262 | "\n", 263 | "# Construct a simple computation function\n", 264 | "def func(a,b):\n", 265 | " '''TODO: Define the operation for c, d, e (use tf.add, tf.subtract, tf.multiply).'''\n", 266 | " c = tf.add(a, b)\n", 267 | " # c = # TODO\n", 268 | " d = tf.subtract(b, 1)\n", 269 | " # d = # TODO\n", 270 | " e = tf.multiply(c, d)\n", 271 | " # e = # TODO\n", 272 | " return e" 273 | ] 274 | }, 275 | { 276 | "cell_type": "markdown", 277 | "metadata": { 278 | "id": "AwrRfDMS2-oy" 279 | }, 280 | "source": [ 281 | "Now, we can call this function to execute the computation graph given some inputs `a,b`:" 282 | ] 283 | }, 284 | { 285 | "cell_type": "code", 286 | "execution_count": null, 287 | "metadata": { 288 | "id": "pnwsf8w2uF7p" 289 | }, 290 | "outputs": [], 291 | "source": [ 292 | "# Consider example values for a,b\n", 293 | "a, b = 1.5, 2.5\n", 294 | "# Execute the computation\n", 295 | "e_out = func(a,b)\n", 296 | "print(e_out)" 297 | ] 298 | }, 299 | { 300 | "cell_type": "markdown", 301 | "metadata": { 302 | "id": "6HqgUIUhYZ29" 303 | }, 304 | "source": [ 305 | "Notice how our output is a Tensor with value defined by the output of the computation, and that the output has no shape as it is a single scalar value." 306 | ] 307 | }, 308 | { 309 | "cell_type": "markdown", 310 | "metadata": { 311 | "id": "1h4o9Bb0YZ29" 312 | }, 313 | "source": [ 314 | "## 1.3 Neural networks in TensorFlow\n", 315 | "We can also define neural networks in TensorFlow. TensorFlow uses a high-level API called [Keras](https://www.tensorflow.org/guide/keras) that provides a powerful, intuitive framework for building and training deep learning models.\n", 316 | "\n", 317 | "Let's first consider the example of a simple perceptron defined by just one dense layer: $ y = \\sigma(Wx + b)$, where $W$ represents a matrix of weights, $b$ is a bias, $x$ is the input, $\\sigma$ is the sigmoid activation function, and $y$ is the output. We can also visualize this operation using a graph:\n", 318 | "\n", 319 | "![alt text](https://raw.githubusercontent.com/MITDeepLearning/introtodeeplearning/2025/lab1/img/computation-graph-2.png)\n", 320 | "\n", 321 | "Tensors can flow through abstract types called [```Layers```](https://www.tensorflow.org/api_docs/python/tf/keras/layers/Layer) -- the building blocks of neural networks. ```Layers``` implement common neural networks operations, and are used to update weights, compute losses, and define inter-layer connectivity. We will first define a ```Layer``` to implement the simple perceptron defined above." 322 | ] 323 | }, 324 | { 325 | "cell_type": "code", 326 | "execution_count": null, 327 | "metadata": { 328 | "id": "HutbJk-1kHPh" 329 | }, 330 | "outputs": [], 331 | "source": [ 332 | "### Defining a network Layer ###\n", 333 | "\n", 334 | "# n_output_nodes: number of output nodes\n", 335 | "# input_shape: shape of the input\n", 336 | "# x: input to the layer\n", 337 | "\n", 338 | "class OurDenseLayer(tf.keras.layers.Layer):\n", 339 | " def __init__(self, n_output_nodes):\n", 340 | " super(OurDenseLayer, self).__init__()\n", 341 | " self.n_output_nodes = n_output_nodes\n", 342 | "\n", 343 | " def build(self, input_shape):\n", 344 | " d = int(input_shape[-1])\n", 345 | " # Define and initialize parameters: a weight matrix W and bias b\n", 346 | " # Note that parameter initialization is random!\n", 347 | " self.W = self.add_weight(\"weight\", shape=[d, self.n_output_nodes]) # note the dimensionality\n", 348 | " self.b = self.add_weight(\"bias\", shape=[1, self.n_output_nodes]) # note the dimensionality\n", 349 | "\n", 350 | " def call(self, x):\n", 351 | " '''TODO: define the operation for z (hint: use tf.matmul)'''\n", 352 | " z = tf.matmul(x, self.W) + self.b # TODO\n", 353 | " # z = # TODO\n", 354 | "\n", 355 | " '''TODO: define the operation for out (hint: use tf.sigmoid)'''\n", 356 | " y = tf.sigmoid(z) # TODO\n", 357 | " # y = # TODO\n", 358 | " return y\n", 359 | "\n", 360 | "# Since layer parameters are initialized randomly, we will set a random seed for reproducibility\n", 361 | "tf.keras.utils.set_random_seed(1)\n", 362 | "layer = OurDenseLayer(3)\n", 363 | "layer.build((1,2))\n", 364 | "x_input = tf.constant([[1,2.]], shape=(1,2))\n", 365 | "y = layer.call(x_input)\n", 366 | "\n", 367 | "# test the output!\n", 368 | "print(y.numpy())\n", 369 | "mdl.lab1.test_custom_dense_layer_output(y)" 370 | ] 371 | }, 372 | { 373 | "cell_type": "markdown", 374 | "metadata": { 375 | "id": "Jt1FgM7qYZ3D" 376 | }, 377 | "source": [ 378 | "Conveniently, TensorFlow has defined a number of ```Layers``` that are commonly used in neural networks, for example a [```Dense```](https://www.tensorflow.org/api_docs/python/tf/keras/layers/Dense?version=stable). Now, instead of using a single ```Layer``` to define our simple neural network, we'll use the [`Sequential`](https://www.tensorflow.org/versions/r2.0/api_docs/python/tf/keras/Sequential) model from Keras and a single [`Dense` ](https://www.tensorflow.org/versions/r2.0/api_docs/python/tf/keras/layers/Dense) layer to define our network. With the `Sequential` API, you can readily create neural networks by stacking together layers like building blocks." 379 | ] 380 | }, 381 | { 382 | "cell_type": "code", 383 | "execution_count": null, 384 | "metadata": { 385 | "id": "7WXTpmoL6TDz" 386 | }, 387 | "outputs": [], 388 | "source": [ 389 | "### Defining a neural network using the Sequential API ###\n", 390 | "\n", 391 | "# Import relevant packages\n", 392 | "from tensorflow.keras import Sequential\n", 393 | "from tensorflow.keras.layers import Dense\n", 394 | "\n", 395 | "# Define the number of outputs\n", 396 | "n_output_nodes = 3\n", 397 | "\n", 398 | "# First define the model\n", 399 | "model = Sequential()\n", 400 | "\n", 401 | "'''TODO: Define a dense (fully connected) layer to compute z'''\n", 402 | "# Remember: dense layers are defined by the parameters W and b!\n", 403 | "# You can read more about the initialization of W and b in the TF documentation :)\n", 404 | "# https://www.tensorflow.org/api_docs/python/tf/keras/layers/Dense?version=stable\n", 405 | "dense_layer = Dense(n_output_nodes, activation='sigmoid') # TODO\n", 406 | "# dense_layer = # TODO\n", 407 | "\n", 408 | "# Add the dense layer to the model\n", 409 | "model.add(dense_layer)\n" 410 | ] 411 | }, 412 | { 413 | "cell_type": "markdown", 414 | "metadata": { 415 | "id": "HDGcwYfUyR-U" 416 | }, 417 | "source": [ 418 | "That's it! We've defined our model using the Sequential API. Now, we can test it out using an example input:" 419 | ] 420 | }, 421 | { 422 | "cell_type": "code", 423 | "execution_count": null, 424 | "metadata": { 425 | "id": "sg23OczByRDb" 426 | }, 427 | "outputs": [], 428 | "source": [ 429 | "# Test model with example input\n", 430 | "x_input = tf.constant([[1,2.]], shape=(1,2))\n", 431 | "\n", 432 | "'''TODO: feed input into the model and predict the output!'''\n", 433 | "model_output = model(x_input).numpy()\n", 434 | "# model_output = # TODO\n", 435 | "print(model_output)" 436 | ] 437 | }, 438 | { 439 | "cell_type": "markdown", 440 | "metadata": { 441 | "id": "596NvsOOtr9F" 442 | }, 443 | "source": [ 444 | "In addition to defining models using the `Sequential` API, we can also define neural networks by directly subclassing the [`Model`](https://www.tensorflow.org/api_docs/python/tf/keras/Model?version=stable) class, which groups layers together to enable model training and inference. The `Model` class captures what we refer to as a \"model\" or as a \"network\". Using Subclassing, we can create a class for our model, and then define the forward pass through the network using the `call` function. Subclassing affords the flexibility to define custom layers, custom training loops, custom activation functions, and custom models. Let's define the same neural network as above now using Subclassing rather than the `Sequential` model." 445 | ] 446 | }, 447 | { 448 | "cell_type": "code", 449 | "execution_count": null, 450 | "metadata": { 451 | "id": "K4aCflPVyViD" 452 | }, 453 | "outputs": [], 454 | "source": [ 455 | "### Defining a model using subclassing ###\n", 456 | "\n", 457 | "from tensorflow.keras import Model\n", 458 | "from tensorflow.keras.layers import Dense\n", 459 | "\n", 460 | "class SubclassModel(tf.keras.Model):\n", 461 | "\n", 462 | " # In __init__, we define the Model's layers\n", 463 | " def __init__(self, n_output_nodes):\n", 464 | " super(SubclassModel, self).__init__()\n", 465 | " '''TODO: Our model consists of a single Dense layer. Define this layer.'''\n", 466 | " self.dense_layer = Dense(n_output_nodes, activation='sigmoid') # TODO\n", 467 | " # self.dense_layer = '''TODO: Dense Layer'''\n", 468 | "\n", 469 | " # In the call function, we define the Model's forward pass.\n", 470 | " def call(self, inputs):\n", 471 | " return self.dense_layer(inputs)" 472 | ] 473 | }, 474 | { 475 | "cell_type": "markdown", 476 | "metadata": { 477 | "id": "U0-lwHDk4irB" 478 | }, 479 | "source": [ 480 | "Just like the model we built using the `Sequential` API, let's test out our `SubclassModel` using an example input.\n", 481 | "\n" 482 | ] 483 | }, 484 | { 485 | "cell_type": "code", 486 | "execution_count": null, 487 | "metadata": { 488 | "id": "LhB34RA-4gXb" 489 | }, 490 | "outputs": [], 491 | "source": [ 492 | "n_output_nodes = 3\n", 493 | "model = SubclassModel(n_output_nodes)\n", 494 | "\n", 495 | "x_input = tf.constant([[1,2.]], shape=(1,2))\n", 496 | "\n", 497 | "print(model.call(x_input))" 498 | ] 499 | }, 500 | { 501 | "cell_type": "markdown", 502 | "metadata": { 503 | "id": "HTIFMJLAzsyE" 504 | }, 505 | "source": [ 506 | "Importantly, Subclassing affords us a lot of flexibility to define custom models. For example, we can use boolean arguments in the `call` function to specify different network behaviors, for example different behaviors during training and inference. Let's suppose under some instances we want our network to simply output the input, without any perturbation. We define a boolean argument `isidentity` to control this behavior:" 507 | ] 508 | }, 509 | { 510 | "cell_type": "code", 511 | "execution_count": null, 512 | "metadata": { 513 | "id": "P7jzGX5D1xT5" 514 | }, 515 | "outputs": [], 516 | "source": [ 517 | "### Defining a model using subclassing and specifying custom behavior ###\n", 518 | "\n", 519 | "from tensorflow.keras import Model\n", 520 | "from tensorflow.keras.layers import Dense\n", 521 | "\n", 522 | "class IdentityModel(tf.keras.Model):\n", 523 | "\n", 524 | " # As before, in __init__ we define the Model's layers\n", 525 | " # Since our desired behavior involves the forward pass, this part is unchanged\n", 526 | " def __init__(self, n_output_nodes):\n", 527 | " super(IdentityModel, self).__init__()\n", 528 | " self.dense_layer = tf.keras.layers.Dense(n_output_nodes, activation='sigmoid')\n", 529 | "\n", 530 | " '''TODO: Implement the behavior where the network outputs the input, unchanged, under control of the isidentity argument.'''\n", 531 | " def call(self, inputs, isidentity=False):\n", 532 | " x = self.dense_layer(inputs)\n", 533 | " if isidentity: # TODO\n", 534 | " return inputs # TODO\n", 535 | " return x\n", 536 | "\n", 537 | " # def call(self, inputs, isidentity=False):\n", 538 | " # TODO" 539 | ] 540 | }, 541 | { 542 | "cell_type": "markdown", 543 | "metadata": { 544 | "id": "Ku4rcCGx5T3y" 545 | }, 546 | "source": [ 547 | "Let's test this behavior:" 548 | ] 549 | }, 550 | { 551 | "cell_type": "code", 552 | "execution_count": null, 553 | "metadata": { 554 | "id": "NzC0mgbk5dp2" 555 | }, 556 | "outputs": [], 557 | "source": [ 558 | "n_output_nodes = 3\n", 559 | "model = IdentityModel(n_output_nodes)\n", 560 | "\n", 561 | "x_input = tf.constant([[1,2.]], shape=(1,2))\n", 562 | "'''TODO: pass the input into the model and call with and without the input identity option.'''\n", 563 | "out_activate = model.call(x_input) # TODO\n", 564 | "# out_activate = # TODO\n", 565 | "out_identity = model.call(x_input, isidentity=True) # TODO\n", 566 | "# out_identity = # TODO\n", 567 | "\n", 568 | "print(\"Network output with activation: {}; network identity output: {}\".format(out_activate.numpy(), out_identity.numpy()))" 569 | ] 570 | }, 571 | { 572 | "cell_type": "markdown", 573 | "metadata": { 574 | "id": "7V1dEqdk6VI5" 575 | }, 576 | "source": [ 577 | "Now that we have learned how to define `Layers` as well as neural networks in TensorFlow using both the `Sequential` and Subclassing APIs, we're ready to turn our attention to how to actually implement network training with backpropagation." 578 | ] 579 | }, 580 | { 581 | "cell_type": "markdown", 582 | "metadata": { 583 | "id": "dQwDhKn8kbO2" 584 | }, 585 | "source": [ 586 | "## 1.4 Automatic differentiation in TensorFlow\n", 587 | "\n", 588 | "[Automatic differentiation](https://en.wikipedia.org/wiki/Automatic_differentiation)\n", 589 | "is one of the most important parts of TensorFlow and is the backbone of training with\n", 590 | "[backpropagation](https://en.wikipedia.org/wiki/Backpropagation). We will use the TensorFlow GradientTape [`tf.GradientTape`](https://www.tensorflow.org/api_docs/python/tf/GradientTape?version=stable) to trace operations for computing gradients later.\n", 591 | "\n", 592 | "When a forward pass is made through the network, all forward-pass operations get recorded to a \"tape\"; then, to compute the gradient, the tape is played backwards. By default, the tape is discarded after it is played backwards; this means that a particular `tf.GradientTape` can only\n", 593 | "compute one gradient, and subsequent calls throw a runtime error. However, we can compute multiple gradients over the same computation by creating a ```persistent``` gradient tape.\n", 594 | "\n", 595 | "First, we will look at how we can compute gradients using GradientTape and access them for computation. We define the simple function $ y = x^2$ and compute the gradient:" 596 | ] 597 | }, 598 | { 599 | "cell_type": "code", 600 | "execution_count": null, 601 | "metadata": { 602 | "id": "tdkqk8pw5yJM" 603 | }, 604 | "outputs": [], 605 | "source": [ 606 | "### Gradient computation with GradientTape ###\n", 607 | "\n", 608 | "# y = x^2\n", 609 | "# Example: x = 3.0\n", 610 | "x = tf.Variable(3.0)\n", 611 | "\n", 612 | "# Initiate the gradient tape\n", 613 | "with tf.GradientTape() as tape:\n", 614 | " # Define the function\n", 615 | " y = x * x\n", 616 | "# Access the gradient -- derivative of y with respect to x\n", 617 | "dy_dx = tape.gradient(y, x)\n", 618 | "\n", 619 | "assert dy_dx.numpy() == 6.0" 620 | ] 621 | }, 622 | { 623 | "cell_type": "markdown", 624 | "metadata": { 625 | "id": "JhU5metS5xF3" 626 | }, 627 | "source": [ 628 | "In training neural networks, we use differentiation and stochastic gradient descent (SGD) to optimize a loss function. Now that we have a sense of how `GradientTape` can be used to compute and access derivatives, we will look at an example where we use automatic differentiation and SGD to find the minimum of $L=(x-x_f)^2$. Here $x_f$ is a variable for a desired value we are trying to optimize for; $L$ represents a loss that we are trying to minimize. While we can clearly solve this problem analytically ($x_{min}=x_f$), considering how we can compute this using `GradientTape` sets us up nicely for future labs where we use gradient descent to optimize entire neural network losses." 629 | ] 630 | }, 631 | { 632 | "cell_type": "code", 633 | "execution_count": null, 634 | "metadata": { 635 | "attributes": { 636 | "classes": [ 637 | "py" 638 | ], 639 | "id": "" 640 | }, 641 | "id": "7g1yWiSXqEf-" 642 | }, 643 | "outputs": [], 644 | "source": [ 645 | "### Function minimization with automatic differentiation and SGD ###\n", 646 | "\n", 647 | "# Initialize a random value for our initial x\n", 648 | "x = tf.Variable([tf.random.normal([1])])\n", 649 | "print(\"Initializing x={}\".format(x.numpy()))\n", 650 | "\n", 651 | "learning_rate = 1e-2 # learning rate for SGD\n", 652 | "history = []\n", 653 | "# Define the target value\n", 654 | "x_f = 4\n", 655 | "\n", 656 | "# We will run SGD for a number of iterations. At each iteration, we compute the loss,\n", 657 | "# compute the derivative of the loss with respect to x, and perform the SGD update.\n", 658 | "for i in range(500):\n", 659 | " with tf.GradientTape() as tape:\n", 660 | " '''TODO: define the loss as described above'''\n", 661 | " loss = (x - x_f)**2 # \"forward pass\": record the current loss on the tape\n", 662 | " # loss = # TODO\n", 663 | "\n", 664 | " # loss minimization using gradient tape\n", 665 | " grad = tape.gradient(loss, x) # compute the derivative of the loss with respect to x\n", 666 | " new_x = x - learning_rate*grad # sgd update\n", 667 | " x.assign(new_x) # update the value of x\n", 668 | " history.append(x.numpy()[0])\n", 669 | "\n", 670 | "# Plot the evolution of x as we optimize towards x_f!\n", 671 | "plt.plot(history)\n", 672 | "plt.plot([0, 500],[x_f,x_f])\n", 673 | "plt.legend(('Predicted', 'True'))\n", 674 | "plt.xlabel('Iteration')\n", 675 | "plt.ylabel('x value')" 676 | ] 677 | }, 678 | { 679 | "cell_type": "markdown", 680 | "metadata": { 681 | "id": "pC7czCwk3ceH" 682 | }, 683 | "source": [ 684 | "`GradientTape` provides an extremely flexible framework for automatic differentiation. In order to back propagate errors through a neural network, we track forward passes on the Tape, use this information to determine the gradients, and then use these gradients for optimization using SGD.\n" 685 | ] 686 | } 687 | ], 688 | "metadata": { 689 | "accelerator": "GPU", 690 | "colab": { 691 | "collapsed_sections": [ 692 | "WBk0ZDWY-ff8" 693 | ], 694 | "name": "TF_Part1_Intro_Solution.ipynb", 695 | "provenance": [] 696 | }, 697 | "kernelspec": { 698 | "display_name": "Python 3", 699 | "language": "python", 700 | "name": "python3" 701 | }, 702 | "language_info": { 703 | "name": "python", 704 | "version": "3.9.6" 705 | }, 706 | "vscode": { 707 | "interpreter": { 708 | "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6" 709 | } 710 | } 711 | }, 712 | "nbformat": 4, 713 | "nbformat_minor": 0 714 | } 715 | -------------------------------------------------------------------------------- /lab2/TF_Part1_MNIST.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "id": "Xmf_JRJa_N8C" 7 | }, 8 | "source": [ 9 | "\n", 10 | " \n", 13 | " \n", 15 | " \n", 17 | "
\n", 11 | " \n", 12 | " Visit MIT Deep Learning\n", 14 | " Run in Google Colab\n", 16 | " View Source on GitHub
\n", 18 | "\n", 19 | "# Copyright Information" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": null, 25 | "metadata": { 26 | "id": "gKA_J7bdP33T" 27 | }, 28 | "outputs": [], 29 | "source": [ 30 | "# Copyright 2025 MIT Introduction to Deep Learning. All Rights Reserved.\n", 31 | "#\n", 32 | "# Licensed under the MIT License. You may not use this file except in compliance\n", 33 | "# with the License. Use and/or modification of this code outside of MIT Introduction\n", 34 | "# to Deep Learning must reference:\n", 35 | "#\n", 36 | "# © MIT Introduction to Deep Learning\n", 37 | "# http://introtodeeplearning.com\n", 38 | "#" 39 | ] 40 | }, 41 | { 42 | "cell_type": "markdown", 43 | "metadata": { 44 | "id": "Cm1XpLftPi4A" 45 | }, 46 | "source": [ 47 | "# Laboratory 2: Computer Vision\n", 48 | "\n", 49 | "# Part 1: MNIST Digit Classification\n", 50 | "\n", 51 | "In the first portion of this lab, we will build and train a convolutional neural network (CNN) for classification of handwritten digits from the famous [MNIST](http://yann.lecun.com/exdb/mnist/) dataset. The MNIST dataset consists of 60,000 training images and 10,000 test images. Our classes are the digits 0-9.\n", 52 | "\n", 53 | "First, let's download the course repository, install dependencies, and import the relevant packages we'll need for this lab." 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": null, 59 | "metadata": { 60 | "id": "RsGqx_ai_N8F" 61 | }, 62 | "outputs": [], 63 | "source": [ 64 | "# Import Tensorflow 2.0\n", 65 | "# !pip install tensorflow\n", 66 | "import tensorflow as tf\n", 67 | "\n", 68 | "# MIT introduction to deep learning package\n", 69 | "!pip install mitdeeplearning --quiet\n", 70 | "import mitdeeplearning as mdl\n", 71 | "\n", 72 | "# other packages\n", 73 | "import matplotlib.pyplot as plt\n", 74 | "import numpy as np\n", 75 | "import random\n", 76 | "from tqdm import tqdm" 77 | ] 78 | }, 79 | { 80 | "cell_type": "markdown", 81 | "metadata": { 82 | "id": "nCpHDxX1bzyZ" 83 | }, 84 | "source": [ 85 | "We'll also install Comet. If you followed the instructions from Lab 1, you should have your Comet account set up. Enter your API key below." 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": null, 91 | "metadata": { 92 | "id": "GSR_PAqjbzyZ" 93 | }, 94 | "outputs": [], 95 | "source": [ 96 | "!pip install comet_ml > /dev/null 2>&1\n", 97 | "import comet_ml\n", 98 | "# TODO: ENTER YOUR API KEY HERE!!\n", 99 | "COMET_API_KEY = \"\"\n", 100 | "\n", 101 | "# Check that we are using a GPU, if not switch runtimes\n", 102 | "# using Runtime > Change Runtime Type > GPU\n", 103 | "assert len(tf.config.list_physical_devices('GPU')) > 0\n", 104 | "assert COMET_API_KEY != \"\", \"Please insert your Comet API Key\"" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "source": [ 110 | "# start a first comet experiment for the first part of the lab\n", 111 | "comet_ml.init(project_name=\"6S191_lab2_part1_NN\")\n", 112 | "comet_model_1 = comet_ml.Experiment()" 113 | ], 114 | "metadata": { 115 | "id": "wGPDtVxvTtPk" 116 | }, 117 | "execution_count": null, 118 | "outputs": [] 119 | }, 120 | { 121 | "cell_type": "markdown", 122 | "metadata": { 123 | "id": "HKjrdUtX_N8J" 124 | }, 125 | "source": [ 126 | "## 1.1 MNIST dataset\n", 127 | "\n", 128 | "Let's download and load the dataset and display a few random samples from it:" 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": null, 134 | "metadata": { 135 | "id": "p2dQsHI3_N8K" 136 | }, 137 | "outputs": [], 138 | "source": [ 139 | "mnist = tf.keras.datasets.mnist\n", 140 | "(train_images, train_labels), (test_images, test_labels) = mnist.load_data()\n", 141 | "train_images = (np.expand_dims(train_images, axis=-1)/255.).astype(np.float32)\n", 142 | "train_labels = (train_labels).astype(np.int64)\n", 143 | "test_images = (np.expand_dims(test_images, axis=-1)/255.).astype(np.float32)\n", 144 | "test_labels = (test_labels).astype(np.int64)" 145 | ] 146 | }, 147 | { 148 | "cell_type": "markdown", 149 | "metadata": { 150 | "id": "5ZtUqOqePsRD" 151 | }, 152 | "source": [ 153 | "Our training set is made up of 28x28 grayscale images of handwritten digits.\n", 154 | "\n", 155 | "Let's visualize what some of these images and their corresponding training labels look like." 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": null, 161 | "metadata": { 162 | "id": "bDBsR2lP_N8O", 163 | "scrolled": true 164 | }, 165 | "outputs": [], 166 | "source": [ 167 | "plt.figure(figsize=(10,10))\n", 168 | "random_inds = np.random.choice(60000,36)\n", 169 | "for i in range(36):\n", 170 | " plt.subplot(6,6,i+1)\n", 171 | " plt.xticks([])\n", 172 | " plt.yticks([])\n", 173 | " plt.grid(False)\n", 174 | " image_ind = random_inds[i]\n", 175 | " plt.imshow(np.squeeze(train_images[image_ind]), cmap=plt.cm.binary)\n", 176 | " plt.xlabel(train_labels[image_ind])\n", 177 | "comet_model_1.log_figure(figure=plt)" 178 | ] 179 | }, 180 | { 181 | "cell_type": "markdown", 182 | "metadata": { 183 | "id": "V6hd3Nt1_N8q" 184 | }, 185 | "source": [ 186 | "## 1.2 Neural Network for Handwritten Digit Classification\n", 187 | "\n", 188 | "We'll first build a simple neural network consisting of two fully connected layers and apply this to the digit classification task. Our network will ultimately output a probability distribution over the 10 digit classes (0-9). This first architecture we will be building is depicted below:\n", 189 | "\n", 190 | "![alt_text](https://raw.githubusercontent.com/MITDeepLearning/introtodeeplearning/master/lab2/img/mnist_2layers_arch.png \"CNN Architecture for MNIST Classification\")\n" 191 | ] 192 | }, 193 | { 194 | "cell_type": "markdown", 195 | "metadata": { 196 | "id": "rphS2rMIymyZ" 197 | }, 198 | "source": [ 199 | "### Fully connected neural network architecture\n", 200 | "To define the architecture of this first fully connected neural network, we'll once again use the Keras API and define the model using the [`Sequential`](https://www.tensorflow.org/api_docs/python/tf/keras/models/Sequential) class. Note how we first use a [`Flatten`](https://www.tensorflow.org/api_docs/python/tf/keras/layers/Flatten) layer, which flattens the input so that it can be fed into the model.\n", 201 | "\n", 202 | "In this next block, you'll define the fully connected layers of this simple work." 203 | ] 204 | }, 205 | { 206 | "cell_type": "code", 207 | "execution_count": null, 208 | "metadata": { 209 | "id": "MMZsbjAkDKpU" 210 | }, 211 | "outputs": [], 212 | "source": [ 213 | "def build_fc_model():\n", 214 | " fc_model = tf.keras.Sequential([\n", 215 | " # First define a Flatten layer\n", 216 | " tf.keras.layers.Flatten(),\n", 217 | "\n", 218 | " # '''TODO: Define the activation function for the first fully connected (Dense) layer.'''\n", 219 | " tf.keras.layers.Dense(128, activation= '''TODO'''),\n", 220 | "\n", 221 | " # '''TODO: Define the second Dense layer to output the classification probabilities'''\n", 222 | " '''[TODO Dense layer to output classification probabilities]'''\n", 223 | "\n", 224 | " ])\n", 225 | " return fc_model\n", 226 | "\n", 227 | "model = build_fc_model()" 228 | ] 229 | }, 230 | { 231 | "cell_type": "markdown", 232 | "metadata": { 233 | "id": "VtGZpHVKz5Jt" 234 | }, 235 | "source": [ 236 | "As we progress through this next portion, you may find that you'll want to make changes to the architecture defined above. **Note that in order to update the model later on, you'll need to re-run the above cell to re-initialize the model.**" 237 | ] 238 | }, 239 | { 240 | "cell_type": "markdown", 241 | "metadata": { 242 | "id": "mVN1_AeG_N9N" 243 | }, 244 | "source": [ 245 | "Let's take a step back and think about the network we've just created. The first layer in this network, `tf.keras.layers.Flatten`, transforms the format of the images from a 2d-array (28 x 28 pixels), to a 1d-array of 28 * 28 = 784 pixels. You can think of this layer as unstacking rows of pixels in the image and lining them up. There are no learned parameters in this layer; it only reformats the data.\n", 246 | "\n", 247 | "After the pixels are flattened, the network consists of a sequence of two `tf.keras.layers.Dense` layers. These are fully-connected neural layers. The first `Dense` layer has 128 nodes (or neurons). The second (and last) layer (which you've defined!) should return an array of probability scores that sum to 1. Each node contains a score that indicates the probability that the current image belongs to one of the handwritten digit classes.\n", 248 | "\n", 249 | "That defines our fully connected model!" 250 | ] 251 | }, 252 | { 253 | "cell_type": "markdown", 254 | "metadata": { 255 | "id": "gut8A_7rCaW6" 256 | }, 257 | "source": [ 258 | "\n", 259 | "\n", 260 | "### Compile the model\n", 261 | "\n", 262 | "Before training the model, we need to define a few more settings. These are added during the model's [`compile`](https://www.tensorflow.org/api_docs/python/tf/keras/models/Sequential#compile) step:\n", 263 | "\n", 264 | "* *Loss function* — This defines how we measure how accurate the model is during training. As was covered in lecture, during training we want to minimize this function, which will \"steer\" the model in the right direction.\n", 265 | "* *Optimizer* — This defines how the model is updated based on the data it sees and its loss function.\n", 266 | "* *Metrics* — Here we can define metrics used to monitor the training and testing steps. In this example, we'll look at the *accuracy*, the fraction of the images that are correctly classified.\n", 267 | "\n", 268 | "We'll start out by using a stochastic gradient descent (SGD) optimizer initialized with a learning rate of 0.1. Since we are performing a categorical classification task, we'll want to use the [cross entropy loss](https://www.tensorflow.org/api_docs/python/tf/keras/metrics/sparse_categorical_crossentropy).\n", 269 | "\n", 270 | "You'll want to experiment with both the choice of optimizer and learning rate and evaluate how these affect the accuracy of the trained model." 271 | ] 272 | }, 273 | { 274 | "cell_type": "code", 275 | "execution_count": null, 276 | "metadata": { 277 | "id": "Lhan11blCaW7" 278 | }, 279 | "outputs": [], 280 | "source": [ 281 | "'''TODO: Experiment with different optimizers and learning rates. How do these affect\n", 282 | " the accuracy of the trained model? Which optimizers and/or learning rates yield\n", 283 | " the best performance?'''\n", 284 | "model.compile(optimizer=tf.keras.optimizers.SGD(learning_rate=1e-1),\n", 285 | " loss='sparse_categorical_crossentropy',\n", 286 | " metrics=['accuracy'])" 287 | ] 288 | }, 289 | { 290 | "cell_type": "markdown", 291 | "metadata": { 292 | "id": "qKF6uW-BCaW-" 293 | }, 294 | "source": [ 295 | "### Train the model\n", 296 | "\n", 297 | "We're now ready to train our model, which will involve feeding the training data (`train_images` and `train_labels`) into the model, and then asking it to learn the associations between images and labels. We'll also need to define the batch size and the number of epochs, or iterations over the MNIST dataset, to use during training.\n", 298 | "\n", 299 | "In Lab 1, we saw how we can use `GradientTape` to optimize losses and train models with stochastic gradient descent. After defining the model settings in the `compile` step, we can also accomplish training by calling the [`fit`](https://www.tensorflow.org/api_docs/python/tf/keras/models/Sequential#fit) method on an instance of the `Model` class. We will use this to train our fully connected model\n" 300 | ] 301 | }, 302 | { 303 | "cell_type": "code", 304 | "execution_count": null, 305 | "metadata": { 306 | "id": "EFMbIqIvQ2X0" 307 | }, 308 | "outputs": [], 309 | "source": [ 310 | "# Define the batch size and the number of epochs to use during training\n", 311 | "BATCH_SIZE = 64\n", 312 | "EPOCHS = 5\n", 313 | "\n", 314 | "model.fit(train_images, train_labels, batch_size=BATCH_SIZE, epochs=EPOCHS)\n", 315 | "comet_model_1.end()" 316 | ] 317 | }, 318 | { 319 | "cell_type": "markdown", 320 | "metadata": { 321 | "id": "W3ZVOhugCaXA" 322 | }, 323 | "source": [ 324 | "As the model trains, the loss and accuracy metrics are displayed. With five epochs and a learning rate of 0.01, this fully connected model should achieve an accuracy of approximatley 0.97 (or 97%) on the training data." 325 | ] 326 | }, 327 | { 328 | "cell_type": "markdown", 329 | "metadata": { 330 | "id": "oEw4bZgGCaXB" 331 | }, 332 | "source": [ 333 | "### Evaluate accuracy on the test dataset\n", 334 | "\n", 335 | "Now that we've trained the model, we can ask it to make predictions about a test set that it hasn't seen before. In this example, the `test_images` array comprises our test dataset. To evaluate accuracy, we can check to see if the model's predictions match the labels from the `test_labels` array.\n", 336 | "\n", 337 | "Use the [`evaluate`](https://www.tensorflow.org/api_docs/python/tf/keras/models/Sequential#evaluate) method to evaluate the model on the test dataset!" 338 | ] 339 | }, 340 | { 341 | "cell_type": "code", 342 | "execution_count": null, 343 | "metadata": { 344 | "id": "VflXLEeECaXC" 345 | }, 346 | "outputs": [], 347 | "source": [ 348 | "'''TODO: Use the evaluate method to test the model!'''\n", 349 | "test_loss, test_acc = # TODO\n", 350 | "\n", 351 | "print('Test accuracy:', test_acc)" 352 | ] 353 | }, 354 | { 355 | "cell_type": "markdown", 356 | "metadata": { 357 | "id": "yWfgsmVXCaXG" 358 | }, 359 | "source": [ 360 | "You may observe that the accuracy on the test dataset is a little lower than the accuracy on the training dataset. This gap between training accuracy and test accuracy is an example of *overfitting*, when a machine learning model performs worse on new data than on its training data.\n", 361 | "\n", 362 | "What is the highest accuracy you can achieve with this first fully connected model? Since the handwritten digit classification task is pretty straightforward, you may be wondering how we can do better...\n", 363 | "\n", 364 | "![Deeper...](https://i.kym-cdn.com/photos/images/newsfeed/000/534/153/f87.jpg)" 365 | ] 366 | }, 367 | { 368 | "cell_type": "markdown", 369 | "metadata": { 370 | "id": "baIw9bDf8v6Z" 371 | }, 372 | "source": [ 373 | "## 1.3 Convolutional Neural Network (CNN) for handwritten digit classification" 374 | ] 375 | }, 376 | { 377 | "cell_type": "markdown", 378 | "metadata": { 379 | "id": "_J72Yt1o_fY7" 380 | }, 381 | "source": [ 382 | "As we saw in lecture, convolutional neural networks (CNNs) are particularly well-suited for a variety of tasks in computer vision, and have achieved near-perfect accuracies on the MNIST dataset. We will now build a CNN composed of two convolutional layers and pooling layers, followed by two fully connected layers, and ultimately output a probability distribution over the 10 digit classes (0-9). The CNN we will be building is depicted below:\n", 383 | "\n", 384 | "![alt_text](https://raw.githubusercontent.com/MITDeepLearning/introtodeeplearning/master/lab2/img/convnet_fig.png \"CNN Architecture for MNIST Classification\")" 385 | ] 386 | }, 387 | { 388 | "cell_type": "markdown", 389 | "metadata": { 390 | "id": "EEHqzbJJAEoR" 391 | }, 392 | "source": [ 393 | "### Define the CNN model\n", 394 | "\n", 395 | "We'll use the same training and test datasets as before, and proceed similarly as our fully connected network to define and train our new CNN model. To do this we will explore two layers we have not encountered before: you can use [`keras.layers.Conv2D` ](https://www.tensorflow.org/api_docs/python/tf/keras/layers/Conv2D) to define convolutional layers and [`keras.layers.MaxPool2D`](https://www.tensorflow.org/api_docs/python/tf/keras/layers/MaxPool2D) to define the pooling layers. Use the parameters shown in the network architecture above to define these layers and build the CNN model." 396 | ] 397 | }, 398 | { 399 | "cell_type": "code", 400 | "execution_count": null, 401 | "metadata": { 402 | "id": "vec9qcJs-9W5" 403 | }, 404 | "outputs": [], 405 | "source": [ 406 | "def build_cnn_model():\n", 407 | " cnn_model = tf.keras.Sequential([\n", 408 | "\n", 409 | " # TODO: Define the first convolutional layer\n", 410 | " tf.keras.layers.Conv2D('''TODO''')\n", 411 | "\n", 412 | " # TODO: Define the first max pooling layer\n", 413 | " tf.keras.layers.MaxPool2D('''TODO''')\n", 414 | "\n", 415 | " # TODO: Define the second convolutional layer\n", 416 | " tf.keras.layers.Conv2D('''TODO''')\n", 417 | "\n", 418 | " # TODO: Define the second max pooling layer\n", 419 | " tf.keras.layers.MaxPool2D('''TODO''')\n", 420 | "\n", 421 | " tf.keras.layers.Flatten(),\n", 422 | " tf.keras.layers.Dense(128, activation=tf.nn.relu),\n", 423 | "\n", 424 | " # TODO: Define the last Dense layer to output the classification\n", 425 | " # probabilities. Pay attention to the activation needed a probability\n", 426 | " # output\n", 427 | " '''[TODO Dense layer to output classification probabilities]'''\n", 428 | " ])\n", 429 | "\n", 430 | " return cnn_model\n", 431 | "\n", 432 | "cnn_model = build_cnn_model()\n", 433 | "# Initialize the model by passing some data through\n", 434 | "cnn_model.predict(train_images[[0]])\n", 435 | "# Print the summary of the layers in the model.\n", 436 | "print(cnn_model.summary())" 437 | ] 438 | }, 439 | { 440 | "cell_type": "markdown", 441 | "metadata": { 442 | "id": "kUAXIBynCih2" 443 | }, 444 | "source": [ 445 | "### Train and test the CNN model\n", 446 | "\n", 447 | "Now, as before, we can define the loss function, optimizer, and metrics through the `compile` method. Compile the CNN model with an optimizer and learning rate of choice:" 448 | ] 449 | }, 450 | { 451 | "cell_type": "code", 452 | "execution_count": null, 453 | "metadata": { 454 | "id": "vheyanDkCg6a" 455 | }, 456 | "outputs": [], 457 | "source": [ 458 | "comet_ml.init(project_name=\"6.s191lab2_part1_CNN\")\n", 459 | "comet_model_2 = comet_ml.Experiment()\n", 460 | "\n", 461 | "'''TODO: Define the compile operation with your optimizer and learning rate of choice'''\n", 462 | "cnn_model.compile(optimizer='''TODO''', loss='''TODO''', metrics=['accuracy']) # TODO" 463 | ] 464 | }, 465 | { 466 | "cell_type": "markdown", 467 | "metadata": { 468 | "id": "U19bpRddC7H_" 469 | }, 470 | "source": [ 471 | "As was the case with the fully connected model, we can train our CNN using the `fit` method via the Keras API." 472 | ] 473 | }, 474 | { 475 | "cell_type": "code", 476 | "execution_count": null, 477 | "metadata": { 478 | "id": "YdrGZVmWDK4p" 479 | }, 480 | "outputs": [], 481 | "source": [ 482 | "'''TODO: Use model.fit to train the CNN model, with the same batch_size and number of epochs previously used.'''\n", 483 | "cnn_model.fit('''TODO''')\n", 484 | "# comet_model_2.end()" 485 | ] 486 | }, 487 | { 488 | "cell_type": "markdown", 489 | "metadata": { 490 | "id": "pEszYWzgDeIc" 491 | }, 492 | "source": [ 493 | "Great! Now that we've trained the model, let's evaluate it on the test dataset using the [`evaluate`](https://www.tensorflow.org/api_docs/python/tf/keras/models/Sequential#evaluate) method:" 494 | ] 495 | }, 496 | { 497 | "cell_type": "code", 498 | "execution_count": null, 499 | "metadata": { 500 | "id": "JDm4znZcDtNl" 501 | }, 502 | "outputs": [], 503 | "source": [ 504 | "'''TODO: Use the evaluate method to test the model!'''\n", 505 | "test_loss, test_acc = # TODO\n", 506 | "\n", 507 | "print('Test accuracy:', test_acc)" 508 | ] 509 | }, 510 | { 511 | "cell_type": "markdown", 512 | "metadata": { 513 | "id": "2rvEgK82Glv9" 514 | }, 515 | "source": [ 516 | "What is the highest accuracy you're able to achieve using the CNN model, and how does the accuracy of the CNN model compare to the accuracy of the simple fully connected network? What optimizers and learning rates seem to be optimal for training the CNN model?\n", 517 | "\n", 518 | "Feel free to click the Comet links to investigate the training/accuracy curves for your model." 519 | ] 520 | }, 521 | { 522 | "cell_type": "markdown", 523 | "metadata": { 524 | "id": "xsoS7CPDCaXH" 525 | }, 526 | "source": [ 527 | "### Make predictions with the CNN model\n", 528 | "\n", 529 | "With the model trained, we can use it to make predictions about some images. The [`predict`](https://www.tensorflow.org/api_docs/python/tf/keras/models/Sequential#predict) function call generates the output predictions given a set of input samples.\n" 530 | ] 531 | }, 532 | { 533 | "cell_type": "code", 534 | "execution_count": null, 535 | "metadata": { 536 | "id": "Gl91RPhdCaXI" 537 | }, 538 | "outputs": [], 539 | "source": [ 540 | "predictions = cnn_model.predict(test_images)" 541 | ] 542 | }, 543 | { 544 | "cell_type": "markdown", 545 | "metadata": { 546 | "id": "x9Kk1voUCaXJ" 547 | }, 548 | "source": [ 549 | "With this function call, the model has predicted the label for each image in the testing set. Let's take a look at the prediction for the first image in the test dataset:" 550 | ] 551 | }, 552 | { 553 | "cell_type": "code", 554 | "execution_count": null, 555 | "metadata": { 556 | "id": "3DmJEUinCaXK" 557 | }, 558 | "outputs": [], 559 | "source": [ 560 | "predictions[0]" 561 | ] 562 | }, 563 | { 564 | "cell_type": "markdown", 565 | "metadata": { 566 | "id": "-hw1hgeSCaXN" 567 | }, 568 | "source": [ 569 | "As you can see, a prediction is an array of 10 numbers. Recall that the output of our model is a probability distribution over the 10 digit classes. Thus, these numbers describe the model's \"confidence\" that the image corresponds to each of the 10 different digits.\n", 570 | "\n", 571 | "Let's look at the digit that has the highest confidence for the first image in the test dataset:" 572 | ] 573 | }, 574 | { 575 | "cell_type": "code", 576 | "execution_count": null, 577 | "metadata": { 578 | "id": "qsqenuPnCaXO" 579 | }, 580 | "outputs": [], 581 | "source": [ 582 | "'''TODO: identify the digit with the highest confidence prediction for the first\n", 583 | " image in the test dataset. '''\n", 584 | "prediction = # TODO\n", 585 | "\n", 586 | "print(prediction)" 587 | ] 588 | }, 589 | { 590 | "cell_type": "markdown", 591 | "metadata": { 592 | "id": "E51yS7iCCaXO" 593 | }, 594 | "source": [ 595 | "So, the model is most confident that this image is a \"???\". We can check the test label (remember, this is the true identity of the digit) to see if this prediction is correct:" 596 | ] 597 | }, 598 | { 599 | "cell_type": "code", 600 | "execution_count": null, 601 | "metadata": { 602 | "id": "Sd7Pgsu6CaXP" 603 | }, 604 | "outputs": [], 605 | "source": [ 606 | "print(\"Label of this digit is:\", test_labels[0])\n", 607 | "plt.imshow(test_images[0,:,:,0], cmap=plt.cm.binary)\n", 608 | "comet_model_2.log_figure(figure=plt)" 609 | ] 610 | }, 611 | { 612 | "cell_type": "markdown", 613 | "metadata": { 614 | "id": "ygh2yYC972ne" 615 | }, 616 | "source": [ 617 | "It is! Let's visualize the classification results on the MNIST dataset. We will plot images from the test dataset along with their predicted label, as well as a histogram that provides the prediction probabilities for each of the digits:" 618 | ] 619 | }, 620 | { 621 | "cell_type": "code", 622 | "execution_count": null, 623 | "metadata": { 624 | "id": "HV5jw-5HwSmO" 625 | }, 626 | "outputs": [], 627 | "source": [ 628 | "#@title Change the slider to look at the model's predictions! { run: \"auto\" }\n", 629 | "\n", 630 | "image_index = 79 #@param {type:\"slider\", min:0, max:100, step:1}\n", 631 | "plt.subplot(1,2,1)\n", 632 | "mdl.lab2.plot_image_prediction(image_index, predictions, test_labels, test_images)\n", 633 | "plt.subplot(1,2,2)\n", 634 | "mdl.lab2.plot_value_prediction(image_index, predictions, test_labels)\n", 635 | "comet_model_2.log_figure(figure=plt)" 636 | ] 637 | }, 638 | { 639 | "cell_type": "markdown", 640 | "metadata": { 641 | "id": "kgdvGD52CaXR" 642 | }, 643 | "source": [ 644 | "We can also plot several images along with their predictions, where correct prediction labels are blue and incorrect prediction labels are grey. The number gives the percent confidence (out of 100) for the predicted label. Note the model can be very confident in an incorrect prediction!" 645 | ] 646 | }, 647 | { 648 | "cell_type": "code", 649 | "execution_count": null, 650 | "metadata": { 651 | "id": "hQlnbqaw2Qu_" 652 | }, 653 | "outputs": [], 654 | "source": [ 655 | "# Plots the first X test images, their predicted label, and the true label\n", 656 | "# Color correct predictions in blue, incorrect predictions in red\n", 657 | "num_rows = 5\n", 658 | "num_cols = 4\n", 659 | "num_images = num_rows*num_cols\n", 660 | "plt.figure(figsize=(2*2*num_cols, 2*num_rows))\n", 661 | "for i in range(num_images):\n", 662 | " plt.subplot(num_rows, 2*num_cols, 2*i+1)\n", 663 | " mdl.lab2.plot_image_prediction(i, predictions, test_labels, test_images)\n", 664 | " plt.subplot(num_rows, 2*num_cols, 2*i+2)\n", 665 | " mdl.lab2.plot_value_prediction(i, predictions, test_labels)\n", 666 | "comet_model_2.log_figure(figure=plt)\n", 667 | "comet_model_2.end()\n" 668 | ] 669 | }, 670 | { 671 | "cell_type": "markdown", 672 | "metadata": { 673 | "id": "k-2glsRiMdqa" 674 | }, 675 | "source": [ 676 | "## 1.4 Training the model 2.0\n", 677 | "\n", 678 | "Earlier in the lab, we used the [`fit`](https://www.tensorflow.org/api_docs/python/tf/keras/models/Sequential#fit) function call to train the model. This function is quite high-level and intuitive, which is really useful for simpler models. As you may be able to tell, this function abstracts away many details in the training call, and we have less control over training model, which could be useful in other contexts.\n", 679 | "\n", 680 | "As an alternative to this, we can use the [`tf.GradientTape`](https://www.tensorflow.org/api_docs/python/tf/GradientTape) class to record differentiation operations during training, and then call the [`tf.GradientTape.gradient`](https://www.tensorflow.org/api_docs/python/tf/GradientTape#gradient) function to actually compute the gradients. You may recall seeing this in Lab 1 Part 1, but let's take another look at this here.\n", 681 | "\n", 682 | "We'll use this framework to train our `cnn_model` using stochastic gradient descent." 683 | ] 684 | }, 685 | { 686 | "cell_type": "code", 687 | "execution_count": null, 688 | "metadata": { 689 | "id": "Wq34id-iN1Ml" 690 | }, 691 | "outputs": [], 692 | "source": [ 693 | "# Rebuild the CNN model\n", 694 | "cnn_model = build_cnn_model()\n", 695 | "\n", 696 | "batch_size = 12\n", 697 | "loss_history = mdl.util.LossHistory(smoothing_factor=0.95) # to record the evolution of the loss\n", 698 | "plotter = mdl.util.PeriodicPlotter(sec=2, xlabel='Iterations', ylabel='Loss', scale='semilogy')\n", 699 | "optimizer = tf.keras.optimizers.SGD(learning_rate=1e-2) # define our optimizer\n", 700 | "\n", 701 | "comet_ml.init(project_name=\"6.s191lab2_part1_CNN2\")\n", 702 | "comet_model_3 = comet_ml.Experiment()\n", 703 | "\n", 704 | "if hasattr(tqdm, '_instances'): tqdm._instances.clear() # clear if it exists\n", 705 | "\n", 706 | "for idx in tqdm(range(0, train_images.shape[0], batch_size)):\n", 707 | " # First grab a batch of training data and convert the input images to tensors\n", 708 | " (images, labels) = (train_images[idx:idx+batch_size], train_labels[idx:idx+batch_size])\n", 709 | " images = tf.convert_to_tensor(images, dtype=tf.float32)\n", 710 | "\n", 711 | " # GradientTape to record differentiation operations\n", 712 | " with tf.GradientTape() as tape:\n", 713 | " #'''TODO: feed the images into the model and obtain the predictions'''\n", 714 | " logits = # TODO\n", 715 | "\n", 716 | " #'''TODO: compute the categorical cross entropy loss\n", 717 | " loss_value = tf.keras.backend.sparse_categorical_crossentropy('''TODO''', '''TODO''') # TODO\n", 718 | " comet_model_3.log_metric(\"loss\", loss_value.numpy().mean(), step=idx)\n", 719 | "\n", 720 | " loss_history.append(loss_value.numpy().mean()) # append the loss to the loss_history record\n", 721 | " plotter.plot(loss_history.get())\n", 722 | "\n", 723 | " # Backpropagation\n", 724 | " '''TODO: Use the tape to compute the gradient against all parameters in the CNN model.\n", 725 | " Use cnn_model.trainable_variables to access these parameters.'''\n", 726 | " grads = # TODO\n", 727 | " optimizer.apply_gradients(zip(grads, cnn_model.trainable_variables))\n", 728 | "\n", 729 | "comet_model_3.log_figure(figure=plt)\n", 730 | "comet_model_3.end()\n" 731 | ] 732 | }, 733 | { 734 | "cell_type": "markdown", 735 | "metadata": { 736 | "id": "3cNtDhVaqEdR" 737 | }, 738 | "source": [ 739 | "## 1.5 Conclusion\n", 740 | "In this part of the lab, you had the chance to play with different MNIST classifiers with different architectures (fully-connected layers only, CNN), and experiment with how different hyperparameters affect accuracy (learning rate, etc.). The next part of the lab explores another application of CNNs, facial detection, and some drawbacks of AI systems in real world applications, like issues of bias." 741 | ] 742 | } 743 | ], 744 | "metadata": { 745 | "accelerator": "GPU", 746 | "colab": { 747 | "collapsed_sections": [ 748 | "Xmf_JRJa_N8C" 749 | ], 750 | "name": "TF_Part1_MNIST.ipynb", 751 | "provenance": [] 752 | }, 753 | "kernelspec": { 754 | "display_name": "Python 3", 755 | "name": "python3" 756 | }, 757 | "language_info": { 758 | "codemirror_mode": { 759 | "name": "ipython", 760 | "version": 3 761 | }, 762 | "file_extension": ".py", 763 | "mimetype": "text/x-python", 764 | "name": "python", 765 | "nbconvert_exporter": "python", 766 | "pygments_lexer": "ipython3", 767 | "version": "3.9.6" 768 | } 769 | }, 770 | "nbformat": 4, 771 | "nbformat_minor": 0 772 | } 773 | -------------------------------------------------------------------------------- /lab2/img/DB-VAE.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MITDeepLearning/introtodeeplearning/2e90e0461ffa952f5855d4a89a32742796794527/lab2/img/DB-VAE.png -------------------------------------------------------------------------------- /lab2/img/SS-VAE.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MITDeepLearning/introtodeeplearning/2e90e0461ffa952f5855d4a89a32742796794527/lab2/img/SS-VAE.png -------------------------------------------------------------------------------- /lab2/img/convnet_fig.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MITDeepLearning/introtodeeplearning/2e90e0461ffa952f5855d4a89a32742796794527/lab2/img/convnet_fig.png -------------------------------------------------------------------------------- /lab2/img/mnist_2layers_arch.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MITDeepLearning/introtodeeplearning/2e90e0461ffa952f5855d4a89a32742796794527/lab2/img/mnist_2layers_arch.png -------------------------------------------------------------------------------- /lab2/img/mnist_model.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MITDeepLearning/introtodeeplearning/2e90e0461ffa952f5855d4a89a32742796794527/lab2/img/mnist_model.png -------------------------------------------------------------------------------- /lab3/README.md: -------------------------------------------------------------------------------- 1 | # MIT 6.S191 Lab 3: Fine-Tune an LLM, You Must! 2 | 3 | ![yoda](https://github.com/MITDeepLearning/introtodeeplearning/raw/2025/lab3/img/yoda_wallpaper.jpg) 4 | In this lab, you will fine-tune a multi-billion parameter large language model (LLM). We will go through several fundamental concepts of LLMs, including tokenization, templates, and fine-tuning. This lab provides a complete pipeline for fine-tuning a language model to generate responses in a specific style, and you will explore not only language model fine-tuning, but also ways to evaluate the performance of a language model. 5 | 6 | You will use Google's [Gemma 2B](https://huggingface.co/google/gemma-2b-it) model as the base language model to fine-tune; [Liquid AI's](https://www.liquid.ai/) [LFM-40B](https://www.liquid.ai/liquid-foundation-models) as an evaluation "judge" model; and Comet ML's [Opik](https://www.comet.com/site/products/opik/) as a framework for streamlined LLM evaluation. -------------------------------------------------------------------------------- /lab3/img/yoda_wallpaper.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MITDeepLearning/introtodeeplearning/2e90e0461ffa952f5855d4a89a32742796794527/lab3/img/yoda_wallpaper.jpg -------------------------------------------------------------------------------- /mitdeeplearning/__init__.py: -------------------------------------------------------------------------------- 1 | import mitdeeplearning.util 2 | 3 | import mitdeeplearning.lab1 4 | import mitdeeplearning.lab2 5 | import mitdeeplearning.lab3 6 | import mitdeeplearning.lab3_old 7 | -------------------------------------------------------------------------------- /mitdeeplearning/bin/abc2wav: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | abcfile=$1 4 | suffix=${abcfile%.abc} 5 | abc2midi $abcfile -o "$suffix.mid" 6 | timidity "$suffix.mid" -Ow "$suffix.wav" 7 | rm "$suffix.abc" "$suffix.mid" 8 | -------------------------------------------------------------------------------- /mitdeeplearning/data/faces/DF/10.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MITDeepLearning/introtodeeplearning/2e90e0461ffa952f5855d4a89a32742796794527/mitdeeplearning/data/faces/DF/10.png -------------------------------------------------------------------------------- /mitdeeplearning/data/faces/DF/19.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MITDeepLearning/introtodeeplearning/2e90e0461ffa952f5855d4a89a32742796794527/mitdeeplearning/data/faces/DF/19.png -------------------------------------------------------------------------------- /mitdeeplearning/data/faces/DF/6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MITDeepLearning/introtodeeplearning/2e90e0461ffa952f5855d4a89a32742796794527/mitdeeplearning/data/faces/DF/6.png -------------------------------------------------------------------------------- /mitdeeplearning/data/faces/DF/7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MITDeepLearning/introtodeeplearning/2e90e0461ffa952f5855d4a89a32742796794527/mitdeeplearning/data/faces/DF/7.png -------------------------------------------------------------------------------- /mitdeeplearning/data/faces/DF/9.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MITDeepLearning/introtodeeplearning/2e90e0461ffa952f5855d4a89a32742796794527/mitdeeplearning/data/faces/DF/9.png -------------------------------------------------------------------------------- /mitdeeplearning/data/faces/DM/20.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MITDeepLearning/introtodeeplearning/2e90e0461ffa952f5855d4a89a32742796794527/mitdeeplearning/data/faces/DM/20.png -------------------------------------------------------------------------------- /mitdeeplearning/data/faces/DM/3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MITDeepLearning/introtodeeplearning/2e90e0461ffa952f5855d4a89a32742796794527/mitdeeplearning/data/faces/DM/3.png -------------------------------------------------------------------------------- /mitdeeplearning/data/faces/DM/5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MITDeepLearning/introtodeeplearning/2e90e0461ffa952f5855d4a89a32742796794527/mitdeeplearning/data/faces/DM/5.png -------------------------------------------------------------------------------- /mitdeeplearning/data/faces/DM/8.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MITDeepLearning/introtodeeplearning/2e90e0461ffa952f5855d4a89a32742796794527/mitdeeplearning/data/faces/DM/8.png -------------------------------------------------------------------------------- /mitdeeplearning/data/faces/DM/9.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MITDeepLearning/introtodeeplearning/2e90e0461ffa952f5855d4a89a32742796794527/mitdeeplearning/data/faces/DM/9.png -------------------------------------------------------------------------------- /mitdeeplearning/data/faces/LF/1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MITDeepLearning/introtodeeplearning/2e90e0461ffa952f5855d4a89a32742796794527/mitdeeplearning/data/faces/LF/1.png -------------------------------------------------------------------------------- /mitdeeplearning/data/faces/LF/11.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MITDeepLearning/introtodeeplearning/2e90e0461ffa952f5855d4a89a32742796794527/mitdeeplearning/data/faces/LF/11.png -------------------------------------------------------------------------------- /mitdeeplearning/data/faces/LF/2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MITDeepLearning/introtodeeplearning/2e90e0461ffa952f5855d4a89a32742796794527/mitdeeplearning/data/faces/LF/2.png -------------------------------------------------------------------------------- /mitdeeplearning/data/faces/LF/4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MITDeepLearning/introtodeeplearning/2e90e0461ffa952f5855d4a89a32742796794527/mitdeeplearning/data/faces/LF/4.png -------------------------------------------------------------------------------- /mitdeeplearning/data/faces/LF/8.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MITDeepLearning/introtodeeplearning/2e90e0461ffa952f5855d4a89a32742796794527/mitdeeplearning/data/faces/LF/8.png -------------------------------------------------------------------------------- /mitdeeplearning/data/faces/LM/1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MITDeepLearning/introtodeeplearning/2e90e0461ffa952f5855d4a89a32742796794527/mitdeeplearning/data/faces/LM/1.png -------------------------------------------------------------------------------- /mitdeeplearning/data/faces/LM/11.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MITDeepLearning/introtodeeplearning/2e90e0461ffa952f5855d4a89a32742796794527/mitdeeplearning/data/faces/LM/11.png -------------------------------------------------------------------------------- /mitdeeplearning/data/faces/LM/5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MITDeepLearning/introtodeeplearning/2e90e0461ffa952f5855d4a89a32742796794527/mitdeeplearning/data/faces/LM/5.png -------------------------------------------------------------------------------- /mitdeeplearning/data/faces/LM/8.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MITDeepLearning/introtodeeplearning/2e90e0461ffa952f5855d4a89a32742796794527/mitdeeplearning/data/faces/LM/8.png -------------------------------------------------------------------------------- /mitdeeplearning/data/faces/LM/9.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MITDeepLearning/introtodeeplearning/2e90e0461ffa952f5855d4a89a32742796794527/mitdeeplearning/data/faces/LM/9.png -------------------------------------------------------------------------------- /mitdeeplearning/lab1.py: -------------------------------------------------------------------------------- 1 | import os 2 | import regex as re 3 | import subprocess 4 | import urllib 5 | import numpy as np 6 | import tensorflow as tf 7 | 8 | from IPython.display import Audio 9 | 10 | 11 | cwd = os.path.dirname(__file__) 12 | 13 | 14 | def load_training_data(): 15 | with open(os.path.join(cwd, "data", "irish.abc"), "r") as f: 16 | text = f.read() 17 | songs = extract_song_snippet(text) 18 | return songs 19 | 20 | 21 | def extract_song_snippet(text): 22 | pattern = "(^|\n\n)(.*?)\n\n" 23 | search_results = re.findall(pattern, text, overlapped=True, flags=re.DOTALL) 24 | songs = [song[1] for song in search_results] 25 | print("Found {} songs in text".format(len(songs))) 26 | return songs 27 | 28 | 29 | def save_song_to_abc(song, filename="tmp"): 30 | save_name = "{}.abc".format(filename) 31 | with open(save_name, "w") as f: 32 | f.write(song) 33 | return filename 34 | 35 | 36 | def abc2wav(abc_file): 37 | path_to_tool = os.path.join(cwd, "bin", "abc2wav") 38 | cmd = "{} {}".format(path_to_tool, abc_file) 39 | return os.system(cmd) 40 | 41 | 42 | def play_wav(wav_file): 43 | return Audio(wav_file) 44 | 45 | 46 | def play_song(song): 47 | basename = save_song_to_abc(song) 48 | ret = abc2wav(basename + ".abc") 49 | if ret == 0: # did not suceed 50 | return play_wav(basename + ".wav") 51 | return None 52 | 53 | 54 | def play_generated_song(generated_text): 55 | songs = extract_song_snippet(generated_text) 56 | if len(songs) == 0: 57 | print( 58 | "No valid songs found in generated text. Try training the \ 59 | model longer or increasing the amount of generated music to \ 60 | ensure complete songs are generated!" 61 | ) 62 | 63 | for song in songs: 64 | play_song(song) 65 | print( 66 | "None of the songs were valid, try training longer to improve \ 67 | syntax." 68 | ) 69 | 70 | 71 | def test_batch_func_types(func, args): 72 | ret = func(*args) 73 | assert len(ret) == 2, "[FAIL] get_batch must return two arguments (input and label)" 74 | assert type(ret[0]) == np.ndarray, "[FAIL] test_batch_func_types: x is not np.array" 75 | assert type(ret[1]) == np.ndarray, "[FAIL] test_batch_func_types: y is not np.array" 76 | print("[PASS] test_batch_func_types") 77 | return True 78 | 79 | 80 | def test_batch_func_shapes(func, args): 81 | dataset, seq_length, batch_size = args 82 | x, y = func(*args) 83 | correct = (batch_size, seq_length) 84 | assert ( 85 | x.shape == correct 86 | ), "[FAIL] test_batch_func_shapes: x {} is not correct shape {}".format( 87 | x.shape, correct 88 | ) 89 | assert ( 90 | y.shape == correct 91 | ), "[FAIL] test_batch_func_shapes: y {} is not correct shape {}".format( 92 | y.shape, correct 93 | ) 94 | print("[PASS] test_batch_func_shapes") 95 | return True 96 | 97 | 98 | def test_batch_func_next_step(func, args): 99 | x, y = func(*args) 100 | assert ( 101 | x[:, 1:] == y[:, :-1] 102 | ).all(), "[FAIL] test_batch_func_next_step: x_{t} must equal y_{t-1} for all t" 103 | print("[PASS] test_batch_func_next_step") 104 | return True 105 | 106 | 107 | def test_custom_dense_layer_output(y): 108 | # define the ground truth value for the array 109 | true_y = np.array([[0.27064407, 0.1826951, 0.50374055]], dtype="float32") 110 | assert tf.shape(y).numpy().tolist() == list( 111 | true_y.shape 112 | ), "[FAIL] output is of incorrect shape. expected {} but got {}".format( 113 | true_y.shape, y.numpy().shape 114 | ) 115 | np.testing.assert_almost_equal( 116 | y.numpy(), 117 | true_y, 118 | decimal=7, 119 | err_msg="[FAIL] output is of incorrect value. expected {} but got {}".format( 120 | true_y, y.numpy() 121 | ), 122 | verbose=True, 123 | ) 124 | print("[PASS] test_custom_dense_layer_output") 125 | return True 126 | -------------------------------------------------------------------------------- /mitdeeplearning/lab2.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import os 3 | import matplotlib.pyplot as plt 4 | import numpy as np 5 | import tensorflow as tf 6 | import time 7 | import h5py 8 | import sys 9 | import glob 10 | 11 | IM_SHAPE = (64, 64, 3) 12 | 13 | 14 | def plot_image_prediction(i, predictions_array, true_label, img): 15 | predictions_array, true_label, img = predictions_array[i], true_label[i], img[i] 16 | plt.grid(False) 17 | plt.xticks([]) 18 | plt.yticks([]) 19 | 20 | plt.imshow(np.squeeze(img), cmap=plt.cm.binary) 21 | 22 | predicted_label = np.argmax(predictions_array) 23 | if predicted_label == true_label: 24 | color = "blue" 25 | else: 26 | color = "red" 27 | 28 | plt.xlabel( 29 | "{} {:2.0f}% ({})".format( 30 | predicted_label, 100 * np.max(predictions_array), true_label 31 | ), 32 | color=color, 33 | ) 34 | 35 | 36 | def plot_value_prediction(i, predictions_array, true_label): 37 | predictions_array, true_label = predictions_array[i], true_label[i] 38 | plt.grid(False) 39 | plt.xticks([]) 40 | plt.yticks([]) 41 | thisplot = plt.bar(range(10), predictions_array, color="#777777") 42 | plt.ylim([0, 1]) 43 | predicted_label = np.argmax(predictions_array) 44 | 45 | thisplot[predicted_label].set_color("red") 46 | thisplot[true_label].set_color("blue") 47 | 48 | 49 | class TrainingDatasetLoader(object): 50 | def __init__(self, data_path, channels_last=True): 51 | print("Opening {}".format(data_path)) 52 | sys.stdout.flush() 53 | 54 | self.cache = h5py.File(data_path, "r") 55 | 56 | print("Loading data into memory...") 57 | sys.stdout.flush() 58 | self.images = self.cache["images"][:] 59 | self.channels_last = channels_last 60 | self.labels = self.cache["labels"][:].astype(np.float32) 61 | self.image_dims = self.images.shape 62 | n_train_samples = self.image_dims[0] 63 | 64 | self.train_inds = np.random.permutation(np.arange(n_train_samples)) 65 | 66 | self.pos_train_inds = self.train_inds[self.labels[self.train_inds, 0] == 1.0] 67 | self.neg_train_inds = self.train_inds[self.labels[self.train_inds, 0] != 1.0] 68 | 69 | def get_train_size(self): 70 | return self.train_inds.shape[0] 71 | 72 | def get_train_steps_per_epoch(self, batch_size, factor=10): 73 | return self.get_train_size() // factor // batch_size 74 | 75 | def get_batch(self, n, only_faces=False, p_pos=None, p_neg=None, return_inds=False): 76 | if only_faces: 77 | selected_inds = np.random.choice( 78 | self.pos_train_inds, size=n, replace=False, p=p_pos 79 | ) 80 | else: 81 | selected_pos_inds = np.random.choice( 82 | self.pos_train_inds, size=n // 2, replace=False, p=p_pos 83 | ) 84 | selected_neg_inds = np.random.choice( 85 | self.neg_train_inds, size=n // 2, replace=False, p=p_neg 86 | ) 87 | selected_inds = np.concatenate((selected_pos_inds, selected_neg_inds)) 88 | 89 | sorted_inds = np.sort(selected_inds) 90 | train_img = (self.images[sorted_inds, :, :, ::-1] / 255.0).astype(np.float32) 91 | train_label = self.labels[sorted_inds, ...] 92 | 93 | if not self.channels_last: 94 | train_img = np.ascontiguousarray( 95 | np.transpose(train_img, (0, 3, 1, 2)) 96 | ) # [B, H, W, C] -> [B, C, H, W] 97 | return ( 98 | (train_img, train_label, sorted_inds) 99 | if return_inds 100 | else (train_img, train_label) 101 | ) 102 | 103 | def get_n_most_prob_faces(self, prob, n): 104 | idx = np.argsort(prob)[::-1] 105 | most_prob_inds = self.pos_train_inds[idx[: 10 * n : 10]] 106 | return (self.images[most_prob_inds, ...] / 255.0).astype(np.float32) 107 | 108 | def get_all_train_faces(self): 109 | return self.images[self.pos_train_inds] 110 | 111 | 112 | def get_test_faces(channels_last=True): 113 | cwd = os.path.dirname(__file__) 114 | images = {"LF": [], "LM": [], "DF": [], "DM": []} 115 | for key in images.keys(): 116 | files = glob.glob(os.path.join(cwd, "data", "faces", key, "*.png")) 117 | for file in sorted(files): 118 | image = cv2.resize(cv2.imread(file), (64, 64))[:, :, ::-1] / 255.0 119 | if not channels_last: 120 | image = np.transpose(image, (2, 0, 1)) 121 | images[key].append(image) 122 | 123 | return images["LF"], images["LM"], images["DF"], images["DM"] 124 | -------------------------------------------------------------------------------- /mitdeeplearning/lab3.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from openai import OpenAI 4 | from datasets import load_dataset 5 | from torch.utils.data import DataLoader 6 | 7 | 8 | cwd = os.path.dirname(__file__) 9 | 10 | def create_dataloader(style): 11 | ds = load_dataset("databricks/databricks-dolly-15k", split="train") 12 | with open(os.path.join(cwd, f"data/text_styles/{style}.txt"), "r") as f: 13 | new_responses = [line.strip().replace("\\n", "\n") for line in f] 14 | 15 | # Update the entire dataset at once with the new responses 16 | ds_ = ds.select(range(len(new_responses))) 17 | ds_ = ds_.map( 18 | lambda x, idx: {"response_style": new_responses[idx]}, 19 | with_indices=True, 20 | num_proc=1 21 | ) 22 | 23 | n = len(new_responses) 24 | ds_test = ds.select(range(n, n+n)) 25 | 26 | # Create a dataloader 27 | dataloader = DataLoader(ds_, batch_size=1, shuffle=True) 28 | dataloader_test = DataLoader(ds_test, batch_size=1, shuffle=True) 29 | return dataloader, dataloader_test 30 | 31 | 32 | 33 | class LLMClient: 34 | def __init__(self, model: str, api_key: str, api_base: str = "https://openrouter.ai/api/v1"): 35 | self.llm_client = OpenAI(api_key=api_key, base_url=api_base) 36 | self.model = model 37 | 38 | def ask(self, user: str, system: str = None, **kwargs): 39 | messages = [{"role": "user", "content": user}] 40 | if system: 41 | messages.insert(0, {"role": "system", "content": system}) 42 | res = self.llm_client.chat.completions.create( 43 | model=self.model, 44 | messages=messages, 45 | **kwargs 46 | ) 47 | return res 48 | 49 | 50 | yoda_test_text = ( 51 | "Wisdom, sought by many, found by few, it is. Haste not, patience have. " 52 | "For in stillness, answers come. Much to learn, still you have. " 53 | "Fear leads to anger; anger, to hate. Down the dark path, guide you it will. " 54 | "Trust the Force, you must. Powerful ally it is. Life it creates, surrounds, binds. " 55 | "Adventure, excitement, a Jedi craves not these things. Discipline, balance, seek you should. " 56 | "Hmm, clearer now is the path, yes? Help you more, I can, if needed it is. " 57 | "Endless, the journey of learning is. Stay true to your path, and clarity you will find. " 58 | "Remember, the Force flows through all, but your heart determines how it shapes your destiny. " 59 | "Much more to teach, I have. Ready, are you? Mmm." 60 | ) 61 | 62 | 63 | 64 | # class Llama(LLMClient): 65 | # def __init__(self, api_key: str): 66 | # """ 67 | # Initialize the LlamaFree model client. 68 | 69 | # LlamaFree is available from LlamaFree. 70 | # Provide your LlamaFree API key (`api_key`) to access. 71 | # """ 72 | # # super().__init__(model="meta-llama/llama-3.2-3b-instruct", api_key=api_key) 73 | # super().__init__(model="meta-llama/llama-3.1-8b-instruct", api_key=api_key) 74 | 75 | 76 | # class LFM40B(LLMClient): 77 | # def __init__(self, api_key: str): 78 | # """ 79 | # Initialize the LFM-40B model client. 80 | 81 | # LFM-40B is available from Lambda Labs. 82 | # Provide your Lambda Labs API key (`api_key`) to access. 83 | # """ 84 | # api_base = "https://api.lambdalabs.com/v1" 85 | # super().__init__(model="lfm-40b", api_base=api_base, api_key=api_key) 86 | -------------------------------------------------------------------------------- /mitdeeplearning/lab3_old.py: -------------------------------------------------------------------------------- 1 | import io 2 | import base64 3 | from IPython.display import HTML 4 | import gym 5 | import numpy as np 6 | import cv2 7 | 8 | 9 | def play_video(filename, width=None): 10 | encoded = base64.b64encode(io.open(filename, "r+b").read()) 11 | video_width = 'width="' + str(width) + '"' if width is not None else "" 12 | embedded = HTML( 13 | data=""" 14 | """.format(video_width, encoded.decode("ascii")) 17 | ) 18 | 19 | return embedded 20 | 21 | 22 | def preprocess_pong(image): 23 | I = image[35:195] # Crop 24 | I = I[::2, ::2, 0] # Downsample width and height by a factor of 2 25 | I[I == 144] = 0 # Remove background type 1 26 | I[I == 109] = 0 # Remove background type 2 27 | I[I != 0] = 1 # Set remaining elements (paddles, ball, etc.) to 1 28 | I = cv2.dilate(I, np.ones((3, 3), np.uint8), iterations=1) 29 | I = I[::2, ::2, np.newaxis] 30 | return I.astype(np.float) 31 | 32 | 33 | def pong_change(prev, curr): 34 | prev = preprocess_pong(prev) 35 | curr = preprocess_pong(curr) 36 | I = prev - curr 37 | # I = (I - I.min()) / (I.max() - I.min() + 1e-10) 38 | return I 39 | 40 | 41 | class Memory: 42 | def __init__(self): 43 | self.clear() 44 | 45 | # Resets/restarts the memory buffer 46 | def clear(self): 47 | self.observations = [] 48 | self.actions = [] 49 | self.rewards = [] 50 | 51 | # Add observations, actions, rewards to memory 52 | def add_to_memory(self, new_observation, new_action, new_reward): 53 | self.observations.append(new_observation) 54 | self.actions.append(new_action) 55 | self.rewards.append(new_reward) 56 | 57 | 58 | def aggregate_memories(memories): 59 | batch_memory = Memory() 60 | 61 | for memory in memories: 62 | for step in zip(memory.observations, memory.actions, memory.rewards): 63 | batch_memory.add_to_memory(*step) 64 | 65 | return batch_memory 66 | 67 | 68 | def parallelized_collect_rollout(batch_size, envs, model, choose_action): 69 | assert ( 70 | len(envs) == batch_size 71 | ), "Number of parallel environments must be equal to the batch size." 72 | 73 | memories = [Memory() for _ in range(batch_size)] 74 | next_observations = [single_env.reset() for single_env in envs] 75 | previous_frames = [obs for obs in next_observations] 76 | done = [False] * batch_size 77 | rewards = [0] * batch_size 78 | 79 | while True: 80 | current_frames = [obs for obs in next_observations] 81 | diff_frames = [ 82 | pong_change(prev, curr) 83 | for (prev, curr) in zip(previous_frames, current_frames) 84 | ] 85 | 86 | diff_frames_not_done = [ 87 | diff_frames[b] for b in range(batch_size) if not done[b] 88 | ] 89 | actions_not_done = choose_action( 90 | model, np.array(diff_frames_not_done), single=False 91 | ) 92 | 93 | actions = [None] * batch_size 94 | ind_not_done = 0 95 | for b in range(batch_size): 96 | if not done[b]: 97 | actions[b] = actions_not_done[ind_not_done] 98 | ind_not_done += 1 99 | 100 | for b in range(batch_size): 101 | if done[b]: 102 | continue 103 | next_observations[b], rewards[b], done[b], info = envs[b].step(actions[b]) 104 | previous_frames[b] = current_frames[b] 105 | memories[b].add_to_memory(diff_frames[b], actions[b], rewards[b]) 106 | 107 | if all(done): 108 | break 109 | 110 | return memories 111 | 112 | 113 | def save_video_of_model(model, env_name, suffix=""): 114 | import skvideo.io 115 | from pyvirtualdisplay import Display 116 | 117 | display = Display(visible=0, size=(400, 300)) 118 | display.start() 119 | 120 | env = gym.make(env_name) 121 | obs = env.reset() 122 | prev_obs = obs 123 | 124 | filename = env_name + suffix + ".mp4" 125 | output_video = skvideo.io.FFmpegWriter(filename) 126 | 127 | counter = 0 128 | done = False 129 | while not done: 130 | frame = env.render(mode="rgb_array") 131 | output_video.writeFrame(frame) 132 | 133 | if "CartPole" in env_name: 134 | input_obs = obs 135 | elif "Pong" in env_name: 136 | input_obs = pong_change(prev_obs, obs) 137 | else: 138 | raise ValueError(f"Unknown env for saving: {env_name}") 139 | 140 | action = model(np.expand_dims(input_obs, 0)).numpy().argmax() 141 | 142 | prev_obs = obs 143 | obs, reward, done, info = env.step(action) 144 | counter += 1 145 | 146 | output_video.close() 147 | print("Successfully saved {} frames into {}!".format(counter, filename)) 148 | return filename 149 | 150 | 151 | def save_video_of_memory(memory, filename, size=(512, 512)): 152 | import skvideo.io 153 | 154 | output_video = skvideo.io.FFmpegWriter(filename) 155 | 156 | for observation in memory.observations: 157 | output_video.writeFrame(cv2.resize(255 * observation, size)) 158 | 159 | output_video.close() 160 | return filename 161 | -------------------------------------------------------------------------------- /mitdeeplearning/util.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import time 3 | import numpy as np 4 | 5 | from IPython import display as ipythondisplay 6 | from string import Formatter 7 | 8 | 9 | def display_model(model): 10 | import tensorflow as tf 11 | tf.keras.utils.plot_model(model, to_file="tmp.png", show_shapes=True) 12 | return ipythondisplay.Image("tmp.png") 13 | 14 | 15 | def plot_sample(x, y, vae, backend='tf'): 16 | """Plot original and reconstructed images side by side. 17 | 18 | Args: 19 | x: Input images array of shape [B, H, W, C] (TF) or [B, C, H, W] (PT) 20 | y: Labels array of shape [B] where 1 indicates a face 21 | vae: VAE model (TensorFlow or PyTorch) 22 | framework: 'tf' or 'pt' indicating which framework to use 23 | """ 24 | plt.figure(figsize=(2, 1)) 25 | 26 | if backend == 'tf': 27 | idx = np.where(y == 1)[0][0] 28 | _, _, _, recon = vae(x) 29 | recon = np.clip(recon, 0, 1) 30 | 31 | elif backend == 'pt': 32 | import torch 33 | y = y.detach().cpu().numpy() 34 | face_indices = np.where(y == 1)[0] 35 | idx = face_indices[0] if len(face_indices) > 0 else 0 36 | 37 | with torch.inference_mode(): 38 | _, _, _, recon = vae(x) 39 | recon = torch.clamp(recon, 0, 1) 40 | recon = recon.permute(0, 2, 3, 1).detach().cpu().numpy() 41 | x = x.permute(0, 2, 3, 1).detach().cpu().numpy() 42 | 43 | else: 44 | raise ValueError("framework must be 'tf' or 'pt'") 45 | 46 | plt.subplot(1, 2, 1) 47 | plt.imshow(x[idx]) 48 | plt.grid(False) 49 | 50 | plt.subplot(1, 2, 2) 51 | plt.imshow(recon[idx]) 52 | plt.grid(False) 53 | 54 | if backend == 'pt': 55 | plt.show() 56 | 57 | 58 | class LossHistory: 59 | def __init__(self, smoothing_factor=0.0): 60 | self.alpha = smoothing_factor 61 | self.loss = [] 62 | 63 | def append(self, value): 64 | self.loss.append( 65 | self.alpha * self.loss[-1] + (1 - self.alpha) * value 66 | if len(self.loss) > 0 67 | else value 68 | ) 69 | 70 | def get(self): 71 | return self.loss 72 | 73 | 74 | class PeriodicPlotter: 75 | def __init__(self, sec, xlabel="", ylabel="", scale=None): 76 | self.xlabel = xlabel 77 | self.ylabel = ylabel 78 | self.sec = sec 79 | self.scale = scale 80 | 81 | self.tic = time.time() 82 | 83 | def plot(self, data): 84 | if time.time() - self.tic > self.sec: 85 | plt.cla() 86 | 87 | if self.scale is None: 88 | plt.plot(data) 89 | elif self.scale == "semilogx": 90 | plt.semilogx(data) 91 | elif self.scale == "semilogy": 92 | plt.semilogy(data) 93 | elif self.scale == "loglog": 94 | plt.loglog(data) 95 | else: 96 | raise ValueError("unrecognized parameter scale {}".format(self.scale)) 97 | 98 | plt.xlabel(self.xlabel) 99 | plt.ylabel(self.ylabel) 100 | ipythondisplay.clear_output(wait=True) 101 | ipythondisplay.display(plt.gcf()) 102 | 103 | self.tic = time.time() 104 | 105 | 106 | def create_grid_of_images(xs, size=(5, 5)): 107 | """Combine a list of images into a single image grid by stacking them into an array of shape `size`""" 108 | 109 | grid = [] 110 | counter = 0 111 | for i in range(size[0]): 112 | row = [] 113 | for j in range(size[1]): 114 | row.append(xs[counter]) 115 | counter += 1 116 | row = np.hstack(row) 117 | grid.append(row) 118 | grid = np.vstack(grid) 119 | return grid 120 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | # Inside of setup.cfg 2 | [metadata] 3 | description_file = README.md 4 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from pkg_resources import DistributionNotFound, get_distribution 2 | from distutils.core import setup 3 | 4 | 5 | def get_dist(pkgname): 6 | try: 7 | return get_distribution(pkgname) 8 | except DistributionNotFound: 9 | return None 10 | 11 | install_deps = [ 12 | 'comet_ml', 13 | 'numpy', 14 | 'regex', 15 | 'tqdm', 16 | 'gym', 17 | 'opik', 18 | 'openai', 19 | 'transformers', 20 | 'datasets', 21 | 'peft', 22 | 'lion-pytorch', 23 | ] 24 | tf_ver = '2.0.0a' 25 | if get_dist('tensorflow>='+tf_ver) is None and get_dist('tensorflow_gpu>='+tf_ver) is None: 26 | install_deps.append('tensorflow>='+tf_ver) 27 | 28 | setup( 29 | name = 'mitdeeplearning', # How you named your package folder (MyLib) 30 | packages = ['mitdeeplearning'], # Chose the same as "name" 31 | version = '0.7.5', # Start with a small number and increase it with every change you make 32 | license='MIT', # Chose a license from here: https://help.github.com/articles/licensing-a-repository 33 | description = 'Official software labs for MIT Introduction to Deep Learning (http://introtodeeplearning.com)', # Give a short description about your library 34 | author = 'Alexander Amini', # Type in your name 35 | author_email = 'introtodeeplearning-staff@mit.edu', # Type in your E-Mail 36 | url = 'http://introtodeeplearning.com', # Provide either the link to your github or to your website 37 | download_url = 'https://github.com/MITDeepLearning/introtodeeplearning/archive/v0.7.5.tar.gz', # I explain this later on 38 | keywords = ['deep learning', 'neural networks', 'tensorflow', 'introduction'], # Keywords that define your package best 39 | install_requires=install_deps, 40 | classifiers=[ 41 | 'Development Status :: 3 - Alpha', # Chose either "3 - Alpha", "4 - Beta" or "5 - Production/Stable" as the current state of your package 42 | 'License :: OSI Approved :: MIT License', # Again, pick a license 43 | 'Programming Language :: Python :: 3', #Specify which pyhton versions that you want to support', 44 | 'Programming Language :: Python :: 3.6', 45 | ], 46 | package_data={ 47 | 'mitdeeplearning': ['bin/*', 'data/*', 'data/text_styles/*', 'data/faces/DF/*', 'data/faces/DM/*', 'data/faces/LF/*', 'data/faces/LM/*'], 48 | }, 49 | 50 | ) 51 | -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | import mitdeeplearning as mdl 2 | 3 | songs = mdl.lab1.load_training_data() 4 | 5 | basename = mdl.lab1.save_song_to_abc(songs[0]) 6 | ret = mdl.lab1.abc2wav(basename+'.abc') 7 | 8 | import pdb; pdb.set_trace() 9 | -------------------------------------------------------------------------------- /xtra_labs/llm_finetune/NOT_FINAL: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MITDeepLearning/introtodeeplearning/2e90e0461ffa952f5855d4a89a32742796794527/xtra_labs/llm_finetune/NOT_FINAL -------------------------------------------------------------------------------- /xtra_labs/llm_finetune/draft.py: -------------------------------------------------------------------------------- 1 | """ 2 | Drafting lab flow in script format using PyTorch 3 | """ 4 | from datasets import load_dataset 5 | import math 6 | import numpy as np 7 | import pandas as pd 8 | import random 9 | import torch 10 | import torch.nn as nn 11 | import torch.nn.functional as F 12 | from torch.nn import CrossEntropyLoss 13 | from torch.optim import Adam 14 | import transformers 15 | from trl import SFTTrainer 16 | from tqdm import tqdm 17 | 18 | from utils import run_benchmark, make_spider_plot 19 | 20 | # Part 1 21 | 22 | # TEXT: overview of LLM lab 23 | # Load pretrained LLM (medium size model) 24 | 25 | # model_name = "facebook/opt-1.3b" 26 | model_name = "facebook/opt-125m" 27 | model = transformers.AutoModelForCausalLM.from_pretrained(model_name, device_map="auto") 28 | tokenizer = transformers.AutoTokenizer.from_pretrained(model_name) 29 | 30 | # TEXT: explain tokenizer 31 | # Include cell for tokenizer inspection 32 | 33 | # TEXT: explain how LLMs are trained for next token prediction 34 | # Write a function to predict next token 35 | def predict_next_token(probs, tokenizer): 36 | new_token = np.random.choice(len(probs), p=probs.numpy()) 37 | print(tokenizer.decode(new_token), end='', flush=True) 38 | return new_token 39 | 40 | # TEXT: explain that next token prediction must be called multiple times for inference 41 | # Call in loop for autoregressive inference 42 | def generate(start_text, model, tokenizer, num_steps=20, temp=1.): 43 | print(start_text, end="") 44 | x = tokenizer.encode(start_text) 45 | num_start = len(x) 46 | 47 | for i in range(num_steps): 48 | input_tensor = torch.tensor(x).view(1, -1).to("cuda") 49 | logits = model(input_tensor).logits 50 | probs = F.softmax(logits/temp, dim=-1)[0, -1, :].cpu().detach() 51 | 52 | new_token = predict_next_token(probs, tokenizer) 53 | x.append(new_token) 54 | 55 | output = tokenizer.decode(x[num_start:]) 56 | return output 57 | 58 | def generate_pt(model, tokenizer, text, num_steps=50, until=None, temp=1.): 59 | device = model.device 60 | print(text, end='', flush=True) 61 | x = tokenizer.encode(text) 62 | enc_until = tokenizer.encode(until)[1:] 63 | num_start = len(x) 64 | 65 | decoded = tokenizer.decode(x) 66 | 67 | for step in range(num_steps): 68 | with torch.no_grad(): 69 | input_tensor = torch.reshape(torch.LongTensor(x), [1, -1]).to(device) 70 | logits = model(input_tensor).logits 71 | probs = F.softmax(logits/temp, dim=-1)[0, -1, :] 72 | probs = probs.detach().cpu().numpy() 73 | 74 | new_token = np.random.choice(len(probs), p=probs) 75 | x.append(new_token) 76 | 77 | new_decoded = tokenizer.decode(x) 78 | new_part = new_decoded[len(decoded):] 79 | decoded = new_decoded 80 | 81 | print(new_part, end='', flush=True) 82 | text += new_part 83 | 84 | if len(x) >= len(until) and text[-len(until):] == until: 85 | break 86 | 87 | 88 | output = tokenizer.decode(x[num_start:]) 89 | print("\n", flush=True) 90 | return output 91 | 92 | # Test autoregressive generation 93 | # while True: 94 | # print("\n\n\n\n\n") 95 | # input_text = input("Prompt: ") 96 | # output = generate(input_text, model, tokenizer) 97 | 98 | # TEXT: some background on LLM benchmarking 99 | # Load benchmark dataset and evaluate model 100 | benchmark_dataset = pd.read_csv("benchmark.csv") 101 | # category_accs_1300m, avg_acc_1300m = run_benchmark(model, tokenizer, benchmark_dataset) 102 | 103 | # TEXT: ask them to make a prediction on how accuracy will be affected by different model sizes 104 | 105 | # Benchmark smaller model 106 | # model_name_350m = "facebook/opt-350m" 107 | # model_350m = transformers.AutoModelForCausalLM.from_pretrained(model_name_350m, device_map="auto") 108 | # tokenizer_350m = transformers.AutoTokenizer.from_pretrained(model_name_350m) 109 | 110 | # category_accs_350m, avg_acc_350m = run_benchmark(model_350m, tokenizer_350m, benchmark_dataset) 111 | 112 | # Benchmark larger model 113 | # model_name_2700m = "facebook/opt-2.7b" 114 | # model_2700m = transformers.AutoModelForCausalLM.from_pretrained(model_name_2700m, device_map="auto") 115 | # tokenizer_2700m = transformers.AutoTokenizer.from_pretrained(model_name_2700m) 116 | 117 | # category_accs_2700m, avg_acc_2700m = run_benchmark(model_2700m, tokenizer_2700m, benchmark_dataset) 118 | 119 | # Spider plot 120 | 121 | # benchmark_data = {"350M-Model": category_accs_350m, "1300M-Model": category_accs_1300m, "2700M-Model": category_accs_2700m} 122 | # benchmark_data = {"350M-Model": category_accs_1300m} 123 | # make_spider_plot(benchmark_data) 124 | 125 | def print_lora_params(module, layer_type): 126 | summ = 0 127 | for name, child in module.named_children(): 128 | if isinstance(child, layer_type): 129 | num_params = sum(p.numel() for p in child.parameters() if p.requires_grad) 130 | 131 | print(name, num_params, child.in_features, child.out_features, (child.in_features * 8 + child.out_features * 8 == num_params)) 132 | 133 | summ += num_params 134 | else: 135 | summ += print_lora_params(child, layer_type) 136 | 137 | return summ 138 | 139 | # Part 2 140 | 141 | # inspect current model 142 | # print(model) 143 | 144 | # summ = print_lora_params(model, nn.Linear) 145 | 146 | # print("with function", summ) 147 | 148 | # print("without function", sum(p.numel() for p in model.parameters() if p.requires_grad)) 149 | 150 | # # freeze all parameter gradients 151 | for param in model.parameters(): 152 | param.requires_grad = False 153 | 154 | # new LoRA linear layer class 155 | class LoRALinear(nn.Module): 156 | def __init__( 157 | self, 158 | in_features: int, 159 | out_features: int, 160 | pretrained_weight: torch.Tensor, 161 | pretrained_bias: torch.Tensor, 162 | r: int = 8, 163 | lora_alpha: int = 8, 164 | lora_dropout: float = 0.1, 165 | **kwargs 166 | ): 167 | super(LoRALinear, self).__init__() 168 | 169 | self.r = r 170 | self.in_features = in_features 171 | self.out_features = out_features 172 | self.lora_alpha = lora_alpha 173 | 174 | self.weight = nn.Parameter(pretrained_weight) 175 | self.weight.requires_grad = False 176 | 177 | if pretrained_bias is not None: 178 | self.bias = nn.Parameter(pretrained_bias) 179 | self.bias.requires_grad = False 180 | else: 181 | self.bias = None 182 | 183 | # from https://github.com/microsoft/LoRA/blob/main/loralib/layers.py 184 | self.lora_A = nn.Parameter(self.weight.new_zeros((r, in_features))) 185 | self.lora_B = nn.Parameter(self.weight.new_zeros((out_features, r))) 186 | self.scaling = self.lora_alpha / self.r 187 | self.lora_dropout = nn.Dropout(p=lora_dropout) 188 | 189 | def forward(self, x: torch.Tensor): 190 | result = F.linear(x, self.weight, bias=self.bias) 191 | result += (self.lora_dropout(x) @ self.lora_A.transpose(0, 1) @ self.lora_B.transpose(0, 1)) * self.scaling 192 | return result 193 | 194 | # replace linear layers in model recursively 195 | def replace_linear_with_lora(module): 196 | for name, child in module.named_children(): 197 | if isinstance(child, nn.Linear): 198 | setattr(module, name, LoRALinear(child.in_features, child.out_features, child.weight, child.bias)) 199 | else: 200 | replace_linear_with_lora(child) 201 | 202 | replace_linear_with_lora(model) 203 | 204 | 205 | 206 | # summ = print_lora_params(model, LoRALinear) 207 | 208 | # print("with function", summ) 209 | 210 | # print("without function", sum(p.numel() for p in model.parameters() if p.requires_grad)) 211 | 212 | 213 | # inspect new model 214 | # print(model) 215 | 216 | # load chat dataset 217 | dataset_name = "timdettmers/openassistant-guanaco" 218 | ft_dataset = load_dataset(dataset_name, split="train") 219 | 220 | # train model (barebones loop) 221 | context_length = 768 222 | loss_fn = CrossEntropyLoss() 223 | 224 | learning_rate = 1e-4 225 | optimizer = Adam(model.parameters(), lr=learning_rate) 226 | num_epochs = 5 227 | 228 | model = model.to("cuda") 229 | 230 | ### Train the model 231 | # Define some training args 232 | args = transformers.TrainingArguments("/home/dnori/introtodeeplearning/xtra_labs/llm_finetune/outputs", 233 | per_device_train_batch_size=1, 234 | logging_first_step=True, 235 | logging_steps=20, 236 | save_steps=100, 237 | ) 238 | 239 | # Define a callback to check the progress on a sample question 240 | class PrinterCallback(transformers.TrainerCallback): 241 | def on_log(self, args, state, control, model, logs=None, **kwargs): 242 | start_text = "### Human: When the weather is sunny, what color is the sky?### Assistant:" 243 | generate_pt(model, tokenizer, start_text, num_steps=200, until="###") 244 | 245 | # Actually train the model 246 | trainer = SFTTrainer( 247 | model, 248 | args=args, 249 | train_dataset=ft_dataset, 250 | dataset_text_field="text", 251 | max_seq_length=context_length, 252 | callbacks=[PrinterCallback()] 253 | ) 254 | trainer.train() 255 | 256 | 257 | # for epoch in range(num_epochs): 258 | # total_loss = 0 259 | # num_batches = 0 260 | 261 | # for batch in tqdm(ft_dataset): 262 | # prompt = batch["text"] 263 | 264 | # # encode with tokenizer 265 | # x = tokenizer.encode(prompt) 266 | # x_tensor = torch.tensor(x).view(1, -1).to("cuda") 267 | # max_len = min(context_length, x_tensor.shape[1]-1) 268 | # selected_len = random.randint(1,max_len) 269 | 270 | # input_tensor = x_tensor[:,:selected_len] 271 | # target_tensor = x_tensor[0,1:selected_len+1] 272 | 273 | # # zero gradients 274 | # optimizer.zero_grad() 275 | 276 | # # run through model 277 | # logits = model(input_tensor).logits[0] 278 | 279 | # # apply loss 280 | # loss = loss_fn(logits, target_tensor) 281 | 282 | # # backpropagation 283 | # loss.backward() 284 | 285 | # # optimizer step 286 | # optimizer.step() 287 | 288 | # total_loss += loss.item() 289 | # num_batches += 1 290 | 291 | # # Print average loss for the epoch 292 | # average_loss = total_loss / num_batches 293 | # print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {average_loss}") 294 | 295 | # # evaluate finetuned model on benchmark 296 | # category_accs_1300m_ft, avg_acc_1300m_ft = run_benchmark(model, tokenizer, benchmark_dataset) 297 | 298 | # add to spider plot 299 | # benchmark_data = {"350M-Model": category_accs_350m, "1300M-Model": category_accs_1300m, "1300M-Model-Finetuned": category_accs_1300m_ft, "2700M-Model": category_accs_2700m} 300 | # benchmark_data = {"350M-Model": category_accs_1300m, "350M-Model-Finetuned": category_accs_1300m_ft} 301 | # make_spider_plot(benchmark_data) -------------------------------------------------------------------------------- /xtra_labs/llm_finetune/spider.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MITDeepLearning/introtodeeplearning/2e90e0461ffa952f5855d4a89a32742796794527/xtra_labs/llm_finetune/spider.png -------------------------------------------------------------------------------- /xtra_labs/llm_finetune/utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Contains functions that the students will not interface with 3 | """ 4 | import matplotlib.pyplot as plt 5 | import numpy as np 6 | import pandas as pd 7 | import tensorflow as tf 8 | import torch 9 | import torch.nn.functional as F 10 | from tqdm import tqdm 11 | 12 | def run_benchmark(model, tokenizer, dataset, few_shot=7, num_steps=500, verbose=False): 13 | device = model.device 14 | dataset["Correct"] = 0.0 15 | 16 | # Loop through every question in the benchmark 17 | for step, row in tqdm(dataset.iterrows(), total=len(dataset)): 18 | question = row['Question'] 19 | pre_text = f"### Human: {question}### Assistant:" 20 | len_prefix = len(tokenizer.encode(pre_text)) 21 | 22 | # Run the model individually with each of the four responses. 23 | # Measure the model's logprob for outputing each of the four responses. 24 | # Choose the answer with the highest logprob 25 | logprobs = [] 26 | answers = [] 27 | for choice in ["A", "B", "C", "D"]: 28 | answer = row[f'Answer {choice}'] 29 | text = f"{pre_text} {answer}" 30 | 31 | # Run the model 32 | with torch.no_grad(): 33 | x = tokenizer.encode(text, return_tensors="pt").to(device) 34 | logits = model(x).logits 35 | probs = F.softmax(logits, dim=-1)[0, :-1, :] # shape: [seq_len-1, vocab_size] 36 | y = x[0, 1:] # shape: [seq_len-1] 37 | 38 | # Compute the log probability for this answer to appear (average logprob over the answer tokens) 39 | next_token_prob = np.array([probs[i, y[i]].item() for i in range(y.shape[0])]) 40 | num_ans_tokens = x.shape[1] - len_prefix 41 | logprob = np.mean(np.log(next_token_prob[-num_ans_tokens:])) 42 | logprobs.append(logprob) 43 | answers.append(answer) 44 | 45 | # Check for the correct answer (always the zero-th index, by definition) 46 | correct = np.argmax(logprobs) == 0 47 | 48 | # Record if the model got the answer correct or not. 49 | # Optionally print the question -> prediction if verbose 50 | dataset.at[step, "Correct"] = float(correct) 51 | if verbose: 52 | print(f"[{correct}] {question} -> {answers[np.argmax(logprobs)]}") 53 | 54 | 55 | # Group by the the categories and compute the average accuracy 56 | accs = dataset.groupby("Category")["Correct"].mean() 57 | sorted_accs = accs.sort_values() 58 | print(sorted_accs) 59 | 60 | return accs, dataset["Correct"].mean() 61 | 62 | def make_spider_plot(data): 63 | """ 64 | Data is a dictionary where keys are different entities 65 | Values are pd Series where series indices are plot labels and series values show performance 66 | """ 67 | colors = ['#1aaf6c', '#429bf4', '#d42cea'] 68 | i = 0 69 | fig, ax = plt.subplots(figsize=(8,6), subplot_kw=dict(polar=True)) 70 | for k,v in data.items(): 71 | labels = v.index.tolist() 72 | values = v.values.tolist() 73 | 74 | num_vars = len(labels) 75 | angles = np.linspace(0, 2 * np.pi, num_vars, endpoint=False).tolist() 76 | angles += angles[:1] 77 | values += values[:1] 78 | 79 | ax.plot(angles, values, color=colors[i], linewidth=1, label=k) 80 | ax.fill(angles, values, color=colors[i], alpha=0.25) 81 | 82 | i+=1 83 | 84 | ax.set_theta_offset(np.pi / 2) 85 | ax.set_theta_direction(-1) 86 | ax.set_thetagrids(np.degrees(angles[:-1]), labels) 87 | for label, angle in zip(ax.get_xticklabels(), angles): 88 | if angle in (0, np.pi): 89 | label.set_horizontalalignment('center') 90 | elif 0 < angle < np.pi: 91 | label.set_horizontalalignment('left') 92 | else: 93 | label.set_horizontalalignment('right') 94 | 95 | ax.set_ylim(0, 1) 96 | ax.set_rlabel_position(180 / num_vars) 97 | 98 | ax.legend(loc='upper right', bbox_to_anchor=(1.3, 1.1)) 99 | 100 | plt.savefig("spider.png") 101 | 102 | 103 | -------------------------------------------------------------------------------- /xtra_labs/rl_pong/img/COMING SOON: -------------------------------------------------------------------------------- 1 | COMING SOON 2 | -------------------------------------------------------------------------------- /xtra_labs/rl_pong/img/vista_overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MITDeepLearning/introtodeeplearning/2e90e0461ffa952f5855d4a89a32742796794527/xtra_labs/rl_pong/img/vista_overview.png -------------------------------------------------------------------------------- /xtra_labs/rl_selfdriving/img/COMING SOON: -------------------------------------------------------------------------------- 1 | COMING SOON 2 | -------------------------------------------------------------------------------- /xtra_labs/rl_selfdriving/img/vista_overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MITDeepLearning/introtodeeplearning/2e90e0461ffa952f5855d4a89a32742796794527/xtra_labs/rl_selfdriving/img/vista_overview.png -------------------------------------------------------------------------------- /xtra_labs/uncertainty/Part1_IntroductionCapsa.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "attachments": {}, 5 | "cell_type": "markdown", 6 | "metadata": { 7 | "id": "SWa-rLfIlTaf" 8 | }, 9 | "source": [ 10 | "\n", 11 | " \n", 14 | " \n", 16 | " \n", 18 | "
\n", 12 | " \n", 13 | " Visit MIT Deep Learning\n", 15 | " Run in Google Colab\n", 17 | " View Source on GitHub
\n", 19 | "\n", 20 | "# Copyright Information" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": null, 26 | "metadata": { 27 | "id": "-LohleBMlahL" 28 | }, 29 | "outputs": [], 30 | "source": [ 31 | "# Copyright 2023 MIT Introduction to Deep Learning. All Rights Reserved.\n", 32 | "# \n", 33 | "# Licensed under the MIT License. You may not use this file except in compliance\n", 34 | "# with the License. Use and/or modification of this code outside of MIT Introduction\n", 35 | "# to Deep Learning must reference:\n", 36 | "#\n", 37 | "# © MIT Introduction to Deep Learning\n", 38 | "# http://introtodeeplearning.com\n", 39 | "#" 40 | ] 41 | }, 42 | { 43 | "cell_type": "markdown", 44 | "metadata": { 45 | "id": "ckzz5Hus-hJB" 46 | }, 47 | "source": [ 48 | "# Laboratory 3: Debiasing, Uncertainty, and Robustness\n", 49 | "\n", 50 | "# Part 1: Introduction to Capsa\n", 51 | "\n", 52 | "In this lab, we'll explore different ways to make deep learning models more **robust** and **trustworthy**.\n", 53 | "\n", 54 | "To achieve this it is critical to be able to identify and diagnose issues of bias and uncertainty in deep learning models, as we explored in the Facial Detection Lab 2. We need benchmarks that uniformly measure how uncertain a given model is, and we need principled ways of measuring bias and uncertainty. To that end, in this lab, we'll utilize [Capsa](https://github.com/themis-ai/capsa), a risk-estimation wrapping library developed by [Themis AI](https://themisai.io/). Capsa supports the estimation of three different types of ***risk***, defined as measures of how robust and trustworthy our model is. These are:\n", 55 | "1. **Representation bias**: reflects how likely combinations of features are to appear in a given dataset. Often, certain combinations of features are severely under-represented in datasets, which means models learn them less well and can thus lead to unwanted bias.\n", 56 | "2. **Data uncertainty**: reflects noise in the data, for example when sensors have noisy measurements, classes in datasets have low separations, and generally when very similar inputs lead to drastically different outputs. Also known as *aleatoric* uncertainty. \n", 57 | "3. **Model uncertainty**: captures the areas of our underlying data distribution that the model has not yet learned or has difficulty learning. Areas of high model uncertainty can be due to out-of-distribution (OOD) samples or data that is harder to learn. Also known as *epistemic* uncertainty." 58 | ] 59 | }, 60 | { 61 | "cell_type": "markdown", 62 | "metadata": { 63 | "id": "o02MyoDrnNqP" 64 | }, 65 | "source": [ 66 | "## CAPSA overview\n", 67 | "\n", 68 | "This lab introduces Capsa and its functionalities, to next build automated tools that use Capsa to mitigate the underlying issues of bias and uncertainty.\n", 69 | "\n", 70 | "The core idea behind [Capsa](https://themisai.io/capsa/) is that any deep learning model of interest can be ***wrapped*** -- just like wrapping a gift -- to be made ***aware of its own risks***. Risk is captured in representation bias, data uncertainty, and model uncertainty.\n", 71 | "\n", 72 | "![alt text](https://raw.githubusercontent.com/MITDeepLearning/introtodeeplearning/2023/lab3/img/capsa_overview.png)\n", 73 | "\n", 74 | "This means that Capsa takes the user's original model as input, and modifies it minimally to create a risk-aware variant while preserving the model's underlying structure and training pipeline. Capsa is a one-line addition to any training workflow in TensorFlow. In this part of the lab, we'll apply Capsa's risk estimation methods to a simple regression problem to further explore the notions of bias and uncertainty. \n", 75 | "\n", 76 | "Please refer to [Capsa's documentation](https://themisai.io/capsa/) for additional details." 77 | ] 78 | }, 79 | { 80 | "cell_type": "markdown", 81 | "metadata": { 82 | "id": "hF0uSqk-nwmA" 83 | }, 84 | "source": [ 85 | "Let's get started by installing the necessary dependencies:" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": null, 91 | "metadata": { 92 | "id": "NdXF4Reyj6yy" 93 | }, 94 | "outputs": [], 95 | "source": [ 96 | "# Import Tensorflow 2.0\n", 97 | "%tensorflow_version 2.x\n", 98 | "import tensorflow as tf\n", 99 | "\n", 100 | "import IPython\n", 101 | "import functools\n", 102 | "import matplotlib.pyplot as plt\n", 103 | "import numpy as np\n", 104 | "from tqdm import tqdm\n", 105 | "\n", 106 | "# Download and import the MIT Introduction to Deep Learning package\n", 107 | "!pip install mitdeeplearning\n", 108 | "import mitdeeplearning as mdl\n", 109 | "\n", 110 | "# Download and import Capsa\n", 111 | "!pip install capsa\n", 112 | "import capsa" 113 | ] 114 | }, 115 | { 116 | "cell_type": "markdown", 117 | "metadata": { 118 | "id": "xzEcxjKHn8gc" 119 | }, 120 | "source": [ 121 | "## 1.1 Dataset\n", 122 | "\n", 123 | "We will build understanding of bias and uncertainty by training a neural network for a simple 2D regression task: modeling the function $y = x^3$. We will use Capsa to analyze this dataset and the performance of the model. Noise and missing-ness will be injected into the dataset.\n", 124 | "\n", 125 | "Let's generate the dataset and visualize it:" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": null, 131 | "metadata": { 132 | "id": "fH40EhC1j9dH" 133 | }, 134 | "outputs": [], 135 | "source": [ 136 | "# Get the data for the cubic function, injected with noise and missing-ness\n", 137 | "# This is just a toy dataset that we can use to test some of the wrappers on\n", 138 | "def gen_data(x_min, x_max, n, train=True):\n", 139 | " if train: \n", 140 | " x = np.random.triangular(x_min, 2, x_max, size=(n, 1))\n", 141 | " else: \n", 142 | " x = np.linspace(x_min, x_max, n).reshape(n, 1)\n", 143 | "\n", 144 | " sigma = 2*np.exp(-(x+1)**2/1) + 0.2 if train else np.zeros_like(x)\n", 145 | " y = x**3/6 + np.random.normal(0, sigma).astype(np.float32)\n", 146 | "\n", 147 | " return x, y\n", 148 | "\n", 149 | "# Plot the dataset and visualize the train and test datapoints\n", 150 | "x_train, y_train = gen_data(-4, 4, 2000, train=True) # train data\n", 151 | "x_test, y_test = gen_data(-6, 6, 500, train=False) # test data\n", 152 | "\n", 153 | "plt.figure(figsize=(10, 6))\n", 154 | "plt.plot(x_test, y_test, c='r', zorder=-1, label='ground truth')\n", 155 | "plt.scatter(x_train, y_train, s=1.5, label='train data')\n", 156 | "plt.legend()" 157 | ] 158 | }, 159 | { 160 | "cell_type": "markdown", 161 | "metadata": { 162 | "id": "Fz3UxT8vuN95" 163 | }, 164 | "source": [ 165 | "In the plot above, the blue points are the training data, which will be used as inputs to train the neural network model. The red line is the ground truth data, which will be used to evaluate the performance of the model.\n", 166 | "\n", 167 | "#### **TODO: Inspecting the 2D regression dataset**\n", 168 | "\n", 169 | " Write short (~1 sentence) answers to the questions below to complete the `TODO`s:\n", 170 | "\n", 171 | "1. What are your observations about where the train data and test data lie relative to each other?\n", 172 | "2. What, if any, areas do you expect to have high/low aleatoric (data) uncertainty?\n", 173 | "3. What, if any, areas do you expect to have high/low epistemic (model) uncertainty?" 174 | ] 175 | }, 176 | { 177 | "cell_type": "markdown", 178 | "metadata": { 179 | "id": "mXMOYRHnv8tF" 180 | }, 181 | "source": [ 182 | "## 1.2 Regression on cubic dataset\n", 183 | "\n", 184 | "Next we will define a small dense neural network model that can predict `y` given `x`: this is a classical regression task! We will build the model and use the [`model.fit()`](https://www.tensorflow.org/api_docs/python/tf/keras/Model#fit) function to train the model -- normally, without any risk-awareness -- using the train dataset that we visualized above." 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": null, 190 | "metadata": { 191 | "id": "7p1XwfZVuB68" 192 | }, 193 | "outputs": [], 194 | "source": [ 195 | "### Define and train a dense NN model for the regression task###\n", 196 | "\n", 197 | "'''Function to define a small dense NN'''\n", 198 | "def create_dense_NN():\n", 199 | " return tf.keras.Sequential(\n", 200 | " [\n", 201 | " tf.keras.Input(shape=(1,)),\n", 202 | " tf.keras.layers.Dense(32, \"relu\"),\n", 203 | " tf.keras.layers.Dense(32, \"relu\"),\n", 204 | " tf.keras.layers.Dense(32, \"relu\"),\n", 205 | " tf.keras.layers.Dense(1),\n", 206 | " ]\n", 207 | " )\n", 208 | "\n", 209 | "dense_NN = create_dense_NN()\n", 210 | "\n", 211 | "# Build the model for regression, defining the loss function and optimizer\n", 212 | "dense_NN.compile(\n", 213 | " optimizer=tf.keras.optimizers.Adam(learning_rate=5e-3),\n", 214 | " loss=tf.keras.losses.MeanSquaredError(), # MSE loss for the regression task\n", 215 | ")\n", 216 | "\n", 217 | "# Train the model for 30 epochs using model.fit().\n", 218 | "loss_history = dense_NN.fit(x_train, y_train, epochs=30)" 219 | ] 220 | }, 221 | { 222 | "cell_type": "markdown", 223 | "metadata": { 224 | "id": "ovwYBUG3wTDv" 225 | }, 226 | "source": [ 227 | "Now, we are ready to evaluate our neural network. We use the test data to assess performance on the regression task, and visualize the predicted values against the true values.\n", 228 | "\n", 229 | "Given your observation of the data in the previous plot, where do you expect the model to perform well? Let's test the model and see:" 230 | ] 231 | }, 232 | { 233 | "cell_type": "code", 234 | "execution_count": null, 235 | "metadata": { 236 | "id": "fb-EklZywR4D" 237 | }, 238 | "outputs": [], 239 | "source": [ 240 | "# Pass the test data through the network and predict the y values\n", 241 | "y_predicted = dense_NN.predict(x_test)\n", 242 | "\n", 243 | "# Visualize the true (x, y) pairs for the test data vs. the predicted values\n", 244 | "plt.figure(figsize=(10, 6))\n", 245 | "plt.scatter(x_train, y_train, s=1.5, label='train data')\n", 246 | "plt.plot(x_test, y_test, c='r', zorder=-1, label='ground truth')\n", 247 | "plt.plot(x_test, y_predicted, c='b', zorder=0, label='predicted')\n", 248 | "plt.legend()" 249 | ] 250 | }, 251 | { 252 | "cell_type": "markdown", 253 | "metadata": { 254 | "id": "7Vktjwfu0ReH" 255 | }, 256 | "source": [ 257 | "\n", 258 | "#### **TODO: Analyzing the performance of standard regression model**\n", 259 | "\n", 260 | "Write short (~1 sentence) answers to the questions below to complete the `TODO`s:\n", 261 | "\n", 262 | "1. Where does the model perform well?\n", 263 | "2. Where does the model perform poorly?" 264 | ] 265 | }, 266 | { 267 | "cell_type": "markdown", 268 | "metadata": { 269 | "id": "7MzvM48JyZMO" 270 | }, 271 | "source": [ 272 | "## 1.3 Evaluating bias\n", 273 | "\n", 274 | "Now that we've seen what the predictions from this model look like, we will identify and quantify bias and uncertainty in this problem. We first consider bias.\n", 275 | "\n", 276 | "Recall that *representation bias* reflects how likely combinations of features are to appear in a given dataset. Capsa calculates how likely combinations of features are by using a histogram estimation approach: the `capsa.HistogramWrapper`. For low-dimensional data, the `capsa.HistogramWrapper` bins the input directly into discrete categories and measures the density. More details of the `HistogramWrapper` and how it can be used are [available here](https://themisai.io/capsa/api_documentation/HistogramWrapper.html).\n", 277 | "\n", 278 | "We start by taking our `dense_NN` and wrapping it with the `capsa.HistogramWrapper`:" 279 | ] 280 | }, 281 | { 282 | "cell_type": "code", 283 | "execution_count": null, 284 | "metadata": { 285 | "id": "AVv-knsCwOp9" 286 | }, 287 | "outputs": [], 288 | "source": [ 289 | "### Wrap the dense network for bias estimation ###\n", 290 | "\n", 291 | "standard_dense_NN = create_dense_NN()\n", 292 | "bias_wrapped_dense_NN = capsa.HistogramWrapper(\n", 293 | " standard_dense_NN, # the original model\n", 294 | " num_bins=20,\n", 295 | " queue_size=2000, # how many samples to track\n", 296 | " target_hidden_layer=False # for low-dimensional data (like this dataset), we can estimate biases directly from data\n", 297 | ")" 298 | ] 299 | }, 300 | { 301 | "cell_type": "markdown", 302 | "metadata": { 303 | "id": "UFHO7LKcz8uP" 304 | }, 305 | "source": [ 306 | "Now that we've wrapped the classifier, let's re-train it to update the bias estimates as we train. We can use the exact same training pipeline, using `compile` to build the model and `model.fit()` to train the model:" 307 | ] 308 | }, 309 | { 310 | "cell_type": "code", 311 | "execution_count": null, 312 | "metadata": { 313 | "id": "SkyD3rsqy2ff" 314 | }, 315 | "outputs": [], 316 | "source": [ 317 | "### Compile and train the wrapped model! ###\n", 318 | "\n", 319 | "# Build the model for regression, defining the loss function and optimizer\n", 320 | "bias_wrapped_dense_NN.compile(\n", 321 | " optimizer=tf.keras.optimizers.Adam(learning_rate=2e-3),\n", 322 | " loss=tf.keras.losses.MeanSquaredError(), # MSE loss for the regression task\n", 323 | ")\n", 324 | "\n", 325 | "# Train the wrapped model for 30 epochs.\n", 326 | "loss_history_bias_wrap = bias_wrapped_dense_NN.fit(x_train, y_train, epochs=30)\n", 327 | "\n", 328 | "print(\"Done training model with Bias Wrapper!\")" 329 | ] 330 | }, 331 | { 332 | "cell_type": "markdown", 333 | "metadata": { 334 | "id": "_6iVeeqq0f_H" 335 | }, 336 | "source": [ 337 | "We can now use our wrapped model to assess the bias for a given test input. With the wrapping capability, Capsa neatly allows us to output a *bias score* along with the predicted target value. This bias score reflects the density of data surrounding an input point -- the higher the score, the greater the data representation and density. The wrapped, risk-aware model outputs the predicted target and bias score after it is called!\n", 338 | "\n", 339 | "Let's see how it is done:" 340 | ] 341 | }, 342 | { 343 | "cell_type": "code", 344 | "execution_count": null, 345 | "metadata": { 346 | "id": "tZ17eCbP0YM4" 347 | }, 348 | "outputs": [], 349 | "source": [ 350 | "### Generate and visualize bias scores for data in test set ###\n", 351 | "\n", 352 | "# Call the risk-aware model to generate scores\n", 353 | "predictions, bias = bias_wrapped_dense_NN(x_test)\n", 354 | "\n", 355 | "# Visualize the relationship between the input data x and the bias\n", 356 | "fig, ax = plt.subplots(2, 1, figsize=(8,6))\n", 357 | "ax[0].plot(x_test, bias, label='bias')\n", 358 | "ax[0].set_ylabel('Estimated Bias')\n", 359 | "ax[0].legend()\n", 360 | "\n", 361 | "# Let's compare against the ground truth density distribution\n", 362 | "# should roughly align with our estimated bias in this toy example\n", 363 | "ax[1].hist(x_train, 50, label='ground truth')\n", 364 | "ax[1].set_xlim(-6, 6)\n", 365 | "ax[1].set_ylabel('True Density')\n", 366 | "ax[1].legend();" 367 | ] 368 | }, 369 | { 370 | "cell_type": "markdown", 371 | "metadata": { 372 | "id": "HpDMT_1FERQE" 373 | }, 374 | "source": [ 375 | "#### **TODO: Evaluating bias with wrapped regression model**\n", 376 | "\n", 377 | "Write short (~1 sentence) answers to the questions below to complete the `TODO`s:\n", 378 | "\n", 379 | "1. How does the bias score relate to the train/test data density from the first plot?\n", 380 | "2. What is one limitation of the Histogram approach that simply bins the data based on frequency?" 381 | ] 382 | }, 383 | { 384 | "cell_type": "markdown", 385 | "metadata": { 386 | "id": "PvS8xR_q27Ec" 387 | }, 388 | "source": [ 389 | "# 1.4 Estimating data uncertainty\n", 390 | "\n", 391 | "Next we turn our attention to uncertainty, first focusing on the uncertainty in the data -- the aleatoric uncertainty.\n", 392 | "\n", 393 | "As introduced in Lecture 5 on Robust & Trustworthy Deep Learning, in regression we can estimate aleatoric uncertainty by training the model to predict both a target value and a variance for every input. Because we estimate both a mean and variance for every input, this method is called Mean Variance Estimation (MVE). MVE involves modifying the output layer to predict both the mean and variance, and changing the loss to reflect the prediction likelihood.\n", 394 | "\n", 395 | "Capsa automatically implements these changes for us: we can wrap a given model using `capsa.MVEWrapper` to use MVE to estimate aleatoric uncertainty. All we have to do is define the model and the loss function to evaluate its predictions! More details of the `MVEWrapper` and how it can be used are [available here](https://themisai.io/capsa/api_documentation/MVEWrapper.html).\n", 396 | "\n", 397 | "Let's take our standard network, wrap it with `capsa.MVEWrapper`, build the wrapped model, and then train it for the regression task. Finally, we evaluate performance of the resulting model by quantifying the aleatoric uncertainty across the data space: " 398 | ] 399 | }, 400 | { 401 | "cell_type": "code", 402 | "execution_count": null, 403 | "metadata": { 404 | "id": "sxmm-2sd3G9u" 405 | }, 406 | "outputs": [], 407 | "source": [ 408 | "### Estimating data uncertainty with Capsa wrapping ###\n", 409 | "\n", 410 | "standard_dense_NN = create_dense_NN()\n", 411 | "# Wrap the dense network for aleatoric uncertainty estimation\n", 412 | "mve_wrapped_NN = capsa.MVEWrapper(standard_dense_NN)\n", 413 | "\n", 414 | "# Build the model for regression, defining the loss function and optimizer\n", 415 | "mve_wrapped_NN.compile(\n", 416 | " optimizer=tf.keras.optimizers.Adam(learning_rate=1e-2),\n", 417 | " loss=tf.keras.losses.MeanSquaredError(), # MSE loss for the regression task\n", 418 | ")\n", 419 | "\n", 420 | "# Train the wrapped model for 30 epochs.\n", 421 | "loss_history_mve_wrap = mve_wrapped_NN.fit(x_train, y_train, epochs=30)\n", 422 | "\n", 423 | "# Call the uncertainty-aware model to generate outputs for the test data\n", 424 | "x_test_clipped = np.clip(x_test, x_train.min(), x_train.max())\n", 425 | "prediction = mve_wrapped_NN(x_test_clipped)" 426 | ] 427 | }, 428 | { 429 | "cell_type": "code", 430 | "execution_count": null, 431 | "metadata": { 432 | "id": "dT2Rx8JCg3NR" 433 | }, 434 | "outputs": [], 435 | "source": [ 436 | "# Capsa makes the aleatoric uncertainty an attribute of the prediction!\n", 437 | "pred = np.array(prediction.y_hat).flatten()\n", 438 | "unc = np.sqrt(prediction.aleatoric).flatten() # out.aleatoric is the predicted variance\n", 439 | "\n", 440 | "# Visualize the aleatoric uncertainty across the data space\n", 441 | "plt.figure(figsize=(10, 6))\n", 442 | "plt.scatter(x_train, y_train, s=1.5, label='train data')\n", 443 | "plt.plot(x_test, y_test, c='r', zorder=-1, label='ground truth')\n", 444 | "plt.fill_between(x_test_clipped.flatten(), pred-2*unc, pred+2*unc, \n", 445 | " color='b', alpha=0.2, label='aleatoric')\n", 446 | "plt.legend()" 447 | ] 448 | }, 449 | { 450 | "cell_type": "markdown", 451 | "metadata": { 452 | "id": "ZFeArgRX9U9s" 453 | }, 454 | "source": [ 455 | "#### **TODO: Estimating aleatoric uncertainty**\n", 456 | "\n", 457 | "Write short (~1 sentence) answers to the questions below to complete the `TODO`s:\n", 458 | "\n", 459 | "1. For what values of $x$ is the aleatoric uncertainty high or increasing suddenly?\n", 460 | "2. How does your answer in (1) relate to how the $x$ values are distributed?" 461 | ] 462 | }, 463 | { 464 | "cell_type": "markdown", 465 | "metadata": { 466 | "id": "6FC5WPRT5lAb" 467 | }, 468 | "source": [ 469 | "# 1.5 Estimating model uncertainty\n", 470 | "\n", 471 | "Finally, we use Capsa for estimating the uncertainty underlying the model predictions -- the epistemic uncertainty. In this example, we'll use ensembles, which essentially copy the model `N` times and average predictions across all runs for a more robust prediction, and also calculate the variance of the `N` runs to estimate the uncertainty.\n", 472 | "\n", 473 | "Capsa provides a neat wrapper, `capsa.EnsembleWrapper`, to make an ensemble from an input model. Just like with aleatoric estimation, we can take our standard dense network model, wrap it with `capsa.EnsembleWrapper`, build the wrapped model, and then train it for the regression task. More details of the `EnsembleWrapper` and how it can be used are [available here](https://themisai.io/capsa/api_documentation/EnsembleWrapper.html).\n", 474 | "\n", 475 | "Finally, we evaluate the resulting model by quantifying the epistemic uncertainty on the test data:" 476 | ] 477 | }, 478 | { 479 | "cell_type": "code", 480 | "execution_count": null, 481 | "metadata": { 482 | "id": "SuRlhq2c5Fob" 483 | }, 484 | "outputs": [], 485 | "source": [ 486 | "### Estimating model uncertainty with Capsa wrapping ###\n", 487 | "\n", 488 | "standard_dense_NN = create_dense_NN()\n", 489 | "# Wrap the dense network for epistemic uncertainty estimation with an Ensemble\n", 490 | "ensemble_NN = capsa.EnsembleWrapper(standard_dense_NN)\n", 491 | "\n", 492 | "# Build the model for regression, defining the loss function and optimizer\n", 493 | "ensemble_NN.compile(\n", 494 | " optimizer=tf.keras.optimizers.Adam(learning_rate=3e-3),\n", 495 | " loss=tf.keras.losses.MeanSquaredError(), # MSE loss for the regression task\n", 496 | ")\n", 497 | "\n", 498 | "# Train the wrapped model for 30 epochs.\n", 499 | "loss_history_ensemble = ensemble_NN.fit(x_train, y_train, epochs=30)\n", 500 | "\n", 501 | "# Call the uncertainty-aware model to generate outputs for the test data\n", 502 | "prediction = ensemble_NN(x_test)" 503 | ] 504 | }, 505 | { 506 | "cell_type": "code", 507 | "execution_count": null, 508 | "metadata": { 509 | "id": "eauNoKDOj_ZT" 510 | }, 511 | "outputs": [], 512 | "source": [ 513 | "# Capsa makes the epistemic uncertainty an attribute of the prediction!\n", 514 | "pred = np.array(prediction.y_hat).flatten()\n", 515 | "unc = np.array(prediction.epistemic).flatten()\n", 516 | "\n", 517 | "# Visualize the aleatoric uncertainty across the data space\n", 518 | "plt.figure(figsize=(10, 6))\n", 519 | "plt.scatter(x_train, y_train, s=1.5, label='train data')\n", 520 | "plt.plot(x_test, y_test, c='r', zorder=-1, label='ground truth')\n", 521 | "plt.fill_between(x_test.flatten(), pred-20*unc, pred+20*unc, color='b', alpha=0.2, label='epistemic')\n", 522 | "plt.legend()" 523 | ] 524 | }, 525 | { 526 | "cell_type": "markdown", 527 | "metadata": { 528 | "id": "N4LMn2tLPBdg" 529 | }, 530 | "source": [ 531 | "#### **TODO: Estimating epistemic uncertainty**\n", 532 | "\n", 533 | "Write short (~1 sentence) answers to the questions below to complete the `TODO`s:\n", 534 | "\n", 535 | "1. For what values of $x$ is the epistemic uncertainty high or increasing suddenly?\n", 536 | "2. How does your answer in (1) relate to how the $x$ values are distributed (refer back to original plot)? Think about both the train and test data.\n", 537 | "3. How could you reduce the epistemic uncertainty in regions where it is high?" 538 | ] 539 | }, 540 | { 541 | "cell_type": "markdown", 542 | "metadata": { 543 | "id": "CkpvkOL06jRd" 544 | }, 545 | "source": [ 546 | "# 1.6 Conclusion\n", 547 | "\n", 548 | "You've just analyzed the bias, aleatoric uncertainty, and epistemic uncertainty for your first risk-aware model! This is a task that data scientists do constantly to determine methods of improving their models and datasets.\n", 549 | "\n", 550 | "In the next part of the lab, you'll continue to build off of these concepts to study them in the context of facial detection systems: not only diagnosing issues of bias and uncertainty, but also developing solutions to *mitigate* these risks.\n", 551 | "\n", 552 | "![alt text](https://raw.githubusercontent.com/MITDeepLearning/introtodeeplearning/2023/lab3/img/solutions_toy.png)" 553 | ] 554 | }, 555 | { 556 | "cell_type": "code", 557 | "execution_count": null, 558 | "metadata": { 559 | "id": "nIpfPcpjlsKK" 560 | }, 561 | "outputs": [], 562 | "source": [] 563 | } 564 | ], 565 | "metadata": { 566 | "colab": { 567 | "include_colab_link": true, 568 | "provenance": [] 569 | }, 570 | "kernelspec": { 571 | "display_name": "Python 3", 572 | "name": "python3" 573 | }, 574 | "language_info": { 575 | "name": "python" 576 | } 577 | }, 578 | "nbformat": 4, 579 | "nbformat_minor": 0 580 | } 581 | --------------------------------------------------------------------------------