├── .DS_Store
├── Discussion
    ├── .DS_Store
    ├── discussion1.pdf
    ├── discussion10.pdf
    ├── discussion11.pdf
    ├── discussion12.pdf
    ├── discussion13.pdf
    ├── discussion2.pdf
    ├── discussion3.pdf
    ├── discussion4.pdf
    ├── discussion5.pdf
    ├── discussion6.pdf
    ├── discussion7.pdf
    ├── discussion8.pdf
    ├── final_review-finetuning.pdf
    ├── final_review-generative.pdf
    ├── final_review-premt.pdf
    ├── final_review-transformers.pdf
    ├── midterm_ae_and_rnn.pdf
    ├── midterm_basics.pdf
    └── midterm_cnn_and_gnn.pdf
├── Final
    ├── final-sp23-sol.pdf
    └── final-sp23.pdf
├── Homework
    ├── .DS_Store
    ├── Homework0
    │   ├── .DS_Store
    │   ├── Coding
    │   │   ├── color_organ_learning.ipynb
    │   │   └── color_organ_learning_solution.ipynb
    │   ├── hw0.pdf
    │   └── hw0_sol.pdf
    ├── Homework1
    │   ├── .DS_Store
    │   ├── coding
    │   │   ├── FullyConnectedNets.ipynb
    │   │   ├── hw1_coding_fully_connected_solution.zip
    │   │   ├── hw1_vis_linearization_sol_(1)_(1)-2.ipynb
    │   │   └── hw2_visualization_linearization.ipynb
    │   ├── hw1.pdf
    │   └── hw1_sol.pdf
    ├── Homework2
    │   ├── .DS_Store
    │   ├── coding
    │   │   ├── hw2_linearization_part2-2.ipynb
    │   │   ├── hw2_linearization_part2_sol (2).ipynb
    │   │   ├── hw2_momentum.ipynb
    │   │   ├── hw2_momentum_sol (2).ipynb
    │   │   ├── hw2_optim_init_sol.zip
    │   │   └── hw2_optimizer_init.ipynb
    │   ├── hw2.pdf
    │   └── hw2_sol.pdf
    ├── Homework4
    │   ├── .DS_Store
    │   ├── coding
    │   │   ├── hw4_dropout-2.ipynb
    │   │   ├── hw4_dropout_sol (1).ipynb
    │   │   ├── hw4_edge_detection-2.ipynb
    │   │   ├── hw4_edge_detection_sol (1).ipynb
    │   │   └── hw4_gpu_memory.ipynb
    │   ├── hw4.pdf
    │   └── hw4_sol.pdf
    ├── Homework5
    │   ├── .DS_Store
    │   ├── coding
    │   │   ├── .DS_Store
    │   │   ├── hw5_graph_clustering-2.ipynb
    │   │   ├── hw5_graph_clustering_solution.ipynb
    │   │   ├── hw5_zkc.ipynb
    │   │   └── hw5_zkc_solution.ipynb
    │   ├── hw5.pdf
    │   └── hw5_sol.pdf
    ├── homework10
    │   ├── .DS_Store
    │   ├── coding
    │   │   ├── hw10_lm_prompting.ipynb
    │   │   ├── hw10_maml.ipynb
    │   │   ├── hw10_maml_sol.ipynb
    │   │   ├── hw10_pruning.ipynb
    │   │   ├── hw10_pruning_sol.ipynb
    │   │   ├── hw10_quantization.ipynb
    │   │   └── hw10_quantization_sol.ipynb
    │   ├── hw10.pdf
    │   └── hw10_sol.pdf
    ├── homework11
    │   ├── .DS_Store
    │   ├── coding
    │   │   ├── .DS_Store
    │   │   ├── generative_models_sol.zip
    │   │   ├── hw11_continual_learning.ipynb
    │   │   ├── hw11_policy_gradient.ipynb
    │   │   ├── hw11_policy_gradient_sol.ipynb
    │   │   ├── hw11_summarization_part2.ipynb
    │   │   ├── hw11_summarization_part2_sol.ipynb
    │   │   └── hw11_vae_gan.ipynb
    │   ├── hw11.pdf
    │   └── hw11_sol.pdf
    ├── homework12
    │   ├── .DS_Store
    │   ├── coding
    │   │   ├── .DS_Store
    │   │   ├── Early_Exit_HW_12_Solutions_Bug_Fixed.ipynb
    │   │   ├── hw12_early_exit_sol.ipynb
    │   │   └── hw12_rlhf_sol.ipynb
    │   ├── hw12.pdf
    │   └── hw12_sol.pdf
    ├── homework3
    │   ├── .DS_Store
    │   ├── coding
    │   │   ├── .DS_Store
    │   │   ├── HandDesignFilters_Sol (1).ipynb
    │   │   ├── hw3_HandDesignFilters.ipynb
    │   │   ├── hw3_bn_drop.ipynb
    │   │   ├── hw3_cnn.ipynb
    │   │   ├── hw3_coding_bn_drop_cnn_sol.zip
    │   │   └── hw3_pytorch_cnn.ipynb
    │   ├── hw3.pdf
    │   └── hw3_sol.pdf
    ├── homework6
    │   ├── .DS_Store
    │   ├── coding
    │   │   ├── .DS_Store
    │   │   ├── hw6_rnn_and_grad.ipynb
    │   │   ├── hw6_rnn_and_grad_sol.ipynb
    │   │   ├── hw6_rnn_last_name.ipynb
    │   │   └── hw6_rnn_last_name_sol.ipynb
    │   ├── hw6.pdf
    │   └── hw6_sol.pdf
    ├── homework7
    │   ├── .DS_Store
    │   ├── coding
    │   │   ├── hw7_ae.ipynb
    │   │   └── hw7_ae_sol.ipynb
    │   ├── hw7.pdf
    │   └── hw7_sol.pdf
    ├── homework8
    │   ├── .DS_Store
    │   ├── coding
    │   │   ├── hw8_hand_transformer.ipynb
    │   │   └── hw8_hand_transformer_sol (1).ipynb
    │   ├── hw8.pdf
    │   └── hw8_sol.pdf
    └── homework9
    │   ├── .DS_Store
    │   ├── coding
    │       ├── .DS_Store
    │       ├── hw9_mae.ipynb
    │       ├── hw9_mae_sol.ipynb
    │       ├── hw9_summarization_part1.ipynb
    │       ├── hw9_summarization_part1_sol.ipynb
    │       └── hw9_visualize_attention.ipynb
    │   ├── hw9.pdf
    │   └── hw9_sol.pdf
├── LICENSE
├── Lecture
    ├── .DS_Store
    ├── Lecture 1.pdf
    ├── Lecture10.pdf
    ├── Lecture11.pdf
    ├── Lecture2.pdf
    ├── Lecture3.pdf
    ├── Lecture4.pdf
    ├── Lecture5.pdf
    ├── Lecture6.pdf
    ├── Lecture7.pdf
    ├── Lecture8.pdf
    ├── Lecture9.pdf
    ├── matrixcookbook.pdf
    ├── note10.png
    └── note9.png
├── Midterm
    ├── mt-sp23-sol.pdf
    └── mt-sp23.pdf
└── README.md


/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Leo-Adventure/CS182-Berkeley-Deep-Learning-23Spring/c70df32185f7a1e38e3287dbc6d2d0b08f9e98d2/.DS_Store


--------------------------------------------------------------------------------
/Discussion/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Leo-Adventure/CS182-Berkeley-Deep-Learning-23Spring/c70df32185f7a1e38e3287dbc6d2d0b08f9e98d2/Discussion/.DS_Store


--------------------------------------------------------------------------------
/Discussion/discussion1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Leo-Adventure/CS182-Berkeley-Deep-Learning-23Spring/c70df32185f7a1e38e3287dbc6d2d0b08f9e98d2/Discussion/discussion1.pdf


--------------------------------------------------------------------------------
/Discussion/discussion10.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Leo-Adventure/CS182-Berkeley-Deep-Learning-23Spring/c70df32185f7a1e38e3287dbc6d2d0b08f9e98d2/Discussion/discussion10.pdf


--------------------------------------------------------------------------------
/Discussion/discussion11.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Leo-Adventure/CS182-Berkeley-Deep-Learning-23Spring/c70df32185f7a1e38e3287dbc6d2d0b08f9e98d2/Discussion/discussion11.pdf


--------------------------------------------------------------------------------
/Discussion/discussion12.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Leo-Adventure/CS182-Berkeley-Deep-Learning-23Spring/c70df32185f7a1e38e3287dbc6d2d0b08f9e98d2/Discussion/discussion12.pdf


--------------------------------------------------------------------------------
/Discussion/discussion13.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Leo-Adventure/CS182-Berkeley-Deep-Learning-23Spring/c70df32185f7a1e38e3287dbc6d2d0b08f9e98d2/Discussion/discussion13.pdf


--------------------------------------------------------------------------------
/Discussion/discussion2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Leo-Adventure/CS182-Berkeley-Deep-Learning-23Spring/c70df32185f7a1e38e3287dbc6d2d0b08f9e98d2/Discussion/discussion2.pdf


--------------------------------------------------------------------------------
/Discussion/discussion3.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Leo-Adventure/CS182-Berkeley-Deep-Learning-23Spring/c70df32185f7a1e38e3287dbc6d2d0b08f9e98d2/Discussion/discussion3.pdf


--------------------------------------------------------------------------------
/Discussion/discussion4.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Leo-Adventure/CS182-Berkeley-Deep-Learning-23Spring/c70df32185f7a1e38e3287dbc6d2d0b08f9e98d2/Discussion/discussion4.pdf


--------------------------------------------------------------------------------
/Discussion/discussion5.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Leo-Adventure/CS182-Berkeley-Deep-Learning-23Spring/c70df32185f7a1e38e3287dbc6d2d0b08f9e98d2/Discussion/discussion5.pdf


--------------------------------------------------------------------------------
/Discussion/discussion6.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Leo-Adventure/CS182-Berkeley-Deep-Learning-23Spring/c70df32185f7a1e38e3287dbc6d2d0b08f9e98d2/Discussion/discussion6.pdf


--------------------------------------------------------------------------------
/Discussion/discussion7.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Leo-Adventure/CS182-Berkeley-Deep-Learning-23Spring/c70df32185f7a1e38e3287dbc6d2d0b08f9e98d2/Discussion/discussion7.pdf


--------------------------------------------------------------------------------
/Discussion/discussion8.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Leo-Adventure/CS182-Berkeley-Deep-Learning-23Spring/c70df32185f7a1e38e3287dbc6d2d0b08f9e98d2/Discussion/discussion8.pdf


--------------------------------------------------------------------------------
/Discussion/final_review-finetuning.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Leo-Adventure/CS182-Berkeley-Deep-Learning-23Spring/c70df32185f7a1e38e3287dbc6d2d0b08f9e98d2/Discussion/final_review-finetuning.pdf


--------------------------------------------------------------------------------
/Discussion/final_review-generative.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Leo-Adventure/CS182-Berkeley-Deep-Learning-23Spring/c70df32185f7a1e38e3287dbc6d2d0b08f9e98d2/Discussion/final_review-generative.pdf


--------------------------------------------------------------------------------
/Discussion/final_review-premt.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Leo-Adventure/CS182-Berkeley-Deep-Learning-23Spring/c70df32185f7a1e38e3287dbc6d2d0b08f9e98d2/Discussion/final_review-premt.pdf


--------------------------------------------------------------------------------
/Discussion/final_review-transformers.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Leo-Adventure/CS182-Berkeley-Deep-Learning-23Spring/c70df32185f7a1e38e3287dbc6d2d0b08f9e98d2/Discussion/final_review-transformers.pdf


--------------------------------------------------------------------------------
/Discussion/midterm_ae_and_rnn.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Leo-Adventure/CS182-Berkeley-Deep-Learning-23Spring/c70df32185f7a1e38e3287dbc6d2d0b08f9e98d2/Discussion/midterm_ae_and_rnn.pdf


--------------------------------------------------------------------------------
/Discussion/midterm_basics.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Leo-Adventure/CS182-Berkeley-Deep-Learning-23Spring/c70df32185f7a1e38e3287dbc6d2d0b08f9e98d2/Discussion/midterm_basics.pdf


--------------------------------------------------------------------------------
/Discussion/midterm_cnn_and_gnn.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Leo-Adventure/CS182-Berkeley-Deep-Learning-23Spring/c70df32185f7a1e38e3287dbc6d2d0b08f9e98d2/Discussion/midterm_cnn_and_gnn.pdf


--------------------------------------------------------------------------------
/Final/final-sp23-sol.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Leo-Adventure/CS182-Berkeley-Deep-Learning-23Spring/c70df32185f7a1e38e3287dbc6d2d0b08f9e98d2/Final/final-sp23-sol.pdf


--------------------------------------------------------------------------------
/Final/final-sp23.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Leo-Adventure/CS182-Berkeley-Deep-Learning-23Spring/c70df32185f7a1e38e3287dbc6d2d0b08f9e98d2/Final/final-sp23.pdf


--------------------------------------------------------------------------------
/Homework/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Leo-Adventure/CS182-Berkeley-Deep-Learning-23Spring/c70df32185f7a1e38e3287dbc6d2d0b08f9e98d2/Homework/.DS_Store


--------------------------------------------------------------------------------
/Homework/Homework0/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Leo-Adventure/CS182-Berkeley-Deep-Learning-23Spring/c70df32185f7a1e38e3287dbc6d2d0b08f9e98d2/Homework/Homework0/.DS_Store


--------------------------------------------------------------------------------
/Homework/Homework0/hw0.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Leo-Adventure/CS182-Berkeley-Deep-Learning-23Spring/c70df32185f7a1e38e3287dbc6d2d0b08f9e98d2/Homework/Homework0/hw0.pdf


--------------------------------------------------------------------------------
/Homework/Homework0/hw0_sol.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Leo-Adventure/CS182-Berkeley-Deep-Learning-23Spring/c70df32185f7a1e38e3287dbc6d2d0b08f9e98d2/Homework/Homework0/hw0_sol.pdf


--------------------------------------------------------------------------------
/Homework/Homework1/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Leo-Adventure/CS182-Berkeley-Deep-Learning-23Spring/c70df32185f7a1e38e3287dbc6d2d0b08f9e98d2/Homework/Homework1/.DS_Store


--------------------------------------------------------------------------------
/Homework/Homework1/coding/hw1_coding_fully_connected_solution.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Leo-Adventure/CS182-Berkeley-Deep-Learning-23Spring/c70df32185f7a1e38e3287dbc6d2d0b08f9e98d2/Homework/Homework1/coding/hw1_coding_fully_connected_solution.zip


--------------------------------------------------------------------------------
/Homework/Homework1/coding/hw2_visualization_linearization.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "cells": [
  3 |     {
  4 |       "cell_type": "markdown",
  5 |       "metadata": {
  6 |         "id": "Giacfc7eHrUJ"
  7 |       },
  8 |       "source": [
  9 |         "# Visualizing features from local linearization of neural nets"
 10 |       ]
 11 |     },
 12 |     {
 13 |       "cell_type": "code",
 14 |       "execution_count": null,
 15 |       "metadata": {
 16 |         "id": "ufJzj4IOHrUO"
 17 |       },
 18 |       "outputs": [],
 19 |       "source": [
 20 |         "!pip install ipympl torchviz\n",
 21 |         "!pip install torch==1.13 --extra-index-url https://download.pytorch.org/whl/cpu\n",
 22 |         "# restart your runtime after this step"
 23 |       ]
 24 |     },
 25 |     {
 26 |       "cell_type": "code",
 27 |       "execution_count": null,
 28 |       "metadata": {
 29 |         "id": "sKsd5p0jHrUP"
 30 |       },
 31 |       "outputs": [],
 32 |       "source": [
 33 |         "import torch\n",
 34 |         "import torch.nn as nn\n",
 35 |         "import matplotlib.pyplot as plt\n",
 36 |         "import numpy as np\n",
 37 |         "import copy\n",
 38 |         "import time\n",
 39 |         "from torchvision.models.feature_extraction import create_feature_extractor\n",
 40 |         "from ipywidgets import fixed, interactive, widgets\n",
 41 |         "%matplotlib inline"
 42 |       ]
 43 |     },
 44 |     {
 45 |       "cell_type": "code",
 46 |       "execution_count": null,
 47 |       "metadata": {
 48 |         "id": "7ACCCVC-HrUQ"
 49 |       },
 50 |       "outputs": [],
 51 |       "source": [
 52 |         "# enable matplotlib widgets;\n",
 53 |         "\n",
 54 |         "# on Google Colab\n",
 55 |         "from google.colab import output\n",
 56 |         "output.enable_custom_widget_manager()\n",
 57 |         "\n",
 58 |         "%matplotlib widget"
 59 |       ]
 60 |     },
 61 |     {
 62 |       "cell_type": "code",
 63 |       "execution_count": null,
 64 |       "metadata": {
 65 |         "id": "pzhYAZbiHrUQ"
 66 |       },
 67 |       "outputs": [],
 68 |       "source": [
 69 |         "def to_torch(x):\n",
 70 |         "    return torch.from_numpy(x).float()\n",
 71 |         "\n",
 72 |         "\n",
 73 |         "def to_numpy(x):\n",
 74 |         "    return x.detach().numpy()\n",
 75 |         "\n",
 76 |         "\n",
 77 |         "def plot_data(X, y, X_test, y_test):\n",
 78 |         "    clip_bound = 2.5\n",
 79 |         "    plt.xlim(0, 1)\n",
 80 |         "    plt.ylim(-clip_bound, clip_bound)\n",
 81 |         "    plt.scatter(X[:, 0], y, c='darkorange', s=40.0, label='training data points')\n",
 82 |         "    plt.plot(X_test, y_test, '--', color='royalblue', linewidth=2.0, label='Ground truth')\n",
 83 |         "\n",
 84 |         "\n",
 85 |         "def plot_relu(bias, slope):\n",
 86 |         "    plt.scatter([-bias / slope], 0, c='darkgrey', s=40.0)\n",
 87 |         "    if slope > 0 and bias < 0:\n",
 88 |         "        plt.plot([0, -bias / slope, 1], [0, 0, slope * (1 - bias)], ':')\n",
 89 |         "    elif slope < 0 and bias > 0:\n",
 90 |         "        plt.plot([0, -bias / slope, 1], [-bias * slope, 0, 0], ':')\n",
 91 |         "\n",
 92 |         "\n",
 93 |         "def plot_relus(params):\n",
 94 |         "    slopes = to_numpy(params[0]).ravel()\n",
 95 |         "    biases = to_numpy(params[1])\n",
 96 |         "    for relu in range(biases.size):\n",
 97 |         "        plot_relu(biases[relu], slopes[relu])\n",
 98 |         "\n",
 99 |         "\n",
100 |         "def plot_function(X_test, net):\n",
101 |         "    y_pred = net(to_torch(X_test))\n",
102 |         "    plt.plot(X_test, to_numpy(y_pred), '-', color='forestgreen', label='prediction')\n",
103 |         "\n",
104 |         "\n",
105 |         "def plot_update(X, y, X_test, y_test, net, state=None):\n",
106 |         "    if state is not None:\n",
107 |         "        net.load_state_dict(state)\n",
108 |         "    plt.figure(figsize=(10, 7))\n",
109 |         "    plot_relus(list(net.parameters()))\n",
110 |         "    plot_function(X_test, net)\n",
111 |         "    plot_data(X, y, X_test, y_test)\n",
112 |         "    plt.legend()\n",
113 |         "    plt.show();\n",
114 |         "\n",
115 |         "\n",
116 |         "def train_network(X, y, X_test, y_test, net, optim, n_steps, save_every, initial_weights=None, verbose=False):\n",
117 |         "    loss = torch.nn.MSELoss()\n",
118 |         "    y_torch = to_torch(y.reshape(-1, 1))\n",
119 |         "    X_torch = to_torch(X)\n",
120 |         "    if initial_weights is not None:\n",
121 |         "        net.load_state_dict(initial_weights)\n",
122 |         "    history = {}\n",
123 |         "    for s in range(n_steps):\n",
124 |         "        subsample = np.random.choice(y.size, y.size // 5)\n",
125 |         "        step_loss = loss(y_torch[subsample], net(X_torch[subsample, :]))\n",
126 |         "        optim.zero_grad()\n",
127 |         "        step_loss.backward()\n",
128 |         "        optim.step()\n",
129 |         "        if (s + 1) % save_every == 0 or s == 0:\n",
130 |         "#             plot_update(X, y, X_test, y_test, net)\n",
131 |         "            history[s + 1] = {}\n",
132 |         "            history[s + 1]['state'] = copy.deepcopy(net.state_dict())\n",
133 |         "            with torch.no_grad():\n",
134 |         "                test_loss = loss(to_torch(y_test.reshape(-1, 1)), net(to_torch(X_test)))\n",
135 |         "            history[s + 1]['train_error'] = to_numpy(step_loss).item()\n",
136 |         "            history[s + 1]['test_error'] = to_numpy(test_loss).item()\n",
137 |         "            if verbose:\n",
138 |         "                print(\"SGD Iteration %d\" % (s + 1))\n",
139 |         "                print(\"\\tTrain Loss: %.3f\" % to_numpy(step_loss).item())\n",
140 |         "                print(\"\\tTest Loss: %.3f\" % to_numpy(test_loss).item())\n",
141 |         "            else:\n",
142 |         "                # Print update every 10th save point\n",
143 |         "                if (s + 1) % (save_every * 10) == 0:\n",
144 |         "                    print(\"SGD Iteration %d\" % (s + 1))\n",
145 |         "\n",
146 |         "    return history\n",
147 |         "\n",
148 |         "\n",
149 |         "def plot_test_train_errors(history):\n",
150 |         "    sample_points = np.array(list(history.keys()))\n",
151 |         "    etrain = [history[s]['train_error'] for s in history]\n",
152 |         "    etest = [history[s]['test_error'] for s in history]\n",
153 |         "    plt.plot(sample_points / 1e3, etrain, label='Train Error')\n",
154 |         "    plt.plot(sample_points / 1e3, etest, label='Test Error')\n",
155 |         "    plt.xlabel(\"Iterations (1000's)\")\n",
156 |         "    plt.ylabel(\"MSE\")\n",
157 |         "    plt.yscale('log')\n",
158 |         "    plt.legend()\n",
159 |         "    plt.show();\n",
160 |         "\n",
161 |         "\n",
162 |         "def make_iter_slider(iters):\n",
163 |         "    # print(iters)\n",
164 |         "    return widgets.SelectionSlider(\n",
165 |         "        options=iters,\n",
166 |         "        value=1,\n",
167 |         "        description='SGD Iterations: ',\n",
168 |         "        disabled=False\n",
169 |         "    )\n",
170 |         "\n",
171 |         "\n",
172 |         "def history_interactive(history, idx, X, y, X_test, y_test, net):\n",
173 |         "    plot_update(X, y, X_test, y_test, net, state=history[idx]['state'])\n",
174 |         "    plt.show()\n",
175 |         "    print(\"Train Error: %.3f\" % history[idx]['train_error'])\n",
176 |         "    print(\"Test Error: %.3f\" % history[idx]['test_error'])\n",
177 |         "\n",
178 |         "\n",
179 |         "def make_history_interactive(history, X, y, X_test, y_test, net):\n",
180 |         "    sample_points = list(history.keys())\n",
181 |         "    return interactive(history_interactive,\n",
182 |         "                       history=fixed(history),\n",
183 |         "                       idx=make_iter_slider(sample_points),\n",
184 |         "                       X=fixed(X),\n",
185 |         "                       y=fixed(y),\n",
186 |         "                       X_test=fixed(X_test),\n",
187 |         "                       y_test=fixed(y_test),\n",
188 |         "                       net=fixed(net))\n",
189 |         "\n",
190 |         "\n",
191 |         "%matplotlib inline"
192 |       ]
193 |     },
194 |     {
195 |       "cell_type": "markdown",
196 |       "metadata": {
197 |         "id": "8InbMWunHrUV"
198 |       },
199 |       "source": [
200 |         "# Generate Training and Test Data\n",
201 |         "\n",
202 |         "We are using piecewise linear function. Our training data has added noise $y = f(x) + \\epsilon,\\, \\epsilon \\sim \\mathcal{N}(0, \\sigma^2)$. The test data is noise free.\n",
203 |         "\n",
204 |         "_Once you have gone through the discussion once you may wish to adjust the number of training samples and noise variance to see how gradient descent behaves under the new conditions._"
205 |       ]
206 |     },
207 |     {
208 |       "cell_type": "code",
209 |       "execution_count": null,
210 |       "metadata": {
211 |         "id": "CgiG1--NHrUV"
212 |       },
213 |       "outputs": [],
214 |       "source": [
215 |         "f_type = 'piecewise_linear'\n",
216 |         "\n",
217 |         "def f_true(X, f_type):\n",
218 |         "    if f_type == 'sin(20x)':\n",
219 |         "        return np.sin(20 * X[:,0])\n",
220 |         "    else:\n",
221 |         "        TenX = 10 * X[:,0]\n",
222 |         "        _ = 12345\n",
223 |         "        return (TenX - np.floor(TenX)) * np.sin(_ * np.ceil(TenX)) - (TenX - np.ceil(TenX)) * np.sin(_ * np.floor(TenX)) \n",
224 |         "    \n",
225 |         "n_features = 1\n",
226 |         "n_samples = 200\n",
227 |         "sigma = 0.1\n",
228 |         "rng = np.random.RandomState(1)\n",
229 |         "\n",
230 |         "# Generate train data\n",
231 |         "X = np.sort(rng.rand(n_samples, n_features), axis=0)\n",
232 |         "y = f_true(X, f_type) + rng.randn(n_samples) * sigma\n",
233 |         "\n",
234 |         "# Generate NOISELESS test data\n",
235 |         "X_test = np.concatenate([X.copy(), np.expand_dims(np.linspace(0., 1., 1000), axis=1)])\n",
236 |         "X_test = np.sort(X_test, axis=0)\n",
237 |         "y_test = f_true(X_test, f_type)"
238 |       ]
239 |     },
240 |     {
241 |       "cell_type": "code",
242 |       "execution_count": null,
243 |       "metadata": {
244 |         "id": "THtHT2T4HrUX"
245 |       },
246 |       "outputs": [],
247 |       "source": [
248 |         "plt.scatter(X, y)\n",
249 |         "plt.show()"
250 |       ]
251 |     },
252 |     {
253 |       "cell_type": "code",
254 |       "execution_count": null,
255 |       "metadata": {
256 |         "id": "xSyroCMtHrUY"
257 |       },
258 |       "outputs": [],
259 |       "source": [
260 |         "plt.scatter(X_test, y_test)\n",
261 |         "plt.show()"
262 |       ]
263 |     },
264 |     {
265 |       "cell_type": "markdown",
266 |       "metadata": {
267 |         "id": "sVUXmkd3HrUZ"
268 |       },
269 |       "source": [
270 |         "# Define the Neural Networks\n",
271 |         "\n",
272 |         "We will learn the piecewise linear target function using a simple 1-hidden layer neural network with ReLU non-linearity, defined by\n",
273 |         "$$ \\hat{y} = \\mathbf{W}^{(2)} \\Phi \\left( \\mathbf{W}^{(1)} x + \\mathbf{b}^{(1)} \\right) + \\mathbf{b}^{(2)} $$\n",
274 |         "where $\\Phi(x) = ReLU(x)$ and superscripts refer to indices, not the power operator.\n",
275 |         "\n",
276 |         "We will also create two SGD optimizers to allow us to choose whether to train all parameters or only the linear output layer's parameters. Note that we use separate learning rates for the two version of training. There is too much variance in the gradients when training all layers to use a large learning rate, so we have to decrease it.\n",
277 |         "\n",
278 |         "We will modify the default initialization of the biases so that the ReLU elbows are all inside the region we are interested in.\n",
279 |         "\n",
280 |         "We create several versions of this network with varying widths to explore how hidden layer width impacts learning performance.\n",
281 |         "\n",
282 |         "_Once you have gone through the discussion once you may wish to train networks with even larger widths to see how they behave under the three different training paradigms in this notebook._"
283 |       ]
284 |     },
285 |     {
286 |       "cell_type": "code",
287 |       "execution_count": null,
288 |       "metadata": {
289 |         "id": "Z5gY3Ou3HrUZ"
290 |       },
291 |       "outputs": [],
292 |       "source": [
293 |         "# Don't rerun this cell after training or you will lose all your work\n",
294 |         "nets_by_size = {}"
295 |       ]
296 |     },
297 |     {
298 |       "cell_type": "code",
299 |       "execution_count": null,
300 |       "metadata": {
301 |         "id": "dqAZu5-WHrUa"
302 |       },
303 |       "outputs": [],
304 |       "source": [
305 |         "widths = [10, 20, 40]\n",
306 |         "for width in widths:\n",
307 |         "    # Define a 1-hidden layer ReLU nonlinearity network\n",
308 |         "    net = nn.Sequential(nn.Linear(1, width),\n",
309 |         "                        nn.ReLU(),\n",
310 |         "                        nn.Linear(width, 1))\n",
311 |         "    loss = nn.MSELoss()\n",
312 |         "    # Get trainable parameters\n",
313 |         "    weights_all = list(net.parameters())\n",
314 |         "    # Get the output weights alone\n",
315 |         "    weights_out = weights_all[2:]\n",
316 |         "    # Adjust initial biases so elbows are in [0,1]\n",
317 |         "    elbows = np.sort(np.random.rand(width))\n",
318 |         "    new_biases = -elbows * to_numpy(weights_all[0]).ravel()\n",
319 |         "    weights_all[1].data = to_torch(new_biases)\n",
320 |         "    # Create SGD optimizers for outputs alone and for all weights\n",
321 |         "    lr_out = 0.2\n",
322 |         "    lr_all = 0.02\n",
323 |         "    opt_all = torch.optim.SGD(params=weights_all, lr=lr_all)\n",
324 |         "    opt_out = torch.optim.SGD(params=weights_out, lr=lr_out)\n",
325 |         "    # Save initial state for comparisons\n",
326 |         "    initial_weights = copy.deepcopy(net.state_dict())\n",
327 |         "    # print(\"Initial Weights\", initial_weights)\n",
328 |         "    nets_by_size[width] = {'net': net, 'opt_all': opt_all, \n",
329 |         "                           'opt_out': opt_out, 'init': initial_weights}"
330 |       ]
331 |     },
332 |     {
333 |       "cell_type": "markdown",
334 |       "metadata": {
335 |         "id": "kXk00JfvHrUb"
336 |       },
337 |       "source": [
338 |         "# Train the neural networks"
339 |       ]
340 |     },
341 |     {
342 |       "cell_type": "code",
343 |       "execution_count": null,
344 |       "metadata": {
345 |         "id": "jVMdbT03HrUb"
346 |       },
347 |       "outputs": [],
348 |       "source": [
349 |         "n_steps = 150000\n",
350 |         "save_every = 1000\n",
351 |         "t0 = time.time()\n",
352 |         "for w in widths:\n",
353 |         "    print(\"-\"*40)\n",
354 |         "    print(\"Width\", w)\n",
355 |         "    new_net = nn.Sequential(nn.Linear(1, w),\n",
356 |         "                        nn.ReLU(),\n",
357 |         "                        nn.Linear(w, 1))\n",
358 |         "    new_net.load_state_dict(nets_by_size[w]['net'].state_dict().copy())\n",
359 |         "    opt_all = torch.optim.SGD(params=new_net.parameters(), lr=lr_all)\n",
360 |         "    initial_weights = nets_by_size[w]['init']\n",
361 |         "    history_all = train_network(X, y, X_test, y_test, \n",
362 |         "                            new_net, optim=opt_all, \n",
363 |         "                            n_steps=n_steps, save_every=save_every, \n",
364 |         "                            initial_weights=initial_weights,\n",
365 |         "                            verbose=False)\n",
366 |         "    nets_by_size[w]['trained_net'] = new_net\n",
367 |         "    nets_by_size[w]['hist_all'] = history_all\n",
368 |         "    print(\"Width\", w)\n",
369 |         "    plot_test_train_errors(history_all)\n",
370 |         "t1 = time.time()\n",
371 |         "print(\"-\"*40)\n",
372 |         "print(\"Trained all layers in %.1f minutes\" % ((t1 - t0) / 60))"
373 |       ]
374 |     },
375 |     {
376 |       "cell_type": "markdown",
377 |       "metadata": {
378 |         "id": "S5u1kSTSHrUc"
379 |       },
380 |       "source": [
381 |         "# (a) Visualize Gradients\n",
382 |         "\n",
383 |         "Visualize the features corresponding to\n",
384 |         "$\\frac{\\partial}{\\partial w_i^{(1)}} y(x)$\n",
385 |         "and\n",
386 |         "$\\frac{\\partial}{\\partial b_i^{(1)}} y(x)$\n",
387 |         "where\n",
388 |         "$w^{(1)}_i$\n",
389 |         "are\n",
390 |         "the first hidden layer's weights and the\n",
391 |         "$b^{(1)}_i$\n",
392 |         "are the first hidden layer's biases. These derivatives should be evaluated at\n",
393 |         "at least both the random initialization and the final trained\n",
394 |         "network. When visualizing these features, plot them as a function\n",
395 |         "of the scalar input $x$, the same way that the notebook plots the\n",
396 |         "constituent \"elbow\" features that are the outputs of the\n",
397 |         "penultimate layer."
398 |       ]
399 |     },
400 |     {
401 |       "cell_type": "code",
402 |       "execution_count": null,
403 |       "metadata": {
404 |         "id": "sw9_omspHrUc"
405 |       },
406 |       "outputs": [],
407 |       "source": [
408 |         "def backward_and_plot_grad(X, model, vis_name='all', title='', legend=False):\n",
409 |         "    \"\"\"\n",
410 |         "    Run backpropagation on `model` using `X` as the input\n",
411 |         "    to compute the gradient w.r.t. parameters of `y`,\n",
412 |         "    and then visualize collected gradients according to `vis_name`\n",
413 |         "    \"\"\"\n",
414 |         "    width = model[0].out_features  # the width is the number of hidden units.\n",
415 |         "    gradients = np.zeros((width, X.shape[0]))\n",
416 |         "    num_pts = 0\n",
417 |         "    gradient_collect, vis_collect = { }, { }\n",
418 |         "    for x in X:\n",
419 |         "        y = model(to_torch(x))\n",
420 |         "\n",
421 |         "        ########################################################################\n",
422 |         "        # TODO: Complete the following part to run backpropagation. (2 lines)\n",
423 |         "        # Hint: Remember to set grad to zero before backpropagation\n",
424 |         "        ########################################################################\n",
425 |         "        pass\n",
426 |         "        ########################################################################\n",
427 |         "        \n",
428 |         "        # collect gradients from `p.grad.data`\n",
429 |         "        for n, p in model.named_parameters():\n",
430 |         "            for w_idx, w_grad in enumerate( p.grad.data.reshape(-1) ):\n",
431 |         "                if f'{n}.{w_idx}' not in gradient_collect:\n",
432 |         "                    gradient_collect[ f'{n}.{w_idx}' ] = {'x':[], 'y': []}\n",
433 |         "                if vis_name == 'all' or vis_name == n:\n",
434 |         "                    if f'{n}.{w_idx}' not in vis_collect:\n",
435 |         "                        vis_collect[f'{n}.{w_idx}'] = True\n",
436 |         "                gradient_collect[ f'{n}.{w_idx}' ]['y'].append( w_grad.item() )\n",
437 |         "                gradient_collect[ f'{n}.{w_idx}' ]['x'].append( x )\n",
438 |         "    \n",
439 |         "    for w_n in vis_collect:\n",
440 |         "        # we assume that X is sorted, so we use line plot\n",
441 |         "        plt.plot( X, gradient_collect[w_n]['y'], label=w_n )\n",
442 |         "        \n",
443 |         "    plt.xlabel('Data Points (X)')\n",
444 |         "    plt.ylabel(f'Gradient for {vis_name} of {width}-width Net')\n",
445 |         "    if legend:\n",
446 |         "        plt.legend()\n",
447 |         "    plt.title(title)\n",
448 |         "    plt.show()\n",
449 |         "\n",
450 |         "\n",
451 |         "for width in nets_by_size:\n",
452 |         "    backward_and_plot_grad(X, nets_by_size[width]['net'], '0.weight', 'Random Init')\n",
453 |         "    backward_and_plot_grad(X, nets_by_size[width]['trained_net'], '0.weight', 'Trained')\n",
454 |         "    backward_and_plot_grad(X, nets_by_size[width]['net'], '0.bias', 'Random Init')\n",
455 |         "    backward_and_plot_grad(X, nets_by_size[width]['trained_net'], '0.bias', 'Trained')"
456 |       ]
457 |     },
458 |     {
459 |       "cell_type": "markdown",
460 |       "metadata": {
461 |         "id": "faJqnWTtHrUd"
462 |       },
463 |       "source": [
464 |         "# (b) SVD for feature matrix\n",
465 |         "\n",
466 |         "During training, we can imagine that we have a generalized\n",
467 |         "linear model with a feature matrix corresponding to the linearized\n",
468 |         "features corresponding to each learnable parameter. We know from\n",
469 |         "our analysis of gradient descent, that the singular values and\n",
470 |         "singular vectors corresponding to this feature matrix are\n",
471 |         "important.\n",
472 |         "\n",
473 |         "Use the SVD of this feature matrix to plot both the singular values and visualize the “principle\n",
474 |         "features” that correspond to the d-dimensional singular vectors multiplied by all the features\n",
475 |         "corresponding to the parameters\n",
476 |         "\n",
477 |         "(HINT: Remember that the feature matrix whose SVD you are\n",
478 |         "taking has $n$ rows where each row corresponds to one training\n",
479 |         "point and $d$ columns where each column corresponds to each of\n",
480 |         "the learnable features. Meanwhile, you are going to be\n",
481 |         "plotting/visualizing the \"principle features\" as functions of\n",
482 |         "$x$ even at places where you don't have training points.)"
483 |       ]
484 |     },
485 |     {
486 |       "cell_type": "code",
487 |       "execution_count": null,
488 |       "metadata": {
489 |         "id": "Kytqv9qqHrUf"
490 |       },
491 |       "outputs": [],
492 |       "source": [
493 |         "def compute_svd_plot_features(X, y, X_test, y_test, model):\n",
494 |         "    width = model[0].out_features  # the width is the number of hidden units.\n",
495 |         "    gradients = np.zeros((width, X.shape[0]))\n",
496 |         "    num_pts = 0\n",
497 |         "    gradient_collect, vis_collect = { }, { }\n",
498 |         "    for x in X:\n",
499 |         "        y = model(to_torch(x))\n",
500 |         "\n",
501 |         "        ########################################################################\n",
502 |         "        # TODO: Complete the following part to run backpropagation. (2 lines)\n",
503 |         "        # Hint: The same as part (a)\n",
504 |         "        ########################################################################\n",
505 |         "        pass\n",
506 |         "        ########################################################################\n",
507 |         "\n",
508 |         "        for n, p in model.named_parameters():\n",
509 |         "            for w_idx, w_grad in enumerate( p.grad.view(-1).data ):\n",
510 |         "                if f'{n}.{w_idx}' not in gradient_collect:\n",
511 |         "                    gradient_collect[ f'{n}.{w_idx}' ] = {'x':[], 'y': []}\n",
512 |         "                gradient_collect[ f'{n}.{w_idx}' ]['y'].append( w_grad.item() )\n",
513 |         "                gradient_collect[ f'{n}.{w_idx}' ]['x'].append( x )\n",
514 |         "\n",
515 |         "    feature_matrix = []\n",
516 |         "    for w_n in gradient_collect:\n",
517 |         "        feature_matrix.append( gradient_collect[w_n]['y'] )\n",
518 |         "    feature_matrix = np.array( feature_matrix ).T\n",
519 |         "\n",
520 |         "    ############################################################################\n",
521 |         "    # TODO: Complete the following part to SVD-decompose the feature matrix.\n",
522 |         "    #       (1 line)\n",
523 |         "    # Hint: the shape of u, s, vh should be [n, d], [d], and [d, d]\n",
524 |         "    #       respectively\n",
525 |         "    ############################################################################\n",
526 |         "    u, s, vh = ?\n",
527 |         "    ############################################################################\n",
528 |         "\n",
529 |         "    plt.scatter(np.arange(s.shape[0]), s, c='darkorange', s=40.0, label='singular values')\n",
530 |         "    plt.legend()\n",
531 |         "    plt.show()\n",
532 |         "\n",
533 |         "    # Construct more training matrix\n",
534 |         "    ############################################################################\n",
535 |         "    # TODO: Complete the following part to compute the pricipal feature\n",
536 |         "    #       (1 line)\n",
537 |         "    ############################################################################\n",
538 |         "    princple_feature = ?\n",
539 |         "    ############################################################################\n",
540 |         "\n",
541 |         "\n",
542 |         "    for w_idx in range(feature_matrix.shape[1]):\n",
543 |         "        plt.plot( X, princple_feature.T[w_idx] )\n",
544 |         "        \n",
545 |         "    plt.xlabel('Data Points (X)')\n",
546 |         "    plt.ylabel(f'Princeple Feature of {width}-width Net')\n",
547 |         "    plt.show()\n",
548 |         "\n",
549 |         "for w in widths:\n",
550 |         "    net = nets_by_size[w]['net']\n",
551 |         "    print(\"Width\", w)\n",
552 |         "    compute_svd_plot_features(X, y, X_test, y_test, net)\n"
553 |       ]
554 |     },
555 |     {
556 |       "cell_type": "markdown",
557 |       "metadata": {
558 |         "id": "KIIS_WUwHrUg"
559 |       },
560 |       "source": [
561 |         "# (c) Two-layer Network\n",
562 |         "\n",
563 |         "Augment the jupyter notebook to add a second hidden\n",
564 |         "layer of the same size as the first hidden layer, fully connected\n",
565 |         "to the first hidden layer.\n",
566 |         "\n",
567 |         "Allow the visualization of the features corresponding\n",
568 |         "to the parameters in both hidden layers, as well as the\n",
569 |         "\"principle features\" and the singular values."
570 |       ]
571 |     },
572 |     {
573 |       "cell_type": "code",
574 |       "execution_count": null,
575 |       "metadata": {
576 |         "id": "VBE5XB24HrUg"
577 |       },
578 |       "outputs": [],
579 |       "source": [
580 |         "############################################################################\n",
581 |         "# TODO: Write your code here\n",
582 |         "############################################################################"
583 |       ]
584 |     }
585 |   ],
586 |   "metadata": {
587 |     "language_info": {
588 |       "name": "python"
589 |     },
590 |     "colab": {
591 |       "provenance": []
592 |     },
593 |     "kernelspec": {
594 |       "name": "python3",
595 |       "display_name": "Python 3"
596 |     },
597 |     "gpuClass": "standard"
598 |   },
599 |   "nbformat": 4,
600 |   "nbformat_minor": 0
601 | }


--------------------------------------------------------------------------------
/Homework/Homework1/hw1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Leo-Adventure/CS182-Berkeley-Deep-Learning-23Spring/c70df32185f7a1e38e3287dbc6d2d0b08f9e98d2/Homework/Homework1/hw1.pdf


--------------------------------------------------------------------------------
/Homework/Homework1/hw1_sol.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Leo-Adventure/CS182-Berkeley-Deep-Learning-23Spring/c70df32185f7a1e38e3287dbc6d2d0b08f9e98d2/Homework/Homework1/hw1_sol.pdf


--------------------------------------------------------------------------------
/Homework/Homework2/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Leo-Adventure/CS182-Berkeley-Deep-Learning-23Spring/c70df32185f7a1e38e3287dbc6d2d0b08f9e98d2/Homework/Homework2/.DS_Store


--------------------------------------------------------------------------------
/Homework/Homework2/coding/hw2_momentum.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "cells": [
  3 |     {
  4 |       "cell_type": "markdown",
  5 |       "id": "600af611-161a-41fb-87fa-1a3188c3b35d",
  6 |       "metadata": {
  7 |         "id": "600af611-161a-41fb-87fa-1a3188c3b35d"
  8 |       },
  9 |       "source": [
 10 |         "# CS182 HW2 - Accelerating Gradient Descent with Momentum (coding part)\n",
 11 |         "\n",
 12 |         "In this notebook, we will understand gradient descent and  gradient descent with momentum in practice."
 13 |       ]
 14 |     },
 15 |     {
 16 |       "cell_type": "code",
 17 |       "execution_count": null,
 18 |       "id": "180f3378-b2cf-44c1-91c6-d4961d282cdb",
 19 |       "metadata": {
 20 |         "id": "180f3378-b2cf-44c1-91c6-d4961d282cdb"
 21 |       },
 22 |       "outputs": [],
 23 |       "source": [
 24 |         "import numpy as np\n",
 25 |         "import matplotlib.pyplot as plt\n",
 26 |         "\n",
 27 |         "np.random.seed(0)"
 28 |       ]
 29 |     },
 30 |     {
 31 |       "cell_type": "markdown",
 32 |       "id": "ec3c2d38-1039-408c-a3a5-7efb371fec28",
 33 |       "metadata": {
 34 |         "id": "ec3c2d38-1039-408c-a3a5-7efb371fec28"
 35 |       },
 36 |       "source": [
 37 |         "## Generate and Visualize Data\n",
 38 |         "\n",
 39 |         "We generate a dataset of 2D datapoints from the gaussian distribution with a mean of $(-3, 0)$ and covariance matrix of $\\begin{pmatrix}3 & 0 \\\\ 0 & 1\\end{pmatrix}$. The binary labels $y$ indicate whether the second dimension is greater than 0 (positive) or not (negative). The data is visualized using a scatter plot with different colors representing the different labels."
 40 |       ]
 41 |     },
 42 |     {
 43 |       "cell_type": "code",
 44 |       "execution_count": null,
 45 |       "id": "743c76f1-6f99-4ad7-b7b2-f32a17d5229e",
 46 |       "metadata": {
 47 |         "id": "743c76f1-6f99-4ad7-b7b2-f32a17d5229e"
 48 |       },
 49 |       "outputs": [],
 50 |       "source": [
 51 |         "def gen_gaussian_points(n, mean, sigma):\n",
 52 |         "    return np.random.normal(mean, sigma, [n, 2])\n",
 53 |         "\n",
 54 |         "N = 500\n",
 55 |         "\n",
 56 |         "X = gen_gaussian_points(N, [-3, 0], [3, 1])\n",
 57 |         "y = (X[:,1]>0).astype(float)\n",
 58 |         "y = np.expand_dims(y, axis=-1)\n",
 59 |         "\n",
 60 |         "\n",
 61 |         "plt.scatter(*X[y.squeeze()==0].T)\n",
 62 |         "plt.scatter(*X[y.squeeze()==1].T)\n",
 63 |         "plt.title(\"Visualization of Data\")\n",
 64 |         "plt.show()"
 65 |       ]
 66 |     },
 67 |     {
 68 |       "cell_type": "markdown",
 69 |       "id": "oDuTuhNajI_z",
 70 |       "metadata": {
 71 |         "id": "oDuTuhNajI_z"
 72 |       },
 73 |       "source": [
 74 |         "## Visualizing the Loss Landscape\n",
 75 |         "\n",
 76 |         "The following contour plot visualizes the loss landscape of this optimization task. It's important to note that the data has been generated such that the correlation coefficient between dimension 0 and dimension 1 is zero, aligning dimension 0 and dimension 1 with the two singular vectors in the Singular Value Decomposition (SVD) of the data matrix."
 77 |       ]
 78 |     },
 79 |     {
 80 |       "cell_type": "code",
 81 |       "execution_count": null,
 82 |       "id": "MVjQIX8Mjt2t",
 83 |       "metadata": {
 84 |         "id": "MVjQIX8Mjt2t"
 85 |       },
 86 |       "outputs": [],
 87 |       "source": [
 88 |         "w0_s, w1_s = np.meshgrid(np.linspace(-0.5, 0.5, 100), np.linspace(-0.5, 0.5, 100))\n",
 89 |         "w_s = np.stack([w0_s.reshape(-1), w1_s.reshape(-1)], axis=1)\n",
 90 |         "loss_s = ((X @ w_s.T - y) ** 2).sum(axis=0).reshape(100, 100)\n",
 91 |         "from matplotlib import ticker, cm\n",
 92 |         "plt.contourf(w0_s, w1_s, loss_s, cmap=cm.PuBu_r, levels=40)\n",
 93 |         "plt.colorbar()\n",
 94 |         "plt.xlabel(\"w0\")\n",
 95 |         "plt.ylabel(\"w1\")\n",
 96 |         "plt.show()"
 97 |       ]
 98 |     },
 99 |     {
100 |       "cell_type": "markdown",
101 |       "id": "ce3e8be6-c6d6-4fb2-946a-b4d6530fe5a6",
102 |       "metadata": {
103 |         "id": "ce3e8be6-c6d6-4fb2-946a-b4d6530fe5a6"
104 |       },
105 |       "source": [
106 |         "## (Plain) Gradient Descent\n",
107 |         "We will implement gradient descent *without* momentum below."
108 |       ]
109 |     },
110 |     {
111 |       "cell_type": "code",
112 |       "execution_count": null,
113 |       "id": "08ecbc62-3958-46e3-aeea-167e4fefcd2a",
114 |       "metadata": {
115 |         "id": "08ecbc62-3958-46e3-aeea-167e4fefcd2a"
116 |       },
117 |       "outputs": [],
118 |       "source": [
119 |         "def runGD(maxiter,stepsize):\n",
120 |         "    w = np.zeros((2, 1))\n",
121 |         "    grads = []\n",
122 |         "    ws = []\n",
123 |         "    losses = []\n",
124 |         "    for i in range(maxiter):\n",
125 |         "        grad = 2 * (X.T @ X @ w) - 2 * X.T @ y\n",
126 |         "        w = w - stepsize * grad\n",
127 |         "        grads.append(grad)\n",
128 |         "        ws.append(w)\n",
129 |         "        loss = np.linalg.norm(y - X @ w) ** 2\n",
130 |         "        losses.append(loss)\n",
131 |         "    print(\"Final loss =\", loss)\n",
132 |         "    grads = np.array(grads).squeeze()\n",
133 |         "    ws = np.array(ws).squeeze()\n",
134 |         "    return grads, ws, losses \n",
135 |         "\n",
136 |         "maxiter = 100\n",
137 |         "stepsize = 1e-4\n",
138 |         "grads, ws, losses = runGD(maxiter,stepsize)"
139 |       ]
140 |     },
141 |     {
142 |       "cell_type": "markdown",
143 |       "id": "6f85f532-3ac4-402e-9c96-2f0c76acd9c0",
144 |       "metadata": {
145 |         "id": "6f85f532-3ac4-402e-9c96-2f0c76acd9c0"
146 |       },
147 |       "source": [
148 |         "## Gradient Descent with Momentum\n",
149 |         "Implement the gradient descent with momentum algorithm. **Fill in the missing code** for updating the parameters. As a verification step, compare the final loss with the previous part to ensure it is reasonable and not significantly different."
150 |       ]
151 |     },
152 |     {
153 |       "cell_type": "code",
154 |       "execution_count": null,
155 |       "id": "9fded0aa-b390-4f67-8522-fe924ccccf02",
156 |       "metadata": {
157 |         "id": "9fded0aa-b390-4f67-8522-fe924ccccf02"
158 |       },
159 |       "outputs": [],
160 |       "source": [
161 |         "def runGDM(maxiter, stepsize, beta):\n",
162 |         "    w = np.zeros((2, 1))\n",
163 |         "    grads_m = []\n",
164 |         "    ws_m = []\n",
165 |         "    losses_m = []\n",
166 |         "    for i in range(maxiter):\n",
167 |         "        grad = 2 * (X.T @ X @ w) - 2 * X.T @ y\n",
168 |         "        if i == 0:\n",
169 |         "            smoothed_grad = grad\n",
170 |         "        ###############################################\n",
171 |         "        ###       TODO: YOUR CODE HERE              ###\n",
172 |         "        ###############################################\n",
173 |         "        smoothed_grad = ?\n",
174 |         "        ###############################################\n",
175 |         "        ###       END OF YOUR CODE                  ###\n",
176 |         "        ###############################################\n",
177 |         "        w = w - stepsize * smoothed_grad\n",
178 |         "        grads_m.append(grad)\n",
179 |         "        ws_m.append(w)\n",
180 |         "        loss = np.linalg.norm(y - X @ w) ** 2\n",
181 |         "        losses_m.append(loss)\n",
182 |         "    print(\"Final loss =\", loss)\n",
183 |         "    grads_m = np.array(grads_m).squeeze()\n",
184 |         "    ws_m = np.array(ws_m).squeeze()\n",
185 |         "    return grads_m, ws_m, losses_m\n",
186 |         "\n",
187 |         "maxiter = 100\n",
188 |         "stepsize = 1e-4\n",
189 |         "beta = 0.6\n",
190 |         "grads_m, ws_m, losses_m = runGDM(maxiter, stepsize, beta)"
191 |       ]
192 |     },
193 |     {
194 |       "cell_type": "markdown",
195 |       "id": "d525ecbf-7348-477f-a8eb-c069b6e9d349",
196 |       "metadata": {
197 |         "id": "d525ecbf-7348-477f-a8eb-c069b6e9d349"
198 |       },
199 |       "source": [
200 |         "## Visualize the Parameters and Gradients of Different Dimensions\n",
201 |         "\n",
202 |         "In this section, we will visualize the gradients and parameters of two gradient descent methods in each iteration of training.\n",
203 |         "\n",
204 |         "### Gradient Descent w/o Momentum"
205 |       ]
206 |     },
207 |     {
208 |       "cell_type": "code",
209 |       "execution_count": null,
210 |       "id": "186cdf96-f350-4110-a7fd-b616ea07e0f8",
211 |       "metadata": {
212 |         "id": "186cdf96-f350-4110-a7fd-b616ea07e0f8"
213 |       },
214 |       "outputs": [],
215 |       "source": [
216 |         "plt.figure(figsize=(12, 4))\n",
217 |         "plt.plot(range(maxiter), np.abs(grads)[:,0], 'r', label=\"Dimension 0\")\n",
218 |         "plt.plot(range(maxiter), np.abs(grads)[:,1], 'b', label=\"Dimension 1\")\n",
219 |         "plt.title(\"Gradients\")\n",
220 |         "plt.xlabel(\"Iterations\")\n",
221 |         "plt.legend()\n",
222 |         "plt.show()\n",
223 |         "\n",
224 |         "plt.figure(figsize=(12, 4))\n",
225 |         "plt.plot(range(maxiter), np.abs(ws)[:,0], 'r', label=\"Dimension 0\")\n",
226 |         "plt.plot(range(maxiter), np.abs(ws)[:,1], 'b', label=\"Dimension 1\")\n",
227 |         "plt.title(\"Parameters\")\n",
228 |         "plt.xlabel(\"Iterations\")\n",
229 |         "plt.legend()\n",
230 |         "plt.show()"
231 |       ]
232 |     },
233 |     {
234 |       "cell_type": "markdown",
235 |       "id": "94ec155b-3782-434f-97cc-be617dcb6b0d",
236 |       "metadata": {
237 |         "id": "94ec155b-3782-434f-97cc-be617dcb6b0d"
238 |       },
239 |       "source": [
240 |         "### Gradient Descent with Momentum"
241 |       ]
242 |     },
243 |     {
244 |       "cell_type": "code",
245 |       "execution_count": null,
246 |       "id": "0933f153-76de-466c-a098-9ec046cdbfaa",
247 |       "metadata": {
248 |         "id": "0933f153-76de-466c-a098-9ec046cdbfaa"
249 |       },
250 |       "outputs": [],
251 |       "source": [
252 |         "plt.figure(figsize=(12, 4))\n",
253 |         "plt.plot(range(maxiter), np.abs(grads_m)[:,0], 'r', label=\"Dimension 0\")\n",
254 |         "plt.plot(range(maxiter), np.abs(grads_m)[:,1], 'b', label=\"Dimension 1\")\n",
255 |         "plt.title(\"Gradients\")\n",
256 |         "plt.xlabel(\"Iterations\")\n",
257 |         "plt.legend()\n",
258 |         "plt.show()\n",
259 |         "\n",
260 |         "plt.figure(figsize=(12, 4))\n",
261 |         "plt.plot(range(maxiter), np.abs(ws_m)[:,0], 'r', label=\"Dimension 0\")\n",
262 |         "plt.plot(range(maxiter), np.abs(ws_m)[:,1], 'b', label=\"Dimension 1\")\n",
263 |         "plt.title(\"Parameters\")\n",
264 |         "plt.xlabel(\"Iterations\")\n",
265 |         "plt.legend()\n",
266 |         "plt.show()"
267 |       ]
268 |     },
269 |     {
270 |       "cell_type": "markdown",
271 |       "id": "d68bde13-460a-462b-8f94-17ca0cde41c6",
272 |       "metadata": {
273 |         "id": "d68bde13-460a-462b-8f94-17ca0cde41c6"
274 |       },
275 |       "source": [
276 |         "**Question: How does $\\sigma_i$ (the eigenvalues) influence the gradients and paramters updates?** Please answer this question in your written assignment."
277 |       ]
278 |     },
279 |     {
280 |       "cell_type": "markdown",
281 |       "id": "028dba9f-956c-4d77-b2c4-9c76baec1052",
282 |       "metadata": {
283 |         "id": "028dba9f-956c-4d77-b2c4-9c76baec1052"
284 |       },
285 |       "source": [
286 |         "## Compare gradient descent and gradient desent with momentum\n",
287 |         "### Comparing gradient changes with different iterations"
288 |       ]
289 |     },
290 |     {
291 |       "cell_type": "code",
292 |       "execution_count": null,
293 |       "id": "5d841787-7a12-4c99-9ec5-2bff041bb6ef",
294 |       "metadata": {
295 |         "id": "5d841787-7a12-4c99-9ec5-2bff041bb6ef"
296 |       },
297 |       "outputs": [],
298 |       "source": [
299 |         "plt.figure(figsize=(12, 4))\n",
300 |         "plt.plot(range(maxiter), np.abs(grads)[:,0], 'r', label=\"GD\")\n",
301 |         "plt.plot(range(maxiter), np.abs(grads_m)[:,0], 'b', label=\"momentum\")\n",
302 |         "plt.title(\"Gradients of Dimension 0\")\n",
303 |         "plt.xlabel(\"Iterations\")\n",
304 |         "plt.legend()\n",
305 |         "plt.show()\n",
306 |         "\n",
307 |         "plt.figure(figsize=(12, 4))\n",
308 |         "plt.plot(range(maxiter), np.abs(grads)[:,1], 'r', label=\"GD\")\n",
309 |         "plt.plot(range(maxiter), np.abs(grads_m)[:,1], 'b', label=\"momentum\")\n",
310 |         "plt.title(\"Gradients of Dimension 1\")\n",
311 |         "plt.xlabel(\"Iterations\")\n",
312 |         "plt.legend()\n",
313 |         "plt.show()"
314 |       ]
315 |     },
316 |     {
317 |       "cell_type": "markdown",
318 |       "id": "77e76fe9-151e-4b02-bd78-d7065d55b96b",
319 |       "metadata": {
320 |         "id": "77e76fe9-151e-4b02-bd78-d7065d55b96b"
321 |       },
322 |       "source": [
323 |         "### Comparing parameter changes with different iterations"
324 |       ]
325 |     },
326 |     {
327 |       "cell_type": "code",
328 |       "execution_count": null,
329 |       "id": "d8e77003-4328-4a3b-a232-fb6d3f2e8d14",
330 |       "metadata": {
331 |         "id": "d8e77003-4328-4a3b-a232-fb6d3f2e8d14"
332 |       },
333 |       "outputs": [],
334 |       "source": [
335 |         "plt.figure(figsize=(12, 4))\n",
336 |         "plt.plot(range(maxiter), np.abs(ws)[:,0], 'r', label=\"GD\")\n",
337 |         "plt.plot(range(maxiter), np.abs(ws_m)[:,0], 'b', label=\"momentum\")\n",
338 |         "plt.title(\"Parameters of Dimension 0\")\n",
339 |         "plt.xlabel(\"Iterations\")\n",
340 |         "plt.legend()\n",
341 |         "plt.show()\n",
342 |         "\n",
343 |         "plt.figure(figsize=(12, 4))\n",
344 |         "plt.plot(range(maxiter), np.abs(ws)[:,1], 'r', label=\"GD\")\n",
345 |         "plt.plot(range(maxiter), np.abs(ws_m)[:,1], 'b', label=\"momentum\")\n",
346 |         "plt.title(\"Parameters of Dimension 1\")\n",
347 |         "plt.xlabel(\"Iterations\")\n",
348 |         "plt.legend()\n",
349 |         "plt.show()"
350 |       ]
351 |     },
352 |     {
353 |       "cell_type": "markdown",
354 |       "id": "d3252810-248b-4cfb-9c49-db7b3156a25b",
355 |       "metadata": {
356 |         "id": "d3252810-248b-4cfb-9c49-db7b3156a25b"
357 |       },
358 |       "source": [
359 |         "### Comparing loss with different iterations\n",
360 |         "Note that to maximize the visibiity, we will visualize $\\log (\\text{loss}_i-loss_*)$, where $\\text{loss}_i$ is the loss at iteration $i$ and $loss_*$ is the optimal loss.\n",
361 |         "\n"
362 |       ]
363 |     },
364 |     {
365 |       "cell_type": "code",
366 |       "execution_count": null,
367 |       "id": "2974b998-323a-4534-bfd4-a420d5b83389",
368 |       "metadata": {
369 |         "id": "2974b998-323a-4534-bfd4-a420d5b83389"
370 |       },
371 |       "outputs": [],
372 |       "source": [
373 |         "plt.figure(figsize=(12, 4))\n",
374 |         "plt.plot(range(maxiter), np.log(np.abs(losses)-losses[-1]), 'r', label=\"GD\")\n",
375 |         "plt.plot(range(maxiter), np.log(np.abs(losses_m)-losses_m[-1]), 'b', label=\"momentum\")\n",
376 |         "plt.title(\"Loss changes as iterations increase\")\n",
377 |         "plt.legend()\n",
378 |         "plt.ylabel(\"Log(loss(at iteration $i$) - optimal loss)\")\n",
379 |         "plt.xlabel(\"Iterations\")\n",
380 |         "plt.show()"
381 |       ]
382 |     },
383 |     {
384 |       "cell_type": "markdown",
385 |       "id": "6ef83e79-4576-4949-ab4a-365c90ca6d4e",
386 |       "metadata": {
387 |         "id": "6ef83e79-4576-4949-ab4a-365c90ca6d4e"
388 |       },
389 |       "source": [
390 |         "**Question 1: Comparing gradient descent and gradient descent with momentums, which one converges faster for this task? Why?** Please answer this question in your written assignment."
391 |       ]
392 |     },
393 |     {
394 |       "cell_type": "markdown",
395 |       "id": "jGkfVQlyGPNT",
396 |       "metadata": {
397 |         "id": "jGkfVQlyGPNT"
398 |       },
399 |       "source": [
400 |         "**Question 2: If one method converges faster, can you try change the learning rate to further accelerate the convergence? Please re-run one method with different learning rate, and compare the gradients, parameters and loss with the other method.**\n",
401 |         "*Hint: The learning rate cannot be too large, otherwise the function may not converge.*"
402 |       ]
403 |     },
404 |     {
405 |       "cell_type": "code",
406 |       "execution_count": null,
407 |       "id": "2Dx8AFXhG0mk",
408 |       "metadata": {
409 |         "id": "2Dx8AFXhG0mk"
410 |       },
411 |       "outputs": [],
412 |       "source": [
413 |         "maxiter = 100\n",
414 |         "###############################################\n",
415 |         "###       TODO: YOUR CODE HERE              ###\n",
416 |         "###############################################\n",
417 |         "stepsize = ?\n",
418 |         "beta = ?\n",
419 |         "###############################################\n",
420 |         "###       END OF YOUR CODE                  ###\n",
421 |         "###############################################\n",
422 |         "grads_m, ws_m, losses_m = runGDM(maxiter, stepsize, beta)"
423 |       ]
424 |     },
425 |     {
426 |       "cell_type": "markdown",
427 |       "id": "SZSyuep_I1q4",
428 |       "metadata": {
429 |         "id": "SZSyuep_I1q4"
430 |       },
431 |       "source": [
432 |         "### After changing learning rate, compare gradient changes with different iterations\n",
433 |         "*Hint: You should see now that a dimension will have much larger gap between two methods. That is a benefit from larger learning rate.*"
434 |       ]
435 |     },
436 |     {
437 |       "cell_type": "code",
438 |       "execution_count": null,
439 |       "id": "C0-S2V0BIwNc",
440 |       "metadata": {
441 |         "id": "C0-S2V0BIwNc"
442 |       },
443 |       "outputs": [],
444 |       "source": [
445 |         "plt.figure(figsize=(12, 4))\n",
446 |         "plt.plot(range(maxiter), np.abs(grads)[:,0], 'r', label=\"GD\")\n",
447 |         "plt.plot(range(maxiter), np.abs(grads_m)[:,0], 'b', label=\"momentum\")\n",
448 |         "plt.title(\"Gradients of Dimension 0\")\n",
449 |         "plt.xlabel(\"Iterations\")\n",
450 |         "plt.legend()\n",
451 |         "plt.show()\n",
452 |         "\n",
453 |         "plt.figure(figsize=(12, 4))\n",
454 |         "plt.plot(range(maxiter), np.abs(grads)[:,1], 'r', label=\"GD\")\n",
455 |         "plt.plot(range(maxiter), np.abs(grads_m)[:,1], 'b', label=\"momentum\")\n",
456 |         "plt.title(\"Gradients of Dimension 1\")\n",
457 |         "plt.xlabel(\"Iterations\")\n",
458 |         "plt.legend()\n",
459 |         "plt.show()"
460 |       ]
461 |     },
462 |     {
463 |       "cell_type": "markdown",
464 |       "id": "CQ0LKrxJI2gW",
465 |       "metadata": {
466 |         "id": "CQ0LKrxJI2gW"
467 |       },
468 |       "source": [
469 |         "### After changing learning rate, compare parameter changes with different iterations\n",
470 |         "*Hint: You should see now that a dimension will have much larger gap between two methods. That is a benefit from larger learning rate.*"
471 |       ]
472 |     },
473 |     {
474 |       "cell_type": "code",
475 |       "execution_count": null,
476 |       "id": "jMIE_asEIshn",
477 |       "metadata": {
478 |         "id": "jMIE_asEIshn"
479 |       },
480 |       "outputs": [],
481 |       "source": [
482 |         "plt.figure(figsize=(12, 4))\n",
483 |         "plt.plot(range(maxiter), np.abs(ws)[:,0], 'r', label=\"GD\")\n",
484 |         "plt.plot(range(maxiter), np.abs(ws_m)[:,0], 'b', label=\"momentum\")\n",
485 |         "plt.title(\"Parameters of Dimension 0\")\n",
486 |         "plt.xlabel(\"Iterations\")\n",
487 |         "plt.legend()\n",
488 |         "plt.show()\n",
489 |         "\n",
490 |         "plt.figure(figsize=(12, 4))\n",
491 |         "plt.plot(range(maxiter), np.abs(ws)[:,1], 'r', label=\"GD\")\n",
492 |         "plt.plot(range(maxiter), np.abs(ws_m)[:,1], 'b', label=\"momentum\")\n",
493 |         "plt.title(\"Parameters of Dimension 1\")\n",
494 |         "plt.xlabel(\"Iterations\")\n",
495 |         "plt.legend()\n",
496 |         "plt.show()"
497 |       ]
498 |     },
499 |     {
500 |       "cell_type": "markdown",
501 |       "id": "VI951JPKI6CX",
502 |       "metadata": {
503 |         "id": "VI951JPKI6CX"
504 |       },
505 |       "source": [
506 |         "### After changing learning rate, compare loss changes with different iterations\n",
507 |         "*Hint: You should see now one method is much faster than the other one.*"
508 |       ]
509 |     },
510 |     {
511 |       "cell_type": "code",
512 |       "execution_count": null,
513 |       "id": "MJyvowdaG947",
514 |       "metadata": {
515 |         "id": "MJyvowdaG947"
516 |       },
517 |       "outputs": [],
518 |       "source": [
519 |         "plt.figure(figsize=(12, 4))\n",
520 |         "plt.plot(range(maxiter), np.log(np.abs(losses)-losses[-1]), 'r', label=\"GD\")\n",
521 |         "plt.plot(range(maxiter), np.log(np.abs(losses_m)-losses_m[-1]), 'b', label=\"momentum\")\n",
522 |         "plt.title(\"Loss changes as iterations increase\")\n",
523 |         "plt.legend()\n",
524 |         "plt.ylabel(\"Log(loss(at iteration $i$) - optimal loss)\")\n",
525 |         "plt.xlabel(\"Iterations\")\n",
526 |         "plt.show()"
527 |       ]
528 |     }
529 |   ],
530 |   "metadata": {
531 |     "language_info": {
532 |       "name": "python"
533 |     },
534 |     "colab": {
535 |       "provenance": []
536 |     }
537 |   },
538 |   "nbformat": 4,
539 |   "nbformat_minor": 5
540 | }


--------------------------------------------------------------------------------
/Homework/Homework2/coding/hw2_optim_init_sol.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Leo-Adventure/CS182-Berkeley-Deep-Learning-23Spring/c70df32185f7a1e38e3287dbc6d2d0b08f9e98d2/Homework/Homework2/coding/hw2_optim_init_sol.zip


--------------------------------------------------------------------------------
/Homework/Homework2/hw2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Leo-Adventure/CS182-Berkeley-Deep-Learning-23Spring/c70df32185f7a1e38e3287dbc6d2d0b08f9e98d2/Homework/Homework2/hw2.pdf


--------------------------------------------------------------------------------
/Homework/Homework2/hw2_sol.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Leo-Adventure/CS182-Berkeley-Deep-Learning-23Spring/c70df32185f7a1e38e3287dbc6d2d0b08f9e98d2/Homework/Homework2/hw2_sol.pdf


--------------------------------------------------------------------------------
/Homework/Homework4/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Leo-Adventure/CS182-Berkeley-Deep-Learning-23Spring/c70df32185f7a1e38e3287dbc6d2d0b08f9e98d2/Homework/Homework4/.DS_Store


--------------------------------------------------------------------------------
/Homework/Homework4/coding/hw4_dropout-2.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "cells": [
  3 |     {
  4 |       "cell_type": "code",
  5 |       "execution_count": null,
  6 |       "metadata": {
  7 |         "id": "9qD-DMsfqS6i"
  8 |       },
  9 |       "outputs": [],
 10 |       "source": [
 11 |         "# Imports\n",
 12 |         "import numpy as np\n",
 13 |         "import torch\n",
 14 |         "import torch.nn as nn\n",
 15 |         "import matplotlib.pyplot as plt"
 16 |       ]
 17 |     },
 18 |     {
 19 |       "cell_type": "markdown",
 20 |       "metadata": {
 21 |         "id": "50CZrmaRqS6l"
 22 |       },
 23 |       "source": [
 24 |         "# Problem Intro\n",
 25 |         "\n",
 26 |         "We will explore the effect of dropout on a simple gradient descent problem. We will train weights $w_1$ and $w_2$ to solve the linear equation $10w_1 + w_2 = 11$, where $w_1$ and $w_2$ are initialized at 0.\n",
 27 |         "\n",
 28 |         "We formulate this question as an OLS:\n",
 29 |         "\n",
 30 |         "$$\\min_{\\mathbf{w}} \\lVert \\mathbf{Xw} - \\mathbf{y} \\rVert^2 $$,\n",
 31 |         "\n",
 32 |         "where $\\mathbf{X}, \\mathbf{y}$ are:"
 33 |       ]
 34 |     },
 35 |     {
 36 |       "cell_type": "code",
 37 |       "execution_count": null,
 38 |       "metadata": {
 39 |         "id": "86ZHrhIwqS6m"
 40 |       },
 41 |       "outputs": [],
 42 |       "source": [
 43 |         "x = np.array([[10, 1]])\n",
 44 |         "y = np.array([[11]])"
 45 |       ]
 46 |     },
 47 |     {
 48 |       "cell_type": "markdown",
 49 |       "metadata": {
 50 |         "id": "SrbNiwZkqS6n"
 51 |       },
 52 |       "source": [
 53 |         "## No Dropout, Least-Square\n",
 54 |         "\n",
 55 |         "Analytically show what solution we will converge to if we train with gradient descent and an appropriately small learning rate. Take advantage of the fact that when you initialize weights to 0 and train linear regression with gradient descent, you recover the least-squares solution.\n",
 56 |         "\n",
 57 |         "**Complete the following code** to calculate this solution in python, but you can also use another tool and insert your answer. \n",
 58 |         "(HINT: use `np.linalg.pinv`)"
 59 |       ]
 60 |     },
 61 |     {
 62 |       "cell_type": "code",
 63 |       "execution_count": null,
 64 |       "metadata": {
 65 |         "id": "7mXRO5WeqS6n"
 66 |       },
 67 |       "outputs": [],
 68 |       "source": [
 69 |         "################################################################################\n",
 70 |         "# YOUR CODE HERE\n",
 71 |         "################################################################################\n",
 72 |         "w = ?\n",
 73 |         "################################################################################\n",
 74 |         "\n",
 75 |         "print(w)"
 76 |       ]
 77 |     },
 78 |     {
 79 |       "cell_type": "markdown",
 80 |       "metadata": {
 81 |         "id": "9VXPh1k_qS6o"
 82 |       },
 83 |       "source": [
 84 |         "### Question\n",
 85 |         "\n",
 86 |         "Please **include the mathematical expression in your written** assignment submission, and **copy and paste the output of the previous cell** into your submission as well."
 87 |       ]
 88 |     },
 89 |     {
 90 |       "cell_type": "markdown",
 91 |       "metadata": {
 92 |         "id": "UY8WoHWyqS6o"
 93 |       },
 94 |       "source": [
 95 |         "## No Dropout, Gradient Descent\n",
 96 |         "\n",
 97 |         "Show training with gradient descent recovers the expected solution. A training loop has been provided for you."
 98 |       ]
 99 |     },
100 |     {
101 |       "cell_type": "code",
102 |       "execution_count": null,
103 |       "metadata": {
104 |         "id": "mZREQjhSqS6o"
105 |       },
106 |       "outputs": [],
107 |       "source": [
108 |         "def train_simple(net, lr=.001, batch_size=1, itrs=1000, plot=True, optim_class=torch.optim.SGD, x=None, y=None):\n",
109 |         "    optimizer = optim_class(net.parameters(), lr=lr)\n",
110 |         "\n",
111 |         "    losses = []\n",
112 |         "    if x is None:\n",
113 |         "        x = torch.FloatTensor([[10, 1]])\n",
114 |         "        y = torch.FloatTensor([[11]])\n",
115 |         "    else:\n",
116 |         "        x = torch.FloatTensor(x)\n",
117 |         "        y = torch.FloatTensor(y)\n",
118 |         "    # Repeat element batch_size times\n",
119 |         "    x = x.repeat(batch_size, 1)\n",
120 |         "    y = y.repeat(batch_size, 1)\n",
121 |         "    for i in range(itrs):\n",
122 |         "        y_hat = net(x)\n",
123 |         "        loss = torch.nn.MSELoss()(y_hat, y)\n",
124 |         "        optimizer.zero_grad()\n",
125 |         "        loss.backward()\n",
126 |         "        optimizer.step()\n",
127 |         "        losses.append(loss.item())\n",
128 |         "    if plot:\n",
129 |         "        plt.plot(losses)\n",
130 |         "        plt.show()\n",
131 |         "        print_weights(net)\n",
132 |         "    return losses\n",
133 |         "\n",
134 |         "def print_weights(net):\n",
135 |         "  print(f'Weights: {net.state_dict().values()}')"
136 |       ]
137 |     },
138 |     {
139 |       "cell_type": "markdown",
140 |       "metadata": {
141 |         "id": "N5MXjOWbqS6o"
142 |       },
143 |       "source": [
144 |         "**Complete the following code to create the linear network for the OLS in PyTorch.**"
145 |       ]
146 |     },
147 |     {
148 |       "cell_type": "code",
149 |       "execution_count": null,
150 |       "metadata": {
151 |         "id": "HqpnBdGOqS6p"
152 |       },
153 |       "outputs": [],
154 |       "source": [
155 |         "################################################################################\n",
156 |         "# YOUR CODE HERE\n",
157 |         "################################################################################\n",
158 |         "net = ?\n",
159 |         "################################################################################\n",
160 |         "\n",
161 |         "# Initialize weights with 0\n",
162 |         "net.load_state_dict({k: v * 0 for k, v in net.state_dict().items()})\n",
163 |         "losses = train_simple(net)"
164 |       ]
165 |     },
166 |     {
167 |       "cell_type": "markdown",
168 |       "metadata": {
169 |         "id": "_64E9gmVqS6p"
170 |       },
171 |       "source": [
172 |         "### Question\n",
173 |         "\n",
174 |         "Please **copy and paste the output of the previous cell** (text only) into your submission of the written assignment. **Are the weights obtained by training with gradient descent the same as those calculated using the closed-form least squares method?** Answer this question in your written assignment."
175 |       ]
176 |     },
177 |     {
178 |       "cell_type": "markdown",
179 |       "metadata": {
180 |         "id": "xXNWC2LzqS6p"
181 |       },
182 |       "source": [
183 |         "## Dropout, Least-Square"
184 |       ]
185 |     },
186 |     {
187 |       "cell_type": "markdown",
188 |       "metadata": {
189 |         "id": "1qVYs3VdqS6p"
190 |       },
191 |       "source": [
192 |         "Now we add a dropout rate of `p=0.5`, which means that during each forward pass, each input to the network has a 50% probability of being set to `0`. To account for this reduction in the number of inputs, we also need to scale the inputs by `2`. However, during testing, we do not apply any dropout, nor do we scale the inputs.\n",
193 |         "\n",
194 |         "By dropping out each element in the input with a 50% probability, we create a dataset with *four* equally likely inputs, in which $w_1$ is dropped out, $w_2$ is dropped out, both are dropped out, or neither is dropped out. This is our new dataset, represented by `x` and `y`. Using this dataset, we can compute the analytic solution to improve our network's performance.\n",
195 |         "\n",
196 |         "**Complete the following code according to the instructions above:**"
197 |       ]
198 |     },
199 |     {
200 |       "cell_type": "code",
201 |       "execution_count": null,
202 |       "metadata": {
203 |         "id": "p4w_TOVKqS6q"
204 |       },
205 |       "outputs": [],
206 |       "source": [
207 |         "################################################################################\n",
208 |         "# YOUR CODE HERE\n",
209 |         "################################################################################\n",
210 |         "x = ?\n",
211 |         "y = ?\n",
212 |         "w = ?\n",
213 |         "################################################################################\n",
214 |         "print(\"x =\", x)\n",
215 |         "print(\"y =\", y)\n",
216 |         "print(\"w =\", w)"
217 |       ]
218 |     },
219 |     {
220 |       "cell_type": "markdown",
221 |       "metadata": {
222 |         "id": "QMMXBAA5qS6q"
223 |       },
224 |       "source": [
225 |         "### Question\n",
226 |         "\n",
227 |         "Please **copy and paste the output of the previous cell** (text only) into your submission of the written assignment."
228 |       ]
229 |     },
230 |     {
231 |       "cell_type": "markdown",
232 |       "metadata": {
233 |         "id": "2-BAnabqqS6q"
234 |       },
235 |       "source": [
236 |         "## Dropout, Gradient Descent\n",
237 |         "**Add dropout to your network. Implement the Dropout layer below, then run with dropout.** "
238 |       ]
239 |     },
240 |     {
241 |       "cell_type": "code",
242 |       "execution_count": null,
243 |       "metadata": {
244 |         "id": "1EBkYciXqS6q"
245 |       },
246 |       "outputs": [],
247 |       "source": [
248 |         "class Dropout(torch.nn.Module):\n",
249 |         "    def __init__(self, p=0.5):\n",
250 |         "        super().__init__()\n",
251 |         "        self.p = p\n",
252 |         "\n",
253 |         "    def forward(self, x):\n",
254 |         "        if self.training:\n",
255 |         "            ####################################################################\n",
256 |         "            # YOUR CODE HERE\n",
257 |         "            ####################################################################\n",
258 |         "            raise NotImplementedError()\n",
259 |         "            ####################################################################\n",
260 |         "        else:\n",
261 |         "            return x\n",
262 |         "\n",
263 |         "def init_with_dropout(p):\n",
264 |         "    net = torch.nn.Sequential(\n",
265 |         "        Dropout(p),\n",
266 |         "        torch.nn.Linear(2, 1, bias=False)\n",
267 |         "    )\n",
268 |         "    # Initialize weights with 0\n",
269 |         "    net.load_state_dict({k: v * 0 for k, v in net.state_dict().items()})\n",
270 |         "    return net\n",
271 |         "\n",
272 |         "net = init_with_dropout(0.5)\n",
273 |         "losses = train_simple(net)\n",
274 |         "plt.title('losses zoomed in')\n",
275 |         "plt.plot(losses[:100])\n",
276 |         "plt.show()"
277 |       ]
278 |     },
279 |     {
280 |       "cell_type": "markdown",
281 |       "metadata": {
282 |         "id": "NVJAaAu_qS6r"
283 |       },
284 |       "source": [
285 |         "### Question\n",
286 |         "\n",
287 |         "**Describe the shape of the training curve. Are the weights obtained by training with gradient descent the same as those calculated using the closed-form least squares method?** Answer this question in your written assignment."
288 |       ]
289 |     },
290 |     {
291 |       "cell_type": "markdown",
292 |       "metadata": {
293 |         "id": "TfVujONVqS6r"
294 |       },
295 |       "source": [
296 |         "## Dropout, Gradient Descent with Larger Batch Sizes\n",
297 |         "\n",
298 |         "Run the cell below, which uses a larger batch."
299 |       ]
300 |     },
301 |     {
302 |       "cell_type": "code",
303 |       "execution_count": null,
304 |       "metadata": {
305 |         "id": "QYQlV_w0qS6r"
306 |       },
307 |       "outputs": [],
308 |       "source": [
309 |         "net = init_with_dropout(0.5)\n",
310 |         "losses = train_simple(net, batch_size=1024)"
311 |       ]
312 |     },
313 |     {
314 |       "cell_type": "markdown",
315 |       "metadata": {
316 |         "id": "YhrE94gIqS6r"
317 |       },
318 |       "source": [
319 |         "### Question\n",
320 |         "\n",
321 |         "**Describe the loss curve and compare it with the loss curve in the last part. Why are they different? Also compare the trained weights with the one calculated by the least-square formula.** Answer this question in your written assignment."
322 |       ]
323 |     },
324 |     {
325 |       "cell_type": "markdown",
326 |       "metadata": {
327 |         "id": "-5OLr-MXqS6s"
328 |       },
329 |       "source": [
330 |         "# (G) [OPTIONAL]: Sweeping over dropout rate\n",
331 |         "\n",
332 |         "Now, let's see how different dropout rates affect the final solution. Run the cell below to sweep over dropout rates. Since the 4 data points we considered in part (C) are no longer equally likely, we need to weight each data point by its probability of occuring. This turns it into a weighted linear regression problem. The analytic solution for this problem is:\n",
333 |         "\n",
334 |         "$$w = (X^\\top S X)^{-1} X^\\top S y$$\n",
335 |         "\n",
336 |         "where $S$ is the diagonal matrix of probabilities of each data point occuring.\n",
337 |         "\n",
338 |         "Implement the analytic solution in the cell below, and show that the analytic solution matches the empirical solution. You should see that as the dropout rate changes, $w_1$ and $w_2$ change smoothly, except for a discontinuity when dropout rates are 0. Explain this discontinuity."
339 |       ]
340 |     },
341 |     {
342 |       "cell_type": "code",
343 |       "execution_count": null,
344 |       "metadata": {
345 |         "id": "aFcMWci9qS6s"
346 |       },
347 |       "outputs": [],
348 |       "source": [
349 |         "def init_with_dropout(p):\n",
350 |         "    net = torch.nn.Sequential(\n",
351 |         "        Dropout(p),\n",
352 |         "        torch.nn.Linear(2, 1, bias=False)\n",
353 |         "    )\n",
354 |         "    net.load_state_dict({k: v * 0 for k, v in net.state_dict().items()})\n",
355 |         "    return net\n",
356 |         "\n",
357 |         "empirical_dropout_rates = [0, .1, .3, .5, .7]\n",
358 |         "analytical_dropout_rates = np.arange(0, .99, .01)\n",
359 |         "losses_empirical, losses_analytical = [], []\n",
360 |         "w1_empirical, w2_empirical, w1_analytical, w2_analytical = [], [], [], []\n",
361 |         "for p in analytical_dropout_rates:\n",
362 |         "    # compute analytical solution\n",
363 |         "    ############################################################################\n",
364 |         "    # YOUR CODE HERE\n",
365 |         "    ############################################################################\n",
366 |         "    x = ?\n",
367 |         "    y = ?\n",
368 |         "    s = np.diag(?)\n",
369 |         "    w_analytic = ?\n",
370 |         "    ############################################################################\n",
371 |         "    x = np.array([[10, 1]])\n",
372 |         "    y = np.array([[11]])\n",
373 |         "    l_analytic = ((x @ w_analytic - y) ** 2).item()\n",
374 |         "    w1_analytical.append(w_analytic[0][0])\n",
375 |         "    w2_analytical.append(w_analytic[1][0])\n",
376 |         "    losses_analytical.append(l_analytic)\n",
377 |         "for p in empirical_dropout_rates:\n",
378 |         "    net = init_with_dropout(p)\n",
379 |         "    # Initialize weights with 0\n",
380 |         "    losses = train_simple(net, batch_size=1024, itrs=10000, plot=False)\n",
381 |         "    net.eval()\n",
382 |         "    losses_empirical.append(((net(torch.FloatTensor(x)) - torch.FloatTensor(y)) ** 2).item())\n",
383 |         "    w1_empirical.append(net.state_dict()['1.weight'][0][0].item())\n",
384 |         "    w2_empirical.append(net.state_dict()['1.weight'][0][1].item())\n",
385 |         "# Plot all saved values\n",
386 |         "plt.figure(figsize=(10, 5))\n",
387 |         "plt.subplot(1, 2, 1)\n",
388 |         "plt.plot(analytical_dropout_rates, losses_analytical, label='analytical')\n",
389 |         "plt.scatter(empirical_dropout_rates, losses_empirical, label='empirical')\n",
390 |         "plt.legend()\n",
391 |         "\n",
392 |         "plt.subplot(1, 2, 2)\n",
393 |         "plt.plot(analytical_dropout_rates, w1_analytical, label='w1 analytical')\n",
394 |         "plt.scatter(empirical_dropout_rates, w1_empirical, label='w1 empirical')\n",
395 |         "plt.plot(analytical_dropout_rates, w2_analytical, label='w2 analytical')\n",
396 |         "plt.scatter(empirical_dropout_rates, w2_empirical, label='w2 empirical')\n",
397 |         "plt.legend()\n",
398 |         "plt.show()\n"
399 |       ]
400 |     },
401 |     {
402 |       "cell_type": "markdown",
403 |       "metadata": {
404 |         "id": "vqJ432VOqS6s"
405 |       },
406 |       "source": [
407 |         "# (H) [OPTIONAL]: Adding Adam\n",
408 |         "\n",
409 |         "Now, let's add Adam to our network. Run the cell below to train with Adam with and without dropout. Does the solution change? Why or why not?"
410 |       ]
411 |     },
412 |     {
413 |       "cell_type": "code",
414 |       "execution_count": null,
415 |       "metadata": {
416 |         "id": "1375v38dqS6s"
417 |       },
418 |       "outputs": [],
419 |       "source": [
420 |         "dropout_rates = [0, .5]\n",
421 |         "optim_classes = [torch.optim.SGD, torch.optim.Adam]\n",
422 |         "\n",
423 |         "# Two plots, one for w1 and one for w2\n",
424 |         "fig, axs = plt.subplots(figsize=(5, 5))\n",
425 |         "\n",
426 |         "for optim_class in optim_classes:\n",
427 |         "    w1_list = []\n",
428 |         "    w2_list = []\n",
429 |         "    for p in dropout_rates:\n",
430 |         "        net = init_with_dropout(p).train()\n",
431 |         "        losses = train_simple(net, batch_size=1024, itrs=10000, optim_class=optim_class, plot=False)\n",
432 |         "        net.eval()\n",
433 |         "        w1_list.append(net.state_dict()['1.weight'][0][0].item())\n",
434 |         "        w2_list.append(net.state_dict()['1.weight'][0][1].item())\n",
435 |         "    axs.plot(dropout_rates, w1_list, label=f'{optim_class.__name__} w1')\n",
436 |         "    axs.plot(dropout_rates, w2_list, label=f'{optim_class.__name__} w2')\n",
437 |         "axs.legend()\n",
438 |         "axs.set_ylim(0, 4)\n",
439 |         "plt.show()\n",
440 |         "\n"
441 |       ]
442 |     },
443 |     {
444 |       "cell_type": "markdown",
445 |       "metadata": {
446 |         "id": "1MVvFnZIqS6t"
447 |       },
448 |       "source": [
449 |         "## (I): Dropout on real data\n",
450 |         "\n",
451 |         "There are some unusual features of our previous problem:\n",
452 |         "- We only used a single datapoint\n",
453 |         "- We applied dropout to the inputs to the network, whereas in real problems it's typically applied to hidden units\n",
454 |         "- The network was so small that dropout significantly hurt performance. Typically, networks are large enough that they can fit the data well even with dropout.\n",
455 |         "\n",
456 |         "To see the effect of dropout on a more realistic problem, we'll train a network on the CIFAR10 dataset and add a \"cheating feature.\" In this case, the cheating feature consists of a few pixels in the bottom-right corner of the image which encode the class label*. We want to see how dropout helps the network learn to rely less heavily on this cheating feature. Run the next few cells and comment on how dropout affects the degree to which the network relies on the cheating feature. Which model does better on clean data?\n",
457 |         "\n",
458 |         "*This is obviously a contrived cheating feature, but they can appear in real data -- for instance, if a particular camera was used to capture all images of a certain class, the model might learn to rely on subtle camera artifacts rather than the acutal image."
459 |       ]
460 |     },
461 |     {
462 |       "cell_type": "code",
463 |       "execution_count": null,
464 |       "metadata": {
465 |         "id": "ra9xuQ2fqS6t"
466 |       },
467 |       "outputs": [],
468 |       "source": [
469 |         "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
470 |         "\n",
471 |         "class ConvNet(torch.nn.Module):\n",
472 |         "    def __init__(self, dropout_rate=0):\n",
473 |         "        super(ConvNet, self).__init__()\n",
474 |         "        in_channels = 3 \n",
475 |         "        self.conv1 = torch.nn.Conv2d(3, 16, kernel_size=3, padding='same')\n",
476 |         "        self.conv2 = torch.nn.Conv2d(16, 32, kernel_size=3, padding='same')\n",
477 |         "        img_size = 8\n",
478 |         "        self.fc1 = torch.nn.Linear(32 * img_size * img_size, 10)\n",
479 |         "        self.dropout_rate = dropout_rate\n",
480 |         "\n",
481 |         "    def forward(self, x):\n",
482 |         "        # NOTE: we apply more dropout to this network than is typical so we can emphasize the effect.\n",
483 |         "        # It's more typical to apply dropout to only the fully connected layers.\n",
484 |         "        x = torch.nn.functional.relu(torch.nn.functional.max_pool2d(self.conv1(x), 2))\n",
485 |         "        x = torch.nn.functional.dropout(x, training=self.training, p=self.dropout_rate)\n",
486 |         "        x = torch.nn.functional.relu(torch.nn.functional.max_pool2d(self.conv2(x), 2))\n",
487 |         "        x = torch.nn.functional.dropout(x, training=self.training, p=self.dropout_rate)\n",
488 |         "        img_size = 8\n",
489 |         "        x = x.view(-1, 32 * img_size * img_size)\n",
490 |         "        x = torch.nn.functional.relu(self.fc1(x))\n",
491 |         "        return torch.nn.functional.log_softmax(x, dim=1)"
492 |       ]
493 |     },
494 |     {
495 |       "cell_type": "code",
496 |       "execution_count": null,
497 |       "metadata": {
498 |         "id": "Ci71FqajqS6t"
499 |       },
500 |       "outputs": [],
501 |       "source": [
502 |         "def add_cheating_feature(x_batch, y_batch):\n",
503 |         "    # Add the label on the bottom-right corner of the image, encoded in binary\n",
504 |         "    for i in range(x_batch.shape[0]):\n",
505 |         "        binary_list = [int(x) for x in bin(y_batch[i].item())[2:]]\n",
506 |         "        if len(binary_list) < 4:\n",
507 |         "            binary_list = [0] * (4 - len(binary_list)) + binary_list\n",
508 |         "        binary_label = torch.FloatTensor(binary_list) * 3\n",
509 |         "        x_batch[i, 0, -1, -4:] = binary_label\n",
510 |         "        x_batch[i, 1:, -1, -4:] = 1 - binary_label\n",
511 |         "    return x_batch"
512 |       ]
513 |     },
514 |     {
515 |       "cell_type": "code",
516 |       "execution_count": null,
517 |       "metadata": {
518 |         "id": "B6Jeg0BOqS6t"
519 |       },
520 |       "outputs": [],
521 |       "source": [
522 |         "# Load CIFAR10 data\n",
523 |         "from torchvision import datasets, transforms\n",
524 |         "# Nomalizing constants for CIFAR10\n",
525 |         "MEAN = [0.4914, 0.4822, 0.4465]\n",
526 |         "STD = [0.2023, 0.1994, 0.2010]\n",
527 |         "train_loader = torch.utils.data.DataLoader(\n",
528 |         "    datasets.CIFAR10('data', train=True, download=True,\n",
529 |         "                        transform=transforms.Compose([\n",
530 |         "                            transforms.ToTensor(),\n",
531 |         "                            transforms.Normalize(MEAN, STD)\n",
532 |         "                        ])),\n",
533 |         "    batch_size=64, shuffle=True)\n",
534 |         "test_loader = torch.utils.data.DataLoader(\n",
535 |         "    datasets.CIFAR10('data', train=False, transform=transforms.Compose([\n",
536 |         "                            transforms.ToTensor(),\n",
537 |         "                            transforms.Normalize(MEAN, STD)\n",
538 |         "                        ])),    \n",
539 |         "    batch_size=1000, shuffle=True)\n",
540 |         "\n"
541 |       ]
542 |     },
543 |     {
544 |       "cell_type": "code",
545 |       "execution_count": null,
546 |       "metadata": {
547 |         "id": "Z6IF-qwvqS6t"
548 |       },
549 |       "outputs": [],
550 |       "source": [
551 |         "# Visualize the data (note the black and white pixels in the corner)\n",
552 |         "# Images will appear to be overly saturated since matplotlib clips values outside of [0, 1]\n",
553 |         "def visualize_data():\n",
554 |         "    for _ in range(5):\n",
555 |         "        # Get a batch of training data\n",
556 |         "        x_batch, y_batch = next(iter(train_loader))\n",
557 |         "        # Add the cheating feature\n",
558 |         "        x_batch = add_cheating_feature(x_batch, y_batch)\n",
559 |         "        # Plot the first image in the batch, with the cheating feature\n",
560 |         "        # Move the channels to the end\n",
561 |         "        x_batch = x_batch.permute(0, 2, 3, 1)\n",
562 |         "        # Undo the normalization\n",
563 |         "        x_batch = x_batch * torch.FloatTensor(STD).view(1, 1, 1, 3) + torch.FloatTensor(MEAN).view(1, 1, 1, 3)\n",
564 |         "        plt.imshow(x_batch[0])\n",
565 |         "        plt.show()\n",
566 |         "visualize_data()"
567 |       ]
568 |     },
569 |     {
570 |       "cell_type": "code",
571 |       "execution_count": null,
572 |       "metadata": {
573 |         "id": "vY_4F2pEqS6t"
574 |       },
575 |       "outputs": [],
576 |       "source": [
577 |         "# Train the model\n",
578 |         "def train(model, num_epochs=15, lr=1e-3):\n",
579 |         "    all_train_losses = []\n",
580 |         "    all_val_losses = []\n",
581 |         "    optimizer = torch.optim.SGD(model.parameters(), lr=lr)\n",
582 |         "    for epoch in range(1, num_epochs + 1):\n",
583 |         "        train_losses = []\n",
584 |         "        model.train()\n",
585 |         "        for (data, target) in train_loader:\n",
586 |         "            # Put the data on the same device as the model\n",
587 |         "            data = data.to(device)\n",
588 |         "            target = target.to(device)\n",
589 |         "            optimizer.zero_grad()\n",
590 |         "            # add cheating feature\n",
591 |         "            data = add_cheating_feature(data, target)\n",
592 |         "            output = model(data)\n",
593 |         "            loss = torch.nn.CrossEntropyLoss()(output, target)\n",
594 |         "            loss.backward()\n",
595 |         "            train_losses.append(loss.item())\n",
596 |         "            train_losses = train_losses[-100:]\n",
597 |         "            optimizer.step() \n",
598 |         "        model.eval()\n",
599 |         "        test_loss = 0\n",
600 |         "        correct = 0\n",
601 |         "        with torch.no_grad():\n",
602 |         "            for data, target in test_loader:\n",
603 |         "                # Put the data on the same device as the model\n",
604 |         "                data = data.to(device)\n",
605 |         "                target = target.to(device)\n",
606 |         "                # add cheating feature\n",
607 |         "                data = add_cheating_feature(data, target)\n",
608 |         "                output = model(data)\n",
609 |         "                test_loss += torch.nn.CrossEntropyLoss(reduction='sum')(output, target).item() # sum up batch loss\n",
610 |         "                pred = output.data.max(1, keepdim=True)[1] # get the index of the max log-probability\n",
611 |         "                correct += pred.eq(target.data.view_as(pred)).cpu().sum()\n",
612 |         "\n",
613 |         "        test_loss /= len(test_loader.dataset)\n",
614 |         "        train_loss = np.mean(train_losses)\n",
615 |         "        print('Train Epoch: {} of {} Train Loss: {:.3f}, Val Loss: {:3f}, Val Accuracy: {:3f}'.format(\n",
616 |         "                    epoch, num_epochs, train_loss, test_loss, 100. * correct / len(test_loader.dataset)))\n",
617 |         "        all_train_losses.append(train_loss)\n",
618 |         "        all_val_losses.append(test_loss)\n",
619 |         "    plt.plot(all_train_losses)\n",
620 |         "    plt.plot(all_val_losses)\n",
621 |         "    plt.legend(['train', 'val'])\n",
622 |         "    plt.show()\n",
623 |         "    return all_train_losses, all_val_losses"
624 |       ]
625 |     },
626 |     {
627 |       "cell_type": "code",
628 |       "execution_count": null,
629 |       "metadata": {
630 |         "id": "TbnzGivaqS6u"
631 |       },
632 |       "outputs": [],
633 |       "source": [
634 |         "# Test how much the model uses the cheating feature\n",
635 |         "def test_cheating(model):\n",
636 |         "    model.eval()\n",
637 |         "    correct_cheating = 0\n",
638 |         "    correct_not_cheating = 0\n",
639 |         "    correct_random = 0\n",
640 |         "    for data, target in test_loader:\n",
641 |         "        # Put the data on the same device as the model\n",
642 |         "        data = data.to(device)\n",
643 |         "        target = target.to(device)\n",
644 |         "        # Test on clean data\n",
645 |         "        output = model(data)\n",
646 |         "        pred = output.data.max(1, keepdim=True)[1] # get the index of the max log-probability\n",
647 |         "        correct_not_cheating += pred.eq(target.data.view_as(pred)).cpu().sum()\n",
648 |         "        # Test on data with cheating feature\n",
649 |         "        data_modified = add_cheating_feature(data.clone(), target)\n",
650 |         "        output = model(data_modified)\n",
651 |         "        pred = output.data.max(1, keepdim=True)[1] # get the index of the max log-probability\n",
652 |         "        correct_cheating += pred.eq(target.data.view_as(pred)).cpu().sum()\n",
653 |         "        correct_random += pred.eq(target.data.view_as(pred)).cpu().sum()\n",
654 |         "    print('Accuracy on clean data: {}/{} ({:.0f}%)'.format(\n",
655 |         "        correct_not_cheating, len(test_loader.dataset),\n",
656 |         "        100. * correct_not_cheating / len(test_loader.dataset)))\n",
657 |         "    print('Accuracy on data with cheating feature: {}/{} ({:.0f}%)'.format(\n",
658 |         "        correct_cheating, len(test_loader.dataset),\n",
659 |         "        100. * correct_cheating / len(test_loader.dataset)))"
660 |       ]
661 |     },
662 |     {
663 |       "cell_type": "code",
664 |       "execution_count": null,
665 |       "metadata": {
666 |         "id": "bjKV1Fn8qS6u"
667 |       },
668 |       "outputs": [],
669 |       "source": [
670 |         "model_no_dropout = ConvNet(dropout_rate=0)\n",
671 |         "# Put the model on the GPU, if available\n",
672 |         "model_no_dropout.to(device)\n",
673 |         "train_loss, val_loss = train(model_no_dropout, num_epochs=10, lr=3e-3)\n",
674 |         "test_cheating(model_no_dropout)"
675 |       ]
676 |     },
677 |     {
678 |       "cell_type": "code",
679 |       "execution_count": null,
680 |       "metadata": {
681 |         "id": "CF9F5ViNqS6v"
682 |       },
683 |       "outputs": [],
684 |       "source": [
685 |         "model_dropout = ConvNet(dropout_rate=0.75)\n",
686 |         "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
687 |         "model_dropout.to(device)\n",
688 |         "train_loss, val_loss = train(model_dropout, num_epochs=10, lr=3e-3)\n",
689 |         "test_cheating(model_dropout)"
690 |       ]
691 |     }
692 |   ],
693 |   "metadata": {
694 |     "language_info": {
695 |       "name": "python"
696 |     },
697 |     "colab": {
698 |       "provenance": []
699 |     }
700 |   },
701 |   "nbformat": 4,
702 |   "nbformat_minor": 0
703 | }


--------------------------------------------------------------------------------
/Homework/Homework4/hw4.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Leo-Adventure/CS182-Berkeley-Deep-Learning-23Spring/c70df32185f7a1e38e3287dbc6d2d0b08f9e98d2/Homework/Homework4/hw4.pdf


--------------------------------------------------------------------------------
/Homework/Homework4/hw4_sol.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Leo-Adventure/CS182-Berkeley-Deep-Learning-23Spring/c70df32185f7a1e38e3287dbc6d2d0b08f9e98d2/Homework/Homework4/hw4_sol.pdf


--------------------------------------------------------------------------------
/Homework/Homework5/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Leo-Adventure/CS182-Berkeley-Deep-Learning-23Spring/c70df32185f7a1e38e3287dbc6d2d0b08f9e98d2/Homework/Homework5/.DS_Store


--------------------------------------------------------------------------------
/Homework/Homework5/coding/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Leo-Adventure/CS182-Berkeley-Deep-Learning-23Spring/c70df32185f7a1e38e3287dbc6d2d0b08f9e98d2/Homework/Homework5/coding/.DS_Store


--------------------------------------------------------------------------------
/Homework/Homework5/hw5.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Leo-Adventure/CS182-Berkeley-Deep-Learning-23Spring/c70df32185f7a1e38e3287dbc6d2d0b08f9e98d2/Homework/Homework5/hw5.pdf


--------------------------------------------------------------------------------
/Homework/Homework5/hw5_sol.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Leo-Adventure/CS182-Berkeley-Deep-Learning-23Spring/c70df32185f7a1e38e3287dbc6d2d0b08f9e98d2/Homework/Homework5/hw5_sol.pdf


--------------------------------------------------------------------------------
/Homework/homework10/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Leo-Adventure/CS182-Berkeley-Deep-Learning-23Spring/c70df32185f7a1e38e3287dbc6d2d0b08f9e98d2/Homework/homework10/.DS_Store


--------------------------------------------------------------------------------
/Homework/homework10/hw10.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Leo-Adventure/CS182-Berkeley-Deep-Learning-23Spring/c70df32185f7a1e38e3287dbc6d2d0b08f9e98d2/Homework/homework10/hw10.pdf


--------------------------------------------------------------------------------
/Homework/homework10/hw10_sol.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Leo-Adventure/CS182-Berkeley-Deep-Learning-23Spring/c70df32185f7a1e38e3287dbc6d2d0b08f9e98d2/Homework/homework10/hw10_sol.pdf


--------------------------------------------------------------------------------
/Homework/homework11/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Leo-Adventure/CS182-Berkeley-Deep-Learning-23Spring/c70df32185f7a1e38e3287dbc6d2d0b08f9e98d2/Homework/homework11/.DS_Store


--------------------------------------------------------------------------------
/Homework/homework11/coding/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Leo-Adventure/CS182-Berkeley-Deep-Learning-23Spring/c70df32185f7a1e38e3287dbc6d2d0b08f9e98d2/Homework/homework11/coding/.DS_Store


--------------------------------------------------------------------------------
/Homework/homework11/coding/generative_models_sol.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Leo-Adventure/CS182-Berkeley-Deep-Learning-23Spring/c70df32185f7a1e38e3287dbc6d2d0b08f9e98d2/Homework/homework11/coding/generative_models_sol.zip


--------------------------------------------------------------------------------
/Homework/homework11/coding/hw11_continual_learning.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "cells": [
  3 |     {
  4 |       "cell_type": "markdown",
  5 |       "metadata": {
  6 |         "id": "0ZJn_1HDt4II"
  7 |       },
  8 |       "source": [
  9 |         "# Continual Learning with PyTorch\n",
 10 |         "\n",
 11 |         "This notebook is a homework assignment for the course [CS182/282A](https://inst.eecs.berkeley.edu/~cs182/fa22/). The goal of this assignment is to get familiar with the concept of continual learning and how to implement it with PyTorch. We will use the MNIST benchmark for this assignment. Many parts of this notebook are based on the [ContinualAI](https://github.com/ContinualAI)\n",
 12 |         "\n",
 13 |         "---\n",
 14 |         "\n",
 15 |         "\n",
 16 |         "**Requisites**\n",
 17 |         "\n",
 18 |         "*   Python 3.x\n",
 19 |         "*   Jupyter\n",
 20 |         "*   PyTorch >= 1.8\n",
 21 |         "*   NumPy\n",
 22 |         "*   Matplotlib\n",
 23 |         "---"
 24 |       ]
 25 |     },
 26 |     {
 27 |       "cell_type": "code",
 28 |       "execution_count": null,
 29 |       "metadata": {
 30 |         "id": "b9ysnsfZt4IL"
 31 |       },
 32 |       "outputs": [],
 33 |       "source": [
 34 |         "!free -m\n",
 35 |         "!df -h\n",
 36 |         "!nvidia-smi"
 37 |       ]
 38 |     },
 39 |     {
 40 |       "cell_type": "code",
 41 |       "execution_count": null,
 42 |       "metadata": {
 43 |         "id": "61amzXYWt4IN"
 44 |       },
 45 |       "outputs": [],
 46 |       "source": [
 47 |         "import torch\n",
 48 |         "import torch.nn as nn\n",
 49 |         "import torchvision.datasets as datasets\n",
 50 |         "import torchvision.transforms as transforms\n",
 51 |         "import torch.optim as optim\n",
 52 |         "import torch.nn.functional as F\n",
 53 |         "import numpy as np\n",
 54 |         "import matplotlib.pyplot as plt"
 55 |       ]
 56 |     },
 57 |     {
 58 |       "cell_type": "markdown",
 59 |       "metadata": {
 60 |         "id": "_ligXRUxt4IN"
 61 |       },
 62 |       "source": [
 63 |         "## Downloading the dataset\n",
 64 |         "\n",
 65 |         "We will use the MNIST dataset for this assignment. The dataset is already available in PyTorch, so we just need to download it."
 66 |       ]
 67 |     },
 68 |     {
 69 |       "cell_type": "code",
 70 |       "execution_count": null,
 71 |       "metadata": {
 72 |         "id": "arv09vNot4IO"
 73 |       },
 74 |       "outputs": [],
 75 |       "source": [
 76 |         "# download mnist\n",
 77 |         "train_dataset = datasets.MNIST(root='./data', train=True, download=True, transform=transforms.ToTensor())\n",
 78 |         "test_dataset = datasets.MNIST(root='./data', train=False, download=True, transform=transforms.ToTensor())\n",
 79 |         "\n",
 80 |         "# preprocess mnist\n",
 81 |         "train_dataset.data = train_dataset.data.float() / 255\n",
 82 |         "train_dataset.data = train_dataset.data.reshape(-1, 1, 28, 28)\n",
 83 |         "test_dataset.data = test_dataset.data.float() / 255\n",
 84 |         "test_dataset.data = test_dataset.data.reshape(-1, 1, 28, 28)\n",
 85 |         "\n",
 86 |         "print('Train dataset shape: ', train_dataset.data.shape)\n",
 87 |         "print('Test dataset shape: ', test_dataset.data.shape)"
 88 |       ]
 89 |     },
 90 |     {
 91 |       "cell_type": "code",
 92 |       "execution_count": null,
 93 |       "metadata": {
 94 |         "id": "GNKr9uBRt4IO"
 95 |       },
 96 |       "outputs": [],
 97 |       "source": [
 98 |         "use_cuda = torch.cuda.is_available()\n",
 99 |         "device = torch.device(\"cuda\" if use_cuda else \"cpu\");\n",
100 |         "torch.manual_seed(1)"
101 |       ]
102 |     },
103 |     {
104 |       "cell_type": "markdown",
105 |       "metadata": {
106 |         "id": "rdmQJ1Eet4IO"
107 |       },
108 |       "source": [
109 |         "### Define Network\n",
110 |         "\n",
111 |         "We will use a simple 5-layer convolutional neural network for this assignment. The network is defined in the `Net` class below. The network is composed of 3 convolutional layers and 2 fully connected layers. "
112 |       ]
113 |     },
114 |     {
115 |       "cell_type": "code",
116 |       "execution_count": null,
117 |       "metadata": {
118 |         "id": "vdPVbgnpt4IO"
119 |       },
120 |       "outputs": [],
121 |       "source": [
122 |         "class Net(nn.Module):\n",
123 |         "    def __init__(self):\n",
124 |         "        super(Net, self).__init__()\n",
125 |         "        self.conv1 = nn.Conv2d(1, 10, kernel_size=5)\n",
126 |         "        self.conv2 = nn.Conv2d(10, 20, kernel_size=5)\n",
127 |         "        self.conv2_drop = nn.Dropout2d()\n",
128 |         "        self.fc1 = nn.Linear(320, 50)\n",
129 |         "        self.fc2 = nn.Linear(50, 10)\n",
130 |         "\n",
131 |         "    def forward(self, x):\n",
132 |         "        x = F.relu(F.max_pool2d(self.conv1(x), 2))\n",
133 |         "        x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))\n",
134 |         "        x = x.view(-1, 320)\n",
135 |         "        x = F.relu(self.fc1(x))\n",
136 |         "        x = F.dropout(x, training=self.training)\n",
137 |         "        x = self.fc2(x)\n",
138 |         "        return x"
139 |       ]
140 |     },
141 |     {
142 |       "cell_type": "markdown",
143 |       "metadata": {
144 |         "id": "51Boy3Unt4IP"
145 |       },
146 |       "source": [
147 |         "### Training and Testing\n",
148 |         "\n",
149 |         "We will use the `train` and `test` functions to train and test the network. The `train` function takes as input the network, the training data, the optimizer, the loss function, and the number of epochs. The `test` function takes as input the network and the test data. The `train` function returns the training loss and accuracy, and the `test` function returns the test accuracy.\n",
150 |         "\n",
151 |         "Note that we are not using DataLoaders for simplicity in this assignment."
152 |       ]
153 |     },
154 |     {
155 |       "cell_type": "code",
156 |       "execution_count": null,
157 |       "metadata": {
158 |         "id": "fXZhSb3Lt4IP"
159 |       },
160 |       "outputs": [],
161 |       "source": [
162 |         "def train(model, device, x_train, t_train, optimizer, epoch):\n",
163 |         "    model.train()\n",
164 |         "    \n",
165 |         "    for start in range(0, len(t_train)-1, 256): # batch size = 256\n",
166 |         "      end = start + 256\n",
167 |         "      x, y = torch.from_numpy(x_train[start:end]), torch.from_numpy(t_train[start:end]).long()\n",
168 |         "      x, y = x.to(device), y.to(device)\n",
169 |         "      \n",
170 |         "      optimizer.zero_grad()\n",
171 |         "\n",
172 |         "      output = model(x)\n",
173 |         "      loss = F.cross_entropy(output, y)\n",
174 |         "      loss.backward()\n",
175 |         "      optimizer.step()\n",
176 |         "    print('Train Epoch: {} \\tLoss: {:.6f}'.format(epoch, loss.item()))\n",
177 |         "\n",
178 |         "def test(model, device, x_test, t_test):\n",
179 |         "    model.eval()\n",
180 |         "    test_loss = 0\n",
181 |         "    correct = 0\n",
182 |         "    for start in range(0, len(t_test)-1, 256):\n",
183 |         "      end = start + 256\n",
184 |         "      with torch.no_grad():\n",
185 |         "        x, y = torch.from_numpy(x_test[start:end]), torch.from_numpy(t_test[start:end]).long()\n",
186 |         "        x, y = x.to(device), y.to(device)\n",
187 |         "        output = model(x)\n",
188 |         "        test_loss += F.cross_entropy(output, y).item() # sum up batch loss\n",
189 |         "        pred = output.max(1, keepdim=True)[1] # get the index of the max logit\n",
190 |         "        correct += pred.eq(y.view_as(pred)).sum().item()\n",
191 |         "\n",
192 |         "    test_loss /= len(t_test)\n",
193 |         "    print('Test set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\\n'.format(\n",
194 |         "        test_loss, correct, len(t_test),\n",
195 |         "        100. * correct / len(t_test)))\n",
196 |         "    return 100. * correct / len(t_test)"
197 |       ]
198 |     },
199 |     {
200 |       "cell_type": "markdown",
201 |       "metadata": {
202 |         "id": "Pl0Aq4eMt4IQ"
203 |       },
204 |       "source": [
205 |         "Let's instantiate the network, the optimizer, and then train and test the network."
206 |       ]
207 |     },
208 |     {
209 |       "cell_type": "code",
210 |       "execution_count": null,
211 |       "metadata": {
212 |         "id": "Kq0ex5Ult4IQ"
213 |       },
214 |       "outputs": [],
215 |       "source": [
216 |         "model = Net().to(device)\n",
217 |         "optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)\n",
218 |         "\n",
219 |         "# train and test\n",
220 |         "for epoch in range(3): \n",
221 |         "    train(model, device, train_dataset.data.numpy(), train_dataset.targets.numpy(), optimizer, epoch)\n",
222 |         "    test(model, device, test_dataset.data.numpy(), test_dataset.targets.numpy())"
223 |       ]
224 |     },
225 |     {
226 |       "cell_type": "markdown",
227 |       "metadata": {
228 |         "id": "981FULu3t4IQ"
229 |       },
230 |       "source": [
231 |         "# Permuted MNIST\n",
232 |         "\n",
233 |         "Permuted MNIST is one of basic benchmarks for continual learning. In this benchmark, the pixels of the MNIST images are permuted randomly. The goal of the network is to learn to classify the images despite the permutation of the pixels. This benchmark is the example of domain continual learning, where the input domain changes."
234 |       ]
235 |     },
236 |     {
237 |       "cell_type": "code",
238 |       "execution_count": null,
239 |       "metadata": {
240 |         "id": "boOTjWztt4IQ"
241 |       },
242 |       "outputs": [],
243 |       "source": [
244 |         "def permute_mnist(mnist, seed):\n",
245 |         "    \"\"\" Given the training set, permute pixels of each img the same way. \"\"\"\n",
246 |         "\n",
247 |         "    np.random.seed(seed)\n",
248 |         "    print(\"starting permutation...\")\n",
249 |         "    h = w = 28\n",
250 |         "    perm_inds = list(range(h*w))\n",
251 |         "    np.random.shuffle(perm_inds)\n",
252 |         "    # print(perm_inds)\n",
253 |         "    perm_mnist = []\n",
254 |         "    for set in mnist:\n",
255 |         "        num_img = set.shape[0]\n",
256 |         "        flat_set = set.reshape(num_img, w * h)\n",
257 |         "        perm_mnist.append(flat_set[:, perm_inds].reshape(num_img, 1, w, h))\n",
258 |         "    print(\"done.\")\n",
259 |         "    return perm_mnist"
260 |       ]
261 |     },
262 |     {
263 |       "cell_type": "code",
264 |       "execution_count": null,
265 |       "metadata": {
266 |         "id": "wLq206KWt4IR"
267 |       },
268 |       "outputs": [],
269 |       "source": [
270 |         "x_train2, x_test2 = permute_mnist([train_dataset.data.numpy(), test_dataset.data.numpy()], 0)"
271 |       ]
272 |     },
273 |     {
274 |       "cell_type": "code",
275 |       "execution_count": null,
276 |       "metadata": {
277 |         "id": "cPD0BUuDt4IR"
278 |       },
279 |       "outputs": [],
280 |       "source": [
281 |         "f, axarr = plt.subplots(1,2)\n",
282 |         "axarr[0].imshow(train_dataset.data.numpy()[1, 0], cmap=\"gray\")\n",
283 |         "axarr[1].imshow(x_train2[2, 0], cmap=\"gray\")\n",
284 |         "np.vectorize(lambda ax:ax.axis('off'))(axarr)"
285 |       ]
286 |     },
287 |     {
288 |       "cell_type": "markdown",
289 |       "metadata": {
290 |         "id": "hv00bhevt4IR"
291 |       },
292 |       "source": [
293 |         "Let's test our pretrained model is still working on both the original and the permuted MNIST datasets."
294 |       ]
295 |     },
296 |     {
297 |       "cell_type": "code",
298 |       "execution_count": null,
299 |       "metadata": {
300 |         "id": "jYxxUnqQt4IR"
301 |       },
302 |       "outputs": [],
303 |       "source": [
304 |         "print(\"Testing on the first task:\")\n",
305 |         "test(model, device, test_dataset.data.numpy(), test_dataset.targets.numpy())\n",
306 |         "\n",
307 |         "print(\"Testing on the second task:\")\n",
308 |         "test(model, device, x_test2, test_dataset.targets.numpy())"
309 |       ]
310 |     },
311 |     {
312 |       "cell_type": "markdown",
313 |       "metadata": {
314 |         "id": "TKinj1qLt4IR"
315 |       },
316 |       "source": [
317 |         "The newtork is unable to classify the permuted MNIST images. This isn't unexpected, since we did not train the network to classify the permuted MNIST images. Now let's fine-tune the network on the permuted MNIST dataset."
318 |       ]
319 |     },
320 |     {
321 |       "cell_type": "code",
322 |       "execution_count": null,
323 |       "metadata": {
324 |         "id": "UcJpmhjPt4IS"
325 |       },
326 |       "outputs": [],
327 |       "source": [
328 |         "for epoch in range(1, 3):\n",
329 |         "    train(model, device, x_train2, train_dataset.targets.numpy(), optimizer, epoch)\n",
330 |         "    test(model, device, x_test2, test_dataset.targets.numpy())"
331 |       ]
332 |     },
333 |     {
334 |       "cell_type": "code",
335 |       "execution_count": null,
336 |       "metadata": {
337 |         "id": "GFO-UyfKt4IS"
338 |       },
339 |       "outputs": [],
340 |       "source": [
341 |         "print(\"Testing on the first task:\")\n",
342 |         "test(model, device, test_dataset.data.numpy(), test_dataset.targets.numpy())\n",
343 |         "\n",
344 |         "print(\"Testing on the second task:\")\n",
345 |         "test(model, device, x_test2, test_dataset.targets.numpy())"
346 |       ]
347 |     },
348 |     {
349 |       "cell_type": "markdown",
350 |       "metadata": {
351 |         "id": "Edawg4rGt4IS"
352 |       },
353 |       "source": [
354 |         "We observe that the network performs very well on the new task but poorly on the original MNIST task. Catastrophic forgetting occurs here: the network forgets the original MNIST task when it is trained on the permuted MNIST task. Now let's see how can we mitigate the effect of catastrophic forgetting."
355 |       ]
356 |     },
357 |     {
358 |       "cell_type": "markdown",
359 |       "metadata": {
360 |         "id": "bLOyzNQbt4IS"
361 |       },
362 |       "source": [
363 |         "## Continual Learning Strategies\n",
364 |         "\n",
365 |         "Continual learning strategies are methods that allow a network to learn multiple tasks without forgetting the previous tasks. There are many different strategies, and we will implement 3 of them in this assignment. The strategies are: \n",
366 |         "\n",
367 |         "*   **Naive**: Naive fine tuning. Train the network on each task separately.\n",
368 |         "*   **EWC**: Elastic Weight Consolidation\n",
369 |         "*   **Rehearsal**: Store some examples from previous tasks and use them to train the network on the current task.\n",
370 |         "\n",
371 |         "Let's implement the strategies. We will use the `train` and `test` functions defined above to train and test the network. "
372 |       ]
373 |     },
374 |     {
375 |       "cell_type": "code",
376 |       "execution_count": null,
377 |       "metadata": {
378 |         "id": "ILjWYww7t4IS"
379 |       },
380 |       "outputs": [],
381 |       "source": [
382 |         "# task 1\n",
383 |         "x_train = train_dataset.data.numpy()\n",
384 |         "t_train = train_dataset.targets.numpy()\n",
385 |         "x_test = test_dataset.data.numpy()\n",
386 |         "t_test = test_dataset.targets.numpy()\n",
387 |         "\n",
388 |         "task_1 = [(x_train, t_train), (x_test, t_test)]\n",
389 |         "\n",
390 |         "# task 2\n",
391 |         "x_train2, x_test2 = permute_mnist([x_train, x_test], 1)\n",
392 |         "task_2 = [(x_train2, t_train), (x_test2, t_test)]\n",
393 |         "\n",
394 |         "# task 3\n",
395 |         "x_train3, x_test3 = permute_mnist([x_train, x_test], 2)\n",
396 |         "task_3 = [(x_train3, t_train), (x_test3, t_test)]\n",
397 |         "\n",
398 |         "# task list\n",
399 |         "tasks = [task_1, task_2, task_3]"
400 |       ]
401 |     },
402 |     {
403 |       "cell_type": "markdown",
404 |       "metadata": {
405 |         "id": "JxqhMVcat4IT"
406 |       },
407 |       "source": [
408 |         "### Naive\n",
409 |         "\n",
410 |         "The naive strategy is the simplest strategy. We just train the network on each task separately. Let's see how well the network performs on each task and how much it forgets from the previous tasks."
411 |       ]
412 |     },
413 |     {
414 |       "cell_type": "code",
415 |       "execution_count": null,
416 |       "metadata": {
417 |         "id": "L91dIksNt4IT"
418 |       },
419 |       "outputs": [],
420 |       "source": [
421 |         "# Define the model and optimizer\n",
422 |         "model = Net().to(device)\n",
423 |         "optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)"
424 |       ]
425 |     },
426 |     {
427 |       "cell_type": "code",
428 |       "execution_count": null,
429 |       "metadata": {
430 |         "id": "4lq0fxtCt4IT"
431 |       },
432 |       "outputs": [],
433 |       "source": [
434 |         "naive_accs = []\n",
435 |         "num_tasks = len(tasks)\n",
436 |         "\n",
437 |         "for id, task in enumerate(tasks):\n",
438 |         "    avg_acc = 0 # average accuracy on task 1, 2, ..., 5\n",
439 |         "    (x_train, t_train), _ = task\n",
440 |         "    print(\"Training on task: \", id+1)\n",
441 |         "\n",
442 |         "    for epoch in range(3): \n",
443 |         "        train(model, device, x_train, t_train, optimizer, epoch)\n",
444 |         "\n",
445 |         "    for id_test, task in enumerate(tasks):\n",
446 |         "        print('Test on task {}:'.format(id_test+1))\n",
447 |         "        _, (x_test, t_test) = task\n",
448 |         "        acc = test(model, device, x_test, t_test)\n",
449 |         "        avg_acc += acc\n",
450 |         "    \n",
451 |         "    naive_accs.append(avg_acc/num_tasks)\n",
452 |         "    print('Average accuracy on each task: ', avg_acc/num_tasks)\n",
453 |         "    print('-----------------------------------')"
454 |       ]
455 |     },
456 |     {
457 |       "cell_type": "markdown",
458 |       "metadata": {
459 |         "id": "nuwwFJuCt4IT"
460 |       },
461 |       "source": [
462 |         "Q1: What do you observe? How much does the network forget from the previous tasks? Why do you think this happens?\n",
463 |         "\n",
464 |         "Q2: (Open-ended question) We are using CNN. Does MLP perform better or worse than CNN? Try it out and report your results."
465 |       ]
466 |     },
467 |     {
468 |       "cell_type": "markdown",
469 |       "metadata": {
470 |         "id": "6Q491OGPt4IT"
471 |       },
472 |       "source": [
473 |         "### EWC\n",
474 |         "\n",
475 |         "Elastic Weights Consolidation (EWC) strategy is proposed in this paper: \"[Overcoming catastrophic forgetting in neural networks](https://arxiv.org/abs/1612.00796)\" This method is a regularization strategy that penalizes the network for changing the weights of the previous tasks. \n",
476 |         "\n",
477 |         "It is based on the computation of the importance of each weight (fisher information) and a squared regularization loss, penalizing changes in the most important wheights for the previous tasks.\n",
478 |         "\n",
479 |         "$\\mathcal{L}_{\\text{EWC}}(\\theta) = \\mathcal{L}(\\theta) + \\lambda / 2 \\sum_i F_i \\left(\\theta_i - \\theta_i^{\\text{old}}\\right)^2$\n",
480 |         "\n",
481 |         "where $\\theta$ is the current network parameters, $\\theta^{\\text{old}}$ is the network parameters from the previous task, $F_i$ is the diagonal value of fisher information matrix , and $\\lambda$ is a hyperparameter. Informally speaking, Fisher information is the approximation of the Hessian matrix of the loss function with respect to the weights. Therefore, the above equation is 2nd order Taylor expansion of the loss function around the previous task parameters. \n",
482 |         "\n",
483 |         "However, computing the fisher information matrix is not trivial. We will use the diagonal approximation of the fisher information matrix, which is the square of the gradient of the loss function with respect to the old weights."
484 |       ]
485 |     },
486 |     {
487 |       "cell_type": "code",
488 |       "execution_count": null,
489 |       "metadata": {
490 |         "id": "QH4NlRqUt4IU"
491 |       },
492 |       "outputs": [],
493 |       "source": [
494 |         "fisher_dict = {}\n",
495 |         "optpar_dict = {}\n",
496 |         "ewc_lambda = 0.4"
497 |       ]
498 |     },
499 |     {
500 |       "cell_type": "code",
501 |       "execution_count": null,
502 |       "metadata": {
503 |         "id": "hWyLwAXGt4IU"
504 |       },
505 |       "outputs": [],
506 |       "source": [
507 |         "model = Net().to(device)\n",
508 |         "optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)"
509 |       ]
510 |     },
511 |     {
512 |       "cell_type": "markdown",
513 |       "metadata": {
514 |         "id": "toZ8DJH7t4IU"
515 |       },
516 |       "source": [
517 |         "Helper function to compute the fisher information matrix for each weight. This function is called after each task is trained."
518 |       ]
519 |     },
520 |     {
521 |       "cell_type": "code",
522 |       "execution_count": null,
523 |       "metadata": {
524 |         "id": "xoPV7QTit4IV"
525 |       },
526 |       "outputs": [],
527 |       "source": [
528 |         "def on_task_update(task_id, x_mem, t_mem):\n",
529 |         "\n",
530 |         "    model.train()\n",
531 |         "    optimizer.zero_grad()\n",
532 |         "\n",
533 |         "    # accumulating gradients\n",
534 |         "    for start in range(0, len(t_mem)-1, 256):\n",
535 |         "        end = start + 256\n",
536 |         "        x, y = torch.from_numpy(x_mem[start:end]), torch.from_numpy(t_mem[start:end]).long()\n",
537 |         "        x, y = x.to(device), y.to(device)\n",
538 |         "        output = model(x)\n",
539 |         "        loss = F.cross_entropy(output, y)\n",
540 |         "        loss.backward()\n",
541 |         "\n",
542 |         "    fisher_dict[task_id] = {}\n",
543 |         "    optpar_dict[task_id] = {}\n",
544 |         "\n",
545 |         "    # gradients accumulated can be used to calculate fisher\n",
546 |         "    for name, param in model.named_parameters():\n",
547 |         "        optpar_dict[task_id][name] = param.data.clone()\n",
548 |         "        fisher_dict[task_id][name] = param.grad.data.clone().pow(2)"
549 |       ]
550 |     },
551 |     {
552 |       "cell_type": "markdown",
553 |       "metadata": {
554 |         "id": "yPMldBhut4IV"
555 |       },
556 |       "source": [
557 |         "We have to change the `train` function to compute the fisher information matrix for each weight. We will use the `on_task_update` function defined above to compute the fisher information matrix."
558 |       ]
559 |     },
560 |     {
561 |       "cell_type": "code",
562 |       "execution_count": null,
563 |       "metadata": {
564 |         "id": "sbehJ7rHt4IV"
565 |       },
566 |       "outputs": [],
567 |       "source": [
568 |         "def train_ewc(model, device, task_id, x_train, t_train, optimizer, epoch):\n",
569 |         "    model.train()\n",
570 |         "\n",
571 |         "    for start in range(0, len(t_train)-1, 256):\n",
572 |         "        end = start + 256\n",
573 |         "        x, y = torch.from_numpy(x_train[start:end]), torch.from_numpy(t_train[start:end]).long()\n",
574 |         "        x, y = x.to(device), y.to(device)\n",
575 |         "        \n",
576 |         "        optimizer.zero_grad()\n",
577 |         "\n",
578 |         "        output = model(x)\n",
579 |         "        loss = F.cross_entropy(output, y)\n",
580 |         "        \n",
581 |         "        for task in range(task_id):\n",
582 |         "            for name, param in model.named_parameters():\n",
583 |         "                fisher = fisher_dict[task][name]\n",
584 |         "                optpar = optpar_dict[task][name]\n",
585 |         "                loss += (fisher * (optpar - param).pow(2)).sum() * ewc_lambda\n",
586 |         "        \n",
587 |         "        loss.backward()\n",
588 |         "        optimizer.step()\n",
589 |         "    print('Train Epoch: {} \\tLoss: {:.6f}'.format(epoch, loss.item()))"
590 |       ]
591 |     },
592 |     {
593 |       "cell_type": "code",
594 |       "execution_count": null,
595 |       "metadata": {
596 |         "id": "lzD3uPKOt4IW"
597 |       },
598 |       "outputs": [],
599 |       "source": [
600 |         "ewc_accs = []\n",
601 |         "num_tasks = len(tasks)\n",
602 |         "\n",
603 |         "for id, task in enumerate(tasks):\n",
604 |         "    avg_acc = 0 # average accuracy on task 1, 2, ..., 5\n",
605 |         "    (x_train, t_train), _ = task\n",
606 |         "    print(\"Training on task: \", id)\n",
607 |         "\n",
608 |         "    for epoch in range(3): \n",
609 |         "        train_ewc(model, device, id, x_train, t_train, optimizer, epoch)\n",
610 |         "    on_task_update(id, x_train, t_train)\n",
611 |         "\n",
612 |         "    for id_test, task in enumerate(tasks):\n",
613 |         "        print('Test on task {}:'.format(id_test+1))\n",
614 |         "        _, (x_test, t_test) = task\n",
615 |         "        acc = test(model, device, x_test, t_test)\n",
616 |         "        avg_acc += acc\n",
617 |         "    \n",
618 |         "    ewc_accs.append(avg_acc/num_tasks)\n",
619 |         "    print('Average accuracy on each task: ', avg_acc/num_tasks)\n",
620 |         "    print('-----------------------------------')"
621 |       ]
622 |     },
623 |     {
624 |       "cell_type": "markdown",
625 |       "metadata": {
626 |         "id": "Yxp7KZMct4IW"
627 |       },
628 |       "source": [
629 |         "Q1. Hyperparameter is underexplored in this assignment. Try different values of $\\lambda$ and report your results.\n",
630 |         "\n",
631 |         "Q2. What is the role of $\\lambda$? What happens if $\\lambda$ is too small or too large? Explain the results with plasticity and stability of the network."
632 |       ]
633 |     },
634 |     {
635 |       "cell_type": "markdown",
636 |       "metadata": {
637 |         "id": "admc6Z4Lt4IW"
638 |       },
639 |       "source": [
640 |         "### Rehearsal\n",
641 |         "\n",
642 |         "Another strategy to mitigate catastrophic forgetting is to store some examples from previous tasks and use them to train the network on the current task. This strategy is called \"rehearsal\". Storing all the examples would perform best but is not feasible. Therefore, we will use a subset of the examples from the previous tasks. "
643 |       ]
644 |     },
645 |     {
646 |       "cell_type": "code",
647 |       "execution_count": null,
648 |       "metadata": {
649 |         "id": "v7t4sYURt4IX"
650 |       },
651 |       "outputs": [],
652 |       "source": [
653 |         "def shuffle_in_unison(dataset, seed, in_place=False):\n",
654 |         "    \"\"\" Shuffle two (or more) list in unison. \"\"\"\n",
655 |         "\n",
656 |         "    np.random.seed(seed)\n",
657 |         "    rng_state = np.random.get_state()\n",
658 |         "    new_dataset = []\n",
659 |         "    for x in dataset:\n",
660 |         "        if in_place:\n",
661 |         "            np.random.shuffle(x)\n",
662 |         "        else:\n",
663 |         "            new_dataset.append(np.random.permutation(x))\n",
664 |         "        np.random.set_state(rng_state)\n",
665 |         "\n",
666 |         "    if not in_place:\n",
667 |         "        return new_dataset"
668 |       ]
669 |     },
670 |     {
671 |       "cell_type": "code",
672 |       "execution_count": null,
673 |       "metadata": {
674 |         "id": "w3RIdXQot4IX"
675 |       },
676 |       "outputs": [],
677 |       "source": [
678 |         "model = Net().to(device)\n",
679 |         "optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)"
680 |       ]
681 |     },
682 |     {
683 |       "cell_type": "code",
684 |       "execution_count": null,
685 |       "metadata": {
686 |         "id": "jnHV8xfht4IX"
687 |       },
688 |       "outputs": [],
689 |       "source": [
690 |         "rehe_accs = []\n",
691 |         "num_tasks = len(tasks)\n",
692 |         "\n",
693 |         "for id, task in enumerate(tasks):\n",
694 |         "    avg_acc = 0\n",
695 |         "    print(\"Training on task: \", id)\n",
696 |         "\n",
697 |         "    (x_train, t_train), _ = task\n",
698 |         "\n",
699 |         "    # for previous task\n",
700 |         "    for i in range(id):\n",
701 |         "        (past_x_train, past_t_train), _ = tasks[i]\n",
702 |         "        x_train = np.concatenate((x_train, past_x_train))\n",
703 |         "        t_train = np.concatenate((t_train, past_t_train))\n",
704 |         "\n",
705 |         "    x_train, t_train = shuffle_in_unison([x_train, t_train], 0)\n",
706 |         "\n",
707 |         "    for epoch in range(3):\n",
708 |         "        train(model, device, x_train, t_train, optimizer, epoch)\n",
709 |         "\n",
710 |         "    for id_test, task in enumerate(tasks):\n",
711 |         "        print(\"Testing on task: \", id_test)\n",
712 |         "        _, (x_test, t_test) = task\n",
713 |         "        acc = test(model, device, x_test, t_test)\n",
714 |         "        avg_acc = avg_acc + acc\n",
715 |         "\n",
716 |         "    print(\"Avg acc: \", avg_acc / num_tasks)\n",
717 |         "    rehe_accs.append(avg_acc / num_tasks)"
718 |       ]
719 |     },
720 |     {
721 |       "cell_type": "markdown",
722 |       "metadata": {
723 |         "id": "epRffel4t4IY"
724 |       },
725 |       "source": [
726 |         "Q1. What would be the pros and cons of rehearsal?"
727 |       ]
728 |     },
729 |     {
730 |       "cell_type": "markdown",
731 |       "metadata": {
732 |         "id": "7Y_CWmB5t4IY"
733 |       },
734 |       "source": [
735 |         "## Conclusion\n",
736 |         "\n",
737 |         "Let's compare the performance of the 3 strategies on the permuted MNIST dataset."
738 |       ]
739 |     },
740 |     {
741 |       "cell_type": "code",
742 |       "execution_count": null,
743 |       "metadata": {
744 |         "id": "zetlL3ozt4IZ"
745 |       },
746 |       "outputs": [],
747 |       "source": [
748 |         "plt.plot([1, 2, 3], naive_accs, '-o', label=\"Naive\")\n",
749 |         "plt.plot([1, 2, 3], rehe_accs, '-o', label=\"Rehearsal\")\n",
750 |         "plt.plot([1, 2, 3], ewc_accs, '-o', label=\"EWC\")\n",
751 |         "plt.xlabel('Tasks Encountered', fontsize=14)\n",
752 |         "plt.ylabel('Average Accuracy', fontsize=14)\n",
753 |         "plt.title('CL Strategies Comparison on MNIST', fontsize=14);\n",
754 |         "plt.xticks([1, 2, 3])\n",
755 |         "plt.legend(prop={'size': 16});"
756 |       ]
757 |     }
758 |   ],
759 |   "metadata": {
760 |     "colab": {
761 |       "provenance": []
762 |     }
763 |   },
764 |   "nbformat": 4,
765 |   "nbformat_minor": 0
766 | }


--------------------------------------------------------------------------------
/Homework/homework11/coding/hw11_policy_gradient.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "provenance": []
  7 |     }
  8 |   },
  9 |   "cells": [
 10 |     {
 11 |       "cell_type": "markdown",
 12 |       "source": [
 13 |         "#**Q. Zero order optimization (Policy Gradient)**\n",
 14 |         "\n",
 15 |         "We will now talk about $0^{th}$ order optimization, also known as Policy Gradient in a Reinforcement Learning context. Although this method is primarily used in an RL context we will be adapting this method to do $0^{th}$ order optimization on a Neural Network.\n",
 16 |         "\n",
 17 |         "$k^{th}$ order optimization means that in the optimization, we use a $k^{th}$ order derivative ($\\frac{δL^k}{δ^kw}$) to do the optimization. So we can see that gradient descent is a first order optimization method, while Newton's method is a second order optimization method.\n",
 18 |         "\n",
 19 |         "Polciy gradient is a $0^{th}$ order optimization method - which means that you use no derivative for the optimzation. This is used in contexts in which the loss is a **blackboxed** function, hence propogating a gradient through it is impossible.\n",
 20 |         "\n",
 21 |         "Policy gradient at a high level approximates the gradient and then does gradient descent using this approximated gradient."
 22 |       ],
 23 |       "metadata": {
 24 |         "id": "ZBeJ1fQyahdK"
 25 |       }
 26 |     },
 27 |     {
 28 |       "cell_type": "markdown",
 29 |       "source": [
 30 |         "##**a) A handy derivation**\n",
 31 |         "Prove that $p_{\\theta}(x) \\nabla_θlog(p_{\\theta}(x)) = \\nabla_θp_{\\theta}(x)$\n"
 32 |       ],
 33 |       "metadata": {
 34 |         "id": "LtPpvzjUahdM"
 35 |       }
 36 |     },
 37 |     {
 38 |       "cell_type": "markdown",
 39 |       "source": [
 40 |         "##**b) Approximating the derivative**\n",
 41 |         "Let's say we have a neural network $f(x)$ which takes in a $x$ and uses the weights($w$) to output 2 logits <br> ($P = [P(y = 0)$, $P(y = 1)]$). <br> Let $p(x, y)$ be the joint distribution of the input and output data according to **our model**. Hence $p_w(x, y) = p(x)p_w(y|x)$, where p(x) is the ground distribution of x, while $p_w(y|x) = f(x)[y]$ is what our model predicts. \n",
 42 |         "<br><br>\n",
 43 |         "\n",
 44 |         "Similarly we have a **blackboxed** loss function $L(x, f(x))$ which outputs a loss. <br>\n",
 45 |         "For example if i wanted to learn to classify y = 1 if x > 5 and y = 0 otherwise, L(4, (0.1, 0.9)) would be small while L(4, (0.9, 0.1)) would be very high. As we already discussed, since this loss is blackboxed we can't take the derivative through it.\n",
 46 |         "<br><br>\n",
 47 |         "We want to optimize the following objective function <br>\n",
 48 |         "$w^* = argmin_wJ(w)$ <br> where $J(w) = E_{(x, f(x)) \\sim p_w(x, y)}[L(x, f(x))]$. \n",
 49 |         "<br>\n",
 50 |         "To do this optimization we want to approximate $\\nabla_{w} J(w)$ so that we could use an optimization method like gradient descent to find $w^*$ \n",
 51 |         "<br><br>\n",
 52 |         "**Prove that $\\nabla_{w} J(w)$ can be approximated as $\\frac{1}{N}∑_{i=1}^{i=N}(\\nabla_wlog(p_w(y_i|x_i))L(x_i, f(x_i))$**\n",
 53 |         "<br><br>\n",
 54 |         "**HINTS:**\n",
 55 |         "<ol>\n",
 56 |         "<li>Try creating a $\\tau = (x, f(x))$</li>\n",
 57 |         "<li>$E[X] = \\int_xxP(X=x)dx$ </li>\n",
 58 |         "<li>Use the result from part a which was $p_{\\theta}(x) \\nabla_θlog(p_{\\theta}(x)) = \\nabla_θp_{\\theta}(x)$</li>\n",
 59 |         "<li>$p_w(x, y) = p(x)p_w(y|x)$</li>\n",
 60 |         "</ol>"
 61 |       ],
 62 |       "metadata": {
 63 |         "id": "YyhoFOTSahdM"
 64 |       }
 65 |     },
 66 |     {
 67 |       "cell_type": "code",
 68 |       "execution_count": null,
 69 |       "metadata": {
 70 |         "id": "RVkC60-2ahdN"
 71 |       },
 72 |       "outputs": [],
 73 |       "source": [
 74 |         "from collections import OrderedDict\n",
 75 |         "\n",
 76 |         "import matplotlib.pyplot as plt\n",
 77 |         "import numpy as np\n",
 78 |         "import torch as torch\n",
 79 |         "import torch.nn.functional as F\n",
 80 |         "from torch import nn"
 81 |       ]
 82 |     },
 83 |     {
 84 |       "cell_type": "code",
 85 |       "source": [
 86 |         "# Determine the device\n",
 87 |         "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
 88 |         "print(\"Using device\", device)"
 89 |       ],
 90 |       "metadata": {
 91 |         "id": "kljXk-yoahdO"
 92 |       },
 93 |       "execution_count": null,
 94 |       "outputs": []
 95 |     },
 96 |     {
 97 |       "cell_type": "markdown",
 98 |       "source": [
 99 |         "### Data Generation\n",
100 |         "\n",
101 |         "In this question, each datapoint is a 8 dimensional vector assigned to one of the four labels depending on their distance to two points $\\mathbf{1}$ and $-\\mathbf{1}$."
102 |       ],
103 |       "metadata": {
104 |         "id": "hNPIU6vVahdO"
105 |       }
106 |     },
107 |     {
108 |       "cell_type": "code",
109 |       "source": [
110 |         "def generate_data(num_samples, input_dim):\n",
111 |         "    @torch.no_grad()\n",
112 |         "    def true_y(x):\n",
113 |         "        dp = ((x - 1) ** 2).sum(dim=-1)\n",
114 |         "        dn = ((x + 1) ** 2).sum(dim=-1)\n",
115 |         "        zp = dp <= input_dim * 2.5\n",
116 |         "        zn = dn <= input_dim * 2.5\n",
117 |         "\n",
118 |         "        return torch.stack([\n",
119 |         "            zp & zn,\n",
120 |         "            zp & ~zn,\n",
121 |         "            ~zp & zn,\n",
122 |         "            ~zp & ~zn\n",
123 |         "        ], dim=1).long().argmax(dim=-1)\n",
124 |         "    \n",
125 |         "    x = torch.rand((num_samples, input_dim)) * 4.4 - 2.2\n",
126 |         "    y = true_y(x)\n",
127 |         "    return x, y, true_y"
128 |       ],
129 |       "metadata": {
130 |         "id": "lwGg9fg_ahdO"
131 |       },
132 |       "execution_count": null,
133 |       "outputs": []
134 |     },
135 |     {
136 |       "cell_type": "markdown",
137 |       "source": [
138 |         "Here is a visualization when the input dimension is 2:"
139 |       ],
140 |       "metadata": {
141 |         "id": "Gl8NwUF5ahdO"
142 |       }
143 |     },
144 |     {
145 |       "cell_type": "code",
146 |       "source": [
147 |         "x, y, true_y = generate_data(1000, 2)\n",
148 |         "plt.scatter(x[:, 0].numpy(), x[:, 1].numpy(), c=y.numpy(), s=5)"
149 |       ],
150 |       "metadata": {
151 |         "id": "RZg3QuQuahdO"
152 |       },
153 |       "execution_count": null,
154 |       "outputs": []
155 |     },
156 |     {
157 |       "cell_type": "markdown",
158 |       "source": [
159 |         "Let's generate data for input dimension of 8."
160 |       ],
161 |       "metadata": {
162 |         "id": "vh2vExpXahdO"
163 |       }
164 |     },
165 |     {
166 |       "cell_type": "code",
167 |       "source": [
168 |         "torch.manual_seed(73)\n",
169 |         "np.random.seed(73)\n",
170 |         "x, y, true_y = generate_data(1000, 8)\n",
171 |         "x_test, y_test, _ = generate_data(1000, 8)\n",
172 |         "x, y = x.to(device), y.to(device)\n",
173 |         "x_test, y_test = x_test.to(device), y_test.to(device)"
174 |       ],
175 |       "metadata": {
176 |         "id": "np2220ueahdO"
177 |       },
178 |       "execution_count": null,
179 |       "outputs": []
180 |     },
181 |     {
182 |       "cell_type": "markdown",
183 |       "source": [
184 |         "Here is the definition of our model:"
185 |       ],
186 |       "metadata": {
187 |         "id": "iJ4nsJsSahdO"
188 |       }
189 |     },
190 |     {
191 |       "cell_type": "code",
192 |       "source": [
193 |         "def get_model(seed):\n",
194 |         "    torch.manual_seed(seed)\n",
195 |         "    np.random.seed(seed)\n",
196 |         "    return nn.Sequential(\n",
197 |         "        OrderedDict(\n",
198 |         "            [  # randomly initialized NN\n",
199 |         "                ('fc1', nn.Linear(8, 32)),\n",
200 |         "                ('relu1', nn.ReLU()),\n",
201 |         "                ('output', nn.Linear(32, 4))]\n",
202 |         "        )\n",
203 |         "    ).to(device)"
204 |       ],
205 |       "metadata": {
206 |         "id": "kMCfuLxeahdP"
207 |       },
208 |       "execution_count": null,
209 |       "outputs": []
210 |     },
211 |     {
212 |       "cell_type": "markdown",
213 |       "source": [
214 |         "Here is the definition of our metric: accuracy."
215 |       ],
216 |       "metadata": {
217 |         "id": "HyWzJ9EKahdP"
218 |       }
219 |     },
220 |     {
221 |       "cell_type": "code",
222 |       "source": [
223 |         "@torch.no_grad()\n",
224 |         "def accuracy(model, x, y):\n",
225 |         "    pred = torch.argmax(model(x), dim=1)\n",
226 |         "    correct = torch.sum(pred == y).item()\n",
227 |         "    acc = (correct / len(y)) * 100\n",
228 |         "    return acc"
229 |       ],
230 |       "metadata": {
231 |         "id": "kRpt_cfUahdP"
232 |       },
233 |       "execution_count": null,
234 |       "outputs": []
235 |     },
236 |     {
237 |       "cell_type": "markdown",
238 |       "source": [
239 |         "### Supervised Learning"
240 |       ],
241 |       "metadata": {
242 |         "id": "3EPhZcWiahdP"
243 |       }
244 |     },
245 |     {
246 |       "cell_type": "code",
247 |       "source": [
248 |         "lr = 0.002\n",
249 |         "num_iters = 3000\n",
250 |         "model = get_model(73)\n",
251 |         "optimizer = torch.optim.Adam(model.parameters(), lr)\n",
252 |         "train_accs = []\n",
253 |         "train_accs.append(accuracy(model, x, y))\n",
254 |         "valid_accs = []\n",
255 |         "valid_accs.append(accuracy(model, x_test, y_test))\n",
256 |         "for epoch in range(num_iters):\n",
257 |         "    logits = model(x)\n",
258 |         "    criterion = nn.CrossEntropyLoss()\n",
259 |         "    loss = criterion(logits, y)\n",
260 |         "\n",
261 |         "    optimizer.zero_grad()\n",
262 |         "    loss.backward()\n",
263 |         "    optimizer.step()\n",
264 |         "    train_accs.append(accuracy(model, x, y))\n",
265 |         "    valid_accs.append(accuracy(model, x_test, y_test))"
266 |       ],
267 |       "metadata": {
268 |         "id": "kqqeLgHzahdP"
269 |       },
270 |       "execution_count": null,
271 |       "outputs": []
272 |     },
273 |     {
274 |       "cell_type": "code",
275 |       "source": [
276 |         "plt.plot([i for i in range(len(train_accs))], train_accs, label=\"Train\")\n",
277 |         "plt.plot([i for i in range(len(valid_accs))], valid_accs, label=\"Test\")\n",
278 |         "plt.xlabel(\"Iterations\")\n",
279 |         "plt.ylabel(\"Accuracy\")\n",
280 |         "plt.legend()\n",
281 |         "plt.show()\n",
282 |         "print(\"Final test accuracy:\", valid_accs[-1])"
283 |       ],
284 |       "metadata": {
285 |         "id": "xL6CrWIFahdP"
286 |       },
287 |       "execution_count": null,
288 |       "outputs": []
289 |     },
290 |     {
291 |       "cell_type": "markdown",
292 |       "source": [
293 |         "### Policy Gradient"
294 |       ],
295 |       "metadata": {
296 |         "id": "4181Vf4ZahdP"
297 |       }
298 |     },
299 |     {
300 |       "cell_type": "code",
301 |       "source": [
302 |         "# Reward Oracle Function (Black-boxed)\n",
303 |         "# This function calculates the reward, returning 3 for correct predictions and -1 for incorrect ones.\n",
304 |         "# Usage guidelines:\n",
305 |         "# - Call this function only; do not rely on its internal implementation details.\n",
306 |         "# - Gradients are not calculated within this function due to the `@torch.no_grad()` decorator.\n",
307 |         "@torch.no_grad()\n",
308 |         "def reward_oracle(x, pred):\n",
309 |         "    return torch.where(true_y(x) == pred, 3, -1)"
310 |       ],
311 |       "metadata": {
312 |         "id": "4Fxj50JRahdP"
313 |       },
314 |       "execution_count": null,
315 |       "outputs": []
316 |     },
317 |     {
318 |       "cell_type": "markdown",
319 |       "source": [
320 |         "**Implement Policy Gradient Algorithm:** Based on the formulas derived in part (b), complete the policy gradient implementation. For this task, we will use the Adam optimizer and process the full dataset in a single batch. The reward oracle has been invoked for you in the provided code. Remember, do not use the true labels directly in your implementation.\n",
321 |         "\n",
322 |         "*Hint: there are two approaches to get `logprob_on_predicted`: (a) use `torch.gather`. (b) use `nn.CrossEntropyLoss` (or its equivalent in `torch.nn.functional`), but with predicted labels instead of true labels.*"
323 |       ],
324 |       "metadata": {
325 |         "id": "rCxct1tAahdP"
326 |       }
327 |     },
328 |     {
329 |       "cell_type": "code",
330 |       "source": [
331 |         "lr = 0.002\n",
332 |         "num_iters = 3000\n",
333 |         "model = get_model(73)\n",
334 |         "optimizer = torch.optim.Adam(model.parameters(), lr)\n",
335 |         "train_accs = []\n",
336 |         "train_accs.append(accuracy(model, x, y))\n",
337 |         "valid_accs = []\n",
338 |         "valid_accs.append(accuracy(model, x_test, y_test))\n",
339 |         "for epoch in range(num_iters):\n",
340 |         "    logits = model(x)\n",
341 |         "    logprob = F.log_softmax(logits, dim=-1)\n",
342 |         "\n",
343 |         "    predicted = logits.detach().argmax(dim=-1)\n",
344 |         "    reward = reward_oracle(x, predicted)\n",
345 |         "\n",
346 |         "    ############################################################################\n",
347 |         "    # TODO: Calculate the loss function of policy gradient\n",
348 |         "    ############################################################################\n",
349 |         "    logprob_on_predicted = None\n",
350 |         "\n",
351 |         "    loss = None\n",
352 |         "    ############################################################################\n",
353 |         "\n",
354 |         "    optimizer.zero_grad()\n",
355 |         "    loss.backward()\n",
356 |         "    optimizer.step()\n",
357 |         "    train_accs.append(accuracy(model, x, y))\n",
358 |         "    valid_accs.append(accuracy(model, x_test, y_test))"
359 |       ],
360 |       "metadata": {
361 |         "id": "Yp7o8V4cahdP"
362 |       },
363 |       "execution_count": null,
364 |       "outputs": []
365 |     },
366 |     {
367 |       "cell_type": "markdown",
368 |       "source": [
369 |         "#### Question\n",
370 |         "\n",
371 |         "**Include the screenshot of the accuracy plot** in your written assignment submission. With a correct implementation, you should observe a test accuracy of approximately 75% after the final iteration."
372 |       ],
373 |       "metadata": {
374 |         "id": "dI6ah63CahdP"
375 |       }
376 |     },
377 |     {
378 |       "cell_type": "code",
379 |       "source": [
380 |         "plt.plot([i for i in range(len(train_accs))], train_accs, label=\"Train\")\n",
381 |         "plt.plot([i for i in range(len(valid_accs))], valid_accs, label=\"Test\")\n",
382 |         "plt.xlabel(\"Iterations\")\n",
383 |         "plt.ylabel(\"Accuracy\")\n",
384 |         "plt.legend()\n",
385 |         "plt.show()\n",
386 |         "print(\"Final test accuracy:\", valid_accs[-1])"
387 |       ],
388 |       "metadata": {
389 |         "id": "_r7vs-pLahdP"
390 |       },
391 |       "execution_count": null,
392 |       "outputs": []
393 |     },
394 |     {
395 |       "cell_type": "markdown",
396 |       "source": [
397 |         "#### Question\n",
398 |         "\n",
399 |         "**Compare the policy gradient and supervised learning approaches for this classification task, focusing on their convergence speed, stability, and final performance. Explain any observed differences.** Include your response to this question in your written assignment submission."
400 |       ],
401 |       "metadata": {
402 |         "id": "r5eGnPnuahdP"
403 |       }
404 |     }
405 |   ]
406 | }


--------------------------------------------------------------------------------
/Homework/homework11/coding/hw11_vae_gan.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "cells": [
  3 |     {
  4 |       "cell_type": "markdown",
  5 |       "metadata": {
  6 |         "id": "MOVA7zaO8Sx7"
  7 |       },
  8 |       "source": [
  9 |         "## Homework 11\n",
 10 |         "\n",
 11 |         "In this notebook, you will implement VAEs and GANs using a common framework.\n",
 12 |         "\n",
 13 |         "The code is cloned from the following GitHub repo to the **temporary** directory of your Colab runtime, **WHICH WILL BE DELETED IF YOUR RUNTIME IS DISCONNECTED**. It is recommended to edit your code offline and paste it into the Colab environment."
 14 |       ]
 15 |     },
 16 |     {
 17 |       "cell_type": "code",
 18 |       "source": [
 19 |         "%load_ext autoreload\n",
 20 |         "%autoreload 2"
 21 |       ],
 22 |       "metadata": {
 23 |         "id": "fl34gY-wjnJE"
 24 |       },
 25 |       "execution_count": null,
 26 |       "outputs": []
 27 |     },
 28 |     {
 29 |       "cell_type": "code",
 30 |       "execution_count": null,
 31 |       "metadata": {
 32 |         "id": "T6di8XyD8Sx_"
 33 |       },
 34 |       "outputs": [],
 35 |       "source": [
 36 |         "!git clone https://github.com/Berkeley-CS182/cs182hw11\n",
 37 |         "%cd cs182hw11"
 38 |       ]
 39 |     },
 40 |     {
 41 |       "cell_type": "markdown",
 42 |       "metadata": {
 43 |         "id": "AKzef8fz8SyA"
 44 |       },
 45 |       "source": [
 46 |         "### VAEs"
 47 |       ]
 48 |     },
 49 |     {
 50 |       "cell_type": "markdown",
 51 |       "metadata": {
 52 |         "id": "1Ugu2veI8SyA"
 53 |       },
 54 |       "source": [
 55 |         "##### (a) Implement the reparametrization trick in `sample_gaussian` in `utils.py`."
 56 |       ]
 57 |     },
 58 |     {
 59 |       "cell_type": "markdown",
 60 |       "metadata": {
 61 |         "id": "49JIJAJR8SyA"
 62 |       },
 63 |       "source": [
 64 |         "##### (b) Implement `negative_elbo_bound` in `models/vae.py`"
 65 |       ]
 66 |     },
 67 |     {
 68 |       "cell_type": "markdown",
 69 |       "metadata": {
 70 |         "id": "lH-Bk3UW8SyB"
 71 |       },
 72 |       "source": [
 73 |         "##### (c) Run your VAE!\n",
 74 |         "\n",
 75 |         "If you are using the shell, run:\n",
 76 |         "\n",
 77 |         "```\n",
 78 |         "python experiment.py --model vae\n",
 79 |         "```\n",
 80 |         "\n",
 81 |         "If you are on Colab, run the following cells:"
 82 |       ]
 83 |     },
 84 |     {
 85 |       "cell_type": "code",
 86 |       "execution_count": null,
 87 |       "metadata": {
 88 |         "id": "BouQlNOL8SyB"
 89 |       },
 90 |       "outputs": [],
 91 |       "source": [
 92 |         "from experiment import build_config_from_args, experiment\n",
 93 |         "\n",
 94 |         "config = build_config_from_args(is_jupyter=True)\n",
 95 |         "config.model = 'vae'\n",
 96 |         "## TIP: For debugging, feel free to use a smaller iter_max to inspect loss and samples.\n",
 97 |         "config.iter_max = 1000\n",
 98 |         "print(config)\n",
 99 |         "experiment(config)"
100 |       ]
101 |     },
102 |     {
103 |       "cell_type": "markdown",
104 |       "metadata": {
105 |         "id": "NJrny-m-8SyB"
106 |       },
107 |       "source": [
108 |         "##### (d) Visualize samples from your VAE.\n",
109 |         "\n",
110 |         "Find your visualizations in `results/generated_vae_<checkpoint_id>.png`"
111 |       ]
112 |     },
113 |     {
114 |       "cell_type": "markdown",
115 |       "metadata": {
116 |         "id": "8QMrqC1S8SyC"
117 |       },
118 |       "source": [
119 |         "### GANs"
120 |       ]
121 |     },
122 |     {
123 |       "cell_type": "markdown",
124 |       "metadata": {
125 |         "id": "I39JbGUU8SyC"
126 |       },
127 |       "source": [
128 |         "##### (a) See PDF."
129 |       ]
130 |     },
131 |     {
132 |       "cell_type": "markdown",
133 |       "metadata": {
134 |         "id": "nsXIUfV88SyC"
135 |       },
136 |       "source": [
137 |         "##### (b) See PDF."
138 |       ]
139 |     },
140 |     {
141 |       "cell_type": "markdown",
142 |       "metadata": {
143 |         "id": "2L_XPBL78SyC"
144 |       },
145 |       "source": [
146 |         "##### (c) Implement GAN training and visualize your GAN!\n",
147 |         "\n",
148 |         "If you are using the shell, run:\n",
149 |         "\n",
150 |         "```\n",
151 |         "python experiment.py --model gan\n",
152 |         "```\n",
153 |         "\n",
154 |         "If you are on Colab, run the following cells:"
155 |       ]
156 |     },
157 |     {
158 |       "cell_type": "code",
159 |       "execution_count": null,
160 |       "metadata": {
161 |         "id": "9VVLpPEk8SyD"
162 |       },
163 |       "outputs": [],
164 |       "source": [
165 |         "from experiment import build_config_from_args, experiment\n",
166 |         "\n",
167 |         "config = build_config_from_args(is_jupyter=True)\n",
168 |         "config.model = 'gan'\n",
169 |         "## TIP: For debugging, feel free to use a smaller iter_max to inspect loss and samples.\n",
170 |         "# config.iter_max = 500\n",
171 |         "print(config)\n",
172 |         "experiment(config)"
173 |       ]
174 |     },
175 |     {
176 |       "cell_type": "markdown",
177 |       "metadata": {
178 |         "id": "RZeFGNaV8SyD"
179 |       },
180 |       "source": [
181 |         "Find your visualizations in `results/generated_gan_<checkpoint_id>.png`"
182 |       ]
183 |     }
184 |   ],
185 |   "metadata": {
186 |     "kernelspec": {
187 |       "display_name": "cs182",
188 |       "language": "python",
189 |       "name": "python3"
190 |     },
191 |     "language_info": {
192 |       "codemirror_mode": {
193 |         "name": "ipython",
194 |         "version": 3
195 |       },
196 |       "file_extension": ".py",
197 |       "mimetype": "text/x-python",
198 |       "name": "python",
199 |       "nbconvert_exporter": "python",
200 |       "pygments_lexer": "ipython3",
201 |       "version": "3.8.16"
202 |     },
203 |     "orig_nbformat": 4,
204 |     "colab": {
205 |       "provenance": []
206 |     },
207 |     "accelerator": "GPU",
208 |     "gpuClass": "standard"
209 |   },
210 |   "nbformat": 4,
211 |   "nbformat_minor": 0
212 | }


--------------------------------------------------------------------------------
/Homework/homework11/hw11.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Leo-Adventure/CS182-Berkeley-Deep-Learning-23Spring/c70df32185f7a1e38e3287dbc6d2d0b08f9e98d2/Homework/homework11/hw11.pdf


--------------------------------------------------------------------------------
/Homework/homework11/hw11_sol.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Leo-Adventure/CS182-Berkeley-Deep-Learning-23Spring/c70df32185f7a1e38e3287dbc6d2d0b08f9e98d2/Homework/homework11/hw11_sol.pdf


--------------------------------------------------------------------------------
/Homework/homework12/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Leo-Adventure/CS182-Berkeley-Deep-Learning-23Spring/c70df32185f7a1e38e3287dbc6d2d0b08f9e98d2/Homework/homework12/.DS_Store


--------------------------------------------------------------------------------
/Homework/homework12/coding/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Leo-Adventure/CS182-Berkeley-Deep-Learning-23Spring/c70df32185f7a1e38e3287dbc6d2d0b08f9e98d2/Homework/homework12/coding/.DS_Store


--------------------------------------------------------------------------------
/Homework/homework12/hw12.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Leo-Adventure/CS182-Berkeley-Deep-Learning-23Spring/c70df32185f7a1e38e3287dbc6d2d0b08f9e98d2/Homework/homework12/hw12.pdf


--------------------------------------------------------------------------------
/Homework/homework12/hw12_sol.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Leo-Adventure/CS182-Berkeley-Deep-Learning-23Spring/c70df32185f7a1e38e3287dbc6d2d0b08f9e98d2/Homework/homework12/hw12_sol.pdf


--------------------------------------------------------------------------------
/Homework/homework3/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Leo-Adventure/CS182-Berkeley-Deep-Learning-23Spring/c70df32185f7a1e38e3287dbc6d2d0b08f9e98d2/Homework/homework3/.DS_Store


--------------------------------------------------------------------------------
/Homework/homework3/coding/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Leo-Adventure/CS182-Berkeley-Deep-Learning-23Spring/c70df32185f7a1e38e3287dbc6d2d0b08f9e98d2/Homework/homework3/coding/.DS_Store


--------------------------------------------------------------------------------
/Homework/homework3/coding/hw3_cnn.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "cells": [
  3 |     {
  4 |       "cell_type": "markdown",
  5 |       "source": [
  6 |         "# Setup Environment\n",
  7 |         "\n",
  8 |         "If you are working on this assignment using Google Colab, please execute the codes below.\n",
  9 |         "\n",
 10 |         "Alternatively, you can also do this assignment using a local anaconda environment (or a Python virtualenv). Please clone the GitHub repo by running `git clone https://github.com/Berkeley-CS182/cs182hw3.git` and refer to `README.md` for further details."
 11 |       ],
 12 |       "metadata": {
 13 |         "id": "nufgmc-ppd0d"
 14 |       }
 15 |     },
 16 |     {
 17 |       "cell_type": "code",
 18 |       "source": [
 19 |         "#@title Mount your Google Drive\n",
 20 |         "\n",
 21 |         "import os\n",
 22 |         "from google.colab import drive\n",
 23 |         "drive.mount('/content/gdrive')"
 24 |       ],
 25 |       "metadata": {
 26 |         "id": "wYqx6VYApd0e"
 27 |       },
 28 |       "execution_count": null,
 29 |       "outputs": []
 30 |     },
 31 |     {
 32 |       "cell_type": "code",
 33 |       "source": [
 34 |         "#@title Set up mount symlink\n",
 35 |         "\n",
 36 |         "DRIVE_PATH = '/content/gdrive/My\\ Drive/cs182hw3_sp23'\n",
 37 |         "DRIVE_PYTHON_PATH = DRIVE_PATH.replace('\\\\', '')\n",
 38 |         "if not os.path.exists(DRIVE_PYTHON_PATH):\n",
 39 |         "  %mkdir $DRIVE_PATH\n",
 40 |         "\n",
 41 |         "## the space in `My Drive` causes some issues,\n",
 42 |         "## make a symlink to avoid this\n",
 43 |         "SYM_PATH = '/content/cs182hw3'\n",
 44 |         "if not os.path.exists(SYM_PATH):\n",
 45 |         "  !ln -s $DRIVE_PATH $SYM_PATH"
 46 |       ],
 47 |       "metadata": {
 48 |         "id": "ucL2lphHpd0f"
 49 |       },
 50 |       "execution_count": null,
 51 |       "outputs": []
 52 |     },
 53 |     {
 54 |       "cell_type": "code",
 55 |       "source": [
 56 |         "#@title Install dependencies\n",
 57 |         "\n",
 58 |         "!pip install numpy==1.21.6 imageio==2.9.0 matplotlib==3.2.2"
 59 |       ],
 60 |       "metadata": {
 61 |         "id": "BRx0rvMHpd0g"
 62 |       },
 63 |       "execution_count": null,
 64 |       "outputs": []
 65 |     },
 66 |     {
 67 |       "cell_type": "code",
 68 |       "source": [
 69 |         "#@title Clone homework repo\n",
 70 |         "\n",
 71 |         "%cd $SYM_PATH\n",
 72 |         "if not os.path.exists(\"cs182hw3\"):\n",
 73 |         "  !git clone https://github.com/Berkeley-CS182/cs182hw3.git\n",
 74 |         "%cd cs182hw3"
 75 |       ],
 76 |       "metadata": {
 77 |         "id": "T4_GbNUVpd0g"
 78 |       },
 79 |       "execution_count": null,
 80 |       "outputs": []
 81 |     },
 82 |     {
 83 |       "cell_type": "code",
 84 |       "source": [
 85 |         "#@title Download datasets (Skip if you did it in the last part)\n",
 86 |         "\n",
 87 |         "%cd deeplearning/datasets/\n",
 88 |         "!bash ./get_datasets.sh\n",
 89 |         "%cd ../.."
 90 |       ],
 91 |       "metadata": {
 92 |         "id": "OdUhhxtXpd0g"
 93 |       },
 94 |       "execution_count": null,
 95 |       "outputs": []
 96 |     },
 97 |     {
 98 |       "cell_type": "code",
 99 |       "source": [
100 |         "#@title Configure Jupyter Notebook\n",
101 |         "\n",
102 |         "import matplotlib\n",
103 |         "%matplotlib inline\n",
104 |         "%load_ext autoreload\n",
105 |         "%autoreload 2"
106 |       ],
107 |       "metadata": {
108 |         "id": "620hkpvdpd0g"
109 |       },
110 |       "execution_count": null,
111 |       "outputs": []
112 |     },
113 |     {
114 |       "cell_type": "markdown",
115 |       "source": [
116 |         "# Convolutional Networks\n",
117 |         "So far we have worked with deep fully-connected networks, using them to explore different optimization strategies and network architectures. Fully-connected networks are a good testbed for experimentation because they are very computationally efficient, but in practice all state-of-the-art results use convolutional networks instead.\n",
118 |         "\n",
119 |         "First you will implement several layer types that are used in convolutional networks. You will then use these layers to train a convolutional network on the CIFAR-10 dataset."
120 |       ],
121 |       "metadata": {
122 |         "id": "jPOzEoR8pd0h"
123 |       }
124 |     },
125 |     {
126 |       "cell_type": "code",
127 |       "source": [
128 |         "# As usual, a bit of setup\n",
129 |         "\n",
130 |         "import os\n",
131 |         "import time\n",
132 |         "import numpy as np\n",
133 |         "import matplotlib.pyplot as plt\n",
134 |         "from deeplearning.classifiers.fc_net import *\n",
135 |         "from deeplearning.data_utils import get_CIFAR10_data\n",
136 |         "from deeplearning.gradient_check import eval_numerical_gradient, eval_numerical_gradient_array\n",
137 |         "from deeplearning.solver import Solver\n",
138 |         "import random \n",
139 |         "import torch\n",
140 |         "seed = 7\n",
141 |         "torch.manual_seed(seed)\n",
142 |         "random.seed(seed)\n",
143 |         "np.random.seed(seed)\n",
144 |         "\n",
145 |         "plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots\n",
146 |         "plt.rcParams['image.interpolation'] = 'nearest'\n",
147 |         "plt.rcParams['image.cmap'] = 'gray'\n",
148 |         "\n",
149 |         "os.makedirs(\"submission_logs\", exist_ok=True)\n",
150 |         "\n",
151 |         "def abs_error(x, y):\n",
152 |         "    return np.max(np.abs(x - y))\n",
153 |         "\n",
154 |         "def rel_error(x, y):\n",
155 |         "    \"\"\" returns relative error \"\"\"\n",
156 |         "    return np.max(np.abs(x - y) / (np.maximum(1e-8, np.abs(x) + np.abs(y))))"
157 |       ],
158 |       "metadata": {
159 |         "id": "aB32YfDMpd0i"
160 |       },
161 |       "execution_count": null,
162 |       "outputs": []
163 |     },
164 |     {
165 |       "cell_type": "code",
166 |       "source": [
167 |         "# Load the (preprocessed) CIFAR10 data.\n",
168 |         "\n",
169 |         "data = get_CIFAR10_data()\n",
170 |         "for k, v in data.items():\n",
171 |         "    print ('%s: ' % k, v.shape)"
172 |       ],
173 |       "metadata": {
174 |         "id": "AOgTLXl_pd0i"
175 |       },
176 |       "execution_count": null,
177 |       "outputs": []
178 |     },
179 |     {
180 |       "cell_type": "markdown",
181 |       "metadata": {
182 |         "id": "NZoSvnUupd0j"
183 |       },
184 |       "source": [
185 |         "## Convolution: Naive forward pass\n",
186 |         "The core of a convolutional network is the convolution operation. In the file `deeplearning/layers.py`, implement the forward pass for the convolution layer in the function `conv_forward_naive`. \n",
187 |         "\n",
188 |         "You don't have to worry too much about efficiency at this point; just write the code in whatever way you find most clear.\n",
189 |         "\n",
190 |         "You can test your implementation by running the following:"
191 |       ]
192 |     },
193 |     {
194 |       "cell_type": "code",
195 |       "execution_count": null,
196 |       "metadata": {
197 |         "id": "TL6fJNtspd0j"
198 |       },
199 |       "outputs": [],
200 |       "source": [
201 |         "x_shape = (2, 3, 4, 4)\n",
202 |         "w_shape = (3, 3, 4, 4)\n",
203 |         "x = np.linspace(-0.1, 0.5, num=np.prod(x_shape)).reshape(x_shape)\n",
204 |         "w = np.linspace(-0.2, 0.3, num=np.prod(w_shape)).reshape(w_shape)\n",
205 |         "b = np.linspace(-0.1, 0.2, num=3)\n",
206 |         "\n",
207 |         "conv_param = {'stride': 2, 'pad': 1}\n",
208 |         "out, _ = conv_forward_naive(x, w, b, conv_param)\n",
209 |         "correct_out = np.array([[[[[-0.08759809, -0.10987781],\n",
210 |         "                           [-0.18387192, -0.2109216 ]],\n",
211 |         "                          [[ 0.21027089,  0.21661097],\n",
212 |         "                           [ 0.22847626,  0.23004637]],\n",
213 |         "                          [[ 0.50813986,  0.54309974],\n",
214 |         "                           [ 0.64082444,  0.67101435]]],\n",
215 |         "                         [[[-0.98053589, -1.03143541],\n",
216 |         "                           [-1.19128892, -1.24695841]],\n",
217 |         "                          [[ 0.69108355,  0.66880383],\n",
218 |         "                           [ 0.59480972,  0.56776003]],\n",
219 |         "                          [[ 2.36270298,  2.36904306],\n",
220 |         "                           [ 2.38090835,  2.38247847]]]]])\n",
221 |         "\n",
222 |         "# Compare your output to ours; difference should be around 1e-8\n",
223 |         "print ('Testing conv_forward_naive')\n",
224 |         "print ('difference: ', rel_error(out, correct_out))"
225 |       ]
226 |     },
227 |     {
228 |       "cell_type": "markdown",
229 |       "source": [
230 |         "## Convolution: naive backpropagation\n",
231 |         "\n",
232 |         "In `deeplearning/layers.py`, implement the backpropagation for the convolution layer in the function `conv_backward_naive`.\n",
233 |         "\n",
234 |         "The gradient check below will take 30s~1min depending on the efficiency of your code."
235 |       ],
236 |       "metadata": {
237 |         "id": "pJkyxjc-pd0j"
238 |       }
239 |     },
240 |     {
241 |       "cell_type": "code",
242 |       "source": [
243 |         "x = np.random.randn(10, 3, 5, 5)\n",
244 |         "w = np.random.randn(16, 3, 3, 3)\n",
245 |         "b = np.random.randn(16,)\n",
246 |         "conv_param = {'stride': 2, 'pad': 1}\n",
247 |         "dout = np.random.randn(10, 16, 3, 3)\n",
248 |         "out, cache = conv_forward_naive(x, w, b, conv_param)\n",
249 |         "dx, dw, db = conv_backward_naive(dout, cache)\n",
250 |         "dx_num = eval_numerical_gradient_array(lambda xx: conv_forward_naive(xx, w, b, conv_param)[0], x, dout)\n",
251 |         "dw_num = eval_numerical_gradient_array(lambda ww: conv_forward_naive(x, ww, b, conv_param)[0], w, dout)\n",
252 |         "db_num = eval_numerical_gradient_array(lambda bb: conv_forward_naive(x, w, bb, conv_param)[0], b, dout)\n",
253 |         "\n",
254 |         "print ('dx relative error: ', rel_error(dx, dx_num))\n",
255 |         "print ('dw relative error: ', rel_error(dw, dw_num))\n",
256 |         "print ('db relative error: ', rel_error(db, db_num))"
257 |       ],
258 |       "metadata": {
259 |         "id": "xxmAa9T6pd0j"
260 |       },
261 |       "execution_count": null,
262 |       "outputs": []
263 |     },
264 |     {
265 |       "cell_type": "markdown",
266 |       "metadata": {
267 |         "id": "5uSKAFfupd0j"
268 |       },
269 |       "source": [
270 |         "## Max pooling: Naive forward\n",
271 |         "Implement the forward pass for the max-pooling operation in the function `max_pool_forward_naive` in the file `deeplearning/layers.py`. Again, don't worry too much about computational efficiency.\n",
272 |         "\n",
273 |         "Check your implementation by running the following:"
274 |       ]
275 |     },
276 |     {
277 |       "cell_type": "code",
278 |       "execution_count": null,
279 |       "metadata": {
280 |         "id": "4cK4WYxXpd0k"
281 |       },
282 |       "outputs": [],
283 |       "source": [
284 |         "x_shape = (2, 3, 4, 4)\n",
285 |         "x = np.linspace(-0.3, 0.4, num=np.prod(x_shape)).reshape(x_shape)\n",
286 |         "pool_param = {'pool_height': 2, 'pool_width': 2, 'stride': 2}\n",
287 |         "\n",
288 |         "out, _ = max_pool_forward_naive(x, pool_param)\n",
289 |         "\n",
290 |         "correct_out = np.array([[[[-0.26315789, -0.24842105],\n",
291 |         "                          [-0.20421053, -0.18947368]],\n",
292 |         "                         [[-0.14526316, -0.13052632],\n",
293 |         "                          [-0.08631579, -0.07157895]],\n",
294 |         "                         [[-0.02736842, -0.01263158],\n",
295 |         "                          [ 0.03157895,  0.04631579]]],\n",
296 |         "                        [[[ 0.09052632,  0.10526316],\n",
297 |         "                          [ 0.14947368,  0.16421053]],\n",
298 |         "                         [[ 0.20842105,  0.22315789],\n",
299 |         "                          [ 0.26736842,  0.28210526]],\n",
300 |         "                         [[ 0.32631579,  0.34105263],\n",
301 |         "                          [ 0.38526316,  0.4       ]]]])\n",
302 |         "\n",
303 |         "# Compare your output with ours. Difference should be around 1e-8.\n",
304 |         "print ('Testing max_pool_forward_naive function:')\n",
305 |         "print ('difference: ', rel_error(out, correct_out))"
306 |       ]
307 |     },
308 |     {
309 |       "cell_type": "markdown",
310 |       "metadata": {
311 |         "id": "4jB312yGpd0k"
312 |       },
313 |       "source": [
314 |         "## Max pooling: Naive backward\n",
315 |         "In `deeplearning/layers.py`, implement the backpropagation for the max pooling layer in the function `max_pool_backward_naive`."
316 |       ]
317 |     },
318 |     {
319 |       "cell_type": "code",
320 |       "source": [
321 |         "x = np.random.randn(10, 3, 8, 7)\n",
322 |         "pool_param = {'pool_height': 2, 'pool_width': 3, 'stride': 2}\n",
323 |         "dout = np.random.randn(10, 3, 4, 3)\n",
324 |         "out, cache = max_pool_forward_naive(x, pool_param)\n",
325 |         "dx = max_pool_backward_naive(dout, cache)\n",
326 |         "dx_num = eval_numerical_gradient_array(lambda xx: max_pool_forward_naive(xx, pool_param)[0], x, dout)\n",
327 |         "\n",
328 |         "print ('dx relative error: ', rel_error(dx, dx_num))"
329 |       ],
330 |       "metadata": {
331 |         "id": "Ba4AhiNTpd0k"
332 |       },
333 |       "execution_count": null,
334 |       "outputs": []
335 |     },
336 |     {
337 |       "cell_type": "markdown",
338 |       "metadata": {
339 |         "id": "Qu6qFlWipd0k"
340 |       },
341 |       "source": [
342 |         "## Convolutional \"sandwich\" layers\n",
343 |         "Previously we introduced the concept of \"sandwich\" layers that combine multiple operations into commonly used patterns. In the file `deeplearning/layer_utils.py` you will find sandwich layers that implement a few commonly used patterns for convolutional networks.\n",
344 |         "\n",
345 |         "The gradient check below will take 45s~1min30s depending on the efficiency of your code."
346 |       ]
347 |     },
348 |     {
349 |       "cell_type": "code",
350 |       "execution_count": null,
351 |       "metadata": {
352 |         "id": "2U4TErMMpd0k"
353 |       },
354 |       "outputs": [],
355 |       "source": [
356 |         "from deeplearning.layer_utils import conv_relu_pool_forward, conv_relu_pool_backward\n",
357 |         "\n",
358 |         "x = np.random.randn(2, 3, 16, 16)\n",
359 |         "w = np.random.randn(3, 3, 3, 3)\n",
360 |         "b = np.random.randn(3,)\n",
361 |         "dout = np.random.randn(2, 3, 8, 8)\n",
362 |         "conv_param = {'stride': 1, 'pad': 1}\n",
363 |         "pool_param = {'pool_height': 2, 'pool_width': 2, 'stride': 2}\n",
364 |         "\n",
365 |         "out, cache = conv_relu_pool_forward(x, w, b, conv_param, pool_param)\n",
366 |         "dx, dw, db = conv_relu_pool_backward(dout, cache)\n",
367 |         "\n",
368 |         "dx_num = eval_numerical_gradient_array(lambda x: conv_relu_pool_forward(x, w, b, conv_param, pool_param)[0], x, dout)\n",
369 |         "dw_num = eval_numerical_gradient_array(lambda w: conv_relu_pool_forward(x, w, b, conv_param, pool_param)[0], w, dout)\n",
370 |         "db_num = eval_numerical_gradient_array(lambda b: conv_relu_pool_forward(x, w, b, conv_param, pool_param)[0], b, dout)\n",
371 |         "\n",
372 |         "print ('Testing conv_relu_pool')\n",
373 |         "print ('dx error: ', rel_error(dx_num, dx))\n",
374 |         "print ('dw error: ', rel_error(dw_num, dw))\n",
375 |         "print ('db error: ', rel_error(db_num, db))"
376 |       ]
377 |     },
378 |     {
379 |       "cell_type": "markdown",
380 |       "metadata": {
381 |         "id": "tCvxdByUpd0k"
382 |       },
383 |       "source": [
384 |         "## Three-layer ConvNet\n",
385 |         "Now that you have implemented all the necessary layers, we can put them together into a simple convolutional network.\n",
386 |         "\n",
387 |         "Open the file `deeplearning/classifiers/cnn.py` and complete the implementation of the `ThreeLayerConvNet` class. Run the following cells to help you debug:"
388 |       ]
389 |     },
390 |     {
391 |       "cell_type": "markdown",
392 |       "metadata": {
393 |         "id": "MaQM4xyXpd0k"
394 |       },
395 |       "source": [
396 |         "### Sanity check loss\n",
397 |         "After you build a new network, one of the first things you should do is sanity check the loss. When we use the softmax loss, we expect the loss for random weights (and no regularization) to be about `log(C)` for `C` classes. When we add regularization this should go up."
398 |       ]
399 |     },
400 |     {
401 |       "cell_type": "markdown",
402 |       "metadata": {
403 |         "id": "qmhSlabOpd0k"
404 |       },
405 |       "source": [
406 |         "### Gradient check\n",
407 |         "After the loss looks reasonable, use numeric gradient checking to make sure that your backward pass is correct. When you use numeric gradient checking you should use a small amount of artifical data and a small number of neurons at each layer."
408 |       ]
409 |     },
410 |     {
411 |       "cell_type": "code",
412 |       "execution_count": null,
413 |       "metadata": {
414 |         "id": "vAx2x1fLpd0l"
415 |       },
416 |       "outputs": [],
417 |       "source": [
418 |         "from deeplearning.classifiers.cnn import ThreeLayerConvNet\n",
419 |         "\n",
420 |         "np.random.seed(seed)\n",
421 |         "model = ThreeLayerConvNet(num_filters=3, filter_size=1)\n",
422 |         "\n",
423 |         "N = 50\n",
424 |         "X = np.random.randn(N, 3, 32, 32)\n",
425 |         "y = np.random.randint(10, size=N)\n",
426 |         "\n",
427 |         "loss, grads = model.loss(X, y)\n",
428 |         "print ('Initial loss (no regularization): ', loss)\n",
429 |         "# expected: (approx.) 2.302585092994046\n",
430 |         "\n",
431 |         "model.reg = 0.5\n",
432 |         "loss, grads = model.loss(X, y)\n",
433 |         "print ('Initial loss (with regularization): ', loss)\n",
434 |         "# expected: (approx.) 2.322037342994046"
435 |       ]
436 |     },
437 |     {
438 |       "cell_type": "markdown",
439 |       "source": [
440 |         "The following gradient check will take 1min30s to 3min to run. The max relative error of every parameter tensor should be less than `1e-2`."
441 |       ],
442 |       "metadata": {
443 |         "id": "ZalT9992pd0l"
444 |       }
445 |     },
446 |     {
447 |       "cell_type": "code",
448 |       "execution_count": null,
449 |       "metadata": {
450 |         "id": "yQPcuBbupd0l"
451 |       },
452 |       "outputs": [],
453 |       "source": [
454 |         "num_inputs = 5\n",
455 |         "input_dim = (3, 12, 12)\n",
456 |         "reg = 0.0\n",
457 |         "num_classes = 10\n",
458 |         "np.random.seed(seed)\n",
459 |         "X = np.random.randn(num_inputs, *input_dim)\n",
460 |         "y = np.random.randint(num_classes, size=num_inputs)\n",
461 |         "\n",
462 |         "model = ThreeLayerConvNet(num_filters=3, filter_size=3,\n",
463 |         "                          input_dim=input_dim, hidden_dim=7,\n",
464 |         "                          weight_scale=0.01, reg=0.001, dtype=np.float64)\n",
465 |         "loss, grads = model.loss(X, y)\n",
466 |         "for param_name in sorted(grads):\n",
467 |         "    f = lambda _: model.loss(X, y)[0]\n",
468 |         "    param_grad_num = eval_numerical_gradient(f, model.params[param_name], verbose=False, h=1e-6)\n",
469 |         "    e = rel_error(param_grad_num, grads[param_name])\n",
470 |         "    print ('%s max relative error: %e' % (param_name, rel_error(param_grad_num, grads[param_name])))"
471 |       ]
472 |     },
473 |     {
474 |       "cell_type": "markdown",
475 |       "metadata": {
476 |         "id": "tepQMZ7xpd0l"
477 |       },
478 |       "source": [
479 |         "# Spatial Batch Normalization\n",
480 |         "We already saw that batch normalization is a very useful technique for training deep fully-connected networks. Batch normalization can also be used for convolutional networks, but we need to tweak it a bit; the modification will be called \"spatial batch normalization.\"\n",
481 |         "\n",
482 |         "Normally batch-normalization accepts inputs of shape `(N, D)` and produces outputs of shape `(N, D)`, where we normalize across the minibatch dimension `N`. For data coming from convolutional layers, batch normalization needs to accept inputs of shape `(N, C, H, W)` and produce outputs of shape `(N, C, H, W)` where the `N` dimension gives the minibatch size and the `(H, W)` dimensions give the spatial size of the feature map.\n",
483 |         "\n",
484 |         "If the feature map was produced using convolutions, then we expect the statistics of each feature channel to be relatively consistent both between different imagesand different locations within the same image. Therefore spatial batch normalization computes a mean and variance for each of the `C` feature channels by computing statistics over both the minibatch dimension `N` and the spatial dimensions `H` and `W`."
485 |       ]
486 |     },
487 |     {
488 |       "cell_type": "markdown",
489 |       "metadata": {
490 |         "id": "pO3t3xQepd0l"
491 |       },
492 |       "source": [
493 |         "## Spatial batch normalization: forward\n",
494 |         "\n",
495 |         "In the file `deeplearning/layers.py`, implement the forward pass for spatial batch normalization in the function `spatial_batchnorm_forward`. Check your implementation by running the following:"
496 |       ]
497 |     },
498 |     {
499 |       "cell_type": "code",
500 |       "execution_count": null,
501 |       "metadata": {
502 |         "id": "DQScsktDpd0l"
503 |       },
504 |       "outputs": [],
505 |       "source": [
506 |         "# Check the training-time forward pass by checking means and variances\n",
507 |         "# of features both before and after spatial batch normalization\n",
508 |         "N, C, H, W = 2, 3, 4, 5\n",
509 |         "x = 4 * np.random.randn(N, C, H, W) + 10\n",
510 |         "\n",
511 |         "print ('Before spatial batch normalization:')\n",
512 |         "print ('  Shape: ', x.shape)\n",
513 |         "print ('  Means: ', x.mean(axis=(0, 2, 3)))\n",
514 |         "print ('  Stds: ', x.std(axis=(0, 2, 3)))\n",
515 |         "\n",
516 |         "# Means should be close to zero and stds close to one. Shape should be unchanged.\n",
517 |         "gamma, beta = np.ones(C), np.zeros(C)\n",
518 |         "bn_param = {'mode': 'train'}\n",
519 |         "out, _ = spatial_batchnorm_forward(x, gamma, beta, bn_param)\n",
520 |         "print ('After spatial batch normalization:')\n",
521 |         "print ('  Shape: ', out.shape)\n",
522 |         "print ('  Means: ', out.mean(axis=(0, 2, 3)))\n",
523 |         "print ('  Stds: ', out.std(axis=(0, 2, 3)))\n",
524 |         "\n",
525 |         "# Means should be close to beta and stds close to gamma. Shape should be unchnaged.\n",
526 |         "gamma, beta = np.asarray([3, 4, 5]), np.asarray([6, 7, 8])\n",
527 |         "out, _ = spatial_batchnorm_forward(x, gamma, beta, bn_param)\n",
528 |         "print ('After spatial batch normalization (nontrivial gamma, beta):')\n",
529 |         "print ('  Shape: ', out.shape)\n",
530 |         "print ('  Means: ', out.mean(axis=(0, 2, 3)))\n",
531 |         "print ('  Stds: ', out.std(axis=(0, 2, 3)))"
532 |       ]
533 |     },
534 |     {
535 |       "cell_type": "code",
536 |       "execution_count": null,
537 |       "metadata": {
538 |         "id": "H3tY5FG5pd0l"
539 |       },
540 |       "outputs": [],
541 |       "source": [
542 |         "# Check the test-time forward pass by running the training-time\n",
543 |         "# forward pass many times to warm up the running averages, and then\n",
544 |         "# checking the means and variances of activations after a test-time\n",
545 |         "# forward pass.\n",
546 |         "\n",
547 |         "N, C, H, W = 10, 4, 11, 12\n",
548 |         "\n",
549 |         "bn_param = {'mode': 'train'}\n",
550 |         "gamma = np.ones(C)\n",
551 |         "beta = np.zeros(C)\n",
552 |         "for t in range(50):\n",
553 |         "    x = 2.3 * np.random.randn(N, C, H, W) + 13\n",
554 |         "    spatial_batchnorm_forward(x, gamma, beta, bn_param)\n",
555 |         "bn_param['mode'] = 'test'\n",
556 |         "x = 2.3 * np.random.randn(N, C, H, W) + 13\n",
557 |         "a_norm, _ = spatial_batchnorm_forward(x, gamma, beta, bn_param)\n",
558 |         "\n",
559 |         "# Means should be close to zero and stds close to one, but will be\n",
560 |         "# noisier than training-time forward passes.\n",
561 |         "print ('After spatial batch normalization (test-time):')\n",
562 |         "print ('  means: ', a_norm.mean(axis=(0, 2, 3)))\n",
563 |         "print ('  stds: ', a_norm.std(axis=(0, 2, 3)))"
564 |       ]
565 |     },
566 |     {
567 |       "cell_type": "markdown",
568 |       "metadata": {
569 |         "id": "LqIrnR9qpd0l"
570 |       },
571 |       "source": [
572 |         "## Spatial batch normalization: backward\n",
573 |         "In the file `deeplearning/layers.py`, implement the backward pass for spatial batch normalization in the function `spatial_batchnorm_backward`. Run the following to check your implementation using a numeric gradient check:"
574 |       ]
575 |     },
576 |     {
577 |       "cell_type": "code",
578 |       "execution_count": null,
579 |       "metadata": {
580 |         "id": "jqrurKXYpd0m"
581 |       },
582 |       "outputs": [],
583 |       "source": [
584 |         "N, C, H, W = 2, 3, 4, 5\n",
585 |         "x = 5 * np.random.randn(N, C, H, W) + 12\n",
586 |         "gamma = np.random.randn(C)\n",
587 |         "beta = np.random.randn(C)\n",
588 |         "dout = np.random.randn(N, C, H, W)\n",
589 |         "\n",
590 |         "bn_param = {'mode': 'train'}\n",
591 |         "fx = lambda x: spatial_batchnorm_forward(x, gamma, beta, bn_param)[0]\n",
592 |         "fg = lambda a: spatial_batchnorm_forward(x, gamma, beta, bn_param)[0]\n",
593 |         "fb = lambda b: spatial_batchnorm_forward(x, gamma, beta, bn_param)[0]\n",
594 |         "\n",
595 |         "dx_num = eval_numerical_gradient_array(fx, x, dout)\n",
596 |         "da_num = eval_numerical_gradient_array(fg, gamma, dout)\n",
597 |         "db_num = eval_numerical_gradient_array(fb, beta, dout)\n",
598 |         "\n",
599 |         "_, cache = spatial_batchnorm_forward(x, gamma, beta, bn_param)\n",
600 |         "dx, dgamma, dbeta = spatial_batchnorm_backward(dout, cache)\n",
601 |         "print ('dx error: ', rel_error(dx_num, dx))\n",
602 |         "print ('dgamma error: ', rel_error(da_num, dgamma))\n",
603 |         "print ('dbeta error: ', rel_error(db_num, dbeta))"
604 |       ]
605 |     }
606 |   ],
607 |   "metadata": {
608 |     "colab": {
609 |       "provenance": []
610 |     }
611 |   },
612 |   "nbformat": 4,
613 |   "nbformat_minor": 0
614 | }


--------------------------------------------------------------------------------
/Homework/homework3/coding/hw3_coding_bn_drop_cnn_sol.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Leo-Adventure/CS182-Berkeley-Deep-Learning-23Spring/c70df32185f7a1e38e3287dbc6d2d0b08f9e98d2/Homework/homework3/coding/hw3_coding_bn_drop_cnn_sol.zip


--------------------------------------------------------------------------------
/Homework/homework3/coding/hw3_pytorch_cnn.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "cells": [
  3 |     {
  4 |       "cell_type": "markdown",
  5 |       "metadata": {
  6 |         "id": "k5E-3ERJpjHj"
  7 |       },
  8 |       "source": [
  9 |         "# Setup Environment\n",
 10 |         "\n",
 11 |         "If you are working on this assignment using Google Colab, please execute the codes below."
 12 |       ]
 13 |     },
 14 |     {
 15 |       "cell_type": "code",
 16 |       "execution_count": null,
 17 |       "metadata": {
 18 |         "id": "fDF4Q4nnpjHm"
 19 |       },
 20 |       "outputs": [],
 21 |       "source": [
 22 |         "#@title Mount your Google Drive\n",
 23 |         "\n",
 24 |         "import os\n",
 25 |         "from google.colab import drive\n",
 26 |         "drive.mount('/content/gdrive')"
 27 |       ]
 28 |     },
 29 |     {
 30 |       "cell_type": "code",
 31 |       "execution_count": null,
 32 |       "metadata": {
 33 |         "id": "70gkC4XapjHo"
 34 |       },
 35 |       "outputs": [],
 36 |       "source": [
 37 |         "#@title Set up mount symlink\n",
 38 |         "\n",
 39 |         "DRIVE_PATH = '/content/gdrive/My\\ Drive/cs182hw3_sp23'\n",
 40 |         "DRIVE_PYTHON_PATH = DRIVE_PATH.replace('\\\\', '')\n",
 41 |         "if not os.path.exists(DRIVE_PYTHON_PATH):\n",
 42 |         "  %mkdir $DRIVE_PATH\n",
 43 |         "\n",
 44 |         "## the space in `My Drive` causes some issues,\n",
 45 |         "## make a symlink to avoid this\n",
 46 |         "SYM_PATH = '/content/cs182hw3'\n",
 47 |         "if not os.path.exists(SYM_PATH):\n",
 48 |         "  !ln -s $DRIVE_PATH $SYM_PATH"
 49 |       ]
 50 |     },
 51 |     {
 52 |       "cell_type": "code",
 53 |       "execution_count": null,
 54 |       "metadata": {
 55 |         "id": "xsRSwWw3pjHo"
 56 |       },
 57 |       "outputs": [],
 58 |       "source": [
 59 |         "#@title Install dependencies\n",
 60 |         "\n",
 61 |         "!pip install numpy==1.21.6 imageio==2.9.0 matplotlib==3.2.2"
 62 |       ]
 63 |     },
 64 |     {
 65 |       "cell_type": "code",
 66 |       "execution_count": null,
 67 |       "metadata": {
 68 |         "id": "03zMODLepjHp"
 69 |       },
 70 |       "outputs": [],
 71 |       "source": [
 72 |         "#@title Clone homework repo\n",
 73 |         "\n",
 74 |         "%cd $SYM_PATH\n",
 75 |         "if not os.path.exists(\"cs182hw3\"):\n",
 76 |         "  !git clone https://github.com/Berkeley-CS182/cs182hw3.git\n",
 77 |         "%cd cs182hw3"
 78 |       ]
 79 |     },
 80 |     {
 81 |       "cell_type": "code",
 82 |       "execution_count": null,
 83 |       "metadata": {
 84 |         "id": "aE3DxNYupjHp"
 85 |       },
 86 |       "outputs": [],
 87 |       "source": [
 88 |         "#@title Configure Jupyter Notebook\n",
 89 |         "\n",
 90 |         "import matplotlib\n",
 91 |         "%matplotlib inline\n",
 92 |         "%load_ext autoreload\n",
 93 |         "%autoreload 2"
 94 |       ]
 95 |     },
 96 |     {
 97 |       "cell_type": "markdown",
 98 |       "metadata": {
 99 |         "id": "_UxcjyMQpjHp"
100 |       },
101 |       "source": [
102 |         "# Train Convolutional Neural Networks using PyTorch\n",
103 |         "\n",
104 |         "In this notebook we will put everything together you've learned: affine layers, relu layers, conv layers, max-pooling, (spatial) batch norm, and dropout, and train CNNs on CIFAR-100.\n",
105 |         "\n",
106 |         "However, our implementation of these modules in NumPy are quite inefficient---especially convolutional layers. Therefore, we use PyTorch with GPU in this coding assignment.\n",
107 |         "\n",
108 |         "Make sure you have access to GPUs when running this notebook. On Google Colab, you can switch to a GPU runtime by clicking \"Runtime\" - \"Change Runtime Type\" - \"GPU\" in the menu on the top of the webpage."
109 |       ]
110 |     },
111 |     {
112 |       "cell_type": "code",
113 |       "execution_count": null,
114 |       "metadata": {
115 |         "id": "7RVNc9KGpjHq"
116 |       },
117 |       "outputs": [],
118 |       "source": [
119 |         "import os\n",
120 |         "import json\n",
121 |         "import numpy as np\n",
122 |         "import torch\n",
123 |         "import torch.nn as nn\n",
124 |         "import torch.nn.functional as F\n",
125 |         "import torch.utils as utils\n",
126 |         "import torch.optim as optim\n",
127 |         "import torchvision\n",
128 |         "from torchvision import datasets, transforms\n",
129 |         "import matplotlib.pyplot as plt\n",
130 |         "\n",
131 |         "os.makedirs(\"submission_logs\", exist_ok=True)"
132 |       ]
133 |     },
134 |     {
135 |       "cell_type": "code",
136 |       "execution_count": null,
137 |       "metadata": {
138 |         "id": "YAbWHSFBpjHq"
139 |       },
140 |       "outputs": [],
141 |       "source": [
142 |         "torch.cuda.is_available()\n",
143 |         "# make sure GPU is enabled"
144 |       ]
145 |     },
146 |     {
147 |       "cell_type": "code",
148 |       "execution_count": null,
149 |       "metadata": {
150 |         "id": "VlR-vQyZpjHr"
151 |       },
152 |       "outputs": [],
153 |       "source": [
154 |         "seed = 227"
155 |       ]
156 |     },
157 |     {
158 |       "cell_type": "markdown",
159 |       "metadata": {
160 |         "id": "z4wPezL2pjHr"
161 |       },
162 |       "source": [
163 |         "## Load and Visualize Data\n",
164 |         "In this cell, we load and visualize the CIFAR100 dataset. Note that we apply data augmentation (random horizontal flip) to the training dataset:\n",
165 |         "```pythontransforms.RandomHorizontalFlip()```\n",
166 |         "Data augmentation is a popular technique in machine learning and computer vision that involves generating additional training data to improve the performance of a model. One common form of data augmentation for image data is random horizontal flipping, which involves flipping an image horizontally with a 50% chance during training. This technique is often used to increase the variability of the training data and to help the model generalize better to new, unseen images. By randomly flipping images, the model is exposed to a wider range of orientations and can better learn to recognize features that are invariant to horizontal flipping."
167 |       ]
168 |     },
169 |     {
170 |       "cell_type": "code",
171 |       "execution_count": null,
172 |       "metadata": {
173 |         "id": "YeI6GfGmpjHr"
174 |       },
175 |       "outputs": [],
176 |       "source": [
177 |         "valid_test_transform = transforms.Compose(\n",
178 |         "    [\n",
179 |         "        transforms.ToTensor(),   # convert image to PyTorch Tensor\n",
180 |         "        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))\n",
181 |         "        # normalize to [-1.0, 1.0] (originally [0.0, 1.0])\n",
182 |         "    ]\n",
183 |         ")\n",
184 |         "\n",
185 |         "train_transform = transforms.Compose(\n",
186 |         "    [\n",
187 |         "        transforms.ToTensor(),\n",
188 |         "        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),\n",
189 |         "        transforms.RandomHorizontalFlip()   # data augmentation\n",
190 |         "    ]\n",
191 |         ")\n",
192 |         "\n",
193 |         "# Download training data from open datasets.\n",
194 |         "training_data = datasets.CIFAR100(\n",
195 |         "    root=\"data\",\n",
196 |         "    train=True,\n",
197 |         "    download=True,\n",
198 |         "    transform=train_transform,\n",
199 |         ")\n",
200 |         "\n",
201 |         "# Download test data from open datasets.\n",
202 |         "valid_test_data = datasets.CIFAR100(\n",
203 |         "    root=\"data\",\n",
204 |         "    train=False,\n",
205 |         "    download=True,\n",
206 |         "    transform=valid_test_transform,\n",
207 |         ")\n",
208 |         "\n",
209 |         "# split original test data to valid data and test data\n",
210 |         "valid_data = list(valid_test_data)[::2]\n",
211 |         "test_data = list(valid_test_data)[1::2]\n",
212 |         "\n",
213 |         "classes = [\n",
214 |         "    \"apple\",\n",
215 |         "    \"aquarium_fish\",\n",
216 |         "    \"baby\",\n",
217 |         "    \"bear\",\n",
218 |         "    \"beaver\",\n",
219 |         "    \"bed\",\n",
220 |         "    \"bee\",\n",
221 |         "    \"beetle\",\n",
222 |         "    \"bicycle\",\n",
223 |         "    \"bottle\",\n",
224 |         "    \"bowl\",\n",
225 |         "    \"boy\",\n",
226 |         "    \"bridge\",\n",
227 |         "    \"bus\",\n",
228 |         "    \"butterfly\",\n",
229 |         "    \"camel\",\n",
230 |         "    \"can\",\n",
231 |         "    \"castle\",\n",
232 |         "    \"caterpillar\",\n",
233 |         "    \"cattle\",\n",
234 |         "    \"chair\",\n",
235 |         "    \"chimpanzee\",\n",
236 |         "    \"clock\",\n",
237 |         "    \"cloud\",\n",
238 |         "    \"cockroach\",\n",
239 |         "    \"couch\",\n",
240 |         "    \"cra\",\n",
241 |         "    \"crocodile\",\n",
242 |         "    \"cup\",\n",
243 |         "    \"dinosaur\",\n",
244 |         "    \"dolphin\",\n",
245 |         "    \"elephant\",\n",
246 |         "    \"flatfish\",\n",
247 |         "    \"forest\",\n",
248 |         "    \"fox\",\n",
249 |         "    \"girl\",\n",
250 |         "    \"hamster\",\n",
251 |         "    \"house\",\n",
252 |         "    \"kangaroo\",\n",
253 |         "    \"keyboard\",\n",
254 |         "    \"lamp\",\n",
255 |         "    \"lawn_mower\",\n",
256 |         "    \"leopard\",\n",
257 |         "    \"lion\",\n",
258 |         "    \"lizard\",\n",
259 |         "    \"lobster\",\n",
260 |         "    \"man\",\n",
261 |         "    \"maple_tree\",\n",
262 |         "    \"motorcycle\",\n",
263 |         "    \"mountain\",\n",
264 |         "    \"mouse\",\n",
265 |         "    \"mushroom\",\n",
266 |         "    \"oak_tree\",\n",
267 |         "    \"orange\",\n",
268 |         "    \"orchid\",\n",
269 |         "    \"otter\",\n",
270 |         "    \"palm_tree\",\n",
271 |         "    \"pear\",\n",
272 |         "    \"pickup_truck\",\n",
273 |         "    \"pine_tree\",\n",
274 |         "    \"plain\",\n",
275 |         "    \"plate\",\n",
276 |         "    \"poppy\",\n",
277 |         "    \"porcupine\",\n",
278 |         "    \"possum\",\n",
279 |         "    \"rabbit\",\n",
280 |         "    \"raccoon\",\n",
281 |         "    \"ray\",\n",
282 |         "    \"road\",\n",
283 |         "    \"rocket\",\n",
284 |         "    \"rose\",\n",
285 |         "    \"sea\",\n",
286 |         "    \"seal\",\n",
287 |         "    \"shark\",\n",
288 |         "    \"shrew\",\n",
289 |         "    \"skunk\",\n",
290 |         "    \"skyscraper\",\n",
291 |         "    \"snail\",\n",
292 |         "    \"snake\",\n",
293 |         "    \"spider\",\n",
294 |         "    \"squirrel\",\n",
295 |         "    \"streetcar\",\n",
296 |         "    \"sunflower\",\n",
297 |         "    \"sweet_pepper\",\n",
298 |         "    \"table\",\n",
299 |         "    \"tank\",\n",
300 |         "    \"telephone\",\n",
301 |         "    \"television\",\n",
302 |         "    \"tiger\",\n",
303 |         "    \"tractor\",\n",
304 |         "    \"train\",\n",
305 |         "    \"trout\",\n",
306 |         "    \"tulip\",\n",
307 |         "    \"turtle\",\n",
308 |         "    \"wardrobe\",\n",
309 |         "    \"whale\",\n",
310 |         "    \"willow_tree\",\n",
311 |         "    \"wolf\",\n",
312 |         "    \"woman\",\n",
313 |         "    \"worm\",\n",
314 |         "]"
315 |       ]
316 |     },
317 |     {
318 |       "cell_type": "code",
319 |       "execution_count": null,
320 |       "metadata": {
321 |         "id": "W5X6ugvopjHs"
322 |       },
323 |       "outputs": [],
324 |       "source": [
325 |         "# Create data loaders.\n",
326 |         "valid_dataloader = utils.data.DataLoader(valid_data, batch_size=5)\n",
327 |         "\n",
328 |         "for X, y in valid_dataloader:\n",
329 |         "    print(f\"Shape of X [N, C, H, W]: {X.shape}\")\n",
330 |         "    print(f\"Shape of y: {y.shape} {y.dtype}\")\n",
331 |         "    break"
332 |       ]
333 |     },
334 |     {
335 |       "cell_type": "markdown",
336 |       "metadata": {
337 |         "id": "U6Vp8H54pjHs"
338 |       },
339 |       "source": [
340 |         "Here is a visualization of 5 images in the validation dataset:"
341 |       ]
342 |     },
343 |     {
344 |       "cell_type": "code",
345 |       "execution_count": null,
346 |       "metadata": {
347 |         "id": "k_dh77hYpjHt"
348 |       },
349 |       "outputs": [],
350 |       "source": [
351 |         "# functions to show an image\n",
352 |         "\n",
353 |         "\n",
354 |         "def imshow(img):\n",
355 |         "    img = img / 2 + 0.5     # unnormalize\n",
356 |         "    npimg = img.numpy()\n",
357 |         "    plt.imshow(np.transpose(npimg, (1, 2, 0)))\n",
358 |         "    plt.show()\n",
359 |         "\n",
360 |         "\n",
361 |         "# get some random training images\n",
362 |         "dataiter = iter(valid_dataloader)\n",
363 |         "images, labels = next(dataiter)\n",
364 |         "\n",
365 |         "# show images\n",
366 |         "imshow(torchvision.utils.make_grid(images))\n",
367 |         "# print labels\n",
368 |         "print('   '.join(f'{classes[labels[j]]:5s}' for j in range(5)))"
369 |       ]
370 |     },
371 |     {
372 |       "cell_type": "markdown",
373 |       "metadata": {
374 |         "id": "9wnVZs2apjHt"
375 |       },
376 |       "source": [
377 |         "## Define the Neural Network Architecture\n",
378 |         "\n",
379 |         "**Complete the code in `dl_pytorch/model.py`** to finish the implementation of a convolutional neural network with batch normalization and dropout."
380 |       ]
381 |     },
382 |     {
383 |       "cell_type": "code",
384 |       "execution_count": null,
385 |       "metadata": {
386 |         "id": "Us2L7PEGpjHt"
387 |       },
388 |       "outputs": [],
389 |       "source": [
390 |         "from dl_pytorch.model import NeuralNetwork\n",
391 |         "\n",
392 |         "model = NeuralNetwork()\n",
393 |         "print(model)\n",
394 |         "\n",
395 |         "assert len(model.state_dict()) == 10\n",
396 |         "assert model.conv1.weight.shape == torch.Size([16, 3, 3, 3])\n",
397 |         "assert model.conv1.bias.shape == torch.Size([16])\n",
398 |         "assert model.conv2.weight.shape == torch.Size([32, 16, 3, 3])\n",
399 |         "assert model.conv2.bias.shape == torch.Size([32])\n",
400 |         "assert model.conv3.weight.shape == torch.Size([64, 32, 3, 3])\n",
401 |         "assert model.conv3.bias.shape == torch.Size([64])\n",
402 |         "assert model.fc1.weight.shape == torch.Size([256, 1024])\n",
403 |         "assert model.fc1.bias.shape == torch.Size([256])\n",
404 |         "assert model.fc2.weight.shape == torch.Size([100, 256])\n",
405 |         "assert model.fc2.bias.shape == torch.Size([100])\n",
406 |         "assert model(torch.randn(9, 3, 32, 32)).shape == torch.Size([9, 100])\n",
407 |         "\n",
408 |         "model = NeuralNetwork(do_batchnorm=True, p_dropout=0.1)\n",
409 |         "assert len(model.state_dict()) == 25\n",
410 |         "assert model.bn1.weight.shape == model.bn1.bias.shape == torch.Size([16])\n",
411 |         "assert model.bn2.weight.shape == model.bn2.bias.shape == torch.Size([32])\n",
412 |         "assert model.bn3.weight.shape == model.bn3.bias.shape == torch.Size([64])\n",
413 |         "assert model(torch.randn(11, 3, 32, 32)).shape == torch.Size([11, 100])"
414 |       ]
415 |     },
416 |     {
417 |       "cell_type": "markdown",
418 |       "metadata": {
419 |         "id": "aO83vd3kpjHu"
420 |       },
421 |       "source": [
422 |         "## Train the Neural Network\n",
423 |         "\n",
424 |         "Complete the code cells below to train your neural network."
425 |       ]
426 |     },
427 |     {
428 |       "cell_type": "code",
429 |       "execution_count": null,
430 |       "metadata": {
431 |         "id": "8CBlHTs-pjHu"
432 |       },
433 |       "outputs": [],
434 |       "source": [
435 |         "def train(dataloader, model, loss_fn, optimizer):\n",
436 |         "    size = len(dataloader.dataset)\n",
437 |         "    model.train()\n",
438 |         "    for batch, (X, y) in enumerate(dataloader):\n",
439 |         "        X, y = X.cuda(), y.cuda()\n",
440 |         "\n",
441 |         "        pred = model(X)\n",
442 |         "        loss = loss_fn(pred, y)\n",
443 |         "\n",
444 |         "        ########################################################################\n",
445 |         "        # TODO: complete the following code for backpropagation and gradient\n",
446 |         "        #  update of a single step.\n",
447 |         "        # Hint: 3 lines\n",
448 |         "        ########################################################################\n",
449 |         "        raise NotImplementedError()\n",
450 |         "        ########################################################################\n",
451 |         "\n",
452 |         "        if batch % 100 == 0:\n",
453 |         "            loss, current = loss.item(), batch * len(X)\n",
454 |         "            print(f\"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]\")"
455 |       ]
456 |     },
457 |     {
458 |       "cell_type": "code",
459 |       "execution_count": null,
460 |       "metadata": {
461 |         "id": "qGs0O5gmpjHu"
462 |       },
463 |       "outputs": [],
464 |       "source": [
465 |         "def test(dataloader, model, loss_fn):\n",
466 |         "    size = len(dataloader.dataset)\n",
467 |         "    num_batches = len(dataloader)\n",
468 |         "    model.eval()\n",
469 |         "    test_loss, correct = 0, 0\n",
470 |         "    with torch.no_grad():\n",
471 |         "        for X, y in dataloader:\n",
472 |         "            X, y = X.cuda(), y.cuda()\n",
473 |         "            pred = model(X)\n",
474 |         "            test_loss += loss_fn(pred, y).item()\n",
475 |         "            correct += (pred.argmax(1) == y).type(torch.float).sum().item()\n",
476 |         "    test_loss /= num_batches\n",
477 |         "    correct /= size\n",
478 |         "    print(f\"Evaluation Error: \\n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \\n\")\n",
479 |         "    return 100*correct"
480 |       ]
481 |     },
482 |     {
483 |       "cell_type": "code",
484 |       "execution_count": null,
485 |       "metadata": {
486 |         "id": "NHZQ1IGQpjHu"
487 |       },
488 |       "outputs": [],
489 |       "source": [
490 |         "def get_optimizer(params, optim_type, lr, momentum, lr_decay, l2_reg):\n",
491 |         "    if optim_type == \"sgd\":\n",
492 |         "        optimizer = optim.SGD(params, lr=lr, momentum=0.0, weight_decay=l2_reg)\n",
493 |         "    elif optim_type == \"sgd_momentum\":\n",
494 |         "        optimizer = optim.SGD(params, lr=lr, momentum=momentum,\n",
495 |         "                              weight_decay=l2_reg)\n",
496 |         "    elif optim_type == \"adam\":\n",
497 |         "        optimizer = optim.AdamW(params, lr=lr, betas=(momentum, 0.999),\n",
498 |         "                                weight_decay=l2_reg)\n",
499 |         "    else:\n",
500 |         "        raise ValueError(optim_type)\n",
501 |         "    scheduler = optim.lr_scheduler.ExponentialLR(optimizer, lr_decay)\n",
502 |         "    return optimizer, scheduler"
503 |       ]
504 |     },
505 |     {
506 |       "cell_type": "markdown",
507 |       "metadata": {
508 |         "id": "SQwcinGOpjHu"
509 |       },
510 |       "source": [
511 |         "Train the neural network. It should achieve at least 35% accuracy on the test set."
512 |       ]
513 |     },
514 |     {
515 |       "cell_type": "code",
516 |       "execution_count": null,
517 |       "metadata": {
518 |         "id": "8yRZp-OppjHv"
519 |       },
520 |       "outputs": [],
521 |       "source": [
522 |         "def run_training(hp, nn_cls, save_prefix):\n",
523 |         "    print(\"Hyperparameters:\", hp)\n",
524 |         "    torch.manual_seed(seed)\n",
525 |         "    torch.cuda.manual_seed(seed)\n",
526 |         "    np.random.seed(seed)\n",
527 |         "\n",
528 |         "    model = nn_cls(do_batchnorm=hp.do_batchnorm, p_dropout=hp.p_dropout).cuda()\n",
529 |         "\n",
530 |         "    # Create data loaders.\n",
531 |         "    train_dataloader = utils.data.DataLoader(\n",
532 |         "        training_data, batch_size=hp.batch_size)\n",
533 |         "    valid_dataloader = utils.data.DataLoader(\n",
534 |         "        valid_data, batch_size=hp.batch_size)\n",
535 |         "    \n",
536 |         "    loss_fn = nn.CrossEntropyLoss()\n",
537 |         "    optimizer, scheduler = get_optimizer(\n",
538 |         "        model.parameters(), hp.optim_type, hp.lr, hp.momentum, hp.lr_decay,\n",
539 |         "        hp.l2_reg)\n",
540 |         "    \n",
541 |         "    for t in range(hp.epochs):\n",
542 |         "        print(f\"Epoch {t+1}\\n-------------------------------\")\n",
543 |         "        train(train_dataloader, model, loss_fn, optimizer)\n",
544 |         "        test(valid_dataloader, model, loss_fn)\n",
545 |         "        scheduler.step()\n",
546 |         "\n",
547 |         "    \n",
548 |         "    print(f\"Saving the model to submission_logs/{save_prefix}.pt\")\n",
549 |         "    torch.save(model.state_dict(), f\"submission_logs/{save_prefix}.pt\")\n",
550 |         "    return model\n",
551 |         "\n",
552 |         "def eval_on_test(hp, model, save_prefix):\n",
553 |         "    train_dataloader = utils.data.DataLoader(\n",
554 |         "        training_data, batch_size=hp.batch_size)\n",
555 |         "    test_dataloader = utils.data.DataLoader(\n",
556 |         "        test_data, batch_size=hp.batch_size)\n",
557 |         "    loss_fn = nn.CrossEntropyLoss()\n",
558 |         "    print(\"Evaluating on the test set\")\n",
559 |         "    test_acc = test(test_dataloader, model, loss_fn)\n",
560 |         "    n_params = sum(p.numel() for p in model.parameters())\n",
561 |         "    print(\"Parameter count: {}\".format(n_params))\n",
562 |         "    n_steps = len(train_dataloader) * hp.epochs\n",
563 |         "    print(\"Training steps: {}\".format(n_steps))\n",
564 |         "    with open(f\"submission_logs/{save_prefix}.json\", \"w\", encoding=\"utf-8\") as f:\n",
565 |         "        json.dump({\n",
566 |         "            \"test_acc\": test_acc,\n",
567 |         "            \"hparams\": hp.__dict__,\n",
568 |         "            \"n_params\": n_params,\n",
569 |         "            \"n_steps\": n_steps\n",
570 |         "        }, f)"
571 |       ]
572 |     },
573 |     {
574 |       "cell_type": "code",
575 |       "execution_count": null,
576 |       "metadata": {
577 |         "id": "C2WYhpODpjHv"
578 |       },
579 |       "outputs": [],
580 |       "source": [
581 |         "from dl_pytorch.hparams import HP as hp_base\n",
582 |         "\n",
583 |         "model = run_training(hp_base, NeuralNetwork, \"model\")\n",
584 |         "eval_on_test(hp_base, model, \"model\")"
585 |       ]
586 |     },
587 |     {
588 |       "cell_type": "markdown",
589 |       "metadata": {
590 |         "id": "9ZCNF2FtpjHv"
591 |       },
592 |       "source": [
593 |         "Train the neural network with batch norm and dropout. It should achieve at least 38% accuracy on the test set."
594 |       ]
595 |     },
596 |     {
597 |       "cell_type": "code",
598 |       "execution_count": null,
599 |       "metadata": {
600 |         "id": "X8dyVSsfpjHv"
601 |       },
602 |       "outputs": [],
603 |       "source": [
604 |         "from dl_pytorch.hparams_bn_drop import HP as hp_bn_drop\n",
605 |         "\n",
606 |         "model = run_training(hp_bn_drop, NeuralNetwork, \"model_bn_drop\")\n",
607 |         "eval_on_test(hp_bn_drop, model, \"model_bn_drop\")"
608 |       ]
609 |     },
610 |     {
611 |       "cell_type": "markdown",
612 |       "metadata": {
613 |         "id": "zNGhm3HPpjHv"
614 |       },
615 |       "source": [
616 |         "## Design your own neural network\n",
617 |         "\n",
618 |         "\n",
619 |         "It's time to showcase your deep learning skills! In this assignment, you will be designing your own neural network using PyTorch. Your task is to **implement your neural network design in the files `dl_pytorch/my_model.py` and `dl_pytorch/hparams_my_model.py`**. The goal is to achieve a test accuracy of **44%** or higher.\n",
620 |         "\n",
621 |         "To ensure reproducibility and to maintain the focus of the assignment, please adhere to the following rules:\n",
622 |         "\n",
623 |         "1. Do not modify the code in the Jupyter Notebook cell or other cells that this cell depends on. It means that you cannot change data processing, the training loop, and the random seed. The emphasis of this assignment is on the model architecture and hyperparameter tuning.\n",
624 |         "\n",
625 |         "1. The number of model parameters must not exceed `1,000,000`.\n",
626 |         "\n",
627 |         "1. The total number of training steps should be no more than `20,000`.\n",
628 |         "\n",
629 |         "1. The maximum number of training epochs is `10`.\n",
630 |         "\n",
631 |         "1. Please refrain from using any pre-trained models or other downloaded assets.\n",
632 |         "\n",
633 |         "Your test accuracy will be displayed on the Gradescope leaderboard. Please note that your rank on the leaderboard does not affect your grade. In order to receive full credit for this part of the assignment, you only need to abide by the rules outlined above and achieve a minimum test accuracy of 44%. Your grade will be scaled linearly, with a score of 0 for a test accuracy of 38% and full credit for a test accuracy of 44% or higher."
634 |       ]
635 |     },
636 |     {
637 |       "cell_type": "code",
638 |       "execution_count": null,
639 |       "metadata": {
640 |         "id": "7Y_TVYpKpjHw"
641 |       },
642 |       "outputs": [],
643 |       "source": [
644 |         "from dl_pytorch.my_model import MyNeuralNetwork\n",
645 |         "from dl_pytorch.hparams_my_model import HP as hp_my_model\n",
646 |         "\n",
647 |         "model = run_training(hp_my_model, MyNeuralNetwork, \"model_my_model\")"
648 |       ]
649 |     },
650 |     {
651 |       "cell_type": "code",
652 |       "execution_count": null,
653 |       "metadata": {
654 |         "id": "rAJVb8ilpjHw"
655 |       },
656 |       "outputs": [],
657 |       "source": [
658 |         "# When you are ready to eval on test set, run this cell\n",
659 |         "# WARNING: In real-world applications, it is a bad practice to evaluate\n",
660 |         "#          frequently on the test set because the model will then perform poorly\n",
661 |         "#          on new, unseen data even if it achieves a high test accuracy.\n",
662 |         "eval_on_test(hp_my_model, model, \"model_my_model\")"
663 |       ]
664 |     },
665 |     {
666 |       "cell_type": "markdown",
667 |       "source": [
668 |         "### Question:\n",
669 |         "\n",
670 |         "**Briefly describe your neural network design and the procedure of hyperparameter tuning.** Please include the answer of this question in your written assignment."
671 |       ],
672 |       "metadata": {
673 |         "collapsed": false,
674 |         "id": "-vwwTiDZpjHw"
675 |       }
676 |     },
677 |     {
678 |       "cell_type": "markdown",
679 |       "metadata": {
680 |         "id": "Oi2CQzrgpjHw"
681 |       },
682 |       "source": [
683 |         "# Collect your submissions\n",
684 |         "\n",
685 |         "The following command will collect your solutions generated by both notebooks.\n",
686 |         "\n",
687 |         "On Colab, after running the following cell, you can download your submissions from the `Files` tab, which can be opened by clicking the file icon on the left hand side of the screen."
688 |       ]
689 |     },
690 |     {
691 |       "cell_type": "code",
692 |       "execution_count": null,
693 |       "metadata": {
694 |         "id": "y_l9HfvppjHx"
695 |       },
696 |       "outputs": [],
697 |       "source": [
698 |         "!rm -f cs182hw3_submission.zip\n",
699 |         "!zip -r cs182hw3_submission.zip . -x \"*.git*\" \"*deeplearning/datasets*\" \"data*\" \"*.ipynb_checkpoints*\" \"*README.md\" \".env/*\" \"*.pyc\" \"*deeplearning/build/*\" \"*__pycache__/*\""
700 |       ]
701 |     }
702 |   ],
703 |   "metadata": {
704 |     "language_info": {
705 |       "name": "python"
706 |     },
707 |     "colab": {
708 |       "provenance": []
709 |     }
710 |   },
711 |   "nbformat": 4,
712 |   "nbformat_minor": 0
713 | }


--------------------------------------------------------------------------------
/Homework/homework3/hw3.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Leo-Adventure/CS182-Berkeley-Deep-Learning-23Spring/c70df32185f7a1e38e3287dbc6d2d0b08f9e98d2/Homework/homework3/hw3.pdf


--------------------------------------------------------------------------------
/Homework/homework3/hw3_sol.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Leo-Adventure/CS182-Berkeley-Deep-Learning-23Spring/c70df32185f7a1e38e3287dbc6d2d0b08f9e98d2/Homework/homework3/hw3_sol.pdf


--------------------------------------------------------------------------------
/Homework/homework6/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Leo-Adventure/CS182-Berkeley-Deep-Learning-23Spring/c70df32185f7a1e38e3287dbc6d2d0b08f9e98d2/Homework/homework6/.DS_Store


--------------------------------------------------------------------------------
/Homework/homework6/coding/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Leo-Adventure/CS182-Berkeley-Deep-Learning-23Spring/c70df32185f7a1e38e3287dbc6d2d0b08f9e98d2/Homework/homework6/coding/.DS_Store


--------------------------------------------------------------------------------
/Homework/homework6/hw6.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Leo-Adventure/CS182-Berkeley-Deep-Learning-23Spring/c70df32185f7a1e38e3287dbc6d2d0b08f9e98d2/Homework/homework6/hw6.pdf


--------------------------------------------------------------------------------
/Homework/homework6/hw6_sol.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Leo-Adventure/CS182-Berkeley-Deep-Learning-23Spring/c70df32185f7a1e38e3287dbc6d2d0b08f9e98d2/Homework/homework6/hw6_sol.pdf


--------------------------------------------------------------------------------
/Homework/homework7/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Leo-Adventure/CS182-Berkeley-Deep-Learning-23Spring/c70df32185f7a1e38e3287dbc6d2d0b08f9e98d2/Homework/homework7/.DS_Store


--------------------------------------------------------------------------------
/Homework/homework7/hw7.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Leo-Adventure/CS182-Berkeley-Deep-Learning-23Spring/c70df32185f7a1e38e3287dbc6d2d0b08f9e98d2/Homework/homework7/hw7.pdf


--------------------------------------------------------------------------------
/Homework/homework7/hw7_sol.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Leo-Adventure/CS182-Berkeley-Deep-Learning-23Spring/c70df32185f7a1e38e3287dbc6d2d0b08f9e98d2/Homework/homework7/hw7_sol.pdf


--------------------------------------------------------------------------------
/Homework/homework8/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Leo-Adventure/CS182-Berkeley-Deep-Learning-23Spring/c70df32185f7a1e38e3287dbc6d2d0b08f9e98d2/Homework/homework8/.DS_Store


--------------------------------------------------------------------------------
/Homework/homework8/hw8.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Leo-Adventure/CS182-Berkeley-Deep-Learning-23Spring/c70df32185f7a1e38e3287dbc6d2d0b08f9e98d2/Homework/homework8/hw8.pdf


--------------------------------------------------------------------------------
/Homework/homework8/hw8_sol.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Leo-Adventure/CS182-Berkeley-Deep-Learning-23Spring/c70df32185f7a1e38e3287dbc6d2d0b08f9e98d2/Homework/homework8/hw8_sol.pdf


--------------------------------------------------------------------------------
/Homework/homework9/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Leo-Adventure/CS182-Berkeley-Deep-Learning-23Spring/c70df32185f7a1e38e3287dbc6d2d0b08f9e98d2/Homework/homework9/.DS_Store


--------------------------------------------------------------------------------
/Homework/homework9/coding/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Leo-Adventure/CS182-Berkeley-Deep-Learning-23Spring/c70df32185f7a1e38e3287dbc6d2d0b08f9e98d2/Homework/homework9/coding/.DS_Store


--------------------------------------------------------------------------------
/Homework/homework9/hw9.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Leo-Adventure/CS182-Berkeley-Deep-Learning-23Spring/c70df32185f7a1e38e3287dbc6d2d0b08f9e98d2/Homework/homework9/hw9.pdf


--------------------------------------------------------------------------------
/Homework/homework9/hw9_sol.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Leo-Adventure/CS182-Berkeley-Deep-Learning-23Spring/c70df32185f7a1e38e3287dbc6d2d0b08f9e98d2/Homework/homework9/hw9_sol.pdf


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Leo-Adventure
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Lecture/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Leo-Adventure/CS182-Berkeley-Deep-Learning-23Spring/c70df32185f7a1e38e3287dbc6d2d0b08f9e98d2/Lecture/.DS_Store


--------------------------------------------------------------------------------
/Lecture/Lecture 1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Leo-Adventure/CS182-Berkeley-Deep-Learning-23Spring/c70df32185f7a1e38e3287dbc6d2d0b08f9e98d2/Lecture/Lecture 1.pdf


--------------------------------------------------------------------------------
/Lecture/Lecture10.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Leo-Adventure/CS182-Berkeley-Deep-Learning-23Spring/c70df32185f7a1e38e3287dbc6d2d0b08f9e98d2/Lecture/Lecture10.pdf


--------------------------------------------------------------------------------
/Lecture/Lecture11.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Leo-Adventure/CS182-Berkeley-Deep-Learning-23Spring/c70df32185f7a1e38e3287dbc6d2d0b08f9e98d2/Lecture/Lecture11.pdf


--------------------------------------------------------------------------------
/Lecture/Lecture2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Leo-Adventure/CS182-Berkeley-Deep-Learning-23Spring/c70df32185f7a1e38e3287dbc6d2d0b08f9e98d2/Lecture/Lecture2.pdf


--------------------------------------------------------------------------------
/Lecture/Lecture3.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Leo-Adventure/CS182-Berkeley-Deep-Learning-23Spring/c70df32185f7a1e38e3287dbc6d2d0b08f9e98d2/Lecture/Lecture3.pdf


--------------------------------------------------------------------------------
/Lecture/Lecture4.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Leo-Adventure/CS182-Berkeley-Deep-Learning-23Spring/c70df32185f7a1e38e3287dbc6d2d0b08f9e98d2/Lecture/Lecture4.pdf


--------------------------------------------------------------------------------
/Lecture/Lecture5.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Leo-Adventure/CS182-Berkeley-Deep-Learning-23Spring/c70df32185f7a1e38e3287dbc6d2d0b08f9e98d2/Lecture/Lecture5.pdf


--------------------------------------------------------------------------------
/Lecture/Lecture6.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Leo-Adventure/CS182-Berkeley-Deep-Learning-23Spring/c70df32185f7a1e38e3287dbc6d2d0b08f9e98d2/Lecture/Lecture6.pdf


--------------------------------------------------------------------------------
/Lecture/Lecture7.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Leo-Adventure/CS182-Berkeley-Deep-Learning-23Spring/c70df32185f7a1e38e3287dbc6d2d0b08f9e98d2/Lecture/Lecture7.pdf


--------------------------------------------------------------------------------
/Lecture/Lecture8.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Leo-Adventure/CS182-Berkeley-Deep-Learning-23Spring/c70df32185f7a1e38e3287dbc6d2d0b08f9e98d2/Lecture/Lecture8.pdf


--------------------------------------------------------------------------------
/Lecture/Lecture9.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Leo-Adventure/CS182-Berkeley-Deep-Learning-23Spring/c70df32185f7a1e38e3287dbc6d2d0b08f9e98d2/Lecture/Lecture9.pdf


--------------------------------------------------------------------------------
/Lecture/matrixcookbook.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Leo-Adventure/CS182-Berkeley-Deep-Learning-23Spring/c70df32185f7a1e38e3287dbc6d2d0b08f9e98d2/Lecture/matrixcookbook.pdf


--------------------------------------------------------------------------------
/Lecture/note10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Leo-Adventure/CS182-Berkeley-Deep-Learning-23Spring/c70df32185f7a1e38e3287dbc6d2d0b08f9e98d2/Lecture/note10.png


--------------------------------------------------------------------------------
/Lecture/note9.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Leo-Adventure/CS182-Berkeley-Deep-Learning-23Spring/c70df32185f7a1e38e3287dbc6d2d0b08f9e98d2/Lecture/note9.png


--------------------------------------------------------------------------------
/Midterm/mt-sp23-sol.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Leo-Adventure/CS182-Berkeley-Deep-Learning-23Spring/c70df32185f7a1e38e3287dbc6d2d0b08f9e98d2/Midterm/mt-sp23-sol.pdf


--------------------------------------------------------------------------------
/Midterm/mt-sp23.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Leo-Adventure/CS182-Berkeley-Deep-Learning-23Spring/c70df32185f7a1e38e3287dbc6d2d0b08f9e98d2/Midterm/mt-sp23.pdf


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Berkeley-CS182-Deep-Learning
2 | This repository contains materials from the author's deep learning course at UC Berkeley lectured by [Prof. Sahai](https://www2.eecs.berkeley.edu/Faculty/Homepages/sahai.html), including coursework, assignments, code, and notes, among other materials
3 | 


--------------------------------------------------------------------------------