├── .gitignore ├── .gitmodules ├── 04-spiral_classification.ipynb ├── 10-autoencoder.ipynb ├── 11-VAE.ipynb ├── 17-optimal_control.ipynb ├── README.md ├── docs ├── 404.html ├── _config.yml ├── _layouts │ ├── custom.html │ └── default.html ├── en │ ├── faq.md │ ├── week01 │ │ └── 01.md │ ├── week02 │ │ ├── 02-3.md │ │ └── 02.md │ ├── week03 │ │ ├── 03-3.md │ │ └── 03.md │ ├── week04 │ │ └── 04.md │ ├── week05 │ │ └── 05.md │ ├── week06 │ │ └── 06.md │ ├── week07 │ │ ├── 07-3.md │ │ └── 07.md │ ├── week08 │ │ ├── 08-3.md │ │ └── 08.md │ ├── week09 │ │ ├── 09-3.md │ │ └── 09.md │ ├── week10 │ │ ├── 10-1.md │ │ ├── 10-2.md │ │ ├── 10-3.md │ │ ├── 10.md │ │ └── lecture10.sbv │ ├── week11 │ │ ├── 11-1.md │ │ ├── 11-2.md │ │ └── 11.md │ ├── week12 │ │ ├── 12-1.md │ │ ├── 12-2.md │ │ ├── 12-3.md │ │ └── 12.md │ ├── week13 │ │ └── 13.md │ ├── week14 │ │ └── 14.md │ └── week15 │ │ ├── 15-1.md │ │ ├── 15-2.md │ │ └── 15.md ├── fr │ ├── README-FR.md │ ├── faq.md │ ├── index.md │ ├── week01 │ │ └── 01.md │ ├── week02 │ │ ├── 02-3.md │ │ └── 02.md │ ├── week03 │ │ ├── 03-3.md │ │ └── 03.md │ ├── week04 │ │ └── 04.md │ ├── week05 │ │ └── 05.md │ ├── week06 │ │ └── 06.md │ ├── week07 │ │ ├── 07-3.md │ │ └── 07.md │ ├── week08 │ │ ├── 08-3.md │ │ └── 08.md │ ├── week09 │ │ ├── 09-3.md │ │ └── 09.md │ ├── week10 │ │ ├── 10-1.md │ │ ├── 10-2.md │ │ ├── 10-3.md │ │ ├── 10.md │ │ └── lecture10.sbv │ ├── week11 │ │ ├── 11-1.md │ │ ├── 11-2.md │ │ ├── 11.md │ │ └── lecture11.sbv │ ├── week12 │ │ ├── 12-1.md │ │ ├── 12-2.md │ │ ├── 12-3.md │ │ ├── 12.md │ │ └── lecture12.sbv │ ├── week13 │ │ └── 13.md │ ├── week14 │ │ └── 14.md │ └── week15 │ │ ├── 15-1.md │ │ ├── 15-2.md │ │ ├── 15.md │ │ ├── practicum09.sbv │ │ └── practicum10.sbv ├── images │ ├── week02 │ │ └── 02-3 │ │ │ └── figure1.png │ ├── week03 │ │ └── 03-3 │ │ │ ├── figure1.png │ │ │ ├── figure10.png │ │ │ ├── figure7.png │ │ │ └── figure9.png │ ├── week07 │ │ └── 07-3 │ │ │ ├── Autoencoder_Arch.png │ │ │ ├── DAEOutput.png │ │ │ ├── DALL-E.png │ │ │ ├── DenoisingAutoEncoder.png │ │ │ └── def.png │ ├── week08 │ │ └── 08-3 │ │ │ ├── AE.png │ │ │ ├── DAE.png │ │ │ ├── VAE.png │ │ │ ├── VAE_DAE.png │ │ │ ├── VAEloss.png │ │ │ ├── bubbles_z.png │ │ │ ├── contractiveAE.png │ │ │ └── target_prop.png │ ├── week09 │ │ └── 09-3 │ │ │ ├── 10_autoencoder_cell_12_output_2.png │ │ │ ├── 10_autoencoder_cell_12_output_3.png │ │ │ ├── dae_noise.png │ │ │ ├── dae_output.png │ │ │ ├── fig_10_cluster_samples.png │ │ │ ├── fig_11_gan_vs_dae.png │ │ │ ├── fig_12_gan_vs_vae.png │ │ │ ├── fig_1_ae.png │ │ │ ├── fig_2_under_over.png │ │ │ ├── fig_3_ae_outputs.png │ │ │ ├── fig_4_autoencoder_kernel.png │ │ │ ├── fig_5_dae.png │ │ │ ├── fig_6_dae_kernels.png │ │ │ ├── fig_7_dae_comparison.png │ │ │ ├── fig_8_merged_imgs.png │ │ │ ├── fig_9_vae.png │ │ │ ├── noise_input.png │ │ │ ├── ns_output.png │ │ │ └── telea_output.png │ ├── week10 │ │ ├── 10-1 │ │ │ ├── CL_objective.png │ │ │ ├── cl_loss_fn.png │ │ │ ├── clustering.png │ │ │ ├── con_learning.png │ │ │ ├── contrastive-learning.png │ │ │ ├── equipartition.png │ │ │ ├── moco.png │ │ │ ├── non-imagenet.png │ │ │ ├── pirl.png │ │ │ ├── semantic_features.png │ │ │ ├── soft-assignment.png │ │ │ ├── ssl_trivial.png │ │ │ └── swav.png │ │ ├── 10-2 │ │ │ ├── avid.png │ │ │ ├── byol.png │ │ │ ├── cma.png │ │ │ ├── figure_1.png │ │ │ ├── seer_1.png │ │ │ ├── seer_2.png │ │ │ └── simsiam.png │ │ └── 10-3 │ │ │ ├── autoencoder.png │ │ │ ├── decoder.png │ │ │ ├── ebm.png │ │ │ ├── predictor.png │ │ │ ├── transformer.png │ │ │ └── unit_delay.png │ ├── week11 │ │ ├── 11-1 │ │ │ └── figure1.png │ │ └── 11-2 │ │ │ ├── Screenshot (85).png │ │ │ ├── bs1.png │ │ │ ├── bs2.png │ │ │ ├── bs3.png │ │ │ ├── figure10.png │ │ │ ├── figure11.png │ │ │ ├── figure12.png │ │ │ ├── figure13.png │ │ │ ├── figure14.png │ │ │ ├── figure5.png │ │ │ ├── figure6.png │ │ │ ├── figure7.png │ │ │ ├── figure8.png │ │ │ ├── figure9.png │ │ │ └── greedy.png │ ├── week12 │ │ ├── 12-1 │ │ │ ├── figure1.png │ │ │ ├── figure10.png │ │ │ ├── figure11.png │ │ │ ├── figure12.png │ │ │ ├── figure13.png │ │ │ ├── figure14.png │ │ │ ├── figure15.png │ │ │ ├── figure16.png │ │ │ ├── figure17.png │ │ │ ├── figure2.png │ │ │ ├── figure3.png │ │ │ ├── figure4.png │ │ │ ├── figure5.png │ │ │ ├── figure6.png │ │ │ ├── figure7.png │ │ │ ├── figure8.png │ │ │ └── figure9.png │ │ ├── 12-2 │ │ │ ├── figure1.png │ │ │ ├── figure10.png │ │ │ ├── figure10_1.png │ │ │ ├── figure11.png │ │ │ ├── figure12.png │ │ │ ├── figure13.png │ │ │ ├── figure14.png │ │ │ ├── figure15.png │ │ │ ├── figure16.png │ │ │ ├── figure17.png │ │ │ ├── figure18.png │ │ │ ├── figure19.png │ │ │ ├── figure2.png │ │ │ ├── figure2_1.png │ │ │ ├── figure2_2.png │ │ │ ├── figure3.png │ │ │ ├── figure3_1.png │ │ │ ├── figure3_2.png │ │ │ ├── figure4.png │ │ │ ├── figure4_2.png │ │ │ ├── figure4_3.png │ │ │ ├── figure5.png │ │ │ ├── figure6.png │ │ │ ├── figure7.png │ │ │ ├── figure8.png │ │ │ ├── figure8_1.png │ │ │ ├── figure8_2.png │ │ │ └── figure9.png │ │ └── 12-3 │ │ │ ├── figure1.png │ │ │ ├── figure10.png │ │ │ ├── figure11.png │ │ │ ├── figure12.png │ │ │ ├── figure13.png │ │ │ ├── figure14.png │ │ │ ├── figure15.png │ │ │ ├── figure15.svg │ │ │ ├── figure16.png │ │ │ ├── figure16.svg │ │ │ ├── figure2.png │ │ │ ├── figure3.svg │ │ │ ├── figure4.svg │ │ │ ├── figure5.svg │ │ │ ├── figure6.svg │ │ │ ├── figure7.svg │ │ │ ├── figure8.svg │ │ │ └── figure9.png │ └── week15 │ │ ├── 15-1 │ │ ├── 1_fig0.png │ │ ├── 1_fig1.png │ │ ├── 1_fig3.png │ │ ├── 1_fig4.png │ │ ├── 1_fig7.png │ │ └── 1_fig9.png │ │ └── 15-2 │ │ ├── 2_fig1.png │ │ ├── 2_fig2.png │ │ ├── 2_fig3.png │ │ ├── 2_fig4.png │ │ ├── 2_fig5.png │ │ ├── 2_fig6.png │ │ └── 2_fig7.png ├── index.md ├── serve.sh └── static └── res └── plot_lib.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Remove [I]Python caching 2 | __pycache__ 3 | .ipynb_checkpoints 4 | 5 | # Remove Mac shit 6 | .DS_Store 7 | 8 | # Remove Vim temp files 9 | *sw* 10 | 11 | # Ignore Data files 12 | *.tar.gz 13 | *.feat 14 | *.txt 15 | *.data 16 | .idea/ 17 | *.pth 18 | *-ubyte 19 | *.pt 20 | *.png 21 | !docs/**/*.png 22 | imdb 23 | data 24 | .jekyll-cache 25 | _site 26 | .vscode 27 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "docs/jekyllbook"] 2 | path = docs/jekyllbook 3 | url = https://github.com/ebetica/jekyllbook 4 | -------------------------------------------------------------------------------- /04-spiral_classification.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Spiral classification" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import torch\n", 17 | "from torch import nn, optim\n", 18 | "from math import pi as π" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "from res.plot_lib import *" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": null, 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "set_default()" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": null, 42 | "metadata": {}, 43 | "outputs": [], 44 | "source": [ 45 | "device = torch.device(\"cuda:0\" if torch.cuda.is_available() else \"cpu\")" 46 | ] 47 | }, 48 | { 49 | "cell_type": "markdown", 50 | "metadata": {}, 51 | "source": [ 52 | "## Create the data" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": null, 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [ 61 | "seed = 12345\n", 62 | "torch.manual_seed(seed)\n", 63 | "N = 1000 # num_samples_per_class\n", 64 | "n = 2 # input dimensions\n", 65 | "K = 5 # num_classes\n", 66 | "d = 100 # num_hidden_units" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": null, 72 | "metadata": {}, 73 | "outputs": [], 74 | "source": [ 75 | "# Generate spirals\n", 76 | "\n", 77 | "t = torch.linspace(0, 1, N)\n", 78 | "a = 0.8 * t + 0.2 # amplitude 0.2 → 1.0\n", 79 | "X = list()\n", 80 | "y = list()\n", 81 | "for k in range(K):\n", 82 | " θ = (2 * t + k) * 2 * π / K + 0.2 * torch.randn(N)\n", 83 | " X.append(torch.stack((a * θ.sin(), a * θ.cos()), dim=1))\n", 84 | " y.append(torch.zeros(N, dtype=torch.long).fill_(k))\n", 85 | "X = torch.cat(X)\n", 86 | "y = torch.cat(y)\n", 87 | "\n", 88 | "print(\"Shapes:\")\n", 89 | "print(\"X:\", tuple(X.size()))\n", 90 | "print(\"y:\", tuple(y.size()))" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": null, 96 | "metadata": {}, 97 | "outputs": [], 98 | "source": [ 99 | "# And visualise them\n", 100 | "plot_data(X, y)" 101 | ] 102 | }, 103 | { 104 | "cell_type": "markdown", 105 | "metadata": {}, 106 | "source": [ 107 | "## Build and train a neural net" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": null, 113 | "metadata": {}, 114 | "outputs": [], 115 | "source": [ 116 | "learning_rate = 1e-3\n", 117 | "lambda_l2 = 1e-5" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": null, 123 | "metadata": {}, 124 | "outputs": [], 125 | "source": [ 126 | "# Model definition\n", 127 | "model = nn.Sequential(\n", 128 | " nn.Linear(n, d),\n", 129 | " # nn.ReLU(), # Comment this line for a linear model\n", 130 | " nn.Linear(d, K) # (Optional) Comment this line and uncomment the next one to display 2D embeddings below\n", 131 | " # nn.Linear(d, 2), nn.Linear(2, K)\n", 132 | ")\n", 133 | "model.to(device) # possibly send to CUDA\n", 134 | "\n", 135 | "# Cross entropy given the linear output\n", 136 | "C = torch.nn.CrossEntropyLoss(reduction='none')\n", 137 | "\n", 138 | "# Using Adam optimiser\n", 139 | "optimiser = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=lambda_l2) # built-in L2\n", 140 | "\n", 141 | "# Full-batch training loop\n", 142 | "for t in range(2_000):\n", 143 | " \n", 144 | " # Feed forward to get the linear sum s\n", 145 | " s = model(X)\n", 146 | " \n", 147 | " # Compute the free energy F and loss L\n", 148 | " F = C(s, y)\n", 149 | " L = F.mean()\n", 150 | " \n", 151 | " # Zero the gradients\n", 152 | " optimiser.zero_grad()\n", 153 | " \n", 154 | " # Backward pass to compute and accumulate the gradient\n", 155 | " # of the free energy w.r.t our learnable params\n", 156 | " L.backward()\n", 157 | " \n", 158 | " # Update params\n", 159 | " optimiser.step()\n", 160 | " \n", 161 | " # Display epoch, L, and accuracy\n", 162 | " overwrite(f'[EPOCH]: {t}, [LOSS]: {L.item():.6f}, [ACCURACY]: {acc(s, y):.3f}')" 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": null, 168 | "metadata": {}, 169 | "outputs": [], 170 | "source": [ 171 | "# Plot trained model\n", 172 | "print(model)\n", 173 | "plot_model(X, y, model)" 174 | ] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "execution_count": null, 179 | "metadata": {}, 180 | "outputs": [], 181 | "source": [ 182 | "# (Optional) Plot internal 2D embeddings if available\n", 183 | "plot_embeddings(X, y, model, zoom=10)" 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "execution_count": null, 189 | "metadata": {}, 190 | "outputs": [], 191 | "source": [ 192 | "# Compute linear output s for a fine grid over the input space\n", 193 | "\n", 194 | "mesh = torch.arange(-1.5, 1.5, 0.01)\n", 195 | "xx, yy = torch.meshgrid(mesh, mesh)\n", 196 | "grid = torch.stack((xx.reshape(-1), yy.reshape(-1)), dim=1)\n", 197 | "with torch.no_grad():\n", 198 | " s = model(grid)" 199 | ] 200 | }, 201 | { 202 | "cell_type": "code", 203 | "execution_count": null, 204 | "metadata": {}, 205 | "outputs": [], 206 | "source": [ 207 | "# Choice of free energy\n", 208 | "\n", 209 | "fe = 'cross-entropy'\n", 210 | "fe = 'negative linear output'" 211 | ] 212 | }, 213 | { 214 | "cell_type": "code", 215 | "execution_count": null, 216 | "metadata": {}, 217 | "outputs": [], 218 | "source": [ 219 | "# Switch to non-interactive matplotlib\n", 220 | "%matplotlib inline\n", 221 | "set_default()" 222 | ] 223 | }, 224 | { 225 | "cell_type": "code", 226 | "execution_count": null, 227 | "metadata": {}, 228 | "outputs": [], 229 | "source": [ 230 | "# ! mkdir {m}-levels" 231 | ] 232 | }, 233 | { 234 | "cell_type": "code", 235 | "execution_count": null, 236 | "metadata": { 237 | "scrolled": true 238 | }, 239 | "outputs": [], 240 | "source": [ 241 | "# Plot 2d energy levels\n", 242 | "\n", 243 | "for k in range(K):\n", 244 | " if fe == 'cross-entropy':\n", 245 | " F = C(s, torch.LongTensor(1).fill_(k).expand(s.size(0)))\n", 246 | " F = F.reshape(xx.shape)\n", 247 | " plot_2d_energy_levels(X, y, (xx, yy, F, k, K), (0, 35), (1, 35, 4))\n", 248 | "\n", 249 | " elif fe == 'negative linear output':\n", 250 | " F = -s[:, k]\n", 251 | " F = F.reshape(xx.shape)\n", 252 | " plot_2d_energy_levels(X, y, (xx, yy, F, k, K), (-20, 20), (-20, 21, 2.5))\n", 253 | " \n", 254 | "# plt.savefig(f'{m}-levels/{k}.png', bbox_inches='tight')" 255 | ] 256 | }, 257 | { 258 | "cell_type": "code", 259 | "execution_count": null, 260 | "metadata": {}, 261 | "outputs": [], 262 | "source": [ 263 | "# ! ffmpeg -framerate 1 -i {m}-levels/%d.png -r 25 -vf \"crop=trunc(iw/2)*2:trunc(ih/2)*2\" -pix_fmt yuv420p {m}-levels.mp4" 264 | ] 265 | }, 266 | { 267 | "cell_type": "code", 268 | "execution_count": null, 269 | "metadata": {}, 270 | "outputs": [], 271 | "source": [ 272 | "# Switch to interactive matplotlib\n", 273 | "%matplotlib widget" 274 | ] 275 | }, 276 | { 277 | "cell_type": "code", 278 | "execution_count": null, 279 | "metadata": {}, 280 | "outputs": [], 281 | "source": [ 282 | "# Cross-entropy\n", 283 | "if fe == 'cross-entropy':\n", 284 | " fig, ax = plot_3d_energy_levels(X, y, (xx, yy, F, k, K), (0, 18), (0, 19, 1), (0, 19, 2))\n", 285 | "elif fe == 'negative linear output':\n", 286 | " fig, ax = plot_3d_energy_levels(X, y, (xx, yy, F, k, K), (-30, 20), (-30, 20, 1), (-30, 21, 5))" 287 | ] 288 | }, 289 | { 290 | "cell_type": "code", 291 | "execution_count": null, 292 | "metadata": {}, 293 | "outputs": [], 294 | "source": [ 295 | "# ! mkdir {m}-3d-levels" 296 | ] 297 | }, 298 | { 299 | "cell_type": "code", 300 | "execution_count": null, 301 | "metadata": {}, 302 | "outputs": [], 303 | "source": [ 304 | "# Spin it around (and maybe save to disk)\n", 305 | "δ = 10\n", 306 | "for angle in range(0, 360, δ):\n", 307 | " ax.view_init(30, -60 + angle)\n", 308 | " fig.canvas.draw()\n", 309 | "# plt.pause(.001)\n", 310 | "# plt.savefig(f'{m}-3d-levels/{angle:03d}.png', bbox_inches='tight')" 311 | ] 312 | }, 313 | { 314 | "cell_type": "code", 315 | "execution_count": null, 316 | "metadata": {}, 317 | "outputs": [], 318 | "source": [ 319 | "# ! ffmpeg -i {m}-3d-levels/%03d.png -vf \"crop=trunc(iw/2)*2:trunc(ih/2)*2\" -pix_fmt yuv420p {m}-3d-levels.mp4" 320 | ] 321 | } 322 | ], 323 | "metadata": { 324 | "kernelspec": { 325 | "display_name": "Python 3 (ipykernel)", 326 | "language": "python", 327 | "name": "python3" 328 | }, 329 | "language_info": { 330 | "codemirror_mode": { 331 | "name": "ipython", 332 | "version": 3 333 | }, 334 | "file_extension": ".py", 335 | "mimetype": "text/x-python", 336 | "name": "python", 337 | "nbconvert_exporter": "python", 338 | "pygments_lexer": "ipython3", 339 | "version": "3.10.13" 340 | } 341 | }, 342 | "nbformat": 4, 343 | "nbformat_minor": 4 344 | } 345 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # NYU Deep Learning Spring 2021 (NYU-DLSP21) 2 | 3 | 4 | [🇬🇧](https://github.com/Atcold/NYU-DLSP21/blob/master/README.md)   [🇫🇷](https://github.com/Atcold/NYU-DLSP21/blob/master/docs/fr/README-FR.md) 5 | 6 | 7 | ## Content new organisation 8 | 9 | This semester we have reorganised the didactic material. 10 | In the first half of the semester we covered 3 topics, spanning two weeks, each followed by an assignment. 11 | Moreover, each lecture had a corresponding practicum. 12 | 13 | 1. History, backpropagation, and gradient descent 14 | 2. Parameter sharing: recurrent and convolutional networks 15 | 3. Latent variable (LV) energy based models (EBMs) 16 | 17 | Pay attention that we have redesigned the curriculum and lectures' content. 18 | We've treated LV-EBM as a *basic* module, which to build upon. 19 | 20 | 21 | ## Enters the semester's second half 22 | 23 | I thought I was going to repropose the same practica I've used during [NYU-DLSP20](https://github.com/Atcold/NYU-DLSP20), last year edition, just in different order. 24 | 25 | But I couldn't. 26 | 27 | This year's students have LV-EBMs on their side. 28 | We told them about *the cake* and now I cannot pretend it doesn't exist and teach as if they were unaware of the elephant in the room. 29 | It would have been intellectually dishonest. 30 | Henceforth, I've redesigned my whole deck of slides. 31 | 32 | 33 | ## This semester repository 34 | 35 | That's why this repo has been created. 36 | I'm **not** going to try to do the same insane work I've put up with last year, but I need a space where to post updated slides, notebooks, and host new transcriptions. 37 | Last year material is still valid. 38 | This year you have a different take. 39 | A more powerful one. 40 | 41 | 42 | ## Previous releases 43 | 44 | Before NYU-DLSP21 there were… 45 | 46 | - [NYU-DLSP20](https://github.com/Atcold/NYU-DLSP20) (major release) 47 | - [NYU-DLSP19](https://github.com/Atcold/NYU-DLSP20/releases/tag/dlsp19) 48 | - [AIMS-DLFL19](https://github.com/Atcold/NYU-DLSP20/releases/tag/aims-fl18) 49 | - [CoDaS-HEP18](https://github.com/Atcold/NYU-DLSP20/releases/tag/v1.0.0) 50 | - [NYU-DLSP18](https://docs.google.com/document/d/1_p1Mw-NtMGN_vpas_pchLsQC2u0NM5mTnRapBrQ2ivk/) 51 | - [Purdue-DLFL16](https://docs.google.com/document/d/1ugJRMqQ_cCUQC1B8mSE0iro7sKrDT8-BnppTZv0rA08/) 52 | - [torch-Video-Tutorials](https://github.com/Atcold/torch-Video-Tutorials) 53 | 54 | ## More info 55 | 56 | Keep reading on the [class website](https://atcold.github.io/NYU-DLSP21/). 57 | -------------------------------------------------------------------------------- /docs/404.html: -------------------------------------------------------------------------------- 1 | jekyllbook/404.html -------------------------------------------------------------------------------- /docs/_config.yml: -------------------------------------------------------------------------------- 1 | permalink: pretty 2 | 3 | # Setup 4 | title: 'Deep Learning' 5 | url: https://atcold.github.io/NYU-DLSP21/ 6 | baseurl: '/NYU-DLSP21' 7 | homepage_title: Home 8 | default_lang: 'en' 9 | 10 | # About/contact 11 | author: 12 | name: atcold 13 | url: https://twitter.com/alfcnz 14 | github: 15 | repo: https://github.com/atcold/NYU-DLSP21 16 | 17 | # Custom vars 18 | version: dlsp21 19 | 20 | src: "." 21 | default_theme: "ayu" 22 | 23 | defaults: 24 | - scope: 25 | path: "" # an empty string here means all files in the project 26 | values: 27 | layout: "custom" 28 | 29 | # For Maths 30 | markdown: kramdown 31 | 32 | # To use hljs, disable the default highlighter 33 | kramdown: 34 | syntax_highlighter_opts: 35 | disable: true 36 | math_engine: null 37 | 38 | exclude: 39 | - jekyllbook 40 | - en/index.md 41 | - vendor 42 | 43 | 44 | ################################### English #################################### 45 | prologues: 46 | - path: en/faq.md 47 | chapters: 48 | - path: en/week01/01.md 49 | - path: en/week02/02.md 50 | sections: 51 | - path: en/week02/02-3.md 52 | - path: en/week03/03.md 53 | sections: 54 | - path: en/week03/03-3.md 55 | - path: en/week04/04.md 56 | - path: en/week05/05.md 57 | - path: en/week06/06.md 58 | - path: en/week07/07.md 59 | sections: 60 | - path: en/week07/07-3.md 61 | - path: en/week08/08.md 62 | sections: 63 | - path: en/week08/08-3.md 64 | - path: en/week09/09.md 65 | sections: 66 | - path: en/week09/09-3.md 67 | - path: en/week10/10.md 68 | sections: 69 | - path: en/week10/10-1.md 70 | - path: en/week10/10-2.md 71 | - path: en/week10/10-3.md 72 | - path: en/week11/11.md 73 | sections: 74 | - path: en/week11/11-1.md 75 | - path: en/week11/11-2.md 76 | - path: en/week12/12.md 77 | sections: 78 | - path: en/week12/12-1.md 79 | - path: en/week12/12-2.md 80 | - path: en/week12/12-3.md 81 | - path: en/week13/13.md 82 | - path: en/week14/14.md 83 | - path: en/week15/15.md 84 | sections: 85 | - path: en/week15/15-1.md 86 | - path: en/week15/15-2.md 87 | 88 | 89 | 90 | ################################### French #################################### 91 | fr: 92 | title: 'Apprentissage Profond' 93 | prologues: 94 | - path: fr/faq.md 95 | chapters: 96 | - path: fr/week01/01.md 97 | - path: fr/week02/02.md 98 | sections: 99 | - path: fr/week02/02-3.md 100 | - path: fr/week03/03.md 101 | sections: 102 | - path: fr/week03/03-3.md 103 | - path: fr/week04/04.md 104 | - path: fr/week05/05.md 105 | - path: fr/week06/06.md 106 | - path: fr/week07/07.md 107 | sections: 108 | - path: fr/week07/07-3.md 109 | - path: fr/week08/08.md 110 | sections: 111 | - path: fr/week08/08-3.md 112 | - path: fr/week09/09.md 113 | sections: 114 | - path: fr/week09/09-3.md 115 | - path: fr/week10/10.md 116 | sections: 117 | - path: fr/week10/10-1.md 118 | - path: fr/week10/10-2.md 119 | - path: fr/week10/10-3.md 120 | - path: fr/week11/11.md 121 | sections: 122 | - path: fr/week11/11-1.md 123 | - path: fr/week11/11-2.md 124 | - path: fr/week12/12.md 125 | sections: 126 | - path: fr/week12/12-1.md 127 | - path: fr/week12/12-2.md 128 | - path: fr/week12/12-3.md 129 | - path: fr/week13/13.md 130 | - path: fr/week14/14.md 131 | - path: fr/week15/15.md 132 | sections: 133 | - path: fr/week15/15-1.md 134 | - path: fr/week15/15-2.md 135 | -------------------------------------------------------------------------------- /docs/_layouts/custom.html: -------------------------------------------------------------------------------- 1 | --- 2 | layout: default 3 | --- 4 | 5 |
6 | $$\gdef \sam #1 {\mathrm{softargmax}(#1)}$$ 7 | $$\gdef \vect #1 {\boldsymbol{#1}} $$ 8 | $$\gdef \matr #1 {\boldsymbol{#1}} $$ 9 | $$\gdef \E {\mathbb{E}} $$ 10 | $$\gdef \V {\mathbb{V}} $$ 11 | $$\gdef \R {\mathbb{R}} $$ 12 | $$\gdef \N {\mathbb{N}} $$ 13 | $$\gdef \relu #1 {\texttt{ReLU}(#1)} $$ 14 | $$\gdef \D {\,\mathrm{d}} $$ 15 | $$\gdef \deriv #1 #2 {\frac{\D #1}{\D #2}}$$ 16 | $$\gdef \pd #1 #2 {\frac{\partial #1}{\partial #2}}$$ 17 | $$\gdef \set #1 {\left\lbrace #1 \right\rbrace} $$ 18 | 19 | % My colours 20 | 21 | $$\gdef \aqua #1 {\textcolor{8dd3c7}{#1}} $$ 22 | $$\gdef \yellow #1 {\textcolor{ffffb3}{#1}} $$ 23 | $$\gdef \lavender #1 {\textcolor{bebada}{#1}} $$ 24 | $$\gdef \red #1 {\textcolor{fb8072}{#1}} $$ 25 | $$\gdef \blue #1 {\textcolor{80b1d3}{#1}} $$ 26 | $$\gdef \orange #1 {\textcolor{fdb462}{#1}} $$ 27 | $$\gdef \green #1 {\textcolor{b3de69}{#1}} $$ 28 | $$\gdef \pink #1 {\textcolor{fccde5}{#1}} $$ 29 | $$\gdef \vgrey #1 {\textcolor{d9d9d9}{#1}} $$ 30 | $$\gdef \violet #1 {\textcolor{bc80bd}{#1}} $$ 31 | $$\gdef \unka #1 {\textcolor{ccebc5}{#1}} $$ 32 | $$\gdef \unkb #1 {\textcolor{ffed6f}{#1}} $$ 33 | 34 | % Vectors 35 | $$\gdef \vx {\pink{\vect{x }}} $$ 36 | $$\gdef \vy {\blue{\vect{y }}} $$ 37 | $$\gdef \vb {\vect{b}} $$ 38 | $$\gdef \vz {\orange{\vect{z }}} $$ 39 | $$\gdef \vtheta {\vect{\theta }} $$ 40 | $$\gdef \vh {\green{\vect{h }}} $$ 41 | $$\gdef \vq {\aqua{\vect{q }}} $$ 42 | $$\gdef \vk {\yellow{\vect{k }}} $$ 43 | $$\gdef \vv {\green{\vect{v }}} $$ 44 | $$\gdef \vytilde {\violet{\tilde{\vect{y}}}} $$ 45 | $$\gdef \vyhat {\red{\hat{\vect{y}}}} $$ 46 | $$\gdef \vycheck {\blue{\check{\vect{y}}}} $$ 47 | $$\gdef \vzcheck {\blue{\check{\vect{z}}}} $$ 48 | $$\gdef \vztilde {\green{\tilde{\vect{z}}}} $$ 49 | $$\gdef \vmu {\green{\vect{\mu}}} $$ 50 | $$\gdef \vu {\orange{\vect{u}}} $$ 51 | 52 | % Matrices 53 | $$\gdef \mW {\matr{W}} $$ 54 | $$\gdef \mA {\matr{A}} $$ 55 | $$\gdef \mX {\pink{\matr{X}}} $$ 56 | $$\gdef \mY {\blue{\matr{Y}}} $$ 57 | $$\gdef \mQ {\aqua{\matr{Q }}} $$ 58 | $$\gdef \mK {\yellow{\matr{K }}} $$ 59 | $$\gdef \mV {\lavender{\matr{V }}} $$ 60 | $$\gdef \mH {\green{\matr{H }}} $$ 61 | 62 | % Coloured math 63 | $$\gdef \cx {\pink{x}} $$ 64 | $$\gdef \ctheta {\orange{\theta}} $$ 65 | $$\gdef \cz {\orange{z}} $$ 66 | $$\gdef \Enc {\lavender{\text{Enc}}} $$ 67 | $$\gdef \Dec {\aqua{\text{Dec}}}$$ 68 | 69 | 70 |
71 | 72 | {% if page.lecturer %} 73 | 🎙️ {{page.lecturer}} 74 | {% endif %} 75 | 76 | {{ content }} 77 | 78 | 79 | {% if page.authors or page.date or page.translator %} 80 |
81 | {% endif %} 82 | 83 | {% if page.authors %} 84 | 📝 {{ page.authors }} 85 | {% endif %} 86 | 87 | {% if page.translator %} 88 | {% if page.lang %} 89 | {% assign thislang = page.lang %} 90 | {% else %} 91 | {% assign thislang = "en" %} 92 | {% endif %} 93 |
94 | {{ page.translator }} 95 | {% endif %} 96 | 97 | {% if page.date %} 98 |
99 | {{ page.date }} 100 | {% endif %} 101 | -------------------------------------------------------------------------------- /docs/_layouts/default.html: -------------------------------------------------------------------------------- 1 | ../jekyllbook/_layouts/default.html -------------------------------------------------------------------------------- /docs/en/faq.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Foreword, FAQ and disclaimer 3 | author: Loïck Bourdois 4 | date: 07 Jul 2021 5 | lang-ref: faq 6 | --- 7 | 8 | 9 | # Foreword 10 | 11 | This course concerns the latest techniques in deep learning and representation learning, focusing on supervised and unsupervised deep learning, embedding methods, metric learning, convolutional and recurrent nets, with applications to computer vision, natural language understanding, and speech recognition. 12 | The prerequisites include: [DS-GA 1001 Intro to Data Science](https://cds.nyu.edu/academics/ms-curriculum/) or a graduate-level machine learning course. 13 | 14 | We invite you to prefer the videos on the [YouTube channel](https://www.youtube.com/playlist?list=PLLHTzKZzVU9e6xUfG10TkTWApKSZCzuBI) ("official" content) since the course is given by the teaching staff, unlike the website where it is the notes taken by the students during the course. 15 | The website is summaries of the videos, so the videos usually include additional information compared to the website. For example: 16 | - anecdotes about the different concepts discussed, 17 | - jokes, 18 | - the repetition of the same concept but in the form of different formulations, thus generally making it possible to understand an idea if a first formulation is not understood, 19 | - the students' questions, which can be the ones you have yourself during the viewing, 20 | If concepts are still not understood at the end of the video, you have the possibility to ask a question in the commentary of the YouTube video, which the website does not allow. 21 | - the references of the articles on which the course is based are present on the slides of the videos whereas they are absent from the website. 22 | 23 | The website thus serves more as a summary of the videos or as a basis for your personal notes that you take while watching the videos. 24 | Note that you can easily switch from the site to a moment of a given video by clicking on the paragraph titles of the web pages. 25 | 26 | 27 | # FAQ 28 | 29 | Here are some answers to frequently asked questions: 30 | - **Does taking this course lead to certification?** 31 | > No, it does not. In order to offer a certification, we would have to be able to evaluate you, but the content has not been designed for this (unlike a MOOC for example). As this is a frequent request, we are thinking about proposing a certification for future editions of the course. 32 | - **How much time should I spend on this course?** 33 | > For each week, there is approximately 2h30/3h of video content. With the time dedicated to note taking and playing with the notebooks, a total estimate of 5 hours per week seems reasonable. For the rest, it depends on the level of immersion you want to achieve in a given topic (reading the referenced articles, applying what was seen in class to your own projects, etc.). 34 | - **Where to ask a question after watching a video?** 35 | > You can ask it directly in the comments section under the YouTube video in question, and Alfredo will be happy to answer it. If the question is about a specific point in the video, please include the time stamp. 36 | > You can also do this on the class [Discord](https://discord.gg/CthuqsX8Pb) specifically for students. It is also used to coordinate viewing groups, discuss assignments, suggest improvements, or generally discuss any topic related to the course. 37 | - **Can I use this course?** 38 | > Of course, the course is under the [Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License](http://creativecommons.org/licenses/by-nc-sa/4.0/). 39 | > This means that: 40 | > - You may not use the material for commercial purposes. 41 | > - You must give appropriate credit, provide a link to the license, and indicate if changes were made. You may do so in any reasonable manner, but not in any way that suggests the licensor endorses you or your use. 42 | > - If you remix, transform, or build upon the material, you must distribute your contributions under the same license as the original. 43 | > 44 | > For credit, you can use the following BibTeX: 45 | > ```bibtex 46 | > @misc{canziani2020nyudlsp21, 47 | > author = {Canziani, Alfredo and LeCun, Yann}, 48 | > title = {{NYU Deep Learning, Spring 2021}}, 49 | > howpublished = "\url{https://atcold.github.io/NYU-DLSP21}", 50 | > year = {2021}, 51 | > note = "[Online; accessed ]" 52 | > } 53 | > ``` 54 | 55 | 56 | 57 | 58 | # Disclaimer 59 | 60 | All other texts found on this site are lecture notes taken by students of the New York University during lectures given by Yann Le Cun, Alfredo Canziani, Ishan Misra, Awni Hannun and Marc'Aurelio Ranzato. 61 | Thus the texts in English were written by several people, which has an impact on the homogeneity of the texts (some write in the past tense, others in the present tense; the abbreviations used are not always the same; some write short sentences, while others write sentences of up to 5 or 6 lines, etc.). 62 | It is possible that there may be some omissions: typing errors, spelling mistakes, etc. 63 | If you notice any, we invite you to submit a PR on the [GitHub directory of the site](https://github.com/Atcold/NYU-DLSP21/pulls) specifying with an `[EN]` that it concerns the English translation. 64 | 65 | Wishing you a deep reading ! 66 | -------------------------------------------------------------------------------- /docs/en/week01/01.md: -------------------------------------------------------------------------------- 1 | --- 2 | lang-ref: ch.01 3 | title: Week 1 4 | --- 5 | 6 | 7 | ## Lecture 8 | 9 | Some history can be found [here](https://atcold.github.io/NYU-DLSP20/en/week01/01-1/), while gradient descent can be found [here](https://atcold.github.io/NYU-DLSP20/en/week02/02-1/). 10 | 11 | 12 | ## Practicum 13 | 14 | This plus the next practicum's summary can be found [here](https://atcold.github.io/NYU-DLSP20/en/week01/01-3/). 15 | -------------------------------------------------------------------------------- /docs/en/week02/02-3.md: -------------------------------------------------------------------------------- 1 | --- 2 | lang-ref: ch.02-3 3 | title: Problem Motivation, Linear Algebra, and Visualization 4 | lecturer: Alfredo Canziani 5 | authors: Rajashekar Vasantha 6 | date: 04 Feb 2021 7 | --- 8 | 9 | 10 | ## Resources 11 | 12 | Please follow Alfredo Canziani [on Twitter @alfcnz](https://twitter.com/alfcnz). Videos and textbooks with relevant details on linear algebra and singular value decomposition (SVD) can be found by searching Alfredo's Twitter, for example type `linear algebra (from:alfcnz)` in the search box. 13 | 14 | 15 | ## [Neural Nets: Rotation and Squashing](https://youtu.be/0TdAmZUMj2k) 16 | A traditional neural network is an alternating collection of two blocks - the linear blocks and the non-linear blocks. Given below is a block diagram of a traditional neural network. 17 |
18 |
19 |
20 | 21 | Figure 1: Block Diagram of a Traditional Neural Network 22 |
23 |
24 | The linear blocks (Rotations, for simplicity) are given by: 25 | 26 | $$ 27 | \vect{s}_{k+1} = \mW_k z_k 28 | $$ 29 | 30 | And the non-linear blocks (Squashing functions for intuitive understanding) are given by: 31 | 32 | $$ \vect{z}_k = h(\vect{s}_k) $$ 33 | 34 | In the above diagram and equations, $$\vx \in \mathbb{R}^n$$ represents the input vector. $$\mW_k \in \mathbb{R}^{n_{k} \times n_{k-1}}$$ represents the matrix of an affine transformation corresponding to the $$k^{\text{th}}$$ block and is described below in further detail. The function $h$ is called the activation function and this function forms the non-linear block of the neural network. Sigmoid, ReLu and tanh are some of the common activation functions and we will look at them in the later parts of this section. After alternate applications of linear and non-linear blocks, the above network produces an output vector $$\vect{s}_k \in \mathbb{R}^{n_{k-1}}$$. 35 | 36 | Let us first have a look at the linear block to gain some intuition on affine transformations. As a motivating example, let us consider image classification. Suppose we take a picture with a 1 megapixel camera. This image will have about 1,000 pixels vertically and 1,000 pixels horizontally, and each pixel will have three colour dimensions for red, green, and blue (RGB). Each particular image can then be considered as one point in a 3 million-dimensional space. With such massive dimensionality, many interesting images we might want to classify -- such as a dog *vs.* a cat -- will essentially be in the same region of the space. 37 | 38 | In order to effectively separate these images, we consider ways of transforming the data in order to move the points. Recall that in 2-D space, a linear transformation is the same as matrix multiplication. For example, the following are transformations, which can be obtained by changing matrix characteristics: 39 | 40 | - Rotation (when the matrix is orthonormal). 41 | - Scaling (when the matrix is diagonal). 42 | - Reflection (when the determinant is negative). 43 | - Shearing. 44 | - Translation. 45 | 46 | Note that translation alone is not linear since 0 will not always be mapped to 0, but it is an affine transformation. Returning to our image example, we can transform the data points by translating such that the points are clustered around 0 and scaling with a diagonal matrix such that we "zoom in" to that region. Finally, we can do classification by finding lines across the space which separate the different points into their respective classes. In other words, the idea is to use linear and nonlinear transformations to map the points into a space such that they are linearly separable. This idea will be made more concrete in the following sections. 47 | 48 | In the next part, we visualize how a neural network separates points and a few linear and non-linear transformations. This can be accessed [here](https://atcold.github.io/NYU-DLSP20/en/week01/01-3/). 49 | -------------------------------------------------------------------------------- /docs/en/week02/02.md: -------------------------------------------------------------------------------- 1 | --- 2 | lang-ref: ch.02 3 | title: Week 2 4 | --- 5 | 6 | 7 | ## Lecture 8 | 9 | Similar to [this](https://atcold.github.io/NYU-DLSP20/en/week11/11-1/), [this](https://atcold.github.io/NYU-DLSP20/en/week11/11-2/), and possibly more. 10 | 11 | 12 | ## Practicum 13 | 14 | We discuss the motivation for applying transformations to data points visualized in space. We talk about Linear Algebra and the application of linear and non-linear transformations. We discuss the use of visualization to understand the function and effects of these transformations. We walk through examples in a Jupyter Notebook and conclude with a discussion of functions represented by neural networks. 15 | -------------------------------------------------------------------------------- /docs/en/week03/03-3.md: -------------------------------------------------------------------------------- 1 | --- 2 | lang-ref: ch.03-3 3 | title: Spiral classification 4 | lecturer: Alfredo Canziani 5 | authors: Wenhao Li 6 | date: 6 May 2021 7 | --- 8 | 9 | ## [Typora](https://typora.io/) 10 | Typora is a useful tool to write markdown with the addition of formulae in LaTeX. It is convenient to write paper and homework, and generating pdf file with Typora. 11 | 12 | ## [Notion](https://www.notion.so/) 13 |
14 |
15 |
16 | Here you can place all your favorite stuff. This includes but is not limited to recipes, music, books, notes. Everything in one place, simple and powerful. 17 | 18 | When you find some useful article regarding Deep Learning, you may want to collect it for future review. The database is just all you need. You can find [more information](https://www.notion.so/Intro-to-databases-fd8cd2d212f74c50954c11086d85997e) about how to use the database. 19 | 20 | First you need to create a database by "Workspace" -> "Add a new page". Inside this page, choose "/table" -> "Table - Full Page". In addition to filling out the information related to the paper, we usually want to cover "The Golden Circle" aka "What? Why? How?" in our summary. 21 | 22 | This is an [example](https://www.notion.so/When-to-use-parametric-models-in-reinforcement-learning-d4c5e586677e49338a41b663231c0633) of how to organize your summary. 23 | 24 | 25 | 26 | 27 | ## [Diagram.net](https://app.diagrams.net/) 28 | 29 | Diagrams.net is a great tool to draw neural network diagrams. Next we will introduce a few rules to make our diagrams more consistent with the ones in lecture. 30 | 31 | 32 | 33 |
34 |
35 |
36 | 37 | The grayscale background means this is an observation, which means they are data points from a given dataset. You can check the input and labels by going to the directory of the dataset if you want. 38 | 39 |
40 |
41 |
42 | 43 | We use "Delay" to denote the encoder(e.g., neural network). 44 | 45 | 46 |
47 |
48 |
49 | 50 | In this example, $\vx$ and $\vy$ are observations. 51 | 52 | In the half above, we feed the $\vx$ to a given encoder to get a prediction $\bar {\vy}$. This is called forward propagation. 53 | 54 | In the half below, we want to get the prediction $\bar{\vx}$ given observation $\vy$. We keep doing gradient descent to make the network output as close as to $\vy$. This is called amortizing inference. 55 | 56 | Usually, we use backpropagation to compute the gradient, then we apply gradient descent with those computed values to train the model. This example shows that backpropagation is NOT only used during training. Backpropagation can also be used for inference. 57 | 58 | 59 | 60 | ## Spiral Classification 61 | The following content is mostly the same, so [here](https://atcold.github.io/NYU-DLSP20/en/week02/02-3/) you can find what you need. 62 | -------------------------------------------------------------------------------- /docs/en/week03/03.md: -------------------------------------------------------------------------------- 1 | --- 2 | lang-ref: ch.03 3 | title: Week 3 4 | --- 5 | 6 | 7 | ## Lecture 8 | 9 | Parts can be found [here](https://atcold.github.io/NYU-DLSP20/en/week03/03-1/) and part [here](https://atcold.github.io/NYU-DLSP20/en/week06/06-2/). 10 | 11 | 12 | ## Practicum 13 | 14 | We introduced how to draw deep network schematics conveniently using diagrams.net. Then we showed the different effect of using only linear transformation, and the effect of combining linear and non-linear transformation together on spiral classification. Finally, we showed the mathematical principles underlying neural networks, including chain rule derivation, back propagation, and gradient descent. 15 | -------------------------------------------------------------------------------- /docs/en/week04/04.md: -------------------------------------------------------------------------------- 1 | --- 2 | lang-ref: ch.04 3 | title: Week 4 4 | --- 5 | 6 | 7 | ## Lecture 8 | 9 | Similar to [last year's edition](https://atcold.github.io/NYU-DLSP20/en/week06/06-1/). 10 | 11 | ## Practicum A & B 12 | 13 | Similar to last year's edition of [CNN](https://atcold.github.io/NYU-DLSP20/en/week03/03-3/) and [RNN](https://atcold.github.io/NYU-DLSP20/en/week06/06-3/). 14 | -------------------------------------------------------------------------------- /docs/en/week05/05.md: -------------------------------------------------------------------------------- 1 | --- 2 | lang-ref: ch.05 3 | title: Week 5 4 | --- 5 | 6 | 7 | ## Lecture 8 | 9 | Similar to [last year's](https://atcold.github.io/NYU-DLSP20/en/week07/07-1/) but different. 10 | 11 | ## Practicum 12 | 13 | Same as [last year](https://atcold.github.io/NYU-DLSP20/en/week15/15-1/). 14 | -------------------------------------------------------------------------------- /docs/en/week06/06.md: -------------------------------------------------------------------------------- 1 | --- 2 | lang-ref: ch.06 3 | title: Week 6 4 | --- 5 | 6 | 7 | ## Lecture 8 | 9 | Similar to [this](https://atcold.github.io/NYU-DLSP20/en/week14/14-1/) and [this](https://atcold.github.io/NYU-DLSP20/en/week14/14-2/). 10 | 11 | ## Practicum 12 | 13 | Same as [last year](https://atcold.github.io/NYU-DLSP20/en/week15/15-2/). 14 | -------------------------------------------------------------------------------- /docs/en/week07/07-3.md: -------------------------------------------------------------------------------- 1 | --- 2 | lang-ref: ch.07-3 3 | title: Introduction to Autoencoders 4 | lecturer: Alfredo Canziani 5 | authors: Vidit Bhargava, Monika Dagar 6 | date: 18 March 2021 7 | --- 8 | ## Applications of Autoencoder 9 | 10 | 11 | ### DALL-E: Creating Images from Text 12 | 13 | DALL-E (released by OpenAI) is a neural network based on the Transformers architecture, that creates images from text captions. It is a 12-billion parameter version of GPT-3, trained on a dataset of text-image pairs. 14 | 15 |
16 |
17 | Figure 1: DALL-E: Input-Output 18 |
19 | 20 | Go to the [website](https://openai.com/blog/dall-e/) and play with the captions! 21 | 22 | 23 | ## Autoencoder 24 | Let's start with some definitions: 25 | 26 | 27 | ### Definitions 28 | 29 | 30 | #### Input 31 | 32 | $\vx$: is observed during both training and testing 33 | 34 | $\vy$: is observed during training but not testing 35 | 36 | $\vz$: is not observed (neither during training nor during testing). 37 | 38 | 39 | #### Output 40 | 41 | $\vh$: is computed from the input (hidden/internal) 42 | 43 | $\vytilde$: is computed from the hidden (predicted $\vy$, ~ means *circa*) 44 | 45 | Confused? 46 | Refer to the below figure to understand the use of different variables in different machine learning techniques. 47 | 48 |
49 |
50 | Figure 2: Variable definitions in different machine learning techniques 51 |
52 | 53 | 54 | ### Introduction 55 | 56 | These kinds of networks are used to learn the internal structure of some input and encode it in a hidden internal representation $\vh$, which expresses the input. 57 | 58 | We already learned how to train energy-based models, let's look at the below network: 59 | 60 |
61 |
62 | Figure 3: Autoencoder Architecture 63 |
64 | 65 | Here instead of computing the minimization of the energy $\red{E}$ for $\vz$, we use an encoder that approximates the minimization and provides a hidden representation $\vh$ for a given $\vy$. 66 | 67 | $$ 68 | \vh = \Enc(\vy) 69 | $$ 70 | 71 | Then the hidden representation is convected into $\vytilde$ (Here we don't have a predictor, we have an encoder). 72 | 73 | $$ 74 | \vytilde= \Dec (\vh) 75 | $$ 76 | 77 | Basically, $\vh$ is the output of a squashing function $f$ of the rotation of our input/observation $\vy$. $\vytilde$ is the output of squashing function $g$ of the rotation of our hidden representation $\vh$. 78 | 79 | $$ 80 | \vh = f(\mW{_h} \vy + \vb{_h}) \\ 81 | \vytilde = g(\mW{_y}\vh + \vb{_y}) 82 | $$ 83 | 84 | Note that, here $\vy$ and $\vytilde$ both belong to the same input space, and $\vh$ belong to $\mathbb{R}^d$ which is the internal representation. $\mW_h$ and $\mW_y$ are matrices for rotation. 85 | 86 | $$ 87 | \vy, \vytilde \in \mathbb{R}^n \\ 88 | \vh \in \mathbb{R}^d \\ 89 | \mW_h \in \mathbb{R}^{d \times n} \\ 90 | \mW_y \in \mathbb{R}^{n \times d} 91 | $$ 92 | 93 | This is called Autoencoder. The encoder is performing amortizing and we don't have to minimize the energy $\red{E}$ but $\red{F}$: 94 | 95 | $$ 96 | \red{F}(\vy) = \red{C}(\vy,\vytilde) + \red{R}(\vh) 97 | $$ 98 | 99 | 100 | ### Reconstruction Costs 101 | 102 | Below are the two examples of reconstruction energies: 103 | 104 | 105 | #### Real-Valued Input: 106 | 107 | $$ 108 | \red{C}(\vy,\vytilde) = \Vert{\vy-\vytilde}\Vert^2 = \Vert \vy-\Dec[\Enc(\vy)] \Vert^2 109 | $$ 110 | 111 | This is the square euclidean distance between $\vy$ and $\vytilde$. 112 | 113 | 114 | #### Binary input 115 | 116 | In the case of binary input, we can simply use binary cross-entropy 117 | 118 | $$ 119 | \red{C}(\vy,\vytilde) = - \sum_{i=1}^n{\vy{_i}\log(\vytilde{_i}) + (1-\vy{_i})\log(1-\vytilde{_i})} 120 | $$ 121 | 122 | 123 | ### Loss Functionals 124 | 125 | Average across all training samples of per sample loss function 126 | 127 | $$ 128 | \mathcal{L}(\red{F}(\cdot),\mY) = \frac{1}{m}\sum_{j=1}^m{\ell(\red{F}(\cdot),\vy^{(j)})} \in \mathbb{R} 129 | $$ 130 | 131 | We take the energy loss and try to push the energy down on $\vytilde$ 132 | 133 | $$ 134 | \ell_{\text{energy}}(\red{F}(\cdot),\vy) = \red{F}(\vy) 135 | $$ 136 | 137 | 138 | ### Use-cases 139 | 140 | The size of the hidden representation $\vh$ obtained using these networks can be both smaller and larger than the input size. 141 | 142 | If we choose a smaller $\vh$, the network can be used for non-linear dimensionality reduction. 143 | 144 | In some situations it can be useful to have a larger than input $\vh$, however, in this scenario, a plain autoencoder would collapse. In other words, since we are trying to reconstruct the input, the model is prone to copying all the input features into the hidden layer and passing it as the output thus essentially behaving as an identity function. This needs to be avoided as this would imply that our model fails to learn anything. 145 | 146 | To prevent the model from collapsing, we have to employ techniques that constrain the amount of region which can take zero or low energy values. These techniques can be some sort of regularization such as sparsity constraints, adding additional noise, or sampling. 147 | 148 | 149 | ### Denoising autoencoder 150 | 151 | We add some augmentation/corruption like Gaussian noise to an input sampled from the training manifold $\vyhat$ before feeding it into the model and expect the reconstructed input $\vytilde$ to be similar to the original input $\vy$. 152 | 153 |
154 |
155 | Figure 4: Denoising Autoencoder Network architecture. 156 |
157 | 158 | An important note: The noise added to the original input should be similar to what we expect in reality, so the model can easily recover from it. 159 | 160 |
161 |
162 | Figure 5: Measuring the traveling distance of the input data 163 |
164 | 165 | In the image above, the light colour points on the spiral represent the original data manifold. As we add noise, we go farther from the original points. These noise-added points are fed into the auto-encoder to generate this graph. 166 | The direction of each arrow points to the original datapoint the model pushes the noise-added point towards; whereas the size of the arrow shows by how much. 167 | We also see a dark purple spiral region which exists because the points in this region are equidistant from two points on the data manifold. 168 | 169 | -------------------------------------------------------------------------------- /docs/en/week07/07.md: -------------------------------------------------------------------------------- 1 | --- 2 | lang-ref: ch.07 3 | title: Week 7 4 | --- 5 | 6 | 7 | ## Lecture part A 8 | 9 | 10 | ## Lecture part B 11 | 12 | 13 | ## Practicum 14 | We started with an application of autoencoders: DALL-E. We discussed Autoencoders (in terms of Energy-Based Models) and their use cases. Next, we discussed the reconstruction costs and the loss functions we should use. Finally, we discussed a particular type of autoencoder, i.e., denoising autoencoder. 15 | -------------------------------------------------------------------------------- /docs/en/week08/08.md: -------------------------------------------------------------------------------- 1 | --- 2 | lang-ref: ch.08 3 | title: Week 8 4 | --- 5 | 6 | 7 | ## Lecture part A 8 | 9 | 10 | ## Lecture part B 11 | 12 | 13 | ## Practicum 14 | In this section, we introduced some Generative Models including Denoising AE, Contractive AE and Variational AE. We compared the functionalities and advantages of Variational AEs over Basic Autoencoders. We explored the objective function of VAE in detail, understanding how it enforced some structure in the latent space. -------------------------------------------------------------------------------- /docs/en/week09/09.md: -------------------------------------------------------------------------------- 1 | --- 2 | lang-ref: ch.09 3 | title: Week 9 4 | --- 5 | 6 | 7 | ## Lecture part A 8 | 9 | 10 | ## Lecture part B 11 | 12 | 13 | ## Practicum 14 | In this section, we covered the implementation of *Generative models* viz. **Undercomplete Autoencoder**, **Denoising Autoencoders**, **Variational Autoencoders** and **Generative Adversarial Networks**. We analyze these models from the perspective of the framework of Energy Based Models (EBM). In doing so, we realize that these generative models can be considered as extensions of EBMs and differ from each other with subtle architectural adjustments. -------------------------------------------------------------------------------- /docs/en/week10/10-2.md: -------------------------------------------------------------------------------- 1 | --- 2 | lang-ref: ch.10-2 3 | title: SEER, AVID + CMA, Distillation, Barlow Twins 4 | lecturer: Ishan Misra 5 | authors: Duc Anh Phi, Krishna Karthik Reddy Jonnala 6 | date: 17 May 2021 7 | --- 8 | 9 | ## SEER: Learning from uncharted Images 10 | Compared to Imagenet dataset, real world images may have different distributions (cartoons, memes) and may or may not have a prominent object. In order to verify if the models work well on images outside of Imagenet dataset we decided to test *Swav* method on large scale data. SEER is *Swav* method tested on billions of unfiltered images. 11 | 12 | Following graph compares the fine tune performance of the four models when transferred to Imagenet. Using SEER method, a model can be trained with more than a billion parameters which are going to transfer really well to Imagenet. 13 |
14 |
15 | Figure 1 Comparing SEER to other methods on ImageNet data 16 |
17 | 18 | As shown in the following table, the performance of SEER is comparable to the networks trained on curated data with weak supervision. 19 |
20 |
21 | Figure 2 SEER performance vs weak supervision model 22 |
23 | 24 | ## AVID + CMA 25 | Audio Visual Instance Discrimination with Cross Modal Agreement is a method that combines *contrastive learning* and *clustering* techniques. 26 | 27 | For contrastive leaning on an Audio-Video dataset, when the (audio-video) inputs are passed to the two encoders ($f_a, f_v$) we will get two embeddings (audio and video). The embeddings from the same sample should be close in feature space compared to embeddings from different samples. 28 | 29 |
30 |
31 | Figure 3 AVID: Audio Video Instance Discrimination 32 |
33 | 34 | To introduce the *clustering*, the notion of the positives and negatives is expanded as shown in the following image. Computing the similarities in the video and audio embeddings from a reference point to all the other samples results in *Positive Set* and *Negative Set*. A sample falls into positive set when both its audio and video embeddings are similar to the reference embeddings. 35 |
36 |
37 | Figure 4 CMA: Cross-Modal Agreements 38 |
39 | 40 | ## Distillation 41 | Distillation methods are similarity maximization based methods. Like other SSL methods distillation tries to prevent trivial solutions. It does so by asymmetry in two different ways. 42 | * Asymmetric *learning rule* between student teacher 43 | * Asymmetric *architecture* between student teacher 44 | 45 | $$ f_{\vtheta}^{\text{student}}(I) = f_{\vtheta}^{\text{teacher}}(\text{augment}(I))$$ 46 | 47 | ### BYOL 48 | BYOL is a distillation technique whose architecture is shown below. 49 |
50 |
51 | Figure 5 BYOL architecture 52 |
53 | 54 | There is an asymmetry in architecture between student teacher as student has an additional prediction head. The gradient backpropagation only happens through Student encoder clearly creating an asymmetry in learning rate. In BYOL there is an additional source of asymmetry which is in weights of student encoder and teacher encoder. Teacher encoder is created as moving average of student encoder. These asymmetries will prevent the model from trivial solutions. 55 | 56 | ### SimSiam 57 | Recent studies showed that all the three sources of asymmetry discussed in BYOL are not needed to prevent the trivial solutions. In *SimSiam* architecture the student and teacher share the same set of weights and there are two sources of asymmetry. 58 | * In architecture of student encoder with an additional predictor head. 59 | * In learning rate, when backpropagating the gradients are passed only through student encoder but not the teacher encoder. After each epoch, the weights of student encoder are copied to the teacher encoder. 60 | 61 |
62 |
63 | Figure 6 SimSiam architecture 64 |
65 | 66 | ## Barlow Twins 67 | 68 | ### Hypothesis from information theory 69 | The efficient coding hypothesis was proposed by Horace Barlow in 1961 as a theoretical model of sensory coding in the brain. Within the brain, neurons communicate with each other by sending electrical impulses called spikes. Barlow hypothesised that the spikes in the sensory system form a neural code for efficiently representing sensory information. By efficient, Barlow meant that the code minimises the number of spikes needed to transmit a given signal. 70 | 71 | ### Implementation 72 | A successful approach to Self-Supervised-Learning (SSL) is to learn representations which are invariant to distortions of the input sample. However, a recurring problem with this approach is the existence of trivial constant solutions. 73 | 74 | The Barlow Twins method proposes an objective function that naturally avoids such collapse by measuring the cross-correlation matrix between the outputs of two identical networks fed with distorted versions of a sample and making them as close as possible to the identity matrix. 75 | 76 | Barlow's redundancy-reduction principle applied to a pair of identical networks. The objective function measures the cross-correlation matrix between the output features of two identical networks fed with distorted versions of a batch of samples and attempts to bring this matrix close to the identity. This causes the representation vectors of distorted versions of a sample to be similar, while minimizing the redundancy between the components of these vectors (Figure 7). 77 | 78 |
79 |
80 | Figure 7 Barlow-Twins Architecture 81 |
82 | 83 | More formally, it produces two distorted views for all images of a batch $X$. The distorted views are obtained via a distribution of data augmentations $\mathcal{T}$. The two batches of distorted views $Y^A$ and $Y^B$ are then fed to a function $f_{\vtheta}$, typically a deep network with trainable parameters $\vtheta$, producing batches of representations $Z^{A}$ and $Z^{B}$ respectively. 84 | 85 | The loss function $\mathcal{L_{BT}}$ contains a invariance and redundancy reduction: 86 | 87 | $$ 88 | \mathcal{L_{BT}} \triangleq \underbrace{\sum_i (1-\mathcal{C}_{ii})^2}_\text{invariance term} + ~~\lambda \underbrace{\sum_{i}\sum_{j \neq i} {\mathcal{C}_{ij}}^2}_\text{redundancy reduction term} 89 | $$ 90 | 91 | where $\lambda$ is a constant controlling the importance of the first and second terms of the loss, and where $\mathcal{C}$ is the cross-correlation matrix computed between the outputs of the two identical networks along the batch dimension: 92 | 93 | $$ 94 | \mathcal{C}_{ij} \triangleq \frac{ 95 | \sum_b z^A_{b,i} z^B_{b,j}} 96 | {\sqrt{\sum_b {(z^A_{b,i})}^2} \sqrt{\sum_b {(z^B_{b,j})}^2}} 97 | $$ 98 | 99 | where $b$ indexes batch samples and $i,j$ index the vector dimension of the networks' outputs. $\mathcal{C}$ is a square matrix with size the dimensionality of the network's output. In other words 100 | 101 | Intuitively, the invariance term of the objective, by trying to equate the diagonal elements of the cross-correlation matrix to 1, makes the representation invariant to the distortions applied. The redundancy reduction term, by trying to equate the off-diagonal elements of the cross-correlation matrix to 0, decorrelates the different vector components of the representation. This decorrelation reduces the redundancy between output units, so that the output units contain non-redundant information about the sample. 102 | -------------------------------------------------------------------------------- /docs/en/week10/10-3.md: -------------------------------------------------------------------------------- 1 | --- 2 | lang-ref: ch.10-3 3 | lecturer: Alfredo Canziani 4 | title: Transformer Encoder-predictor-decoder architecture 5 | authors: Rahul Ahuja, jingshuai jiang 6 | date: 15 Apr 2021 7 | --- 8 | 9 | 10 | ## The Transformer 11 | 12 | Before elaborating the encoder-predictor-decoder architecture, we are going to review two models we've seen before. 13 | 14 | 15 | ### Conditional EBM latent variable architecture 16 | 17 | 18 | We should be familiar with the terminology of these modules from the previous lectures. 19 | In the conditional EBM latent variable architecture, we have $x$ the conditional variable which goes into a predictor. We have $\vy$ which is the target value. The decoder modules will produce $\vytilde$ when fed with a latent variable $z$ and the output of the predictor. $\red{E}$ is the energy function which minimizes the energy between $\vytilde$ and $\vy$. 20 | 21 | 22 |
23 |
24 | Figure 1: (From the EBM lecture) Diagram above depicting the architecture of a conditional EBM latent variable model. 25 |
26 | 27 | ### Autoencoder architecture 28 | 29 | In Autoencoder architecture , we observed there is no conditional input but only a target variable. The entire architecture is trying to learn the structure in these target variables. The target value $\vy$ is fed through an encoder module which transforms into a hidden representation space, forcing only the most important information through. And the decoder will make these variables come back to the original target space with a $\vytilde$. And the cost function will try to minimize the distance between $\vytilde$ and $\vy$. 30 | 31 | 32 | 33 |
34 |
35 | Figure 2: (From the autoencoder lecture) Architecture of a basic Autoencoder consisting of encoder and decoder modules. 36 |
37 | 38 | 39 | 40 | ### Encoder-predictor-decoder architecture 41 | 42 |
43 |
44 | Figure 3: The transformer architecture with a unit delay module. 45 |
46 | 47 | 48 | In a transformer, $\vy$ (target sentence) is a discrete time signal. It has discrete representation in a time index. The $\vy$ is fed into a unit delay module succeeded by an encoder. The unit delay here transforms $\vy[j] \mapsto \vy[j-1]$. The only difference with the autoencoder here is this delayed variable. So we can use this structure in the language model to produce the future when given the past. 49 | 50 | 51 | 52 |
53 |
54 | Figure 4: A unit delay module transforms $\vy[j] \mapsto \vy[j-1]$ 55 |
56 | 57 | The observed signal, $\vx$ (source sentence) , is also fed through an encoder. The output of both encoder and delayed encoder are fed into the predictor, which gives a hidden representation $\vh$. This is very similar to denoising autoencoder as the delay module acts as noise in this case. And $\vx$ here makes this entire architecture a conditional delayed denoising autoencoder. 58 | 59 | ### Encoder module 60 | You can see the detailed explanation of these modules from last year's slides [here](https://atcold.github.io/NYU-DLSP20/en/week12/12-3/). 61 | 62 | 63 | ### Predictor Module 64 | 65 | The transformer predictor module follows a similar procedure as the encoder. However, there is one additional sub-block (i.e. cross-attention) to take into account. Additionally, the output of the encoder modules acts as the inputs to this module. 66 | 67 | 68 |
69 |
70 | Figure 5: The predictor module consisting of a cross attention block 71 |
72 | 73 | ### Cross attention 74 | You can see the detailed explanation of cross attention from last year's slides [cross-attention](https://atcold.github.io/NYU-DLSP20/en/week12/12-3/). 75 | 76 | 77 | ### Decoder module 78 | 79 | Contrary to what authors of the Transformer paper define, the decoder module consists of `1D-convolution` and `Add, Norm` blocks. The output of the predictor module is fed to the decoder module and the output of the decoder module is the predicted sentence. We can train this by providing the delayed target sequence. 80 | 81 | 82 |
83 |
84 | Figure 6: The correct notation of the encoder,predictor and decoder modules in a transformer 85 |
86 | -------------------------------------------------------------------------------- /docs/en/week10/10.md: -------------------------------------------------------------------------------- 1 | --- 2 | lang-ref: ch.10 3 | title: Week 10 4 | --- 5 | 6 | 7 | ## Lecture part A 8 | A brief introduction to self-supervised learning and pretext tasks and discussion of associated trivial solutions. Categorization of recent self-supervised methods: Introduction to Contrastive Learning and the loss function used. Brief overviews of PIRL, SimCLR and MoCo followed by SwAV which is a Clustering based method. Pretraining on Imagenet and non-Imagenet data is also discussed towards the end. 9 | 10 | ## Lecture part B 11 | 12 | 13 | ## Practicum 14 | We introduce attention, focusing on self-attention and its hidden layer representations of the inputs. Then, we introduce the key-value store paradigm and discuss how to represent queries, keys, and values as rotations of an input. Finally, we use attention to interpret the transformer architecture taking a forward pass through a basic transformer through an EBM perspective,, and comparing the encoder-predictor-decoder paradigm to sequential architectures. -------------------------------------------------------------------------------- /docs/en/week11/11-1.md: -------------------------------------------------------------------------------- 1 | --- 2 | lang-ref: ch.11-1 3 | lecturer: Awni Hannun 4 | title: Speech Recognition and Graph Transformer Network I 5 | authors: Cal Peyser, Kevin Chang 6 | date: 14 Apr 2021 7 | --- 8 | 9 | ## Modern Speech Recognition 10 | 11 | This section is a high level introduction to speech recognition and modern speech recognition specifically why it's become so good, but what are some of the problems still. 12 | 13 | * Automatic speech recognition has greatly improved since 2012 14 | * Machine performance can be as good or better than human level performance 15 | * Speech recognition still struggles in 16 | * conversational speech 17 | * multiple speakers 18 | * lots of background noise 19 | * the accent of the speakers 20 | * certain features not well represented in the training data 21 | * Pre 2012 speech recognition systems consisted of lots of many hand engineered components 22 | * larger dataset is not useful so datasets remain small 23 | * combining modules only at inference time instead of learning them together allowed for errors to cascade 24 | * researchers hard to know how to improve complex systems 25 | 26 | * Post 2012 speech recognition systems improvements 27 | * replaced a lot of the traditional components 28 | * add more data 29 | * above two together work in a virtuous cycle 30 | 31 | 32 | ## The CTC Loss 33 | 34 | Given some input speech utterance $\mX$, which consists of $T$ frames of audio. We desire to produce a transcription $\mY$ and we'll think of our transcription as consisting of the letters of a sentence, so $y_1$ is the first letter $y_U$ is the last letter. 35 | 36 | $$ 37 | \mX=[x_1,...,x_T],\ \mY=[y_1,...,y_U] 38 | $$ 39 | 40 | Compute conditional probability(the score) to evaluate transcription, we want to maximize the probability. 41 | 42 | $$\log{P(\mY \mid \mX;\theta)}$$ 43 | 44 | 45 | ### Example 1 46 | 47 | $$ 48 | \mX=[x_1, x_2, x_3],\ \mY=[c,a,t] 49 | $$ 50 | 51 | $\mX$ has three frames, $\mY$ has three letters, the number of inputs matches the number of outputs, it's easy to compute the probability by one to one mapping. 52 | 53 | $$\log{P(c \mid x_1)} + \log{P(a \mid x_2)} + \log{P(t \mid x_3)}$$ 54 | 55 | 56 | ### Example 2 57 | 58 | $$ 59 | \mX=[x_1, x_2, x_3, x_4],\ \mY=[c,a,t] 60 | $$ 61 | 62 | * Alignment: three possible ways 63 | * $A_1$: $x_1\rightarrow c$, $x_2\rightarrow a$, $x_3\rightarrow t$, $x_4\rightarrow t$ 64 | * $A_2$: $x_1\rightarrow c$, $x_2\rightarrow a$, $x_3\rightarrow a$, $x_4\rightarrow t$ 65 | * $A_3$: $x_1\rightarrow c$, $x_2\rightarrow c$, $x_3\rightarrow a$, $x_4\rightarrow t$ 66 | 67 | * Which alignment should we use to compute the score? 68 | * All of them. We're going to try to increase the score of all alignments and then hope the model sorts things out internally. The model can decide to optimize these different alignments and weight them accordingly and learn which one is the best. 69 | 70 | $$\log{P(\mY \mid \mX)}=\log{[P(A_1 \mid \mX)+P(A_2 \mid \mX)+P(A_3 \mid \mX)]}$$ 71 | 72 | **Reminder**: use actual-softsoftmax to sum log probabilities. 73 | 74 | We want $\log{(P_1+P_2)}$ from $\log{P_1}$ and $\log{P_2}$ 75 | 76 | $$ 77 | \begin{aligned} 78 | \text{actual-softmax}(\log{P_1}, \log{P_2}) 79 | &= \log{P_1}+\log{P_2} \\ 80 | &= \log{(e^{\log{P1}}+e^{\log{P2}})} 81 | \end{aligned} 82 | $$ 83 | 84 | ### Alignment graph 85 | 86 | Alignment graph is a way to encode the set of possible alignments to an arbitrary length input. 87 | 88 |
89 |
90 | Figure 1: Alignment graph
91 |
92 |
93 | 94 | This graph is sometimes called weighted finite state acceptor (WFSA). The bold state marked 0 at the beginning is a start state, the concentric circle marked 3 is an accepting state. On each edge, there're a label and a weight on both sides of a slash. Any path in this graph is an encoding of an alignment. 95 | 96 | 97 | ### Problem: too many alignments 98 | 99 | There's a problem when using all of the alignments. The $\mX$ input audio can have lots of frames, in practice they can be as high as thousands. The $\mY$ transcription can have lots of letters, in practice it can be hundreds or more. This is an astronomically large number of alignments, so we can't compute individual score and sum all of them. 100 | 101 | 102 | ### Solution: the forward algorithm(dynamic programming) 103 | 104 | Define forward variable $\alpha_t^u$, the subscript $t$ is where we are in the input and the superscript $u$ is where we are in the output. This represents the score for all alignments of length $t$ which end in the output $y_u$. 105 | 106 | Suppose $\mX=[x_1,x_2,x_3,x_4]$, $\mY=[c,a,t]$, the forward variable $\alpha_2^c$ represents the score of all possible alignments of length two up to the first two frames that ends in $c$ in the first output of the transcription. There's only one possible alignment for that $x_1\rightarrow c$, $x_2\rightarrow c$. This is simple to compute. 107 | 108 | $$\alpha_2^c=\log{P(c \mid x_1)}+\log{P(c \mid x_2)}$$ 109 | 110 | Similarly, $\alpha_2^a$ has only one possibility. 111 | 112 | $$\alpha_2^a=\log{P(c \mid x_1)}+\log{P(a \mid x_2)}$$ 113 | 114 | For $\alpha_3^a$, there are two possible alignments 115 | 116 | * $A_1$: $x_1\rightarrow c$, $x_2\rightarrow c$, $x_3\rightarrow a$ 117 | * $A_2$: $x_1\rightarrow c$, $x_2\rightarrow a$, $x_3\rightarrow a$ 118 | 119 | $$ 120 | \alpha_3^a=\text{actual-softmax}[\log{P(A_1)}, \log{P(A_2)}] \\ 121 | \log{P(A_1)}=\log{P(c \mid x_1)}+\log{P(c \mid x_2)}+\log{P(a \mid x_3)} \\ 122 | \log{P(A_2)}=\log{P(c \mid x_1)}+\log{P(a \mid x_2)}+\log{P(a \mid x_3)} 123 | $$ 124 | 125 | This is the naive approach to compute $\alpha_3^a$. 126 | 127 | Using this forward variable, we seek to model the probability distribution $P(\mY \mid \mX) = \sum_{a \in A} P(a)$, where $A$ is the set of all possible alignments from $\mY$ to $\mX$. This decomposes as 128 | 129 | $$P(\mY \mid \mX) = \sum_{a \in A} \prod_{t=1}^T P(a_t \mid \mX)$$ 130 | 131 | where $P(a_t \mid \mX)$ are the output logits of a system such as an RNN. That is, to compute the likelihood of the transcript $\mY$ we must marginalize over an intractably large number of alignments. We may do this with a recursive decomposition of the forward variable. The below presentation is inspired by https://distill.pub/2017/ctc/, which is an excellent introduction to the algorithm. 132 | 133 | First, we permit an alignment to contain the empty output $\epsilon$ in order to account for the fact that audio sequences are longer than their corresponding transcripts. We also collapse repetitions, so that $\{a, \epsilon, a, a, \epsilon, a\}$ corresponds to the sequence $aaa$. We will also define $\alpha$ using an alternative transcript $Z$, which is equal to $\mY$ but is interspersed with $\epsilon$. That is, $Z = \{\epsilon, y_1, \epsilon, y_2, ..., y_n, \epsilon \}$. 134 | 135 | Now, suppose $y_i = y_{i+1}$, so that $Z$ contains a subsequence $y_i, \epsilon, y_{i+1}$, and suppose $y_{i+1}$ occurs at psosition $s$ in $Z$. Then the alignment for $\alpha_{s}^t$ can be arrived at by one of two ways: either the prediction at time $t-1$ can be $y_{i+1}$ (in which case the repetition is collapsed) or else the prediction at time $t-1$ can be epsilon. So, we may decompose: 136 | 137 | $$\alpha_s^t = (\alpha_{s, t-1} + \alpha_{s-1, t-1}) P(z_s \mid \mX)$$ 138 | 139 | where the elements of the sum represent the two possible prefixes to the alignment. If, on the other hand, we have $y_i \ne y_{i+1}$ then there is the additional third possibility that the prediction at time $t-1$ is equal to $y_i$. So, we have the decomposition 140 | 141 | $$\alpha_s^t = (\alpha_{s, t-1} + \alpha_{s-1, t-1} + \alpha{s-2, t-1}) P(z_s \mid \mX)$$ 142 | 143 | By computing $\alpha_{\vert Z\vert}^{T}$, we may effectively marginalize over all possible alignments between the transcript $\mY$ and the audio $\mX$, allowing efficient training and inference. This is called Connectionist Temporal Classification, or CTC. 144 | 145 | 146 | -------------------------------------------------------------------------------- /docs/en/week11/11.md: -------------------------------------------------------------------------------- 1 | --- 2 | lang-ref: ch.11 3 | title: Week 11 4 | --- 5 | 6 | 7 | ## Lecture part A 8 | We provide an introduction to the problem of speech recognition using neural models, emphasizing the CTC loss for training and inference when input and output sequences are of different lengths. 9 | 10 | 11 | ## Lecture part B 12 | We discuss beam search for use during inference, and how that procedure may be modeled at training time using a Graph Transformer Network. Graph transformers networks are basically weighted finite-state automata with automatic differentiation, that allows us to encode priors into a graph. There are different type of weighted finite-state and different operations including union, Kleene closure, intersection, compose, and forward score. The loss function is usually the difference between to functions. We can easily implement these networks using GTN library. 13 | 14 | 15 | ## Practicum 16 | -------------------------------------------------------------------------------- /docs/en/week12/12-3.md: -------------------------------------------------------------------------------- 1 | --- 2 | lang-ref: ch.12-3 3 | title: MPC (EBM version) 4 | lecturer: Alfredo Canziani 5 | authors: Yang Zhou, Daniel Yao 6 | date: 28 Apr 2021 7 | --- 8 | 9 | 10 | ## Action plan 11 | - Model predictive control **[Here we are today]** 12 | - Backprop through kinematic equation 13 | - Minimisation of the latent 14 | - Truck backer-upper 15 | - Learning an emulator of the kinematics from observations 16 | - Training a policy 17 | - PPUU 18 | - Stochastic environment 19 | - Uncertainty minimisation 20 | - Latent decoupling 21 | 22 | 23 | ## State transition equations -- Evolution of the state 24 | 25 | Here we discuss a state transition equation where $\vx$ represents the state, $\vu$ represents control. We can formulate the state transition function in a continuous-time system where $\vx(t)$ is a function of continuous variable $t$. 26 | 27 |
28 | $$ 29 | \begin{aligned} 30 | \dot{\vx} &= f(\vx,\vu)\\ 31 | \frac{\partial \vx(t)}{\partial t} &= f(\vx(t), \vu(t)) 32 | \end{aligned} 33 | $$ 34 |
35 | 36 |
37 |
38 | Figure 1: State and control illustration of a three-cycle 39 |
40 | 41 | We use a tri-cycle as the example to study it. The orange wheel is the control $\vu$, $(x_c,y_c)$ is the instantaneous center of rotation. You can also have two wheels in the front. For simplicity, we use one wheel as the example. 42 | 43 | In this example $\vx=(x,y,\theta,s)$ is the state, $\vu=(\phi,\alpha)$ is the control. 44 | 45 | $$ 46 | \left\{\begin{array}{l} 47 | \dot{x}=s \cos \theta \\ 48 | \dot{y}=s \sin \theta \\ 49 | \dot{\theta}=\frac{s}{L} \tan \phi \\ 50 | \dot{s}=a 51 | \end{array}\right. 52 | $$ 53 | 54 | 55 | We can reformulate the differential equation from continuous-time system to discrete-time system 56 | 57 | $$ 58 | \vx[t]=\vx[t-1]+f(\vx[t-1], \vu[t]) \mathrm{d} t 59 | $$ 60 | 61 | To be clear, we show the units of $\vx, \vu$. 62 | 63 | $$ 64 | \begin{array}{l} 65 | {[\vu]=\left(\mathrm{rad}\ \frac{\mathrm{m}}{\mathrm{s}^{2}}\right)} \\ 66 | {[\vx]=\left(\mathrm{m} \ \mathrm{m} \ \mathrm{rad} \ \frac{\mathrm{m}}{\mathrm{s}}\right)} 67 | \end{array} 68 | $$ 69 | 70 | Let's take a look at different examples. We use different color for variables we care about. 71 | 72 |
73 |
74 | Figure 2: State Formulation 75 |
76 | 77 | Example 1: Uniform Linear Motion: No acceleration, no steering 78 |
79 |
80 | Figure 3: Control of Uniform Linear Motion 81 |
82 |
83 |
84 | Figure 4: State of Uniform Linear Motion 85 |
86 | 87 | 88 | Example 2: Crush into itself: Negative acceleration, no steering 89 |
90 |
91 | Figure 5: Control of Curshing into itself 92 |
93 |
94 |
95 | Figure 6: State of Curshing into itself 96 |
97 | 98 | 99 | Example 3: Sine wave: Positive steering for the first part, negative steering for the second part 100 |
101 |
102 | Figure 7: Control of Sine Wave 103 |
104 |
105 |
106 | Figure 8: State of Sine Wave 107 |
108 | 109 | 110 | ## Kelley-Bryson algorithm 111 | What if we want the tri-cycle to reach a specified destination with a specified speed? 112 | - This can be achieved by inference using **Kelley-Bryson algorithm**, which utilizes **backprop through time** and **gradient descent**. 113 | 114 | 115 | ### Recap of RNN 116 | We can compare the inference process here with the training process of RNN. 117 | 118 | Below is an RNN schematic chart. We feed variable $\vx[t]$ and the previous state $\vh[t-1]$ into the predictor, while $\vh[0]$ is set to be zero. The predictor outputs the hidden representation $\vh[t]$. 119 |
120 |
121 | Figure 9: RNN schematic chart 122 |
123 | 124 | 125 | ### Optimal control (inference) 126 | In optimal control (inference) shown as below, we feed the latent variable (control) $\vz[t]$ and the previous state $\vx[t-1]$ into the predictor, while $\vx[0]$ is set to be $\vx_0$. The predictor outputs the state $\vx[t]$. 127 | 128 |
129 |
130 | Figure 10: Optimal Control schematic chart 131 |
132 | 133 | Backprop is implemented in both RNN and Optimal Control. However, gradient descent is implemented with respect to predictor's parameters in RNN, and is implemented wrt latent variable $\vz$ in optimal control. 134 | 135 | 136 | ### Unfolded version of optimal control 137 | In unfolded version of optimal control, cost can be set to either the final step of the tri-cycle or every step of the tri-cycle. Besides, cost functions can take many forms, such as Average Distance, Softmin, etc. 138 | 139 | 140 | #### Set the cost to the final step 141 | From the figure below, we can see there is only one cost $c$ set in the final step (step 5), which measures the distance of our target $\vy$ and state $\vx[5]$ with control $\vz[5]$ 142 |
143 |
144 | Figure 11: Cost to the final step 145 |
146 | 147 | $(1)$ If the cost function only involves the final position with no restrictions on the final speed, we can obtain the results after inference shown as below. 148 |
149 |
150 | Figure 12: Cost function involving only the final position 151 |
152 | From the figure above, it is seen that when $T=5$ or $T=6$, the final position meets the target position, but when $T$ is above 6 the final position does not. 153 | 154 | $(2)$ If the cost function involves the final position and zero final speed, we can obtain the results after inference shown as below. 155 |
156 |
157 | Figure 13: Cost function involving the final position and zero final speed 158 |
159 | From the figure above, it is seen that when $T=5$ or $T=6$, the final position roughly meets the target position, but when $T$ is above 6 the final position does not. 160 | 161 | 162 | ### Set the cost to every step 163 | From the figure below, we can see there is a cost $c$ set in every step. 164 |
165 |
166 | Figure 14: Cost to every step 167 |
168 | 169 | $(1)$ Cost Example: Average Distance 170 |
171 |
172 | Figure 15: Cost Example: Average Distance 173 |
174 | 175 | $(2)$ Cost Example: Softmin 176 |
177 |
178 | Figure 16: Cost Example: Softmin 179 |
180 | 181 | Different forms of cost functions can be explored through experimentation. 182 | 183 | 184 | ## Optimization_Path_Planner-Notebook 185 | In this notebook, we use tri-cycle as an example as well. 186 | 187 | 188 | ### Define kinematic model of a tricycle $\dot{\vx}=f(\vx,\vu)$. 189 | * $\vx$ represents state: ($x$, $y$, $θ$, $s$) 190 | * $\vu$ represents control: ($ϕ$, $a$) 191 | * We feed $\vx[t-1]$ and $\vu[t]$ to obtain the next state $\vx[t]$ 192 | 193 | ```python 194 | def f(x, u, t=None): 195 | L = 1 # m 196 | x, y, θ, s = x 197 | 198 | ϕ, a = u 199 | f = torch.zeros(4) 200 | f[0] = s * torch.cos(θ) 201 | f[1] = s * torch.sin(θ) 202 | f[2] = s / L * torch.tan(ϕ) 203 | f[3] = a 204 | return f 205 | ``` 206 | 207 | 208 | ### Define several cost functions 209 | As mentioned above, cost functions can take various forms. In this notebook, we list 5 kinds as follows: 210 | * `vanilla_cost`: Focuses on the final position 211 | * `cost_with_target_s`: Focuses on the final position and final zero speed. 212 | * `cost_sum_distances`: Focuses on the position of every step, and minimizes the mean of the distances. 213 | * `cost_sum_square_distances`: Focuses on the position of every step, and minimizes the mean of squared distances. 214 | * `cost_logsumexp`: The distance of the closest position should be minimized. 215 | 216 | 217 | ```python 218 | def vanilla_cost(state, target): 219 | x_x, x_y = target 220 | return (state[-1][0] - x_x).pow(2) + (state[-1][1] - x_y).pow(2) 221 | 222 | def cost_with_target_s(state, target): 223 | x_x, x_y = target 224 | return (state[-1][0] - x_x).pow(2) + (state[-1][1] - x_y).pow(2) 225 | + (state[-1][-1]).pow(2) 226 | 227 | def cost_sum_distances(state, target): 228 | x_x, x_y = target 229 | dists = ((state[:, 0] - x_x).pow(2) + (state[:, 1] - x_y).pow(2)).pow(0.5) 230 | return dists.mean() 231 | 232 | def cost_sum_square_distances(state, target): 233 | x_x, x_y = target 234 | dists = ((state[:, 0] - x_x).pow(2) + (state[:, 1] - x_y).pow(2)) 235 | return dists.mean() 236 | 237 | def cost_logsumexp(state, target): 238 | x_x, x_y = target 239 | dists = ((state[:, 0] - x_x).pow(2) + (state[:, 1] - x_y).pow(2))#.pow(0.5) 240 | return -1 * torch.logsumexp(-1 * dists, dim=0) 241 | ``` 242 | 243 | 244 | ### Define path planning with cost 245 | * The optimizer is set to be SGD. 246 | * Time interval `T` is set to be 1s. 247 | * We need to compute every state from the initial state by the following code: 248 | ```python 249 | x = [torch.tensor((0, 0, 0, s),dtype=torch.float32)] 250 | for t in range(1, T+1): 251 | x.append(x[-1] + f(x[-1], u[t-1]) * dt) 252 | x_t = torch.stack(x) 253 | ``` 254 | * Then compute the cost: 255 | ```python 256 | cost = cost_f(x_t, (x_x, x_y)) 257 | costs.append(cost.item()) 258 | ``` 259 | * Implement backprop and update $\vu$ 260 | ```python 261 | optimizer.zero_grad() 262 | cost.backward() 263 | optimizer.step() 264 | ``` 265 | * Now we can feed values to path_planning_with_cost to obtain inference results and plot trajectories. **Example**: 266 | ```python 267 | path_planning_with_cost( 268 | x_x=5, x_y=1, s=1, T=5, epochs=5, 269 | stepsize=0.01, cost_f=vanilla_cost, debug=False 270 | ) 271 | ``` 272 | -------------------------------------------------------------------------------- /docs/en/week12/12.md: -------------------------------------------------------------------------------- 1 | --- 2 | lang-ref: ch.12 3 | title: Week 12 4 | --- 5 | 6 | 7 | ## Lecture part A 8 | This lecture introduces the topic of Neural Machine Translation with the help of an example. We then discuss language modelling, model architecture, NMT inference. Further, we discuss the issues faced because of the languages and the need for Low Resource Machine Translation. Also, we examine a case study and the challenges faced in Low Resource MT, different stages in the cycle of research, how they can be used for Machine Translation. 9 | 10 | ## Lecture part B 11 | This week's lecture was a guest lecture by [Marc'Aurelio Ranzato](https://ai.facebook.com/people/marc-aurelio-ranzato/), who is a research scientist and manager at the Facebook AI Research (FAIR) lab, where he works to enable machines to learn with weaker supervision and to efficiently transfer knowledge across tasks. The first part of Lecture B focuses on understanding low resource machine translation, and the second half discusses potential domain mismatches in machine learning and machine translation. 12 | 13 | 14 | ## Practicum 15 | We introduced the state transition function and the way to model a physical system with state and control. We discussed how to achieve optimal control by inference using Kelley-Bryson algorithm, which utilizes backprop through time and gradient descent. Finally, we explained the notebook of Optimization_Path_Planner, in which various cost functions are defined and path planning is implemented to guide a tri-cycle to reach the desired position with the specified speed. 16 | -------------------------------------------------------------------------------- /docs/en/week13/13.md: -------------------------------------------------------------------------------- 1 | --- 2 | lang-ref: ch.13 3 | title: Week 13 4 | --- 5 | 6 | 7 | ## Lecture part A 8 | 9 | 10 | ## Lecture part B 11 | 12 | 13 | ## Practicum -------------------------------------------------------------------------------- /docs/en/week14/14.md: -------------------------------------------------------------------------------- 1 | --- 2 | lang-ref: ch.14 3 | title: Week 14 4 | --- 5 | 6 | 7 | ## Lecture part A 8 | 9 | 10 | ## Lecture part B 11 | 12 | 13 | ## Practicum -------------------------------------------------------------------------------- /docs/en/week15/15-1.md: -------------------------------------------------------------------------------- 1 | --- 2 | lang-ref: ch.15-1 3 | lecturer: Alfredo Canziani and Jiachen Zhu 4 | title: Joint Embedding Methods - Contrastive 5 | authors: Sai Charitha Akula 6 | date: 12 May 2022 7 | --- 8 | 9 | 10 | 11 | ## Visual Representation Learning 12 | 13 | Representation learning trains a system to produce the representations required for feature detection or classification from raw data. Visual representation learning is about the representations of images or videos in particular. 14 | 15 |
16 |
17 | Fig. 1: Visual Representation Learning 18 |
19 | 20 | This can be broadly classified as shown above and the focus of the lecture would be on self-supervised visual representation learning. 21 | 22 | ## Self-supervised Visual Representation Learning 23 | 24 | It is a two stage process comprising pretraining and evaluation 25 | 26 | ##### Step1: Pretraining 27 | 28 | Uses a large amount of unlabeled data to train a backbone network. Different methods will produce the backbone network differently 29 | 30 | ##### Step2: Evaluation 31 | 32 | It can be performed in two ways: feature extraction and finetuning. Both these methods generate representation from ​​the image and then use it to train DsTH ( Downstream Task Head ). The learning of the downstream task would thus be in the representation space instead of the image space. The only difference between the two methods is the stop gradient before the encoder. In finetuning, we can change the encoder unlike in feature extraction. 33 | 34 |
35 |
36 | Fig. 2: Self-supervised Visual Representation Learning 37 |
38 | 39 | ### Generative Models 40 | 41 | The popular one is the denoising autoencoder. You train the model to reconstruct the original image from the noisy image. After the training, we retain the encoder for the downstream task. 42 | 43 | ##### Issues: 44 | 45 | The model tries to solve a problem that is too hard. For example: For a lot of downstream tasks, you don't have to reconstruct the image, which is a tougher problem than the downstream task itself. Also, sometimes the loss function is not good enough. For example: the Euclidean distance used as a reconstruction loss metric isn’t a good metric for comparing the similarity between two images. 46 | 47 |
48 |
49 | Fig. 3: Generative Models - Autoencoder 50 |
51 | 52 | ### Pretext Tasks 53 | 54 | It’s almost the same as above but you train the model to figure out a smart way to generate pseudo labels. For example: Given the image of a tiger, the shuffled image is the input x, and the output y would be the correct way of labeling the patches. The network successfully reinventing the patches indicates that it understands the image. 55 | 56 | ##### Issues: 57 | Designing the pretext task is tricky. if you design the task too easy, the network won’t learn good representation. But if you design the task hard, it can become harder than the downstream task and the network wouldn't be trained well. Also, the representations generated via this method will be tailored to the specific downstream task. 58 | 59 |
60 |
61 | Fig. 4: Pretext Tasks 62 |
63 | 64 | ## Joint Embedding Methods 65 | 66 | Joint Embedding methods try to make their backbone network robust to certain distortions and are invariant to data augmentation. 67 | 68 | As an example, as shown in the image below, for an image of a dog, you take two distorted versions of the image, then encode them with your backbone network to generate representations and you make them to be close to each other. Thus, ensuring the two images share some semantic information. 69 | 70 |
71 |
72 | Fig. 1: Data Augmentation in JEM 73 |
74 | 75 | They also prevent trivial solutions. The network could collapse with just the above condition, as the network can become invariant not only to distortions but to the input altogether i.e., irrespective of the input, it could generate the same output. JEMs try to prevent this trivial solution in different ways. 76 | 77 | Instead of considering only local energy ( between two pairs of distorted images ), these methods get a batch of the images and ensure that the collection of the representation, $\green{H}_{\vx}$, doesn’t have the same rows or columns. ( which is the trivial solution ) 78 | 79 |
80 |
81 | Fig. 2: Preventing Trivial Solutions in JEM 82 |
83 | 86 | 87 | ### Components: 88 | 89 | Every Joint Embedding Method has the following components: 90 | 91 | 1. Data augmentation ( $\vx$ and $\vy$ ): The way you generate the two distorted versions of the image. 92 | 2. Backbone Network ( $\lavender{BB}$ ) - The definition of the backbone 93 | 3. Energy function ( $\red{D}$ ) - The definition of the distance between the two representations. 94 | 4. Loss functionals ( $\green{A}$ and $\green{B}$ ) - The definition of the loss functionals calculated per batch of size N. 95 | 96 | ### Joint Embedding Loss Functions: 97 | 98 | Joint Embedding Loss Functions contain two components: 99 | 1. A term that pushes the positive pair closer 100 | 2. An (implicit) term that prevents the trivial solution (constant output) - implicit because a lot of "other methods" do not have an explicit term to prevent the trivial solution. 101 | 102 | To make the training stable, people usually normalize the embeddings or put a hinge on the loss function to prevent the norm of embeddings from becoming too large or too small 103 | 104 | ### Training Methods 105 | 106 | The training methods can be further classified into the following four types: 107 | 1. Contrastive methods 108 | 2. Non-Contrastive methods 109 | 3. Clustering methods 110 | 4. Other methods 111 | 112 | We now go into the details of each of these methods 113 | 114 | ### Contrastive methods 115 | 116 | Contrastive methods push positive pairs closer and negative pairs away. More details about the contrastive methods including MoCo, PIRL, and SimCLR have been discussed [here](https://atcold.github.io/NYU-DLSP20/en/week08/08-1/). 117 | 118 | 119 | #### The InfoNCE loss function: 120 | Both SimCLR and MoCO use the InfoNCE loss function. 121 | 122 | $$ 123 | \red{L}(\boldsymbol{w},\vx,\vy) = \\[0.5cm] 124 | = -\text{log} \frac{\exp(\blue{\,\beta\,} \text{sim} ( \green{h_{\vx}}, \green{h_{\vy}} ) ) } 125 | { \sum_{\red{n}}^{N}\exp(\blue{\,\beta\,} \text{sim} ( \green{h_{\vx}}, \green{h_{\vx}^\red{n}} )) + 126 | \sum_{\red{n}}^{N}\exp(\blue{\,\beta\,} \text{sim} ( \green{h_{\vx}}, \green{h_{\vy}^\red{n}} )) } \\[0.5cm] 127 | 128 | = -\blue{\,\beta\,} \text{sim} ( \green{h_{\vx}}, \green{h_{\vy}} ) + \text{log} \Big[ 129 | \sum_{\red{n}}^{N}\exp(\blue{\,\beta\,} \text{sim} ( \green{h_{\vx}}, \green{h_{\vx}^\red{n}} )) + 130 | \sum_{\red{n}}^{N}\exp(\blue{\,\beta\,} \text{sim} ( \green{h_{\vx}}, \green{h_{\vy}^\red{n}} )) ]\\[0.5cm] 131 | 132 | = -\blue{\,\beta\,} \text{sim} ( \green{h_{\vx}}, \green{h_{\vy}} ) + \text{softmax}_\blue{\beta} [ 133 | \text{sim} ( \green{h_{\vx}}, \green{h_{\vx}^\red{n}} ), 134 | \text{sim} ( \green{h_{\vx}}, \green{h_{\vy}^\red{n}} ) ] \\[0.5cm] 135 | 136 | \text{sim} (\green{h_{\vx}}, \green{h_{\vy}} ) = \frac{ \green{h_{\vx}}^\top \green{h_{\vy}} } { ||\green{h_{\vx}} || \, ||\green{h_{\vy}} || } 137 | 138 | $$ 139 | 140 | 141 | The first term indicates the similarity between positive pairs and the second term is the softmax between all the negative pairs. We would like to minimize this whole function. 142 | 143 | Notice that it gives different weights to different negative samples. The negative pair that has high similarity is pushed much harder than the negative pair with low similarity because there's a softmax. Also, the similarity measurement here is the inner product between the two representations, and to prevent the gradient explosion, the norm is normalized. Thus, even if the vector grew long, the term ensures that it is a unit vector. 144 | 145 | #### Memory Bank: 146 | 147 | As already mentioned, these models require negative samples. However, finding negative pairs becomes difficult as the embedding spaces become large. 148 | 149 | To handle this, SimCLR and MoCO use large batch sizes to find the samples. The difference between SimCLR and MoCO is the way they deal with the large batch size. SimCLR uses 8192 as the batch size. However, MoCO tries to solve the requirement of a large batch size without actually using a large batch size by using a memory bank. It uses a small batch size but instead of using negative samples from only the current batch, it collects them even from previous batches. For example: with a 256 batch size, aggregating the previous 32 batches of negative samples results essentially in a batch size of 8192. This method saves memory and avoids the effort to generate the negative samples again and again. 150 | 151 |
152 |
153 | Fig. 4: Memory Bank 154 |
155 | 156 | Issue: 157 | Because B is updated every step, the backbone is updated every step, and thus, after a while, the old negative samples are not valid anymore and can lead to a decrease in performance. To avoid this, MoCO uses a momentum backbone that slows down the training of the right backbone. In that case, the difference between the older momentum backbone and the new momentum backbone is not that different, retaining the validitiy of the negative sample even after a while. 158 | 159 |
160 |
161 | Fig. 5: Memory Bank with Momentum Backbone 162 |
163 | 164 | $\vartheta_{t+1}$ ( momemtum backbone’s parameter ) is an exponential moving average of $\theta_{t}$. The learning rate of $\vartheta$ is $( 1 - m )* \eta$. High values of $m$ will make the $\vartheta_{t}$ stable. $m$ =1 will make $\vartheta_{t}$ basically untrained. If $m$ is very small like 0, $\vartheta_{t+1}$ is $\theta_{t+1}$. 165 | 166 | $$ 167 | \theta_{t+1} = \theta_{t} - \eta\Delta\theta_{t} \\ 168 | \vartheta_{t+1} = m\vartheta_{t} + ( 1- m )\theta_{t+1} 169 | $$ 170 | 171 |
172 | $\theta:$ backbone parameters 173 |
174 | 175 |
176 | $\vartheta:$ momentum backbone parameters 177 |
178 | 179 | 180 | #### Disadvantages of Contrastive methods: 181 | 182 | In practice, people found out that contrastive methods need a lot of setup to make them work. They require techniques such as weight sharing between the branches, batch normalization, feature-wise normalization, output quantization, stop gradient, memory banks etc.,.This makes it hard to analyze. Also, they are not stable without the use of those techniques. 183 | -------------------------------------------------------------------------------- /docs/en/week15/15-2.md: -------------------------------------------------------------------------------- 1 | --- 2 | lang-ref: ch.15-2 3 | lecturer: Alfredo Canziani and Jiachen Zhu 4 | title: Joint Embedding Methods - Regularised 5 | authors: Sai Charitha Akula 6 | date: 12 May 2022 7 | --- 8 | 9 | ### Non-Contrastive methods 10 | 11 | #### Non-Contrastive methods and information theory: 12 | 13 | Most of the non-contrastive methods are based on information theory. For example: Redundancy reduction ( Barlow Twins ) and Information. They don't require special architectures or engineering techniques. 14 | 15 | #### VicReg: 16 | It tries to maximize the information content of the embeddings by producing embedding variables that are decorrelated to each other. If the variables are correlated to each other, they covariate together and the information content is reduced. Thus, it prevents an informational collapse in which the variables carry redundant information. Also, this method requires a comparatively small batch size. 17 | 18 | Two types of collapse can occur in these architectures: \\ 19 | $\textbf{Type 1}:$ Irrespective of the input, the network generates the same representation \\ 20 | $\textbf{Type 2}:$ Special collapse - Although different images have different representations, the information content is really low in each representation. 21 | 22 | ##### Loss function: 23 | The loss function is pushing: 24 | 1. Positive pairs closer - to be invariant to data augmentation 25 | 2. The variance of the embeddings large by pushing all of the diagonal terms of the covariance matrix large - to prevent the first kind of collapse 26 | 3. The covariance of the embeddings small by pushing all off the diagonal terms of the covariance matrix small- to prevent the second kind of collapse. 27 | 28 | 29 | $$ 30 | \\[0.5cm] 31 | \green{C} = \frac{1}{N} \green{H}^\top\green{H} \\[0.5 cm] 32 | 33 | \red{L}(\boldsymbol{w},\vx,\vy) = 34 | \Vert \green{h_{\vx}} - \green{h_{\vy}} \Vert^2 \\[0.2cm] 35 | 36 | + \frac{1}{d}[ \sum_{i}^{d} ( \gamma - \,_{\vx}\green{C}_{ii} )^+ + ( \gamma - \, _{\vy}\green{C}_{ii} )^+ ] \\ 37 | 38 | + \frac{1}{d}[ \sum_{i}^{d} \sum_{j \neq i}^{d} ( _{\vx}\green{C}_{ij} )^2 + (_{\vy}\green{C}_{ij} )^2 ] 39 | 40 | $$ 41 | 42 | 43 | ### Clustering methods 44 | 45 | #### SwAV 46 | 47 | This method prevents trivial solution by quantizing the embedding space. SwAV does the following: 48 | 49 | 1. Generates representations and stack the generated representations ( into $\green{H_{x}}$ and $\green{H_{y}}$ ). 50 | 2. Applies sinkhorn clustering method to each of the stacked representation to generate corresponding clustered $\green{\boldsymbol{Q}}$ matrices where each row ( $\violet{q_{\vx}}$ ) represents a one hot vector indicating the cluster the corresponding representation belongs to 51 | 3. Performs second clustering for the representations $\vh_{\vx}$ and $\vh_{\vy}$ with soft-kmeans. This step generates predictions for $\green{q_{\vx}}$ and $\green{q_{\,\vy}}$, $\violet{\tilde{q_{\vx}}}$ and $\tilde{\violet{q_{\vy}}}$, from $\vh_{\vy}$ and $\vh_{\vx}$ respectively ( Thus, called swap prediction ) 52 | 4. Minimizes the loss function which is the sum of two crossentropy functions between $\green{q_{\vx}}$ and $\violet{\tilde{q_{\vx}}}$ and $\green{q_{\vy}}$ and $\violet{\tilde{q_{\vy}}}$. 53 | 54 | 55 |
56 |
57 | Fig. 8: SWaV 58 |
59 | 60 | ##### The Loss function: 61 | 62 | Sinkhorn algorithm: 63 | Sinkhorn algorithm can distribute samples to not just one cluster but to every cluster. Thus, it can help us prevent all the data clustering into a single centroid or any such nonuniform distribution. It takes in hyperparameters that allow us to deploy different levels of uniform distribution across clusters degenerating to K-means algorithm on one extreme and to the perfectly uniform distribution on the other extreme 64 | 65 | Softargmax clustering: 66 | Each $\green{h_{\vy}}$ is normalized. $\boldsymbol{W}\green{h_{\vy}}$ indicates similarity between $\green{h_{\vy}}$ and all other centroids. Softargmax turns the cosine similarly ( positive or negative ) into a probability. 67 | 68 | Since this is predicting the $\green{q_{\vx}}$, we will compare the cross entropy of the prediction, $\violet{\tilde{q_{\vx}}}$, with the actual $\green{q_{\vx}}$ to measure the prediction. 69 | 70 | 73 | 74 | $$ 75 | 76 | \green{Q_{\vx}} = \text{sinkhorn}_{\boldsymbol{W}}(\green{H_{\vx}}) \in \mathbb{R}^{ N \times K } \\\\\\[0.2 cm] 77 | 78 | \green{Q_{\vx}} = [ \green{q_{\vx}}^1,...,\green{q_{\vx}}^N ]^\top \\\\[0.2 cm] 79 | 80 | \boldsymbol{W} \in \mathbb{R}^{ K \times d } : \text{dictionary} \\ \\[0.2 cm] 81 | 82 | \violet{\tilde{q_{\vx}}} = \text{softargmax}_{\blue{\beta}}(\boldsymbol{W}\green{h}_\vy) \in \mathbb{R}^{ K} \\ \\[0.2 cm] 83 | 84 | \red{F}(\vx, \vy) = \red{C}(\green{q_{\vx}}, \violet{\tilde{q_{\vx}}}) + \red{C}(\green{q_{\vy}}, \violet{\tilde{q_{\vy}}}) 85 | 86 | $$ 87 | 88 | 89 | ##### Interpretation of clusters: 90 | This method partitions latent space into a few clusters automatically without labels and the hope is that these clusters will be related to the actual classes. Thus, later, we would just need a few labeled data samples to assign each cluster to the corresponding label under supervised learning. 91 | 92 | ##### Invariance to data augmentation: 93 | Instead of pushing the pairs closer to each other, you push both the representations to be inside the same cluster. 94 | 95 | ##### Preventing trivial solution 96 | In a trivial solution, all the representations will be the same and thus belong to the same centroid. However, with sinkhorn, different clusters have an equal number of samples, thus the representations can’t be put into one centroid, preventing a trivial solution. 97 | 98 | ### Other methods 99 | 100 | The loss function for all the previous methods including contrasting methods needs a batch or pool of negative samples, thus creating problems with distributed training. However, the loss functions of these methods are local. These methods perform well but an understanding of why they don’t collapse is not yet available. Probably there's some implicit regularization happening in these networks to prevent them from converging to a trivial solution. 101 | 102 |
103 |
104 | Fig. 10: Other Methods 105 |
106 | 107 | #### BYOL: 108 | BOYL adds a predictor, predicting $\green{h_{\vy}}$ from $\green{h_{\vx}}$. The energy function ( $\red{D}$ ) is a cosine similarity between $\green{h_{\vy}}$ and predicted $\green{h_{\vy}}$. There is no term for negative samples i.e., this method only pushes positive pairs closer and has no enforcement on negative pairs. It is thought that asymmetrical architecture with extra layers makes this method work. 109 | 110 | SimSiam is a followup version that uses a regular backbone instead of the momentum backbone 111 | 112 | #### Dino: 113 | The two softargmax components used have different coldness or temperature. The energy function is the cross entropy between these two, pushing them together. Even this method doesn’t enforce anything on negative samples. 114 | 115 | #### Data2Vec: 116 | Adds a layer norm at the end of the representation. 117 | 118 | ##### Initialization of the network: 119 | If you initialize the network with a trivial solution, then that network will never work. This is because if the trivial solution is already achieved, the loss function will produce a zero gradient and thus, can never escape from the trivial solution. However, in other cases, the training dynamic is adjusted in a way that they never converge in these methods. 120 | 121 | 122 | ### Improvements for JEMs 123 | 124 | We can further improve these models by experimenting with data augmentation and network architecture. We don’t have a good understanding of these but they are very important. In fact, finding good augmentation may boost more performance than changing the loss function. 125 | 126 | #### Data Augmentation 127 | 128 | Most dominant augmentations were proposed by simCLR and improved a little bit by BYOL: 129 | 1. Random Crop (the most critical one) 130 | 2. Flip 131 | 3. Color Jitter 132 | 4. Gaussian Blur 133 | 134 | It has been found empirically that random crop is the most critical one. It might be because the random crop is the only we can change the spatial information about the images. Flip does the same partly but is weak. Color jitter and gaussian blur change channels. 135 | 136 |
137 |
138 | Fig. 5: Data Augmentation 139 |
140 | 141 | ##### Masking augmentation: 142 | Recently people are moving towards masking augmentation instead of traditional augmentation in which we mask out most ( ~75% in the below image ) of the patches. It can replace random crop since it’s another way to remove the redundancy of the spatial information 143 | 144 | **Issues:** 145 | This works well only with transformer type of architecture and not with convnet. This is because masking introduces too many random artificial edges. For any transformer, the first layer is the conv layer, with kernel size equal to the patch size and thus, this never experiences artificial edges. For convnets which have sliding windows, the artificial edges can't be ignored and will result in noise. 146 | 147 |
148 |
149 | Fig. 6: Masked Augmentation 150 |
151 | 152 | #### Network Architecture 153 | 154 | ##### Projector/Expander: 155 | It is a two/three-layer feed-forward neural network and empirical results show that it is always better to add this in the network architecture. 156 | 157 | The projector is used to project into a lower dimension and the expander is used to project into a higher dimension. A projector is used only during the pretraining and removed while performing the downstream task. This is because the projector removes a lot of information even if the output dimension of the projector and the backbone are the same. 158 | 159 | ##### Momentum Encoder: 160 | Even without a memory bank, a momentum encoder usually helps the performance of the downstream tasks, especially with weak data augmentation. 161 | 162 |
163 |
164 | Fig. 7: Projector/Expander 165 |
166 | -------------------------------------------------------------------------------- /docs/en/week15/15.md: -------------------------------------------------------------------------------- 1 | --- 2 | lang-ref: ch.15 3 | title: Week 15 4 | --- 5 | 6 | ## Lecture part A 7 | 8 | 9 | As pointed out already, we can broadly classify Energy Based Models into generative or joint-embedding based on architectures and into contrastive or regularised & architectural based on training methods. 10 | 11 | 23 | 24 | 30 | 31 | In this section, we discussed Visual Representation Learning, focused on self-supervised visual representation learning. This can be classified into Generative models, Pretext Tasks and Joint Embedding methods. In generative models, you train the model to reconstruct the original image from the noisy image. In pretext tasks, you train the model to figure out a smart way to generate pseudo labels. Joint Embedding methods try to make their backbone network robust to certain distortions and are invariant to data augmentation. JEM training methods can be classified into four types: contrastive methods, non-contrastive methods, clustering methods and Other methods. He concluded the lecture by discussing contrastive methods which push positive pairs closer and negative pairs away. 32 | 35 | 36 | 37 | ## Lecture part B 38 | 39 | In this section, we discussed non-contrastive methods which are based on information theory and don’t require special architectures or engineering techniques. Then, he went on to discuss clustering methods which prevent trivial solution by quantizing the embedding space. Finally, he discussed "Other" methods which are local and don't create problem with distributed training unlike previous methods. He concluded the lecture by suggesting various improvisations for JEMs w.r.t Data augmentation and network architecture. 40 | 41 | 44 | -------------------------------------------------------------------------------- /docs/fr/README-FR.md: -------------------------------------------------------------------------------- 1 | 4 | # Cours sur l'apprentissage profond de la NYU - printemps 2021 (NYU-DLSP21) 5 | 6 | 7 | [🇬🇧](https://github.com/Atcold/NYU-DLSP21/blob/master/README.md)   [🇫🇷](https://github.com/Atcold/NYU-DLSP21/blob/master/docs/fr/README-FR.md) 8 | 9 | 10 | 24 | 25 | ## Nouvelle organisation du contenu 26 | 27 | Ce semestre, nous avons réorganisé le matériel didactique. 28 | Au cours de la première moitié du semestre, nous avons couvert 3 sujets, s'étalant sur deux semaines, chacun étant suivi d'un devoir. 29 | De plus, à chaque cours magistral sont associés des travaux dirigés. 30 | 31 | 1. Historique, rétropropagation et descente de gradient. 32 | 2. Partage des paramètres : réseaux récurrents et convolutifs. 33 | 3. Modèles à base d'énergie (EBMs pour *energy based models*) à variable latente (LV pour *latent variable*). 34 | 35 | Notez que nous avons remanié le programme et le contenu des cours. 36 | Nous avons traité les LV-EBMs comme un module *de base*, sur lequel il faut s'appuyer. 37 | 38 | 39 | 51 | 52 | ## La seconde moitié du semestre 53 | 54 | Alfredo pensait reproduire les mêmes travaux pratiques utilisés pour l'édition de l'année dernière, [NYU-DLSP20](https://github.com/Atcold/NYU-DLSP20), mais dans un ordre différent. 55 | 56 | Cependant il n'a pas pu. 57 | 58 | Les étudiants de cette année ont vu les LV-EBMs et on leur a parlé du *gâteau*. 59 | Alfredo ne pouvait donc pas prétendre qu'il n'existe pas et enseigner comme s'ils n'étaient pas conscients de l'éléphant dans la pièce. 60 | Cela aurait été intellectuellement malhonnête. 61 | Il a donc redessiné l'ensemble de ses diapositives. 62 | 63 | 64 | 73 | 74 | ## Dépôt de ce semestre 75 | 76 | C'est pourquoi ce dépôt a été créé. 77 | Il n'est **pas** prévu de faire le même travail insensé que l'année dernière, mais Alfredo a besoin d'un espace où poster des diapositives mises à jour, des notebooks et accueillir de nouvelles transcriptions/traductions. 78 | Le matériel de l'année dernière est toujours valable. 79 | Cette année, vous avez un point de vue différent. 80 | Un point de vue plus puissant. 81 | 82 | 83 | 96 | 97 | ## Contenus précédents 98 | 99 | Avant NYU-DLSP21, il y a eu : 100 | 101 | - [NYU-DLSP20](https://github.com/Atcold/NYU-DLSP20) (version la plus importante) 102 | - [NYU-DLSP19](https://github.com/Atcold/NYU-DLSP20/releases/tag/dlsp19) 103 | - [AIMS-DLFL19](https://github.com/Atcold/NYU-DLSP20/releases/tag/aims-fl18) 104 | - [CoDaS-HEP18](https://github.com/Atcold/NYU-DLSP20/releases/tag/v1.0.0) 105 | - [NYU-DLSP18](https://docs.google.com/document/d/1_p1Mw-NtMGN_vpas_pchLsQC2u0NM5mTnRapBrQ2ivk/) 106 | - [Purdue-DLFL16](https://docs.google.com/document/d/1ugJRMqQ_cCUQC1B8mSE0iro7sKrDT8-BnppTZv0rA08/) 107 | - [torch-Video-Tutorials](https://github.com/Atcold/torch-Video-Tutorials) 108 | 109 | 114 | 115 | ## Plus d'informations 116 | 117 | Consultez le [site web du cours](https://atcold.github.io/NYU-DLSP21/). 118 | -------------------------------------------------------------------------------- /docs/fr/faq.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Avant-propos, FAQ et éléments de traduction 3 | author: Loïck Bourdois 4 | date: 07 Jul 2021 5 | lang-ref: faq 6 | lang: fr 7 | --- 8 | 9 | # Avant-propos 10 | Ce cours porte sur les techniques de représentation et d'apprentissage profond les plus récentes. Il se concentre sur l'apprentissage supervisé, non supervisé et autosupervisté, mais aussi sur les méthodes d’enchâssement, l'apprentissage métrique et les réseaux convolutifs et récurrents. 11 | Il est illustré d’applications à la vision par ordinateur, la compréhension du langage naturel et la reconnaissance vocale. 12 | Pour suivre ce cours, il est fortement conseillé d’avoir des prérequis en algèbre et d’avoir déjà suivi un cours introductif d'apprentissage machine ou de *data science*. D’après Yann Le Cun, ces cours sont destinés à des personnes de niveau bac+4 ou bac+5. 13 | 14 | Nous vous invitons à privilégier les vidéos de la [chaine YouTube](https://www.youtube.com/watch?v=8L10w1KoOU8&list=PLLHTzKZzVU9e6xUfG10TkTWApKSZCzuBI&index=21) (contenu « officiel ») puisque le cours y est donné par le corps enseignant contrairement au site web où il s’agit des notes prises par les étudiants pendant le cours. 15 | Le site web étant des résumés des vidéos, celles-ci comprennent donc généralement des informations supplémentaires par rapport au site. Comme par exemple : 16 | - des anecdotes sur les différents concepts abordés, 17 | - des blagues, 18 | - la répétition d’un même concept mais sous la forme de différentes formulations permettant ainsi généralement de comprendre une idée si une première formulation n’est pas saisie, 19 | - les questions des étudiants qui peuvent être celles que vous ayez vous-même pendant le visionnage, 20 | Notez que si des concepts ne sont toujours pas compris à l’issue de la vidéo, vous avez la possibilité de poser une question en commentaire de la vidéo YouTube, ce que ne permet pas le site web. 21 | - les références des articles sur lesquels se basent le cours sont présentes sur les diapositives des vidéos alors qu’elles sont absentes du site. 22 | 23 | Le site web sert ainsi davantage de résumé des vidéos ou encore de base que vous pouvez réutiliser pour vos notes personnelles que vous prenez pendant le visionnage des vidéos. 24 | En cas de besoin vous pouvez facilement basculer du site à un moment d’une vidéo donnée en cliquant sur les titres des paragraphes des pages web. 25 | 26 | 27 | 28 | # FAQ 29 | Voici quelques réponses à des questions fréquemment posées : 30 | - **Est-ce que suivre ce cours permet d’obtenir une certification ?** 31 | > Non. Pour proposer une certification, il faudrait pouvoir vous évaluer or le contenu n’a pas été prévu pour (contrairement à un MOOC par exemple). 32 | > Cette demande étant fréquente, des réflexions sont menées pour essayer d’en proposer une pour des éditions futures du cours. 33 | - **Combien de temps consacrer à ce cours ?** 34 | > Pour chaque semaine, il y a environ 2h30/3h de contenu vidéo. Avec le temps consacré à la prise de notes et celui pour jouer avec les *notebooks*, une estimation totale de 5h par semaine semble raisonnable. Pour la suite, cela dépend du niveau d'immersion que vous voulez atteindre dans un sujet donné (lire les articles donnés en référence, appliquer ce qui a été vu en classe à vos propres projets, etc.). 35 | - **Où poser une question à l’issue du visionnage d’une vidéo ?** 36 | > Vous pouvez la poser directement (en anglais) dans la section commentaires sous la vidéo YouTube en question, Alfredo se fera un plaisir d’y répondre. Si cette question porte sur un point précis de la vidéo, pensez à indiquer l’horodatage. Vous pouvez le faire également sur le [Discord](https://discord.gg/CthuqsX8Pb) de la classe dédié expressément aux étudiants. Il sert également à coordonner des groupes de visionnage, discuter des devoirs, suggérer des améliorations ou plus généralement pour tout sujet lié au cours. 37 | - **Puis-je utiliser ce cours?** 38 | > Bien sûr, le cours est placé sous la [Licence internationale Creative Commons Attribution-NonCommercial-ShareAlike 4.0](https://creativecommons.org/licenses/by-nc-sa/4.0/deed.fr). 39 | > Cela signifie que : 40 | > - Vous n'êtes pas autorisé à faire un usage commercial de cette œuvre. 41 | > - Vous devez créditer l'œuvre, intégrer un lien vers la licence et indiquer si des modifications ont été effectuées à l'œuvre. Vous devez indiquer ces informations par tous les moyens raisonnables, sans toutefois suggérer que l'offrant vous soutient ou soutient la façon dont vous avez utilisé son œuvre. 42 | > - Dans le cas où vous effectuez un remix, que vous transformez, ou créez à partir du matériel à partir de l'œuvre originale, vous devez diffuser l'œuvre modifiée dans les mêmes conditions, c'est à dire avec la même licence avec laquelle l'œuvre originale a été diffusée. 43 | > 44 | > - Pour le crédit, vous pouvez utiliser le BibTeX suivant : 45 | > @misc{canziani2020nyudlsp21, 46 | author = {Canziani, Alfredo and LeCun, Yann}, 47 | title = {NYU Deep Learning, Spring 2021}, 48 | howpublished = "\url{https://github.com/Atcold/NYU-DLSP21}", 49 | year = {2021}, 50 | note = "[Online; accessed ]" 51 | } 52 | 53 | 54 | 55 | # Traduction 56 | Vous trouverez ici les informations concernant les choix de traduction adoptés. 57 | 58 | ### Informations de base : 59 | - Pour le site : 60 | Tous les textes présents sur ce site sont des notes de cours prises par les étudiants de la *New York University* lors des cours donnés par Yann Le Cun, Alfredo Canziani, Ishan Misra, Awni Hannun et Marc'Aurelio Ranzato. 61 | Ainsi les textes en anglais ont été rédigés par plusieurs personnes, ce qui a un impact sur l’homogénéité des textes (certains écrivent au passé, d’autres au présent ; les abréviations utilisées ne sont pas forcément toujours les mêmes ; certains écrivent des phrases courtes, quand d’autres écrivent des phrases pouvant aller jusqu’à 5 ou 6 lignes, etc.). 62 | La traduction en français qui vous est proposée a été effectuée par une seule personne dans le but d’atténuer les problèmes cités à l’instant et de proposer une traduction homogène. 63 | 64 | - Pour les vidéos : 65 | Afin de fluidifier la traduction et la compréhension, il a été décidé de ne pas retranscrire les mots « parasites » de remplissage et de transition (les « *you know* », « *sort of* », « *right* », « *so* », etc.). 66 | Quand le débit est élevé, une traduction ne reste qu'environ 4 secondes à l'écran. Pour pouvoir retranscrire le plus d'informations possibles dans cet intervalle de temps, nous utilisons des abréviations lorsque cela est possible (« RNNs » au lieu de « réseaux de neurones récurrents » par exemple). Nous privilégions également l'usage de mots courts (par exemple un « car » à la place d’un « parce que »). 67 | En raison du travail important nécessaire pour effectuer la traduction (1h de travail pour 10min de vidéo) il n'a pas été possible d'effectuer une relecture détaillée des traductions vidéos. Ainsi, si vous remarquez des fautes d'orthographe/de conjugaison, fautes de frappes, etc., nous vous invitons à soumettre une PR sur le [répertoire GitHub du site](https://github.com/Atcold/NYU-DLSP21/pulls) en précisant avec un `[FR]` qu’elle concerne la traduction française. 68 | 69 | 70 | ### Choix de traductions des termes techniques : 71 | 72 | - Choix de traduire les termes anglais en français : 73 | 74 | Terme | Traduction | Raisons / Explications 75 | --- | --- |--- | 76 | Chain rule | Règle de dérivation des fonctions composées | En pratique usage du terme « règle de la chaîne » dans les sous-titres des vidéos pour gagner de la place. 77 | CNN | ConvNet | Yann tient particulièrement au respect de cette traduction. Voir notamment la page 202 du livre [*Quand la machine apprend*](https://www.odilejacob.fr/catalogue/sciences/informatique/quand-la-machine-apprend_9782738149312.php). 78 | Downstream tasks | Tâches en aval | Les tâches de prétexte étant les tâches en amont. 79 | Energy-Based Models | Modèles à base d’énergie | Traduction pas forcément satisfaisante mais adoptée faute de mieux. 80 | Embedding | Enchâssement | Reprise de la traduction utilisée page 228 dans le livre *Quand la machine apprend*. Dans la littérature, il est possible de trouver également l'usage du terme « plongement » comme traduction. Parler tout simplement de vectorisation paraîtrait beaucoup plus simple pour faire le lien avec le concept mathématique (on vectorise un mot par exemple). 81 | Forward model | Modèle prédictif | 82 | Graph Neural Networks | Réseaux de neurones pour graphe | En pratique, pour les sous-titres des vidéos, l'abréviation GNN est privilégiée. 83 | Graph Convolution Networks | Réseaux convolutifs pour graphe | En pratique, pour les sous-titres des vidéos, l'abréviation GCN est privilégiée. 84 | Manifold | Variété | Voir [l'article Wikipédia](https://fr.wikipedia.org/wiki/Vari%C3%A9t%C3%A9_(g%C3%A9om%C3%A9trie)). 85 | Nonlinearity function | Fonction non linéaire | En français, on utilise également le terme de « fonction d’activation ». 86 | Overfitting | Surentraînement | Reprise de la traduction utilisée page 155 dans le livre *Quand la machine apprend*. 87 | Regularizer | Régulariseur | Néologisme préférable à régularisateur. 88 | Sparse | Epars | Pour l'expression « sparse matrix », nous traduisons « sparse » en « creuse » pour « matrice creuse ». Pour tous les autres cas nous utilisons « épars » ou « éparse » en fonction du genre du mot auquel l'adjectif se rapporte. 89 | Sparsity | Eparsité | Néologisme basé sur le mot « épars ». 90 | Template Matching | Template Matching | L'expression « appariement de patrons » comme traduction peut être trouvable sur le site ou dans les vidéos. 91 | Yann LeCun | Yann Le Cun ou Yann | L'explication de l'écriture du nom de famille est donnée page 193 du livre *Quand la machine apprend*. Dans les notes en anglais des étudiants, il est possible de trouver « Mr Yann LeCun », « Mr LeCun », « Doctor Yann LeCun », « Professor LeCun », etc. Nous utilisons simplement « Yann ». 92 | 93 | - Choix de ne pas traduire les termes anglais en français : 94 | Nous avons fait le choix de ne pas traduire certains termes anglais pour des raisons pratiques. Par exemple, certains concepts nécessitent 3 ou 4 mots en français là où 1 seul suffit en anglais. Cela pose notamment problème pour les vidéos où le temps d'affichage est limité, d'où la préférence à garder le terme en anglais. Il serait possible d'utiliser des néologismes mais nous avons préféré ne pas en imposer car ne pouvant peut-être pas faire consensus. Sur le site, les mots laissés en anglais sont indiqués en italique. 95 | 96 | Terme | Traduction | Raisons / Explications 97 | --- | --- |--- | 98 | Dropout | Dropout | Le mot « décimation » serait approprié mais il est déjà utilisé en traitement du signal pour signifier « sous-échantillonnage ». 99 | Finetuning | Finetuning | Le terme « affinage » peut être trouvable dans la littérature. 100 | One hot | One hot | La notion de « vecteurs de base canonique » pourrait être utilisée mais elle est un peu technique et l'expression est plutôt longue pour traduire à peine 2 mots. N.D.T : lorsque j'étais étudiant, dans mes cours d'algèbre linéaire, j'utilisais soit « v.b.c » pour « vecteurs de base canonique » ou bien « zérun » (pour un vecteur contenant des 0 et un 1) mais il s'agit d'une convention personnelle que je ne préfère pas imposer. 101 | Pooling | Pooling | Plusieurs traductions envisagées comme agrégation, agglomération, ou coalescence. Garder le terme en anglais est plus simple (un « max-agrégation » n'est pas très élégant par exemple). 102 | 103 | 104 | En vous souhaitant un bon visionnage ou une bonne lecture ! 105 | -------------------------------------------------------------------------------- /docs/fr/week01/01.md: -------------------------------------------------------------------------------- 1 | --- 2 | lang: fr 3 | lang-ref: ch.01 4 | title: Semaine 1 5 | translation-date: 19 June 2021 6 | translator: Loïck Bourdois 7 | --- 8 | 9 | 14 | ## Cours magistral 15 | Un peu d'histoire sur l'apprentissage supervisé peut être trouvée [ici](https://atcold.github.io/NYU-DLSP20/fr/week01/01-1/), tandis que la descente de gradient peut être trouvée [ici](https://atcold.github.io/NYU-DLSP20/fr/week02/02-1/). 16 | 17 | 22 | ## Travaux dirigés 23 | Le résumé de cette semaine et de la suivante peuvent être trouvé [ici](https://atcold.github.io/NYU-DLSP20/fr/week01/01-3/). 24 | -------------------------------------------------------------------------------- /docs/fr/week02/02-3.md: -------------------------------------------------------------------------------- 1 | --- 2 | lang: fr 3 | lang-ref: ch.02-3 4 | title: Motivation des problèmes, algèbre linéaire et visualisation 5 | lecturer: Alfredo Canziani 6 | authors: Rajashekar Vasantha 7 | date: 04 Feb 2021 8 | typora-root-url: 02-3 9 | translation-date: 19 Jun 2021 10 | translator: Loïck Bourdois 11 | --- 12 | 13 | 14 | 19 | ## Ressources 20 | 21 | Nous vous invitons à suivre Alfredo Canziani [sur Twitter @alfcnz](https://twitter.com/alfcnz). Vous trouverez sur son compte des vidéos et des manuels contenant des détails pertinents sur l'algèbre linéaire et la décomposition en valeurs singulières (SVD). Ce contenu est trouvable en effectuant une recherche (en anglais) sur le Twitter d'Alfredo, en tapant par exemple `linear algebra (from:alfcnz)` dans la barre de recherche. 22 | 23 | 24 | 60 | 61 | 62 | ## [Réseaux neuronaux : rotation et écrasement](https://youtu.be/0TdAmZUMj2k) 63 | Un réseau de neurones traditionnel est une collection alternée de deux blocs : les blocs linéaires et les blocs non linéaires. 64 | Voici le schéma fonctionnel d'un réseau de neurones traditionnel. 65 |
66 |
67 |
68 | 69 | Figure 1 : Schéma d'un réseau de neurones traditionnel 70 |
71 |
72 | Les blocs linéaires (rotations pour simplifier) sont donnés par : 73 | 74 | $$ 75 | \vect{s}_{k+1} = \mW_k z_k 76 | $$ 77 | 78 | Et les blocs non linéaires (fonctions d'écrasement pour une compréhension intuitive) sont donnés par : 79 | 80 | 81 | $$ \vect{z}_k = h(\vect{s}_k) $$ 82 | 83 | Dans le schéma et les équations ci-dessus, $$\vx \in \mathbb{R}^n$$ représente le vecteur d'entrée. 84 | $$\mW_k \in \mathbb{R}^{n_{k} \times n_{k-1}}$$ représente la matrice d'une transformation affine correspondant au $$k^{\text{ème}}$$ bloc et est décrite plus en détail ci-dessous. 85 | La fonction $h$ est appelée fonction d'activation et cette fonction forme le bloc non linéaire du réseau neuronal. 86 | Sigmoïde, ReLU et tanh sont quelques-unes des fonctions d'activation les plus courantes et nous les examinerons dans les parties suivantes de cette section. 87 | Après des applications alternées des blocs linéaire et non linéaire, le réseau ci-dessus produit un vecteur de sortie $$\vect{s}_k \in \mathbb{R}^{n_{k-1}}$$. 88 | 89 | Examinons d'abord le bloc linéaire pour comprendre les transformations affines. Comme exemple considérons la classification d'images. 90 | Supposons que nous prenions une photo avec un appareil photo de $1$ mégapixel. 91 | Cette image aura environ $1 000$ pixels verticalement et $1 000$ pixels horizontalement, et chaque pixel aura trois dimensions de couleur pour le rouge, le vert et le bleu (RVB). 92 | Chaque image peut donc être considérée comme un point dans un espace à $3$ millions de dimensions. 93 | Avec une telle dimensionnalité, de nombreuses images intéressantes que nous pourrions vouloir classer, comme un chien *vs* un chat, se trouveront essentiellement dans la même région de l'espace. 94 | 95 | Afin de séparer efficacement ces images, nous envisageons des moyens de transformer les données afin de déplacer les points. 96 | Rappelons que dans l'espace bidimensionnel, une transformation linéaire équivaut à une multiplication de matrice. 97 | Par exemple, les transformations suivantes peuvent être obtenues en changeant les caractéristiques de la matrice : 98 | 99 | - Rotation : lorsque la matrice est orthonormée. 100 | - Mise à l'échelle (« scalabilité ») : lorsque la matrice est diagonale. 101 | - Réflexion : lorsque le déterminant est négatif. 102 | - *Shearing*. 103 | - Translation. 104 | 105 | A noter que la translation seule n'est pas linéaire puisque $0$ ne sera pas toujours mis en correspondance avec 0, mais c'est une transformation affine. 106 | Pour revenir à notre exemple d'image, nous pouvons transformer les points de données en les translatant de manière à ce qu'ils soient regroupés autour de 0 et en les mettant à l'échelle à l'aide d'une matrice diagonale de manière à effectuer un « zoom avant » sur cette région. 107 | Enfin, nous pouvons effectuer une classification en trouvant des lignes dans l'espace qui séparent les différents points dans leurs classes respectives. 108 | En d'autres termes, l'idée est d'utiliser des transformations linéaires et non linéaires pour représenter les points dans un espace tel qu'ils soient linéairement séparables. 109 | Cette idée sera rendue plus concrète dans les sections suivantes. 110 | 111 | Dans la suite, nous visualisons comment un réseau neuronal sépare des points et quelques transformations linéaires et non linéaires. 112 | Ce contenu est essentiellement le même que celui de l'année dernière, ainsi nous vous invitons à vous rendre [ici](https://atcold.github.io/NYU-DLSP20/fr/week01/01-3/) pour le consulter. 113 | -------------------------------------------------------------------------------- /docs/fr/week02/02.md: -------------------------------------------------------------------------------- 1 | --- 2 | lang: fr 3 | lang-ref: ch.02 4 | title: Semaine 2 5 | translation-date: 19 June 2021 6 | translator: Loïck Bourdois 7 | --- 8 | 9 | 14 | 15 | ## Cours magistral 16 | Similaire à [ceci](https://atcold.github.io/NYU-DLSP20/fr/week11/11-1/) et [ceci](https://atcold.github.io/NYU-DLSP20/fr/week11/11-2/) et peut-être plus. 17 | 18 | 19 | 24 | 25 | ## Travaux dirigés 26 | Nous discutons de la motivation d'appliquer des transformations à des points de données visualisés dans l'espace. Nous parlons d'algèbre linéaire et de l'application de transformations linéaires et non linéaires. Nous abordons l'utilisation de la visualisation pour comprendre la fonction et les effets de ces transformations et parcourons des exemples dans un *notebook* Jupyter. Nous concluons par une discussion sur les fonctions représentées par des réseaux neuronaux. 27 | -------------------------------------------------------------------------------- /docs/fr/week03/03-3.md: -------------------------------------------------------------------------------- 1 | --- 2 | lang: fr 3 | lang-ref: ch.03-3 4 | title: Classification d'une spirale 5 | lecturer: Alfredo Canziani 6 | authors: Wenhao Li 7 | date: 6 May 2021 8 | typora-root-url: 03-3 9 | translation-date: 19 Jun 2021 10 | translator: Loïck Bourdois 11 | --- 12 | 13 | 17 | ## [Typora](https://typora.io/) 18 | *Typora* est un outil utile pour écrire en markdown et ajouter des formules en *LaTeX*. Il est pratique pour rédiger des articles, des devoirs et générer des fichiers pdf. 19 | 20 | 33 | ## [Notion](https://www.notion.so/) 34 |
35 |
36 |
37 | 38 | Avec *Notion* vous pouvez placer en un endroit toutes vos affaires préférées. Cela inclut, sans s'y limiter, les recettes, la musique, les livres, les notes. Tout en un seul endroit, simple et puissant. 39 | 40 | Lorsque vous trouvez un article utile sur l'apprentissage profond, vous pouvez l'y stocker pour le consulter ultérieurement. Vous pouvez trouver [plus d'informations](https://www.notion.so/Intro-to-databases-fd8cd2d212f74c50954c11086d85997e) sur la façon d'utiliser la base de données. 41 | 42 | Vous devez d'abord créer une base de données via *Workspace* => *Add a new page*. Dans cette page, choisissez */table* => *Table - Full Page*. 43 | En plus de remplir les informations relatives au document, nous voulons généralement couvrir le traditionnel « Qui ? Quoi ? Où ? Pourquoi ? Comment ? Quand ? » dans le résumé. 44 | 45 | Voici un [exemple](https://www.notion.so/When-to-use-parametric-models-in-reinforcement-learning-d4c5e586677e49338a41b663231c0633) (en anglais) de la façon d'organiser votre résumé. 46 | 47 | 48 | 77 | ## [Diagrams.net](https://app.diagrams.net/) 78 | 79 | Diagrams.net est un excellent outil pour dessiner des diagrammes de réseaux neuronaux. Nous allons introduire quelques règles pour rendre nos diagrammes plus cohérents avec ceux du cours. 80 | 81 |
82 |
83 |
84 | 85 | Le fond en niveaux de gris signifie qu'il s'agit d'une observation donc qu'il s'agit de points de données d'un jeu de données fourni. 86 | Vous pouvez vérifier l'entrée et les étiquettes en allant dans le répertoire du jeu de données si vous le souhaitez. 87 | 88 |
89 |
90 |
91 | 92 | Nous utilisons *Delay* pour désigner l'encodeur (par exemple, un réseau neuronal). 93 | 94 |
95 |
96 |
97 | 98 | Dans cet exemple, $\vx$ et $\vy$ sont des observations. 99 | 100 | Dans la moitié ci-dessus, nous donnons les $\vx$ à un encodeur pour obtenir une prédiction $\bar {\vy}$. C'est ce qu'on appelle la propagation vers l'avant. 101 | 102 | Dans la moitié inférieure, nous voulons obtenir la prédiction $\bar{\vx}$ étant donné l'observation $\vy$. 103 | Nous continuons à faire une descente de gradient pour que la sortie du réseau soit aussi proche que possible de $\vy$. C'est ce qu'on appelle « l'inférence amortissante ». 104 | 105 | Habituellement, nous utilisons la rétropropagation pour calculer le gradient, puis nous appliquons la descente de gradient avec ces valeurs calculées pour entraîner le modèle. 106 | Cet exemple montre que la rétropropagation n'est PAS uniquement utilisée pendant l'entraînement. La rétropropagation peut également être utilisée pour l'inférence. 107 | 108 | 109 | 113 | ## Classification d'une spirale 114 | Le contenu suivant est essentiellement le même que celui de l'année dernière, rendez-vous donc [ici](https://atcold.github.io/NYU-DLSP20/fr/week02/02-3/) pour le consulter. 115 | -------------------------------------------------------------------------------- /docs/fr/week03/03.md: -------------------------------------------------------------------------------- 1 | --- 2 | lang: fr 3 | lang-ref: ch.03 4 | title: Semaine 3 5 | translation-date: 19 June 2021 6 | translator: Loïck Bourdois 7 | --- 8 | 9 | 10 | 15 | ## Cours magistral 16 | Les différentes parties peuvent être trouvées [ici](https://atcold.github.io/NYU-DLSP20/fr/week03/03-1/) et [ici](https://atcold.github.io/NYU-DLSP20/fr/week06/06-2/). 17 | 18 | 19 | 23 | ## Travaux dirigés 24 | Nous présentons comment dessiner des schémas de réseaux profonds de manière pratique en utilisant **diagrams.net**. Nous montrons ensuite les différents effets de l'utilisation de la seule transformation linéaire, et l'effet de la combinaison de la transformation linéaire et non linéaire sur la classification en spirale. Enfin nous voyons les principes mathématiques qui sous-tendent les réseaux neuronaux, notamment le théorème de dérivation des fonctions composées, la rétropropagation et la descente de gradient. 25 | -------------------------------------------------------------------------------- /docs/fr/week04/04.md: -------------------------------------------------------------------------------- 1 | --- 2 | lang: fr 3 | lang-ref: ch.04 4 | title: Semaine 4 5 | translation-date: 19 June 2021 6 | translator: Loïck Bourdois 7 | --- 8 | 9 | 14 | ## Cours magistral 15 | Similaire à [l'édition de l'année dernière](https://atcold.github.io/NYU-DLSP20/fr/week06/06-1/). 16 | 17 | 18 | 23 | ## Travaux dirigés A & B 24 | Similaires à l'édition de l'année dernière : pour les [ConvNets](https://atcold.github.io/NYU-DLSP20/fr/week03/03-3/), pour les [RNNs](https://atcold.github.io/NYU-DLSP20/fr/week06/06-3/). 25 | -------------------------------------------------------------------------------- /docs/fr/week05/05.md: -------------------------------------------------------------------------------- 1 | --- 2 | lang: fr 3 | lang-ref: ch.05 4 | title: Semaine 5 5 | translation-date: 19 June 2021 6 | translator: Loïck Bourdois 7 | --- 8 | 9 | 14 | ## Cours magistral 15 | Similaire à [celui de l'année dernière](https://atcold.github.io/NYU-DLSP20/fr/week07/07-1/) mais un peu différent. 16 | 17 | 18 | 23 | ## Travaux dirigés 24 | Comme [l'année dernière](https://atcold.github.io/NYU-DLSP20/fr/week15/15-1/). 25 | -------------------------------------------------------------------------------- /docs/fr/week06/06.md: -------------------------------------------------------------------------------- 1 | --- 2 | lang: fr 3 | lang-ref: ch.06 4 | title: Semaine 6 5 | translation-date: 19 June 2021 6 | translator: Loïck Bourdois 7 | --- 8 | 9 | 10 | 15 | ## Cours magistral 16 | Similaire à [ceci](https://atcold.github.io/NYU-DLSP20/fr/week14/14-1/) et [ceci](https://atcold.github.io/NYU-DLSP20/fr/week14/14-2/). 17 | 18 | 23 | ## Travaux dirigés 24 | Comme [l'année dernière](https://atcold.github.io/NYU-DLSP20/fr/week15/15-2/). 25 | -------------------------------------------------------------------------------- /docs/fr/week07/07.md: -------------------------------------------------------------------------------- 1 | --- 2 | lang: fr 3 | lang-ref: ch.07 4 | title: Semaine 7 5 | translation-date: 19 June 2021 6 | translator: Loïck Bourdois 7 | --- 8 | 9 | 10 | 13 | ## Cours magistral partie A 14 | 15 | 18 | ## Cours magistral partie B 19 | 20 | 24 | ## Travaux dirigés 25 | Nous commençons par une application des auto-encodeurs : DALL-E. Nous discutons ensuite des auto-encodeurs (en termes de modèles à base d’énergie) et de leurs cas d'utilisation. Puis nous discutons des coûts de reconstruction et des fonctions de perte à utiliser. Enfin, nous abordons un type particulier d'auto-encodeur à savoir l'auto-encodeur débruiteur. 26 | -------------------------------------------------------------------------------- /docs/fr/week08/08.md: -------------------------------------------------------------------------------- 1 | --- 2 | lang: fr 3 | lang-ref: ch.08 4 | title: Semaine 8 5 | translation-date: 19 June 2021 6 | translator: Loïck Bourdois 7 | --- 8 | 9 | 10 | 13 | ## Cours magistral partie A 14 | 15 | 18 | ## Cours magistral partie B 19 | 20 | 24 | ## Travaux dirigés 25 | Dans cette section, nous présentons quelques modèles génératifs, dont l'auto-encodeur débruiteur, l'auto-encodeur contractif et l'auto-encodeur variationnel. Nous avons comparé les fonctionnalités et les avantages des auto-encodeurs variationnels par rapport aux auto-encodeurs de base. Nous avons exploré en détail la fonction objective de l'auto-encodeur variationnel en comprenant comment il impose une certaine structure dans l'espace latent. 26 | -------------------------------------------------------------------------------- /docs/fr/week09/09.md: -------------------------------------------------------------------------------- 1 | --- 2 | lang: fr 3 | lang-ref: ch.09 4 | title: Semaine 9 5 | translation-date: 19 June 2021 6 | translator: Loïck Bourdois 7 | --- 8 | 9 | 10 | 13 | ## Cours magistral partie A 14 | 15 | 18 | ## Cours magistral partie B 19 | 20 | 24 | ## Travaux dirigés 25 | Dans cette section nous couvrons l'implémentation de modèles génératifs, à savoir les auto-encodeurs sous-complets, les auto-encodeurs débruieurs, les auto-encodeurs variationnels et les réseaux antagonistes génératifs. Nous analysons ces modèles du point de vue du cadre des modèles à base d’énergie (EBMs). Ce faisant, nous nous rendons compte que ces modèles génératifs peuvent être considérés comme des extensions des EBMs et qu'ils diffèrent les uns des autres par de subtils ajustements architecturaux. 26 | -------------------------------------------------------------------------------- /docs/fr/week10/10-3.md: -------------------------------------------------------------------------------- 1 | --- 2 | lang: fr 3 | lang-ref: ch.10-3 4 | lecturer: Alfredo Canziani 5 | title: Architecture encodeur-predicteur-décodeur d'un transformer 6 | authors: Rahul Ahuja, jingshuai jiang 7 | date: 15 Apr 2021 8 | typora-root-url: 10-3 9 | translation-date: 20 Jun 2021 10 | translator: Loïck Bourdois 11 | --- 12 | 13 | 14 | 19 | ## Le *transformer* 20 | 21 | Avant d'élaborer l'architecture encodeur-prédicteur-décodeur, nous allons passer en revue deux modèles que nous avons déjà vus. 22 | 23 | 24 | 36 | 37 | ### Architecture d'un EBM conditionnel à variable latente 38 | 39 | Dans l'architecture d'un EBM conditionnel à variable latente, nous avons $x$ la variable conditionnelle qui va dans un prédicteur. 40 | Nous avons $\vy$ qui est la valeur cible. Les modules de décodage produisent $\vytilde$ lorsqu'on leur donne une variable latente $z$ et la sortie du prédicteur. 41 | $\red{E}$ est la fonction d'énergie qui minimise l'énergie entre $\vytilde$ et $\vy$. 42 | 43 | 44 |
45 |
46 | Figure 1 : Architecture d'un EBM conditionnel à variable latente 47 |
48 | 49 | 50 | 61 | 62 | ### Architecture d'un auto-encodeur 63 | 64 | Dans l'architecture d'un auto-encodeur, nous avons observé qu'il n'y a pas d'entrée conditionnelle mais seulement une variable cible. 65 | L'architecture entière essaie d'apprendre la structure de ces variables cibles. 66 | La valeur cible $\vy$ est introduite dans un module encodeur qui la transforme en un espace de représentation caché, ne laissant passer que les informations les plus importantes. 67 | Et le décodeur fera en sorte que ces variables reviennent à l'espace cible original avec une valeur $\vytilde$. 68 | La fonction de coût va essayer de minimiser la distance entre $\vytilde$ et $\vy$. 69 | 70 | 71 |
72 |
73 | Figure 2 : Architecture d'un autoencodeur de base composé de modules encodeur et décodeur 74 |
75 | 76 | 77 | 96 | 97 | ### Architecture de l'encodeur-prédicteur-décodeur 98 | 99 |
100 |
101 | Figure 3 : L'architecture du transformer avec un module de retard unitaire 102 |
103 | 104 | 105 | Dans un *transformer*, $\vy$ (phrase cible) est un signal temporel discret. Il a une représentation discrète dans un index temporel. 106 | Le $\vy$ est introduit dans un module de retard unitaire suivi d'un encodeur. Le retard unitaire transforme ici $\vy[j] \mapsto \vy[j-1]$ 107 | La seule différence avec l'auto-encodeur ici est cette variable retardée. 108 | Nous pouvons donc utiliser cette structure dans le modèle de langage pour produire le futur lorsqu'on nous donne le passé. 109 | 110 |
111 |
112 | Figure 4 : Un module de retard unitaire transforme $\vy[j] \mapsto \vy[j-1]$ 113 |
114 | 115 | Le signal observé, $\vx$ (phrase source) passe également par un encodeur. 116 | La sortie de l'encodeur et de l'encodeur retardé est introduite dans le prédicteur qui donne une représentation cachée $\vh$. 117 | Ceci est très similaire à l'auto-encodeur débruiteur car le module de retard agit comme un bruit dans ce cas. 118 | $\vx$ fait de cette architecture entière un auto-encodeur débruiteur conditionnel retardé. 119 | 120 | 121 | 125 | 126 | ### Module encodeur 127 | Vous pouvez voir l'explication détaillée de ce module dans les notes de l'année dernière disponibles [ici](https://atcold.github.io/NYU-DLSP20/fr/week12/12-3/). 128 | 129 | 130 | 141 | 142 | ### Module prédicteur 143 | 144 | Le module prédicteur du *transformer* suit une procédure similaire à celle de l'encodeur. 145 | Cependant, il y a un sous-bloc supplémentaire (c'est-à-dire l'attention croisée) à prendre en compte. 146 | De plus, la sortie des modules encodeurs agit comme les entrées de ce module. 147 | 148 | 149 |
150 |
151 | Figure 5 : Le module prédicteur composé d'un bloc d'attention croisée 152 |
153 | 154 | 155 | 159 | 160 | ### Attention croisée 161 | Vous pouvez consulter l'explication détaillée de l'attention croisée dans les notes de l'année dernière disponibles [ici](https://atcold.github.io/NYU-DLSP20/fr/week12/12-3/). 162 | 163 | 174 | 175 | 176 | ### Module décodeur 177 | 178 | Contrairement à ce que les auteurs du papier du *transformer* définissent, le module décodeur est composé de blocs `1D-convolution` et `Add, Norm`. 179 | La sortie du module prédicteur est introduite dans le module décodeur et la sortie du module décodeur est la phrase prédite. 180 | On peut l'entraîner en fournissant la séquence cible retardée. 181 | 182 | 183 |
184 |
185 | Figure 6 : La notation correcte des modules encodeur, prédicteur et décodeur dans un transformer 186 |
187 | 188 | -------------------------------------------------------------------------------- /docs/fr/week10/10.md: -------------------------------------------------------------------------------- 1 | --- 2 | lang: fr 3 | lang-ref: ch.10 4 | title: Semaine 10 5 | translation-date: 19 June 2021 6 | translator: Loïck Bourdois 7 | --- 8 | 9 | 10 | 14 | ## Cours magistral partie A 15 | 16 | Une brève introduction à l'apprentissage autosupervisé et aux tâches de prétexte ainsi qu'une discussion à propos des solutions triviales associées. Puis une catégorisation des méthodes autosupervisées récentes avec une introduction à l'apprentissage contrastif et à la fonction de perte utilisée. Nous poursuivons avec de brèves présentations de PIRL, SimCLR et MoCo suivies de SwAV qui est une méthode basée sur du *clustering*. Le pré-entraînement sur les données ImageNet et non-ImageNet est également discuté à la fin. 17 | 18 | 21 | ## Cours magistral partie B 22 | 23 | 27 | ## Travaux dirigés 28 | 29 | Nous présentons l'attention en nous concentrant sur l'auto-attention et ses représentations des entrées dans la couche cachée. Ensuite, nous introduisons le paradigme clé-valeur et discutons de la manière de représenter les requêtes, les clés et les valeurs comme des rotations d'une entrée. Enfin, nous utilisons l'attention pour interpréter l'architecture du *transformer*. Pour cela nous passons par le biais d'un *transformer* de base dans la perspective des EBMs et en comparant le paradigme encodeur-prédicteur-décodeur aux architectures séquentielles. 30 | -------------------------------------------------------------------------------- /docs/fr/week11/11.md: -------------------------------------------------------------------------------- 1 | --- 2 | lang: fr 3 | lang-ref: ch.11 4 | title: Semaine 11 5 | translation-date: 19 June 2021 6 | translator: Loïck Bourdois 7 | --- 8 | 9 | 10 | 14 | ## Cours magistral partie A 15 | Nous présentons une introduction au problème de la reconnaissance de la parole à l'aide de modèles neuronaux en mettant l'accent sur la perte CTC (*Connectionist Temporal Classification*) pour l'entraînement et l'inférence lorsque les séquences d'entrée et de sortie sont de longueurs différentes. 16 | 17 | 21 | ## Cours magistral partie B 22 | Nous discutons de l'utilisation de la recherche en faisceau pendant l'inférence ainsi que de la façon dont cette procédure peut être modélisée au moment de l'entraînement d'un *Graph Transformer Network* (GTN). Les GTNs sont essentiellement des « accepteur d'état fini pondéré » (WFSA pour « Weighted Finite State Acceptor ») avec différenciation automatique permettant d'encoder des a priori dans un graphe. Il existe différents types d'états finis pondérés et opérations, notamment l'union, l'étoile de Kleene, l'intersection, la composition et le score *forward*. La fonction de perte est généralement la différence entre deux fonctions. Nous pouvons facilement implémenter ces réseaux en utilisant la bibliothèque *gtn*. 23 | 24 | 27 | ## Travaux dirigés 28 | -------------------------------------------------------------------------------- /docs/fr/week12/12.md: -------------------------------------------------------------------------------- 1 | --- 2 | lang: fr 3 | lang-ref: ch.12 4 | title: Semaine 12 5 | translation-date: 19 June 2021 6 | translator: Loïck Bourdois 7 | --- 8 | 9 | 10 | 14 | ## Cours magistral partie A 15 | Cette conférence introduit le sujet de la traduction automatique neuronale à l'aide d'un exemple. Nous abordons la modélisation du langage, l'architecture du modèle et l'inférence de la traduction automatique neuronale. En outre, nous discutons des problèmes rencontrés en raison des langues et de la nécessité d'une traduction automatique à faibles ressources. Nous examinons également une étude de cas, les différentes étapes du cycle de recherche et la manière dont elles peuvent être utilisées pour la traduction automatique. 16 | 17 | 21 | ## Cours magistral partie B 22 | La première partie de cette partie B se concentre sur la compréhension de la traduction automatique à faibles ressources et la seconde partie discute des incompatibilités potentielles entre les domaines de l'apprentissage automatique et de la traduction automatique. 23 | 24 | 25 | 29 | ## Travaux dirigés 30 | Nous introduisons la fonction de transition d'état et la manière de modéliser un système physique avec état et contrôle. Nous avons discuté de la manière d'obtenir un contrôle optimal par inférence en utilisant l'algorithme de Kelley-Bryson qui utilise la rétropropagation dans le temps et la descente de gradient. Enfin, nous voyons dans un notebook diverses fonctions de coût et la planification d'une trajectoire pour guider un tricycle afin qu'il atteigne la position souhaitée avec la vitesse spécifiée. 31 | -------------------------------------------------------------------------------- /docs/fr/week13/13.md: -------------------------------------------------------------------------------- 1 | --- 2 | lang: fr 3 | lang-ref: ch.13 4 | title: Semaine 13 5 | translation-date: 19 June 2021 6 | translator: Loïck Bourdois 7 | --- 8 | 9 | 10 | 13 | ## Cours magistral partie A 14 | 15 | 18 | ## Cours magistral partie B 19 | 20 | 23 | ## Travaux dirigés 24 | -------------------------------------------------------------------------------- /docs/fr/week14/14.md: -------------------------------------------------------------------------------- 1 | --- 2 | lang: fr 3 | lang-ref: ch.14 4 | title: Semaine 14 5 | translation-date: 19 June 2021 6 | translator: Loïck Bourdois 7 | --- 8 | 9 | 10 | 13 | ## Cours magistral partie A 14 | 15 | 18 | ## Cours magistral partie B 19 | 20 | 23 | ## Travaux dirigés 24 | -------------------------------------------------------------------------------- /docs/fr/week15/15.md: -------------------------------------------------------------------------------- 1 | --- 2 | lang: fr 3 | lang-ref: ch.15 4 | title: Semaine 15 5 | translation-date: 31 July 2022 6 | translator: Loïck Bourdois 7 | --- 8 | 9 | 10 | 17 | 18 | 19 | ## Cours magistral partie A 20 | 21 | Dans cette section, nous abordons l'apprentissage de représentations visuelles en nous concentrant sur l'apprentissage autosupervisé. Les méthodes applicables peuvent être classées en modèles génératifs, tâches de prétexte et méthodes d’enchâssements joints. Dans les modèles génératifs, on entraîne le modèle à reconstruire l'image originale à partir de l'image bruitée. Dans les tâches de prétextes, on entraîne le modèle à trouver un moyen intelligent de générer des pseudo-étiquettes. Les méthodes d’enchâssements joints tentent de rendre leur *backbone* robuste à certaines distorsions et invariant à l'augmentation des données. Les méthodes d'entraînement des JEMs peuvent être classées en quatre types : méthodes contrastives, méthodes non-contrastives, méthodes de *clustering* et les « autres méthodes ». Nous concluons en discutant des méthodes contrastives qui rapprochent les paires positives et éloignent les paires négatives. 22 | 23 | 24 | 25 | 30 | 31 | 32 | ## Cours magistral partie B 33 | 34 | Dans cette section, nous abordons les méthodes non-contrastives qui sont basées sur la théorie de l'information et ne nécessitent pas d'architectures ou de techniques d'ingénierie particulières. Ensuite, nous voyons les méthodes de *clustering* qui empêchent une solution triviale en quantifiant l'espace d’enchâssement. Enfin, nous parlons d’« autres méthodes » qui sont locales et ne créent pas de problème pour l’entraînement distribué contrairement aux méthodes précédentes. Nous concluons en suggérant diverses améliorations pour les JEMs par rapport à l’augmentation de données et l’architecture des réseaux. 35 | -------------------------------------------------------------------------------- /docs/images/week02/02-3/figure1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week02/02-3/figure1.png -------------------------------------------------------------------------------- /docs/images/week03/03-3/figure1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week03/03-3/figure1.png -------------------------------------------------------------------------------- /docs/images/week03/03-3/figure10.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week03/03-3/figure10.png -------------------------------------------------------------------------------- /docs/images/week03/03-3/figure7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week03/03-3/figure7.png -------------------------------------------------------------------------------- /docs/images/week03/03-3/figure9.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week03/03-3/figure9.png -------------------------------------------------------------------------------- /docs/images/week07/07-3/Autoencoder_Arch.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week07/07-3/Autoencoder_Arch.png -------------------------------------------------------------------------------- /docs/images/week07/07-3/DAEOutput.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week07/07-3/DAEOutput.png -------------------------------------------------------------------------------- /docs/images/week07/07-3/DALL-E.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week07/07-3/DALL-E.png -------------------------------------------------------------------------------- /docs/images/week07/07-3/DenoisingAutoEncoder.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week07/07-3/DenoisingAutoEncoder.png -------------------------------------------------------------------------------- /docs/images/week07/07-3/def.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week07/07-3/def.png -------------------------------------------------------------------------------- /docs/images/week08/08-3/AE.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week08/08-3/AE.png -------------------------------------------------------------------------------- /docs/images/week08/08-3/DAE.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week08/08-3/DAE.png -------------------------------------------------------------------------------- /docs/images/week08/08-3/VAE.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week08/08-3/VAE.png -------------------------------------------------------------------------------- /docs/images/week08/08-3/VAE_DAE.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week08/08-3/VAE_DAE.png -------------------------------------------------------------------------------- /docs/images/week08/08-3/VAEloss.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week08/08-3/VAEloss.png -------------------------------------------------------------------------------- /docs/images/week08/08-3/bubbles_z.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week08/08-3/bubbles_z.png -------------------------------------------------------------------------------- /docs/images/week08/08-3/contractiveAE.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week08/08-3/contractiveAE.png -------------------------------------------------------------------------------- /docs/images/week08/08-3/target_prop.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week08/08-3/target_prop.png -------------------------------------------------------------------------------- /docs/images/week09/09-3/10_autoencoder_cell_12_output_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week09/09-3/10_autoencoder_cell_12_output_2.png -------------------------------------------------------------------------------- /docs/images/week09/09-3/10_autoencoder_cell_12_output_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week09/09-3/10_autoencoder_cell_12_output_3.png -------------------------------------------------------------------------------- /docs/images/week09/09-3/dae_noise.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week09/09-3/dae_noise.png -------------------------------------------------------------------------------- /docs/images/week09/09-3/dae_output.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week09/09-3/dae_output.png -------------------------------------------------------------------------------- /docs/images/week09/09-3/fig_10_cluster_samples.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week09/09-3/fig_10_cluster_samples.png -------------------------------------------------------------------------------- /docs/images/week09/09-3/fig_11_gan_vs_dae.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week09/09-3/fig_11_gan_vs_dae.png -------------------------------------------------------------------------------- /docs/images/week09/09-3/fig_12_gan_vs_vae.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week09/09-3/fig_12_gan_vs_vae.png -------------------------------------------------------------------------------- /docs/images/week09/09-3/fig_1_ae.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week09/09-3/fig_1_ae.png -------------------------------------------------------------------------------- /docs/images/week09/09-3/fig_2_under_over.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week09/09-3/fig_2_under_over.png -------------------------------------------------------------------------------- /docs/images/week09/09-3/fig_3_ae_outputs.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week09/09-3/fig_3_ae_outputs.png -------------------------------------------------------------------------------- /docs/images/week09/09-3/fig_4_autoencoder_kernel.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week09/09-3/fig_4_autoencoder_kernel.png -------------------------------------------------------------------------------- /docs/images/week09/09-3/fig_5_dae.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week09/09-3/fig_5_dae.png -------------------------------------------------------------------------------- /docs/images/week09/09-3/fig_6_dae_kernels.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week09/09-3/fig_6_dae_kernels.png -------------------------------------------------------------------------------- /docs/images/week09/09-3/fig_7_dae_comparison.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week09/09-3/fig_7_dae_comparison.png -------------------------------------------------------------------------------- /docs/images/week09/09-3/fig_8_merged_imgs.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week09/09-3/fig_8_merged_imgs.png -------------------------------------------------------------------------------- /docs/images/week09/09-3/fig_9_vae.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week09/09-3/fig_9_vae.png -------------------------------------------------------------------------------- /docs/images/week09/09-3/noise_input.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week09/09-3/noise_input.png -------------------------------------------------------------------------------- /docs/images/week09/09-3/ns_output.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week09/09-3/ns_output.png -------------------------------------------------------------------------------- /docs/images/week09/09-3/telea_output.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week09/09-3/telea_output.png -------------------------------------------------------------------------------- /docs/images/week10/10-1/CL_objective.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week10/10-1/CL_objective.png -------------------------------------------------------------------------------- /docs/images/week10/10-1/cl_loss_fn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week10/10-1/cl_loss_fn.png -------------------------------------------------------------------------------- /docs/images/week10/10-1/clustering.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week10/10-1/clustering.png -------------------------------------------------------------------------------- /docs/images/week10/10-1/con_learning.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week10/10-1/con_learning.png -------------------------------------------------------------------------------- /docs/images/week10/10-1/contrastive-learning.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week10/10-1/contrastive-learning.png -------------------------------------------------------------------------------- /docs/images/week10/10-1/equipartition.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week10/10-1/equipartition.png -------------------------------------------------------------------------------- /docs/images/week10/10-1/moco.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week10/10-1/moco.png -------------------------------------------------------------------------------- /docs/images/week10/10-1/non-imagenet.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week10/10-1/non-imagenet.png -------------------------------------------------------------------------------- /docs/images/week10/10-1/pirl.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week10/10-1/pirl.png -------------------------------------------------------------------------------- /docs/images/week10/10-1/semantic_features.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week10/10-1/semantic_features.png -------------------------------------------------------------------------------- /docs/images/week10/10-1/soft-assignment.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week10/10-1/soft-assignment.png -------------------------------------------------------------------------------- /docs/images/week10/10-1/ssl_trivial.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week10/10-1/ssl_trivial.png -------------------------------------------------------------------------------- /docs/images/week10/10-1/swav.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week10/10-1/swav.png -------------------------------------------------------------------------------- /docs/images/week10/10-2/avid.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week10/10-2/avid.png -------------------------------------------------------------------------------- /docs/images/week10/10-2/byol.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week10/10-2/byol.png -------------------------------------------------------------------------------- /docs/images/week10/10-2/cma.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week10/10-2/cma.png -------------------------------------------------------------------------------- /docs/images/week10/10-2/figure_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week10/10-2/figure_1.png -------------------------------------------------------------------------------- /docs/images/week10/10-2/seer_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week10/10-2/seer_1.png -------------------------------------------------------------------------------- /docs/images/week10/10-2/seer_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week10/10-2/seer_2.png -------------------------------------------------------------------------------- /docs/images/week10/10-2/simsiam.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week10/10-2/simsiam.png -------------------------------------------------------------------------------- /docs/images/week10/10-3/autoencoder.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week10/10-3/autoencoder.png -------------------------------------------------------------------------------- /docs/images/week10/10-3/decoder.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week10/10-3/decoder.png -------------------------------------------------------------------------------- /docs/images/week10/10-3/ebm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week10/10-3/ebm.png -------------------------------------------------------------------------------- /docs/images/week10/10-3/predictor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week10/10-3/predictor.png -------------------------------------------------------------------------------- /docs/images/week10/10-3/transformer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week10/10-3/transformer.png -------------------------------------------------------------------------------- /docs/images/week10/10-3/unit_delay.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week10/10-3/unit_delay.png -------------------------------------------------------------------------------- /docs/images/week11/11-1/figure1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week11/11-1/figure1.png -------------------------------------------------------------------------------- /docs/images/week11/11-2/Screenshot (85).png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week11/11-2/Screenshot (85).png -------------------------------------------------------------------------------- /docs/images/week11/11-2/bs1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week11/11-2/bs1.png -------------------------------------------------------------------------------- /docs/images/week11/11-2/bs2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week11/11-2/bs2.png -------------------------------------------------------------------------------- /docs/images/week11/11-2/bs3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week11/11-2/bs3.png -------------------------------------------------------------------------------- /docs/images/week11/11-2/figure10.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week11/11-2/figure10.png -------------------------------------------------------------------------------- /docs/images/week11/11-2/figure11.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week11/11-2/figure11.png -------------------------------------------------------------------------------- /docs/images/week11/11-2/figure12.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week11/11-2/figure12.png -------------------------------------------------------------------------------- /docs/images/week11/11-2/figure13.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week11/11-2/figure13.png -------------------------------------------------------------------------------- /docs/images/week11/11-2/figure14.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week11/11-2/figure14.png -------------------------------------------------------------------------------- /docs/images/week11/11-2/figure5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week11/11-2/figure5.png -------------------------------------------------------------------------------- /docs/images/week11/11-2/figure6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week11/11-2/figure6.png -------------------------------------------------------------------------------- /docs/images/week11/11-2/figure7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week11/11-2/figure7.png -------------------------------------------------------------------------------- /docs/images/week11/11-2/figure8.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week11/11-2/figure8.png -------------------------------------------------------------------------------- /docs/images/week11/11-2/figure9.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week11/11-2/figure9.png -------------------------------------------------------------------------------- /docs/images/week11/11-2/greedy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week11/11-2/greedy.png -------------------------------------------------------------------------------- /docs/images/week12/12-1/figure1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week12/12-1/figure1.png -------------------------------------------------------------------------------- /docs/images/week12/12-1/figure10.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week12/12-1/figure10.png -------------------------------------------------------------------------------- /docs/images/week12/12-1/figure11.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week12/12-1/figure11.png -------------------------------------------------------------------------------- /docs/images/week12/12-1/figure12.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week12/12-1/figure12.png -------------------------------------------------------------------------------- /docs/images/week12/12-1/figure13.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week12/12-1/figure13.png -------------------------------------------------------------------------------- /docs/images/week12/12-1/figure14.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week12/12-1/figure14.png -------------------------------------------------------------------------------- /docs/images/week12/12-1/figure15.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week12/12-1/figure15.png -------------------------------------------------------------------------------- /docs/images/week12/12-1/figure16.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week12/12-1/figure16.png -------------------------------------------------------------------------------- /docs/images/week12/12-1/figure17.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week12/12-1/figure17.png -------------------------------------------------------------------------------- /docs/images/week12/12-1/figure2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week12/12-1/figure2.png -------------------------------------------------------------------------------- /docs/images/week12/12-1/figure3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week12/12-1/figure3.png -------------------------------------------------------------------------------- /docs/images/week12/12-1/figure4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week12/12-1/figure4.png -------------------------------------------------------------------------------- /docs/images/week12/12-1/figure5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week12/12-1/figure5.png -------------------------------------------------------------------------------- /docs/images/week12/12-1/figure6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week12/12-1/figure6.png -------------------------------------------------------------------------------- /docs/images/week12/12-1/figure7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week12/12-1/figure7.png -------------------------------------------------------------------------------- /docs/images/week12/12-1/figure8.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week12/12-1/figure8.png -------------------------------------------------------------------------------- /docs/images/week12/12-1/figure9.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week12/12-1/figure9.png -------------------------------------------------------------------------------- /docs/images/week12/12-2/figure1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week12/12-2/figure1.png -------------------------------------------------------------------------------- /docs/images/week12/12-2/figure10.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week12/12-2/figure10.png -------------------------------------------------------------------------------- /docs/images/week12/12-2/figure10_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week12/12-2/figure10_1.png -------------------------------------------------------------------------------- /docs/images/week12/12-2/figure11.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week12/12-2/figure11.png -------------------------------------------------------------------------------- /docs/images/week12/12-2/figure12.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week12/12-2/figure12.png -------------------------------------------------------------------------------- /docs/images/week12/12-2/figure13.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week12/12-2/figure13.png -------------------------------------------------------------------------------- /docs/images/week12/12-2/figure14.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week12/12-2/figure14.png -------------------------------------------------------------------------------- /docs/images/week12/12-2/figure15.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week12/12-2/figure15.png -------------------------------------------------------------------------------- /docs/images/week12/12-2/figure16.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week12/12-2/figure16.png -------------------------------------------------------------------------------- /docs/images/week12/12-2/figure17.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week12/12-2/figure17.png -------------------------------------------------------------------------------- /docs/images/week12/12-2/figure18.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week12/12-2/figure18.png -------------------------------------------------------------------------------- /docs/images/week12/12-2/figure19.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week12/12-2/figure19.png -------------------------------------------------------------------------------- /docs/images/week12/12-2/figure2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week12/12-2/figure2.png -------------------------------------------------------------------------------- /docs/images/week12/12-2/figure2_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week12/12-2/figure2_1.png -------------------------------------------------------------------------------- /docs/images/week12/12-2/figure2_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week12/12-2/figure2_2.png -------------------------------------------------------------------------------- /docs/images/week12/12-2/figure3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week12/12-2/figure3.png -------------------------------------------------------------------------------- /docs/images/week12/12-2/figure3_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week12/12-2/figure3_1.png -------------------------------------------------------------------------------- /docs/images/week12/12-2/figure3_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week12/12-2/figure3_2.png -------------------------------------------------------------------------------- /docs/images/week12/12-2/figure4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week12/12-2/figure4.png -------------------------------------------------------------------------------- /docs/images/week12/12-2/figure4_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week12/12-2/figure4_2.png -------------------------------------------------------------------------------- /docs/images/week12/12-2/figure4_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week12/12-2/figure4_3.png -------------------------------------------------------------------------------- /docs/images/week12/12-2/figure5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week12/12-2/figure5.png -------------------------------------------------------------------------------- /docs/images/week12/12-2/figure6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week12/12-2/figure6.png -------------------------------------------------------------------------------- /docs/images/week12/12-2/figure7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week12/12-2/figure7.png -------------------------------------------------------------------------------- /docs/images/week12/12-2/figure8.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week12/12-2/figure8.png -------------------------------------------------------------------------------- /docs/images/week12/12-2/figure8_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week12/12-2/figure8_1.png -------------------------------------------------------------------------------- /docs/images/week12/12-2/figure8_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week12/12-2/figure8_2.png -------------------------------------------------------------------------------- /docs/images/week12/12-2/figure9.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week12/12-2/figure9.png -------------------------------------------------------------------------------- /docs/images/week12/12-3/figure1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week12/12-3/figure1.png -------------------------------------------------------------------------------- /docs/images/week12/12-3/figure10.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week12/12-3/figure10.png -------------------------------------------------------------------------------- /docs/images/week12/12-3/figure11.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week12/12-3/figure11.png -------------------------------------------------------------------------------- /docs/images/week12/12-3/figure12.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week12/12-3/figure12.png -------------------------------------------------------------------------------- /docs/images/week12/12-3/figure13.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week12/12-3/figure13.png -------------------------------------------------------------------------------- /docs/images/week12/12-3/figure14.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week12/12-3/figure14.png -------------------------------------------------------------------------------- /docs/images/week12/12-3/figure15.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week12/12-3/figure15.png -------------------------------------------------------------------------------- /docs/images/week12/12-3/figure16.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week12/12-3/figure16.png -------------------------------------------------------------------------------- /docs/images/week12/12-3/figure2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week12/12-3/figure2.png -------------------------------------------------------------------------------- /docs/images/week12/12-3/figure9.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week12/12-3/figure9.png -------------------------------------------------------------------------------- /docs/images/week15/15-1/1_fig0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week15/15-1/1_fig0.png -------------------------------------------------------------------------------- /docs/images/week15/15-1/1_fig1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week15/15-1/1_fig1.png -------------------------------------------------------------------------------- /docs/images/week15/15-1/1_fig3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week15/15-1/1_fig3.png -------------------------------------------------------------------------------- /docs/images/week15/15-1/1_fig4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week15/15-1/1_fig4.png -------------------------------------------------------------------------------- /docs/images/week15/15-1/1_fig7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week15/15-1/1_fig7.png -------------------------------------------------------------------------------- /docs/images/week15/15-1/1_fig9.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week15/15-1/1_fig9.png -------------------------------------------------------------------------------- /docs/images/week15/15-2/2_fig1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week15/15-2/2_fig1.png -------------------------------------------------------------------------------- /docs/images/week15/15-2/2_fig2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week15/15-2/2_fig2.png -------------------------------------------------------------------------------- /docs/images/week15/15-2/2_fig3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week15/15-2/2_fig3.png -------------------------------------------------------------------------------- /docs/images/week15/15-2/2_fig4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week15/15-2/2_fig4.png -------------------------------------------------------------------------------- /docs/images/week15/15-2/2_fig5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week15/15-2/2_fig5.png -------------------------------------------------------------------------------- /docs/images/week15/15-2/2_fig6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week15/15-2/2_fig6.png -------------------------------------------------------------------------------- /docs/images/week15/15-2/2_fig7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week15/15-2/2_fig7.png -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: default 3 | title: DEEP LEARNING 4 | author: Alfredo Canziani 5 | lang-ref: home 6 | --- 7 | 8 | **DS-GA 1008 · SPRING 2021 · [NYU CENTER FOR DATA SCIENCE](http://cds.nyu.edu/)** 9 | 10 | | INSTRUCTORS | Yann LeCun & Alfredo Canziani | 11 | | LECTURES | Wednesday 9:30 – 11:30, Zoom | 12 | | PRACTICA | Tuesdays 9:30 – 10:30, Zoom | 13 | | FORUM | [r/NYU_DeepLearning](https://www.reddit.com/r/NYU_DeepLearning/) | 14 | | DISCORD | [NYU DL](https://discord.gg/CthuqsX8Pb) | 15 | | MATERIAL | [2021 repo](https://github.com/Atcold/NYU-DLSP21) | 16 | 17 | 18 | ## 2021 edition disclaimer 19 | 20 | Check the repo's [`README.md`](https://github.com/Atcold/NYU-DLSP21/blob/master/README.md) and learn about: 21 | 22 | - Content new organisation 23 | - The semester's second half intellectual dilemma 24 | - This semester repository 25 | - Previous releases 26 | 27 | 28 | ## Lectures 29 | 30 | Most of the lectures, labs, and notebooks are similar to the previous edition, nevertheless, some are brand new. 31 | I will try to make clear which is which. 32 | 33 | **Legend**: 🖥 slides, 📝 notes, 📓 Jupyter notebook, 🎥 YouTube video. 34 | 35 | 36 | ### Theme 1: Introduction 37 | 38 | * History and resources [🎥](https://youtu.be/mTtDfKgLm54) [🖥](https://drive.google.com/file/d/1vVNUye-1JNJnqP4A0704sjtF7gs_MpCI/) 39 | * Gradient descent and the backpropagation algorithm [🎥](https://youtu.be/nTlCqaL7fCY) [🖥](https://drive.google.com/file/d/1tYPYGYFDQw5IBs9wx4egCcBTTX2h9d9g/) 40 | * [Neural nets inference](https://atcold.github.io/NYU-DLSP21/en/week02/02-3/) [🎥](https://youtu.be/0TdAmZUMj2k) [📓](https://github.com/Atcold/NYU-DLSP20/blob/master/02-space_stretching.ipynb) 41 | * Modules and architectures [🎥](https://youtu.be/IYQN3i7dJIQ) [🖥](https://drive.google.com/file/d/1IaDI6BJ6g4SJbJLtNjVE_miWRzBH1-MX/) 42 | * [Neural nets training](https://atcold.github.io/NYU-DLSP21/en/week03/03-3/) [🎥](https://youtu.be/EyKiYVwrdjE) [🖥](https://github.com/Atcold/NYU-DLSP20/blob/master/slides/01%20-%20Spiral%20classification.pdf) [📓](https://github.com/Atcold/NYU-DLSP20/blob/master/04-spiral_classification.ipynb) [📓](https://github.com/Atcold/NYU-DLSP20/blob/master/05-regression.ipynb) 43 | * [Homework 1: backprop](https://drive.google.com/drive/folders/1g-uQNEi_NJyELGRMrJGXXxmARDabcXFd) 44 | 45 | 46 | ### Theme 2: Parameters sharing 47 | 48 | * Recurrent and convolutional nets [🎥](https://youtu.be/7dU3TFBJl-0) [🖥](https://drive.google.com/file/d/1GtI4ywzI84oamyr_W5k_wzgfRN139aFD/) [📝](https://drive.google.com/file/d/12jP4ssUIoGURAU8jGj6QwKXyZVdXW0o6/) 49 | * ConvNets in practice [🎥](https://youtu.be/-wz_vADGbtE) [🖥](https://drive.google.com/file/d/1WX3HoZhekL4MVvi_7VuLRYJtBGnF9JJY/) [📝](https://drive.google.com/file/d/1ToWP7e71diAeMtQ0D9pU-f0BXF4bAg46/) 50 | * Natural signals properties and the convolution [🎥](https://youtu.be/KvvNkE2vQVk) [🖥](https://github.com/Atcold/NYU-DLSP20/blob/master/slides/02%20-%20CNN.pdf) [📓](https://github.com/Atcold/NYU-DLSP20/blob/master/06-convnet.ipynb) 51 | * Recurrent neural networks, vanilla and gated (LSTM) [🎥](https://youtu.be/5KSGNomPJTE) [🖥](https://github.com/Atcold/NYU-DLSP20/blob/master/slides/04%20-%20RNN.pdf) [📓](https://github.com/Atcold/NYU-DLSP20/blob/master/08-seq_classification.ipynb) [📓](https://github.com/Atcold/NYU-DLSP20/blob/master/09-echo_data.ipynb) 52 | * [Homework 2: RNN & CNN](https://drive.google.com/drive/folders/1or1YiW0fFiZGEYy6b4EOEDgRPr0GQX0i) 53 | 54 | 55 | ### Theme 3: Energy based models, foundations 56 | 57 | * Energy based models (I) [🎥](https://youtu.be/xIn-Czj1g2Q) [🖥](https://drive.google.com/file/d/1kLUgZdRYFO5ksYHzbsRS8m8IocNiGu2J/) 58 | * Inference for LV-EBMs [🎥](https://youtu.be/xA_OPjRby5g) [🖥](https://github.com/Atcold/NYU-DLSP20/blob/master/slides/12%20-%20EBM.pdf) 59 | * What are EBMs good for? [🎥](https://youtu.be/eJeJWWEo7cE) 60 | * Energy based models (II) [🎥](https://youtu.be/8u2s64ZtmiA) [🖥](https://drive.google.com/file/d/1czfiEE6IPqE7q1fTm-SWOiC3VNEtpNrj/) [📝](https://drive.google.com/file/d/1IB5dkcAQ6GsHEz8Eg2hjaeQeVtT2i4Z5/) 61 | * Training LV-EBMs [🎥](https://youtu.be/XIMaWj5YjOQ) [🖥](https://github.com/Atcold/NYU-DLSP20/blob/master/slides/12%20-%20EBM.pdf) 62 | * [Homework 3: structured prediction](https://drive.google.com/drive/folders/1zGy_SnMBqaoS7_dHRmKiOFtqNV1jJJb6) 63 | 64 | 65 | ### Theme 4: Energy based models, advanced 66 | 67 | * Energy based models (III) [🎥](https://youtu.be/AOFUZZZ6KyU) [🖥](https://drive.google.com/file/d/19crFMCpJ5YCGbWv6myv7O4pGaJT6-u5p/) 68 | * [Unsup learning and autoencoders](https://atcold.github.io/NYU-DLSP21/en/week07/07-3/) [🎥](https://youtu.be/IuXsG3sN3zY) [🖥](https://drive.google.com/file/d/1aa1Hzq5KRekq32mlW4_pgIXMec18WgOg/) 69 | * Energy based models (VI) [🎥](https://youtu.be/bdebHVF__mo) [🖥](https://drive.google.com/file/d/1w6QO0a2_0Prz1U1mxa1n-YP9U8GW1_kq/) 70 | * [From LV-EBM to target prop to (any) autoencoder](https://atcold.github.io/NYU-DLSP21/en/week08/08-3/) [🎥](https://youtu.be/PpcN-F7ovK0) [🖥](https://drive.google.com/file/d/1aa1Hzq5KRekq32mlW4_pgIXMec18WgOg/) 71 | * Energy based models (V) [🎥](https://youtu.be/AQtPoDnauq4) [🖥](https://drive.google.com/file/d/1tKzrnJgptnyMcE_4zWJNP5INeVcVBWkr/) 72 | * [AEs with PyTorch and GANs](https://atcold.github.io/NYU-DLSP21/en/week09/09-3/) [🎥](https://youtu.be/bZF4N8HR1cc) [🖥](https://drive.google.com/file/d/1aa1Hzq5KRekq32mlW4_pgIXMec18WgOg/) [📓](https://github.com/Atcold/NYU-DLSP21/blob/master/10-autoencoder.ipynb) [📓](https://github.com/Atcold/NYU-DLSP21/blob/master/11-VAE.ipynb) 73 | * [Joint Embedding Methods (I)](en/week15/15-1/) [🎥](https://youtu.be/5VjEBHWuYs8) [🖥](https://drive.google.com/file/d/17NYsSagXF5wrprv7ISCEYLeEyRwCZg9r/) [🖥](https://drive.google.com/file/d/1fo5teinBim6GQX5QkzV5f5yzDjbSVraf/) 74 | * [Joint Embedding Methods (II)](en/week15/15-2/) [🎥](https://youtu.be/EBrbaD2zyuo) [🖥](https://drive.google.com/file/d/1I6kggxFK_x--UEhsKbuNHLwmqGSRFIpR/) 75 | 76 | ### Theme 5: Associative memories 77 | 78 | * Energy based models (V) [🎥](https://youtu.be/AQtPoDnauq4) [🖥](https://drive.google.com/file/d/1tKzrnJgptnyMcE_4zWJNP5INeVcVBWkr/) 79 | * [Attention & transformer](https://atcold.github.io/NYU-DLSP21/en/week10/10-3/) [🎥](https://youtu.be/fEVyfT-gLqQ) [🖥](https://drive.google.com/file/d/1MGfNPjg9YpxMcdfP2GcjluMQXlXud10C/) [📓](https://github.com/Atcold/NYU-DLSP20/blob/master/15-transformer.ipynb) 80 | 81 | 82 | ### Theme 6: Graphs 83 | 84 | * [Graph transformer nets](https://atcold.github.io/NYU-DLSP21/en/week11/11/) [[A](https://atcold.github.io/NYU-DLSP21/en/week11/11-1/)][[B](https://atcold.github.io/NYU-DLSP21/en/week11/11-2/)] [🎥](https://youtu.be/Of9s8epjflU) [🖥](https://drive.google.com/file/d/1-u2fSSICaWoFu91oiMsd2mAhg6ZGomMg/) 85 | * Graph convolutional nets (I) [from last year] [🎥](https://youtu.be/Iiv9R6BjxHM) [🖥](https://drive.google.com/file/d/1oq-nZE2bEiQjqBlmk5_N_rFC8LQY0jQr/) 86 | * Graph convolutional nets (II) [🎥](https://youtu.be/lWUh7jzhQ1Q) [🖥](https://github.com/Atcold/NYU-DLSP20/blob/master/slides/11%20-%20GCN.pdf) [📓](https://github.com/Atcold/NYU-DLSP20/blob/master/16-gated_GCN.ipynb) 87 | 88 | 89 | ### Theme 7: Control 90 | 91 | 1. [Planning and control](https://atcold.github.io/NYU-DLSP21/en/week12/12-3/) [🎥](https://youtu.be/wTg6qJlXkok) [🖥](https://drive.google.com/file/d/1JDssHbOxX_MZlmOopQaPZxuyCVoNExcM/) 92 | 2. The Truck Backer-Upper [🎥](https://youtu.be/C4iSZ3IJU-w) [🖥](https://github.com/Atcold/NYU-DLSP20/blob/master/slides/09%20-%20Controller%20learning.pdf) [📓](https://github.com/Atcold/NYU-DLSP20/blob/master/14-truck_backer-upper.ipynb) 93 | 3. Prediction and Planning Under Uncertainty [🎥](https://youtu.be/DJgloa244ZQ) [🖥](http://bit.ly/PPUU-slides) 94 | 95 | 96 | ### Theme 8: Optimisation 97 | * Optimisation (I) [from last year] [🎥](https://youtu.be/--NZb480zlg) [🖥](https://drive.google.com/open?id=1pwlGN6hDFfEYQqBqcMjWbe4yfBDTxsab) 98 | * Optimisation (II) [🎥](https://youtu.be/n1w5b5rTFv0) [🖥](https://drive.google.com/file/d/1ExKFOOdyUiLuk3zN5LAVwUyEoI1HJxag/) [📝](https://drive.google.com/file/d/1UJibhwdwJPZDwqlVVzeAHScPxK4TDCq5/) 99 | 100 | 101 | ### Miscellaneous 102 | 103 | * [SSL for vision](https://atcold.github.io/NYU-DLSP21/en/week10/10/) [[A](https://atcold.github.io/NYU-DLSP21/en/week10/10-1/)][[B](https://atcold.github.io/NYU-DLSP21/en/week10/10-2/)] [🎥](https://youtu.be/8L10w1KoOU8) [🖥](https://drive.google.com/file/d/1BQlWMVesOcioW69RCKWCjp6280Q42W9q/) 104 | * [Low resource machine translation](https://atcold.github.io/NYU-DLSP21/en/week12/12/) [[A](https://atcold.github.io/NYU-DLSP21/en/week12/12-1/)][[B](https://atcold.github.io/NYU-DLSP21/en/week12/12-2/)] [🎥](https://youtu.be/fR42OOy9ROo) [🖥](https://drive.google.com/file/d/1pm1fM1DFqCHrjGorCQCwg5SgMjwZBwGR/) 105 | * Lagrangian backprop, final project, and Q&A [🎥](https://youtu.be/MJfnamMFylo) [🖥](https://drive.google.com/file/d/1Z9tkkTpsHzcyoPN9yqq8Nv_Bnw5bghEK/) [📝](https://drive.google.com/file/d/1BMoaE7I-IwZF32YfASiTw1OnMblWAVGb/) 106 | -------------------------------------------------------------------------------- /docs/serve.sh: -------------------------------------------------------------------------------- 1 | /opt/homebrew/lib/ruby/gems/3.2.0/bin/jekyll serve --trace --baseurl '/NYU-DLSP21' 2 | -------------------------------------------------------------------------------- /docs/static: -------------------------------------------------------------------------------- 1 | jekyllbook/static/ -------------------------------------------------------------------------------- /res/plot_lib.py: -------------------------------------------------------------------------------- 1 | import matplotlib as mpl 2 | import numpy as np 3 | import torch 4 | from IPython.display import clear_output 5 | from matplotlib import pyplot as plt 6 | 7 | 8 | def set_default(figsize=(10, 10), dpi=100): 9 | plt.style.use(['dark_background', 'bmh']) 10 | plt.rc('axes', facecolor='k') 11 | plt.rc('figure', facecolor='k') 12 | plt.rc('figure', figsize=figsize, dpi=dpi) 13 | 14 | 15 | def plot_data(X, y, d=0, auto=False, zoom=1, title='Training data (x, y)'): 16 | X = X.cpu() 17 | y = y.cpu() 18 | s = plt.scatter(X.numpy()[:, 0], X.numpy()[:, 1], c=y, s=20, cmap=plt.cm.Spectral) 19 | plt.axis('square') 20 | plt.axis(np.array((-1.1, 1.1, -1.1, 1.1)) * zoom) 21 | if auto is True: plt.axis('equal') 22 | plt.axis('off') 23 | 24 | _m, _c = 0, '.35' 25 | plt.axvline(0, ymin=_m, color=_c, lw=1) 26 | plt.axhline(0, xmin=_m, color=_c, lw=1) 27 | plt.title(title) 28 | return s 29 | 30 | 31 | def plot_model(X, y, model): 32 | model.cpu() 33 | mesh = torch.arange(-1.1, 1.11, 0.01) 34 | xx, yy = torch.meshgrid(mesh, mesh, indexing='xy') 35 | with torch.no_grad(): 36 | data = torch.stack((xx.reshape(-1), yy.reshape(-1)), dim=1) 37 | Z = model(data) 38 | Z = Z.argmax(dim=1).reshape(xx.shape) 39 | plt.contourf(xx.numpy(), yy.numpy(), Z, cmap=plt.cm.Spectral, alpha=0.3) 40 | plot_data(X, y) 41 | plt.title('Model decision boundaries') 42 | 43 | 44 | def plot_embeddings(X, y, model, zoom=10): 45 | # Use forward hook to get internal embeddings of the second last layer 46 | layer_outputs = {} 47 | 48 | def get_layer_outputs(name): 49 | def hook(model, input, output): 50 | layer_outputs[name] = output 51 | 52 | return hook 53 | 54 | layer = model[-2] 55 | 56 | if layer.__class__ == torch.nn.modules.linear.Linear and layer.out_features == 2: 57 | layer.register_forward_hook(get_layer_outputs("low_dim_embeddings")) 58 | with torch.no_grad(): 59 | model(X) # pass data through model to populate layer_outputs 60 | plot_data( 61 | layer_outputs["low_dim_embeddings"], 62 | y, 63 | zoom=zoom, 64 | title="Low dim embeddings", 65 | ) 66 | last_layer = model[-1] 67 | mesh = torch.arange(-1.1, 1.1, 0.01) * zoom 68 | xx, yy = torch.meshgrid(mesh, mesh, indexing="ij") 69 | with torch.no_grad(): 70 | data = torch.stack((xx.reshape(-1), yy.reshape(-1)), dim=1) 71 | Z = last_layer(data) 72 | Z = Z.argmax(dim=1).reshape(xx.shape) 73 | plt.contourf(xx.numpy(), yy.numpy(), Z, cmap=plt.cm.Spectral, alpha=0.3, levels=y.max().item()) 74 | else: 75 | print( 76 | "Cannot plot: second-last layer is not a linear layer" 77 | f" with output in R^2 (it is {layer})" 78 | ) 79 | 80 | 81 | def acc(l, y): 82 | score, predicted = torch.max(l, 1) 83 | return (y == predicted).sum().float() / len(y) 84 | 85 | 86 | def overwrite(string): 87 | print(string) 88 | clear_output(wait=True) 89 | 90 | 91 | def plot_2d_energy_levels(X, y, energy, v=None, l=None): 92 | xx, yy, F, k, K = energy 93 | if not v: vmin = vmax = None 94 | else: vmin, vmax = v 95 | if not l: levels = None 96 | else: levels = torch.arange(l[0], l[1], l[2]) 97 | plt.figure(figsize=(12, 10)) 98 | plt.pcolormesh(xx.numpy(), yy.numpy(), F, vmin=vmin, vmax=vmax) 99 | plt.colorbar() 100 | cnt = plt.contour(xx.numpy(), yy.numpy(), F, colors='w', linewidths=1, levels=levels) 101 | plt.clabel(cnt, inline=True, fontsize=10, colors='w') 102 | s = plot_data(X, y) 103 | plt.legend(*s.legend_elements(), title='Classes', loc='lower right') 104 | plt.axvline(color='0.55', lw=1) 105 | plt.axhline(color='0.55', lw=1) 106 | plt.axis([-1.5, 1.5, -1.5, 1.5]) 107 | ȳ = torch.zeros(K).int(); ȳ[k] = 1 108 | plt.title(f'Free energy F(x, y = {ȳ.tolist()})') 109 | 110 | 111 | def plot_3d_energy_levels(X, y, energy, v=None, l=None, cbl=None): 112 | xx, yy, F, k, K = energy 113 | if not v: vmin = vmax = None 114 | else: vmin, vmax = v 115 | if not l: levels = None 116 | else: levels = torch.arange(l[0], l[1], l[2]) 117 | fig = plt.figure(figsize=(9.5, 6), facecolor='k') 118 | ax = fig.add_subplot(projection='3d') 119 | cnt = ax.contour(xx.numpy(), yy.numpy(), F, levels=levels, vmin=vmin, vmax=vmax) 120 | ax.scatter(X[:,0], X[:,1], zs=0, c=y, cmap=plt.cm.Spectral) 121 | ax.xaxis.set_pane_color(color=(0,0,0)) 122 | ax.yaxis.set_pane_color(color=(0,0,0)) 123 | ax.zaxis.set_pane_color(color=(0,0,0)) 124 | 125 | vmin, vmax = cnt.get_clim() 126 | ax.set_zlim3d(vmin, vmax) 127 | norm = mpl.colors.Normalize(vmin=vmin, vmax=vmax) 128 | if not cbl: cbl = l 129 | else: cbl = torch.arange(cbl[0], cbl[1], cbl[2]) 130 | sm = plt.cm.ScalarMappable(norm=norm, cmap=cnt.cmap) 131 | sm.set_array([]) 132 | fig.colorbar(sm, ticks=cbl, ax=ax) 133 | ȳ = torch.zeros(K).int(); ȳ[k] = 1 134 | plt.title(f'Free energy F(x, y = {ȳ.tolist()})') 135 | plt.tight_layout() 136 | return fig, ax 137 | --------------------------------------------------------------------------------