├── .gitignore
├── .gitmodules
├── 04-spiral_classification.ipynb
├── 10-autoencoder.ipynb
├── 11-VAE.ipynb
├── 17-optimal_control.ipynb
├── README.md
├── docs
    ├── 404.html
    ├── _config.yml
    ├── _layouts
    │   ├── custom.html
    │   └── default.html
    ├── en
    │   ├── faq.md
    │   ├── week01
    │   │   └── 01.md
    │   ├── week02
    │   │   ├── 02-3.md
    │   │   └── 02.md
    │   ├── week03
    │   │   ├── 03-3.md
    │   │   └── 03.md
    │   ├── week04
    │   │   └── 04.md
    │   ├── week05
    │   │   └── 05.md
    │   ├── week06
    │   │   └── 06.md
    │   ├── week07
    │   │   ├── 07-3.md
    │   │   └── 07.md
    │   ├── week08
    │   │   ├── 08-3.md
    │   │   └── 08.md
    │   ├── week09
    │   │   ├── 09-3.md
    │   │   └── 09.md
    │   ├── week10
    │   │   ├── 10-1.md
    │   │   ├── 10-2.md
    │   │   ├── 10-3.md
    │   │   ├── 10.md
    │   │   └── lecture10.sbv
    │   ├── week11
    │   │   ├── 11-1.md
    │   │   ├── 11-2.md
    │   │   └── 11.md
    │   ├── week12
    │   │   ├── 12-1.md
    │   │   ├── 12-2.md
    │   │   ├── 12-3.md
    │   │   └── 12.md
    │   ├── week13
    │   │   └── 13.md
    │   ├── week14
    │   │   └── 14.md
    │   └── week15
    │   │   ├── 15-1.md
    │   │   ├── 15-2.md
    │   │   └── 15.md
    ├── fr
    │   ├── README-FR.md
    │   ├── faq.md
    │   ├── index.md
    │   ├── week01
    │   │   └── 01.md
    │   ├── week02
    │   │   ├── 02-3.md
    │   │   └── 02.md
    │   ├── week03
    │   │   ├── 03-3.md
    │   │   └── 03.md
    │   ├── week04
    │   │   └── 04.md
    │   ├── week05
    │   │   └── 05.md
    │   ├── week06
    │   │   └── 06.md
    │   ├── week07
    │   │   ├── 07-3.md
    │   │   └── 07.md
    │   ├── week08
    │   │   ├── 08-3.md
    │   │   └── 08.md
    │   ├── week09
    │   │   ├── 09-3.md
    │   │   └── 09.md
    │   ├── week10
    │   │   ├── 10-1.md
    │   │   ├── 10-2.md
    │   │   ├── 10-3.md
    │   │   ├── 10.md
    │   │   └── lecture10.sbv
    │   ├── week11
    │   │   ├── 11-1.md
    │   │   ├── 11-2.md
    │   │   ├── 11.md
    │   │   └── lecture11.sbv
    │   ├── week12
    │   │   ├── 12-1.md
    │   │   ├── 12-2.md
    │   │   ├── 12-3.md
    │   │   ├── 12.md
    │   │   └── lecture12.sbv
    │   ├── week13
    │   │   └── 13.md
    │   ├── week14
    │   │   └── 14.md
    │   └── week15
    │   │   ├── 15-1.md
    │   │   ├── 15-2.md
    │   │   ├── 15.md
    │   │   ├── practicum09.sbv
    │   │   └── practicum10.sbv
    ├── images
    │   ├── week02
    │   │   └── 02-3
    │   │   │   └── figure1.png
    │   ├── week03
    │   │   └── 03-3
    │   │   │   ├── figure1.png
    │   │   │   ├── figure10.png
    │   │   │   ├── figure7.png
    │   │   │   └── figure9.png
    │   ├── week07
    │   │   └── 07-3
    │   │   │   ├── Autoencoder_Arch.png
    │   │   │   ├── DAEOutput.png
    │   │   │   ├── DALL-E.png
    │   │   │   ├── DenoisingAutoEncoder.png
    │   │   │   └── def.png
    │   ├── week08
    │   │   └── 08-3
    │   │   │   ├── AE.png
    │   │   │   ├── DAE.png
    │   │   │   ├── VAE.png
    │   │   │   ├── VAE_DAE.png
    │   │   │   ├── VAEloss.png
    │   │   │   ├── bubbles_z.png
    │   │   │   ├── contractiveAE.png
    │   │   │   └── target_prop.png
    │   ├── week09
    │   │   └── 09-3
    │   │   │   ├── 10_autoencoder_cell_12_output_2.png
    │   │   │   ├── 10_autoencoder_cell_12_output_3.png
    │   │   │   ├── dae_noise.png
    │   │   │   ├── dae_output.png
    │   │   │   ├── fig_10_cluster_samples.png
    │   │   │   ├── fig_11_gan_vs_dae.png
    │   │   │   ├── fig_12_gan_vs_vae.png
    │   │   │   ├── fig_1_ae.png
    │   │   │   ├── fig_2_under_over.png
    │   │   │   ├── fig_3_ae_outputs.png
    │   │   │   ├── fig_4_autoencoder_kernel.png
    │   │   │   ├── fig_5_dae.png
    │   │   │   ├── fig_6_dae_kernels.png
    │   │   │   ├── fig_7_dae_comparison.png
    │   │   │   ├── fig_8_merged_imgs.png
    │   │   │   ├── fig_9_vae.png
    │   │   │   ├── noise_input.png
    │   │   │   ├── ns_output.png
    │   │   │   └── telea_output.png
    │   ├── week10
    │   │   ├── 10-1
    │   │   │   ├── CL_objective.png
    │   │   │   ├── cl_loss_fn.png
    │   │   │   ├── clustering.png
    │   │   │   ├── con_learning.png
    │   │   │   ├── contrastive-learning.png
    │   │   │   ├── equipartition.png
    │   │   │   ├── moco.png
    │   │   │   ├── non-imagenet.png
    │   │   │   ├── pirl.png
    │   │   │   ├── semantic_features.png
    │   │   │   ├── soft-assignment.png
    │   │   │   ├── ssl_trivial.png
    │   │   │   └── swav.png
    │   │   ├── 10-2
    │   │   │   ├── avid.png
    │   │   │   ├── byol.png
    │   │   │   ├── cma.png
    │   │   │   ├── figure_1.png
    │   │   │   ├── seer_1.png
    │   │   │   ├── seer_2.png
    │   │   │   └── simsiam.png
    │   │   └── 10-3
    │   │   │   ├── autoencoder.png
    │   │   │   ├── decoder.png
    │   │   │   ├── ebm.png
    │   │   │   ├── predictor.png
    │   │   │   ├── transformer.png
    │   │   │   └── unit_delay.png
    │   ├── week11
    │   │   ├── 11-1
    │   │   │   └── figure1.png
    │   │   └── 11-2
    │   │   │   ├── Screenshot (85).png
    │   │   │   ├── bs1.png
    │   │   │   ├── bs2.png
    │   │   │   ├── bs3.png
    │   │   │   ├── figure10.png
    │   │   │   ├── figure11.png
    │   │   │   ├── figure12.png
    │   │   │   ├── figure13.png
    │   │   │   ├── figure14.png
    │   │   │   ├── figure5.png
    │   │   │   ├── figure6.png
    │   │   │   ├── figure7.png
    │   │   │   ├── figure8.png
    │   │   │   ├── figure9.png
    │   │   │   └── greedy.png
    │   ├── week12
    │   │   ├── 12-1
    │   │   │   ├── figure1.png
    │   │   │   ├── figure10.png
    │   │   │   ├── figure11.png
    │   │   │   ├── figure12.png
    │   │   │   ├── figure13.png
    │   │   │   ├── figure14.png
    │   │   │   ├── figure15.png
    │   │   │   ├── figure16.png
    │   │   │   ├── figure17.png
    │   │   │   ├── figure2.png
    │   │   │   ├── figure3.png
    │   │   │   ├── figure4.png
    │   │   │   ├── figure5.png
    │   │   │   ├── figure6.png
    │   │   │   ├── figure7.png
    │   │   │   ├── figure8.png
    │   │   │   └── figure9.png
    │   │   ├── 12-2
    │   │   │   ├── figure1.png
    │   │   │   ├── figure10.png
    │   │   │   ├── figure10_1.png
    │   │   │   ├── figure11.png
    │   │   │   ├── figure12.png
    │   │   │   ├── figure13.png
    │   │   │   ├── figure14.png
    │   │   │   ├── figure15.png
    │   │   │   ├── figure16.png
    │   │   │   ├── figure17.png
    │   │   │   ├── figure18.png
    │   │   │   ├── figure19.png
    │   │   │   ├── figure2.png
    │   │   │   ├── figure2_1.png
    │   │   │   ├── figure2_2.png
    │   │   │   ├── figure3.png
    │   │   │   ├── figure3_1.png
    │   │   │   ├── figure3_2.png
    │   │   │   ├── figure4.png
    │   │   │   ├── figure4_2.png
    │   │   │   ├── figure4_3.png
    │   │   │   ├── figure5.png
    │   │   │   ├── figure6.png
    │   │   │   ├── figure7.png
    │   │   │   ├── figure8.png
    │   │   │   ├── figure8_1.png
    │   │   │   ├── figure8_2.png
    │   │   │   └── figure9.png
    │   │   └── 12-3
    │   │   │   ├── figure1.png
    │   │   │   ├── figure10.png
    │   │   │   ├── figure11.png
    │   │   │   ├── figure12.png
    │   │   │   ├── figure13.png
    │   │   │   ├── figure14.png
    │   │   │   ├── figure15.png
    │   │   │   ├── figure15.svg
    │   │   │   ├── figure16.png
    │   │   │   ├── figure16.svg
    │   │   │   ├── figure2.png
    │   │   │   ├── figure3.svg
    │   │   │   ├── figure4.svg
    │   │   │   ├── figure5.svg
    │   │   │   ├── figure6.svg
    │   │   │   ├── figure7.svg
    │   │   │   ├── figure8.svg
    │   │   │   └── figure9.png
    │   └── week15
    │   │   ├── 15-1
    │   │       ├── 1_fig0.png
    │   │       ├── 1_fig1.png
    │   │       ├── 1_fig3.png
    │   │       ├── 1_fig4.png
    │   │       ├── 1_fig7.png
    │   │       └── 1_fig9.png
    │   │   └── 15-2
    │   │       ├── 2_fig1.png
    │   │       ├── 2_fig2.png
    │   │       ├── 2_fig3.png
    │   │       ├── 2_fig4.png
    │   │       ├── 2_fig5.png
    │   │       ├── 2_fig6.png
    │   │       └── 2_fig7.png
    ├── index.md
    ├── serve.sh
    └── static
└── res
    └── plot_lib.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Remove [I]Python caching
 2 | __pycache__
 3 | .ipynb_checkpoints
 4 | 
 5 | # Remove Mac shit
 6 | .DS_Store
 7 | 
 8 | # Remove Vim temp files
 9 | *sw*
10 | 
11 | # Ignore Data files
12 | *.tar.gz
13 | *.feat
14 | *.txt
15 | *.data
16 | .idea/
17 | *.pth
18 | *-ubyte
19 | *.pt
20 | *.png
21 | !docs/**/*.png
22 | imdb
23 | data
24 | .jekyll-cache
25 | _site
26 | .vscode
27 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "docs/jekyllbook"]
2 | 	path = docs/jekyllbook
3 | 	url = https://github.com/ebetica/jekyllbook
4 | 


--------------------------------------------------------------------------------
/04-spiral_classification.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Spiral classification"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": null,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "import torch\n",
 17 |     "from torch import nn, optim\n",
 18 |     "from math import pi as π"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": null,
 24 |    "metadata": {},
 25 |    "outputs": [],
 26 |    "source": [
 27 |     "from res.plot_lib import *"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "code",
 32 |    "execution_count": null,
 33 |    "metadata": {},
 34 |    "outputs": [],
 35 |    "source": [
 36 |     "set_default()"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": null,
 42 |    "metadata": {},
 43 |    "outputs": [],
 44 |    "source": [
 45 |     "device = torch.device(\"cuda:0\" if torch.cuda.is_available() else \"cpu\")"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "markdown",
 50 |    "metadata": {},
 51 |    "source": [
 52 |     "## Create the data"
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "code",
 57 |    "execution_count": null,
 58 |    "metadata": {},
 59 |    "outputs": [],
 60 |    "source": [
 61 |     "seed = 12345\n",
 62 |     "torch.manual_seed(seed)\n",
 63 |     "N = 1000  # num_samples_per_class\n",
 64 |     "n = 2     # input dimensions\n",
 65 |     "K = 5     # num_classes\n",
 66 |     "d = 100   # num_hidden_units"
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "code",
 71 |    "execution_count": null,
 72 |    "metadata": {},
 73 |    "outputs": [],
 74 |    "source": [
 75 |     "# Generate spirals\n",
 76 |     "\n",
 77 |     "t = torch.linspace(0, 1, N)\n",
 78 |     "a = 0.8 * t + 0.2  # amplitude 0.2 → 1.0\n",
 79 |     "X = list()\n",
 80 |     "y = list()\n",
 81 |     "for k in range(K):\n",
 82 |     "    θ = (2 * t + k) * 2 * π / K + 0.2 * torch.randn(N)\n",
 83 |     "    X.append(torch.stack((a * θ.sin(), a * θ.cos()), dim=1))\n",
 84 |     "    y.append(torch.zeros(N, dtype=torch.long).fill_(k))\n",
 85 |     "X = torch.cat(X)\n",
 86 |     "y = torch.cat(y)\n",
 87 |     "\n",
 88 |     "print(\"Shapes:\")\n",
 89 |     "print(\"X:\", tuple(X.size()))\n",
 90 |     "print(\"y:\", tuple(y.size()))"
 91 |    ]
 92 |   },
 93 |   {
 94 |    "cell_type": "code",
 95 |    "execution_count": null,
 96 |    "metadata": {},
 97 |    "outputs": [],
 98 |    "source": [
 99 |     "# And visualise them\n",
100 |     "plot_data(X, y)"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "markdown",
105 |    "metadata": {},
106 |    "source": [
107 |     "## Build and train a neural net"
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "code",
112 |    "execution_count": null,
113 |    "metadata": {},
114 |    "outputs": [],
115 |    "source": [
116 |     "learning_rate = 1e-3\n",
117 |     "lambda_l2 = 1e-5"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "code",
122 |    "execution_count": null,
123 |    "metadata": {},
124 |    "outputs": [],
125 |    "source": [
126 |     "# Model definition\n",
127 |     "model = nn.Sequential(\n",
128 |     "    nn.Linear(n, d),\n",
129 |     "    # nn.ReLU(),  # Comment this line for a linear model\n",
130 |     "    nn.Linear(d, K)  # (Optional) Comment this line and uncomment the next one to display 2D embeddings below\n",
131 |     "    # nn.Linear(d, 2), nn.Linear(2, K)\n",
132 |     ")\n",
133 |     "model.to(device)  # possibly send to CUDA\n",
134 |     "\n",
135 |     "# Cross entropy given the linear output\n",
136 |     "C = torch.nn.CrossEntropyLoss(reduction='none')\n",
137 |     "\n",
138 |     "# Using Adam optimiser\n",
139 |     "optimiser = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=lambda_l2) # built-in L2\n",
140 |     "\n",
141 |     "# Full-batch training loop\n",
142 |     "for t in range(2_000):\n",
143 |     "    \n",
144 |     "    # Feed forward to get the linear sum s\n",
145 |     "    s = model(X)\n",
146 |     "    \n",
147 |     "    # Compute the free energy F and loss L\n",
148 |     "    F = C(s, y)\n",
149 |     "    L = F.mean()\n",
150 |     "    \n",
151 |     "    # Zero the gradients\n",
152 |     "    optimiser.zero_grad()\n",
153 |     "    \n",
154 |     "    # Backward pass to compute and accumulate the gradient\n",
155 |     "    # of the free energy w.r.t our learnable params\n",
156 |     "    L.backward()\n",
157 |     "    \n",
158 |     "    # Update params\n",
159 |     "    optimiser.step()\n",
160 |     "    \n",
161 |     "    # Display epoch, L, and accuracy\n",
162 |     "    overwrite(f'[EPOCH]: {t}, [LOSS]: {L.item():.6f}, [ACCURACY]: {acc(s, y):.3f}')"
163 |    ]
164 |   },
165 |   {
166 |    "cell_type": "code",
167 |    "execution_count": null,
168 |    "metadata": {},
169 |    "outputs": [],
170 |    "source": [
171 |     "# Plot trained model\n",
172 |     "print(model)\n",
173 |     "plot_model(X, y, model)"
174 |    ]
175 |   },
176 |   {
177 |    "cell_type": "code",
178 |    "execution_count": null,
179 |    "metadata": {},
180 |    "outputs": [],
181 |    "source": [
182 |     "# (Optional) Plot internal 2D embeddings if available\n",
183 |     "plot_embeddings(X, y, model, zoom=10)"
184 |    ]
185 |   },
186 |   {
187 |    "cell_type": "code",
188 |    "execution_count": null,
189 |    "metadata": {},
190 |    "outputs": [],
191 |    "source": [
192 |     "# Compute linear output s for a fine grid over the input space\n",
193 |     "\n",
194 |     "mesh = torch.arange(-1.5, 1.5, 0.01)\n",
195 |     "xx, yy = torch.meshgrid(mesh, mesh)\n",
196 |     "grid = torch.stack((xx.reshape(-1), yy.reshape(-1)), dim=1)\n",
197 |     "with torch.no_grad():\n",
198 |     "    s = model(grid)"
199 |    ]
200 |   },
201 |   {
202 |    "cell_type": "code",
203 |    "execution_count": null,
204 |    "metadata": {},
205 |    "outputs": [],
206 |    "source": [
207 |     "# Choice of free energy\n",
208 |     "\n",
209 |     "fe = 'cross-entropy'\n",
210 |     "fe = 'negative linear output'"
211 |    ]
212 |   },
213 |   {
214 |    "cell_type": "code",
215 |    "execution_count": null,
216 |    "metadata": {},
217 |    "outputs": [],
218 |    "source": [
219 |     "# Switch to non-interactive matplotlib\n",
220 |     "%matplotlib inline\n",
221 |     "set_default()"
222 |    ]
223 |   },
224 |   {
225 |    "cell_type": "code",
226 |    "execution_count": null,
227 |    "metadata": {},
228 |    "outputs": [],
229 |    "source": [
230 |     "# ! mkdir {m}-levels"
231 |    ]
232 |   },
233 |   {
234 |    "cell_type": "code",
235 |    "execution_count": null,
236 |    "metadata": {
237 |     "scrolled": true
238 |    },
239 |    "outputs": [],
240 |    "source": [
241 |     "# Plot 2d energy levels\n",
242 |     "\n",
243 |     "for k in range(K):\n",
244 |     "    if fe == 'cross-entropy':\n",
245 |     "        F = C(s, torch.LongTensor(1).fill_(k).expand(s.size(0)))\n",
246 |     "        F = F.reshape(xx.shape)\n",
247 |     "        plot_2d_energy_levels(X, y, (xx, yy, F, k, K), (0, 35), (1, 35, 4))\n",
248 |     "\n",
249 |     "    elif fe == 'negative linear output':\n",
250 |     "        F = -s[:, k]\n",
251 |     "        F = F.reshape(xx.shape)\n",
252 |     "        plot_2d_energy_levels(X, y, (xx, yy, F, k, K), (-20, 20), (-20, 21, 2.5))\n",
253 |     "        \n",
254 |     "#     plt.savefig(f'{m}-levels/{k}.png', bbox_inches='tight')"
255 |    ]
256 |   },
257 |   {
258 |    "cell_type": "code",
259 |    "execution_count": null,
260 |    "metadata": {},
261 |    "outputs": [],
262 |    "source": [
263 |     "# ! ffmpeg -framerate 1 -i {m}-levels/%d.png -r 25 -vf \"crop=trunc(iw/2)*2:trunc(ih/2)*2\" -pix_fmt yuv420p {m}-levels.mp4"
264 |    ]
265 |   },
266 |   {
267 |    "cell_type": "code",
268 |    "execution_count": null,
269 |    "metadata": {},
270 |    "outputs": [],
271 |    "source": [
272 |     "# Switch to interactive matplotlib\n",
273 |     "%matplotlib widget"
274 |    ]
275 |   },
276 |   {
277 |    "cell_type": "code",
278 |    "execution_count": null,
279 |    "metadata": {},
280 |    "outputs": [],
281 |    "source": [
282 |     "# Cross-entropy\n",
283 |     "if fe == 'cross-entropy':\n",
284 |     "    fig, ax = plot_3d_energy_levels(X, y, (xx, yy, F, k, K), (0, 18), (0, 19, 1), (0, 19, 2))\n",
285 |     "elif fe == 'negative linear output':\n",
286 |     "    fig, ax = plot_3d_energy_levels(X, y, (xx, yy, F, k, K), (-30, 20), (-30, 20, 1), (-30, 21, 5))"
287 |    ]
288 |   },
289 |   {
290 |    "cell_type": "code",
291 |    "execution_count": null,
292 |    "metadata": {},
293 |    "outputs": [],
294 |    "source": [
295 |     "# ! mkdir {m}-3d-levels"
296 |    ]
297 |   },
298 |   {
299 |    "cell_type": "code",
300 |    "execution_count": null,
301 |    "metadata": {},
302 |    "outputs": [],
303 |    "source": [
304 |     "# Spin it around (and maybe save to disk)\n",
305 |     "δ = 10\n",
306 |     "for angle in range(0, 360, δ):\n",
307 |     "    ax.view_init(30, -60 + angle)\n",
308 |     "    fig.canvas.draw()\n",
309 |     "#     plt.pause(.001)\n",
310 |     "#     plt.savefig(f'{m}-3d-levels/{angle:03d}.png', bbox_inches='tight')"
311 |    ]
312 |   },
313 |   {
314 |    "cell_type": "code",
315 |    "execution_count": null,
316 |    "metadata": {},
317 |    "outputs": [],
318 |    "source": [
319 |     "# ! ffmpeg -i {m}-3d-levels/%03d.png -vf \"crop=trunc(iw/2)*2:trunc(ih/2)*2\" -pix_fmt yuv420p {m}-3d-levels.mp4"
320 |    ]
321 |   }
322 |  ],
323 |  "metadata": {
324 |   "kernelspec": {
325 |    "display_name": "Python 3 (ipykernel)",
326 |    "language": "python",
327 |    "name": "python3"
328 |   },
329 |   "language_info": {
330 |    "codemirror_mode": {
331 |     "name": "ipython",
332 |     "version": 3
333 |    },
334 |    "file_extension": ".py",
335 |    "mimetype": "text/x-python",
336 |    "name": "python",
337 |    "nbconvert_exporter": "python",
338 |    "pygments_lexer": "ipython3",
339 |    "version": "3.10.13"
340 |   }
341 |  },
342 |  "nbformat": 4,
343 |  "nbformat_minor": 4
344 | }
345 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # NYU Deep Learning Spring 2021 (NYU-DLSP21)
 2 | 
 3 | <!-- English - French -->
 4 | [🇬🇧](https://github.com/Atcold/NYU-DLSP21/blob/master/README.md) &nbsp; [🇫🇷](https://github.com/Atcold/NYU-DLSP21/blob/master/docs/fr/README-FR.md)
 5 | 
 6 | 
 7 | ## Content new organisation
 8 | 
 9 | This semester we have reorganised the didactic material.
10 | In the first half of the semester we covered 3 topics, spanning two weeks, each followed by an assignment.
11 | Moreover, each lecture had a corresponding practicum.
12 | 
13 | 1. History, backpropagation, and gradient descent
14 | 2. Parameter sharing: recurrent and convolutional networks
15 | 3. Latent variable (LV) energy based models (EBMs)
16 | 
17 | Pay attention that we have redesigned the curriculum and lectures' content.
18 | We've treated LV-EBM as a *basic* module, which to build upon.
19 | 
20 | 
21 | ## Enters the semester's second half
22 | 
23 | I thought I was going to repropose the same practica I've used during [NYU-DLSP20](https://github.com/Atcold/NYU-DLSP20), last year edition, just in different order.
24 | 
25 | But I couldn't.
26 | 
27 | This year's students have LV-EBMs on their side.
28 | We told them about *the cake* and now I cannot pretend it doesn't exist and teach as if they were unaware of the elephant in the room.
29 | It would have been intellectually dishonest.
30 | Henceforth, I've redesigned my whole deck of slides.
31 | 
32 | 
33 | ## This semester repository
34 | 
35 | That's why this repo has been created.
36 | I'm **not** going to try to do the same insane work I've put up with last year, but I need a space where to post updated slides, notebooks, and host new transcriptions.
37 | Last year material is still valid.
38 | This year you have a different take.
39 | A more powerful one.
40 | 
41 | 
42 | ## Previous releases
43 | 
44 | Before NYU-DLSP21 there were…
45 | 
46 | - [NYU-DLSP20](https://github.com/Atcold/NYU-DLSP20) (major release)
47 | - [NYU-DLSP19](https://github.com/Atcold/NYU-DLSP20/releases/tag/dlsp19)
48 | - [AIMS-DLFL19](https://github.com/Atcold/NYU-DLSP20/releases/tag/aims-fl18)
49 | - [CoDaS-HEP18](https://github.com/Atcold/NYU-DLSP20/releases/tag/v1.0.0)
50 | - [NYU-DLSP18](https://docs.google.com/document/d/1_p1Mw-NtMGN_vpas_pchLsQC2u0NM5mTnRapBrQ2ivk/)
51 | - [Purdue-DLFL16](https://docs.google.com/document/d/1ugJRMqQ_cCUQC1B8mSE0iro7sKrDT8-BnppTZv0rA08/)
52 | - [torch-Video-Tutorials](https://github.com/Atcold/torch-Video-Tutorials)
53 | 
54 | ## More info
55 | 
56 | Keep reading on the [class website](https://atcold.github.io/NYU-DLSP21/).
57 | 


--------------------------------------------------------------------------------
/docs/404.html:
--------------------------------------------------------------------------------
1 | jekyllbook/404.html


--------------------------------------------------------------------------------
/docs/_config.yml:
--------------------------------------------------------------------------------
  1 | permalink:           pretty
  2 | 
  3 | # Setup
  4 | title:               'Deep Learning'
  5 | url:                 https://atcold.github.io/NYU-DLSP21/
  6 | baseurl:             '/NYU-DLSP21'
  7 | homepage_title:      Home
  8 | default_lang:        'en'
  9 | 
 10 | # About/contact
 11 | author:
 12 | name:                atcold
 13 | url:                 https://twitter.com/alfcnz
 14 | github:
 15 |   repo:              https://github.com/atcold/NYU-DLSP21
 16 | 
 17 | # Custom vars
 18 | version:             dlsp21
 19 | 
 20 | src:                 "."
 21 | default_theme:       "ayu"
 22 | 
 23 | defaults:
 24 |   - scope:
 25 |       path: "" # an empty string here means all files in the project
 26 |     values:
 27 |       layout: "custom"
 28 | 
 29 | # For Maths
 30 | markdown:            kramdown
 31 | 
 32 | # To use hljs, disable the default highlighter
 33 | kramdown:
 34 |   syntax_highlighter_opts:
 35 |     disable: true
 36 |   math_engine: null
 37 | 
 38 | exclude:
 39 |   - jekyllbook
 40 |   - en/index.md
 41 |   - vendor
 42 | 
 43 | 
 44 | ################################### English ####################################
 45 | prologues:
 46 |   - path: en/faq.md
 47 | chapters:
 48 |   - path: en/week01/01.md
 49 |   - path: en/week02/02.md
 50 |     sections:
 51 |         - path: en/week02/02-3.md
 52 |   - path: en/week03/03.md
 53 |     sections:
 54 |         - path: en/week03/03-3.md
 55 |   - path: en/week04/04.md
 56 |   - path: en/week05/05.md
 57 |   - path: en/week06/06.md
 58 |   - path: en/week07/07.md
 59 |     sections:
 60 |       - path: en/week07/07-3.md
 61 |   - path: en/week08/08.md
 62 |     sections:
 63 |       - path: en/week08/08-3.md
 64 |   - path: en/week09/09.md
 65 |     sections:
 66 |         - path: en/week09/09-3.md
 67 |   - path: en/week10/10.md
 68 |     sections:
 69 |         - path: en/week10/10-1.md
 70 |         - path: en/week10/10-2.md
 71 |         - path: en/week10/10-3.md
 72 |   - path: en/week11/11.md
 73 |     sections:
 74 |         - path: en/week11/11-1.md
 75 |         - path: en/week11/11-2.md
 76 |   - path: en/week12/12.md
 77 |     sections:
 78 |         - path: en/week12/12-1.md
 79 |         - path: en/week12/12-2.md
 80 |         - path: en/week12/12-3.md
 81 |   - path: en/week13/13.md
 82 |   - path: en/week14/14.md
 83 |   - path: en/week15/15.md
 84 |     sections:
 85 |         - path: en/week15/15-1.md
 86 |         - path: en/week15/15-2.md        
 87 | 
 88 | 
 89 | 
 90 | ################################### French ####################################
 91 | fr:
 92 |   title: 'Apprentissage Profond'
 93 |   prologues:
 94 |     - path: fr/faq.md
 95 |   chapters:
 96 |     - path: fr/week01/01.md
 97 |     - path: fr/week02/02.md
 98 |       sections:
 99 |           - path: fr/week02/02-3.md
100 |     - path: fr/week03/03.md
101 |       sections:
102 |           - path: fr/week03/03-3.md
103 |     - path: fr/week04/04.md
104 |     - path: fr/week05/05.md
105 |     - path: fr/week06/06.md
106 |     - path: fr/week07/07.md
107 |       sections:
108 |         - path: fr/week07/07-3.md
109 |     - path: fr/week08/08.md
110 |       sections:
111 |         - path: fr/week08/08-3.md
112 |     - path: fr/week09/09.md
113 |       sections:
114 |           - path: fr/week09/09-3.md
115 |     - path: fr/week10/10.md
116 |       sections:
117 |           - path: fr/week10/10-1.md
118 |           - path: fr/week10/10-2.md
119 |           - path: fr/week10/10-3.md
120 |     - path: fr/week11/11.md
121 |       sections:
122 |           - path: fr/week11/11-1.md
123 |           - path: fr/week11/11-2.md
124 |     - path: fr/week12/12.md
125 |       sections:
126 |           - path: fr/week12/12-1.md
127 |           - path: fr/week12/12-2.md
128 |           - path: fr/week12/12-3.md
129 |     - path: fr/week13/13.md
130 |     - path: fr/week14/14.md
131 |     - path: fr/week15/15.md
132 |       sections:
133 |           - path: fr/week15/15-1.md
134 |           - path: fr/week15/15-2.md
135 | 


--------------------------------------------------------------------------------
/docs/_layouts/custom.html:
--------------------------------------------------------------------------------
  1 | ---
  2 | layout: default
  3 | ---
  4 | 
  5 | <div style="display:none;">
  6 |     $$\gdef \sam #1 {\mathrm{softargmax}(#1)}$$
  7 |     $$\gdef \vect #1 {\boldsymbol{#1}} $$
  8 |     $$\gdef \matr #1 {\boldsymbol{#1}} $$
  9 |     $$\gdef \E  {\mathbb{E}} $$
 10 |     $$\gdef \V  {\mathbb{V}} $$
 11 |     $$\gdef \R {\mathbb{R}} $$
 12 |     $$\gdef \N {\mathbb{N}} $$
 13 |     $$\gdef \relu #1 {\texttt{ReLU}(#1)} $$
 14 |     $$\gdef \D {\,\mathrm{d}} $$
 15 |     $$\gdef \deriv #1 #2 {\frac{\D #1}{\D #2}}$$
 16 |     $$\gdef \pd #1 #2 {\frac{\partial #1}{\partial #2}}$$
 17 |     $$\gdef \set #1 {\left\lbrace #1 \right\rbrace} $$
 18 |     
 19 |     % My colours
 20 | 
 21 |     $$\gdef \aqua #1 {\textcolor{8dd3c7}{#1}} $$
 22 |     $$\gdef \yellow #1 {\textcolor{ffffb3}{#1}} $$
 23 |     $$\gdef \lavender #1 {\textcolor{bebada}{#1}} $$
 24 |     $$\gdef \red #1 {\textcolor{fb8072}{#1}} $$
 25 |     $$\gdef \blue #1 {\textcolor{80b1d3}{#1}} $$
 26 |     $$\gdef \orange #1 {\textcolor{fdb462}{#1}} $$
 27 |     $$\gdef \green #1 {\textcolor{b3de69}{#1}} $$
 28 |     $$\gdef \pink #1 {\textcolor{fccde5}{#1}} $$
 29 |     $$\gdef \vgrey #1 {\textcolor{d9d9d9}{#1}} $$
 30 |     $$\gdef \violet #1 {\textcolor{bc80bd}{#1}} $$
 31 |     $$\gdef \unka #1 {\textcolor{ccebc5}{#1}} $$
 32 |     $$\gdef \unkb #1 {\textcolor{ffed6f}{#1}} $$
 33 | 
 34 |     % Vectors
 35 |     $$\gdef \vx {\pink{\vect{x }}} $$
 36 |     $$\gdef \vy {\blue{\vect{y }}} $$
 37 |     $$\gdef \vb {\vect{b}} $$
 38 |     $$\gdef \vz {\orange{\vect{z }}} $$
 39 |     $$\gdef \vtheta {\vect{\theta }} $$
 40 |     $$\gdef \vh {\green{\vect{h }}} $$
 41 |     $$\gdef \vq {\aqua{\vect{q }}} $$
 42 |     $$\gdef \vk {\yellow{\vect{k }}} $$
 43 |     $$\gdef \vv {\green{\vect{v }}} $$
 44 |     $$\gdef \vytilde {\violet{\tilde{\vect{y}}}} $$
 45 |     $$\gdef \vyhat {\red{\hat{\vect{y}}}} $$
 46 |     $$\gdef \vycheck {\blue{\check{\vect{y}}}} $$
 47 |     $$\gdef \vzcheck {\blue{\check{\vect{z}}}} $$
 48 |     $$\gdef \vztilde {\green{\tilde{\vect{z}}}} $$
 49 |     $$\gdef \vmu {\green{\vect{\mu}}} $$
 50 |     $$\gdef \vu {\orange{\vect{u}}} $$
 51 |     
 52 |     % Matrices
 53 |     $$\gdef \mW {\matr{W}} $$
 54 |     $$\gdef \mA {\matr{A}} $$
 55 |     $$\gdef \mX {\pink{\matr{X}}} $$
 56 |     $$\gdef \mY {\blue{\matr{Y}}} $$
 57 |     $$\gdef \mQ {\aqua{\matr{Q }}} $$
 58 |     $$\gdef \mK {\yellow{\matr{K }}} $$
 59 |     $$\gdef \mV {\lavender{\matr{V }}} $$
 60 |     $$\gdef \mH {\green{\matr{H }}} $$
 61 | 
 62 |     % Coloured math
 63 |     $$\gdef \cx {\pink{x}} $$
 64 |     $$\gdef \ctheta {\orange{\theta}} $$
 65 |     $$\gdef \cz {\orange{z}} $$
 66 |     $$\gdef \Enc {\lavender{\text{Enc}}} $$
 67 |     $$\gdef \Dec {\aqua{\text{Dec}}}$$
 68 | 
 69 |     
 70 | </div>
 71 | 
 72 | {% if page.lecturer %}
 73 | 🎙️ <i>{{page.lecturer}}</i>
 74 | {% endif %}
 75 | 
 76 | {{ content }}
 77 | 
 78 | 
 79 | {% if page.authors or page.date or page.translator %}
 80 | <hr/>
 81 | {% endif %}
 82 | 
 83 | {% if page.authors %}
 84 | <!--Writing emoji is here but doesn't show up sometimes -->📝 <i>{{ page.authors }}</i>
 85 | {% endif %}
 86 | 
 87 | {% if page.translator %}
 88 | {% if page.lang %}
 89 |   {% assign thislang = page.lang %}
 90 | {% else %}
 91 |   {% assign thislang = "en" %}
 92 | {% endif %}
 93 | <br/>
 94 | <i class="fa flag-placeholder lang-is-{{ thislang }}"></i> <i>{{ page.translator }}</i>
 95 | {% endif %}
 96 | 
 97 | {% if page.date %}
 98 | <br/>
 99 | <i>{{ page.date }}</i>
100 | {% endif %}
101 | 


--------------------------------------------------------------------------------
/docs/_layouts/default.html:
--------------------------------------------------------------------------------
1 | ../jekyllbook/_layouts/default.html


--------------------------------------------------------------------------------
/docs/en/faq.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Foreword, FAQ and disclaimer
 3 | author: Loïck Bourdois
 4 | date: 07 Jul 2021
 5 | lang-ref: faq
 6 | ---
 7 | 
 8 | 
 9 | # Foreword 
10 | 
11 | This course concerns the latest techniques in deep learning and representation learning, focusing on supervised and unsupervised deep learning, embedding methods, metric learning, convolutional and recurrent nets, with applications to computer vision, natural language understanding, and speech recognition. 
12 | The prerequisites include: [DS-GA 1001 Intro to Data Science](https://cds.nyu.edu/academics/ms-curriculum/) or a graduate-level machine learning course.
13 | 
14 | We invite you to prefer the videos on the [YouTube channel](https://www.youtube.com/playlist?list=PLLHTzKZzVU9e6xUfG10TkTWApKSZCzuBI) ("official" content) since the course is given by the teaching staff, unlike the website where it is the notes taken by the students during the course.
15 | The website is summaries of the videos, so the videos usually include additional information compared to the website. For example:
16 | - anecdotes about the different concepts discussed,
17 | - jokes, 
18 | - the repetition of the same concept but in the form of different formulations, thus generally making it possible to understand an idea if a first formulation is not understood, 
19 | - the students' questions, which can be the ones you have yourself during the viewing,
20 | If concepts are still not understood at the end of the video, you have the possibility to ask a question in the commentary of the YouTube video, which the website does not allow.
21 | - the references of the articles on which the course is based are present on the slides of the videos whereas they are absent from the website.  
22 | 
23 | The website thus serves more as a summary of the videos or as a basis for your personal notes that you take while watching the videos. 
24 | Note that you can easily switch from the site to a moment of a given video by clicking on the paragraph titles of the web pages. 
25 | 
26 | 
27 | # FAQ
28 | 
29 | Here are some answers to frequently asked questions:
30 | - **Does taking this course lead to certification?**
31 | > No, it does not. In order to offer a certification, we would have to be able to evaluate you, but the content has not been designed for this (unlike a MOOC for example). As this is a frequent request, we are thinking about proposing a certification for future editions of the course.
32 | - **How much time should I spend on this course?**
33 | > For each week, there is approximately 2h30/3h of video content. With the time dedicated to note taking and playing with the notebooks, a total estimate of 5 hours per week seems reasonable. For the rest, it depends on the level of immersion you want to achieve in a given topic (reading the referenced articles, applying what was seen in class to your own projects, etc.).
34 | - **Where to ask a question after watching a video?**
35 | > You can ask it directly in the comments section under the YouTube video in question, and Alfredo will be happy to answer it. If the question is about a specific point in the video, please include the time stamp.
36 | > You can also do this on the class [Discord](https://discord.gg/CthuqsX8Pb) specifically for students. It is also used to coordinate viewing groups, discuss assignments, suggest improvements, or generally discuss any topic related to the course.
37 | - **Can I use this course?**
38 | > Of course, the course is under the [Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License](http://creativecommons.org/licenses/by-nc-sa/4.0/).
39 | > This means that:
40 | > - You may not use the material for commercial purposes.
41 | > - You must give appropriate credit, provide a link to the license, and indicate if changes were made. You may do so in any reasonable manner, but not in any way that suggests the licensor endorses you or your use.
42 | > - If you remix, transform, or build upon the material, you must distribute your contributions under the same license as the original.  
43 | >  
44 | > For credit, you can use the following BibTeX:
45 | > ```bibtex
46 | > @misc{canziani2020nyudlsp21,  
47 | >   author = {Canziani, Alfredo and LeCun, Yann},  
48 | >   title = {{NYU Deep Learning, Spring 2021}},
49 | >   howpublished = "\url{https://atcold.github.io/NYU-DLSP21}",  
50 | >   year = {2021},  
51 | >   note = "[Online; accessed <today>]"  
52 | > }
53 | > ```
54 | 
55 | 
56 | 
57 | 
58 | # Disclaimer
59 | 
60 | All other texts found on this site are lecture notes taken by students of the New York University during lectures given by Yann Le Cun, Alfredo Canziani, Ishan Misra, Awni Hannun and Marc'Aurelio Ranzato. 
61 | Thus the texts in English were written by several people, which has an impact on the homogeneity of the texts (some write in the past tense, others in the present tense; the abbreviations used are not always the same; some write short sentences, while others write sentences of up to 5 or 6 lines, etc.).
62 | It is possible that there may be some omissions: typing errors, spelling mistakes, etc. 
63 | If you notice any, we invite you to submit a PR on the [GitHub directory of the site](https://github.com/Atcold/NYU-DLSP21/pulls) specifying with an `[EN]` that it concerns the English translation.
64 | 
65 | Wishing you a deep reading !
66 | 


--------------------------------------------------------------------------------
/docs/en/week01/01.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | lang-ref: ch.01
 3 | title: Week 1
 4 | ---
 5 | 
 6 | 
 7 | ## Lecture
 8 | 
 9 | Some history can be found [here](https://atcold.github.io/NYU-DLSP20/en/week01/01-1/), while gradient descent can be found [here](https://atcold.github.io/NYU-DLSP20/en/week02/02-1/).
10 | 
11 | 
12 | ## Practicum
13 | 
14 | This plus the next practicum's summary can be found [here](https://atcold.github.io/NYU-DLSP20/en/week01/01-3/).
15 | 


--------------------------------------------------------------------------------
/docs/en/week02/02-3.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | lang-ref: ch.02-3
 3 | title: Problem Motivation, Linear Algebra, and Visualization
 4 | lecturer: Alfredo Canziani
 5 | authors: Rajashekar Vasantha
 6 | date: 04 Feb 2021
 7 | ---
 8 | 
 9 | 
10 | ## Resources
11 | 
12 | Please follow Alfredo Canziani [on Twitter @alfcnz](https://twitter.com/alfcnz). Videos and textbooks with relevant details on linear algebra and singular value decomposition (SVD) can be found by searching Alfredo's Twitter, for example type `linear algebra (from:alfcnz)` in the search box.
13 | 
14 | 
15 | ## [Neural Nets: Rotation and Squashing](https://youtu.be/0TdAmZUMj2k)
16 | A traditional neural network is an alternating collection of two blocks - the linear blocks and the non-linear blocks. Given below is a block diagram of a traditional neural network.
17 | <br>
18 | <br>
19 | <center>
20 | <img src="{{site.baseurl}}/images/week02/02-3/figure1.png" width="1000px"/>
21 | Figure 1: Block Diagram of a Traditional Neural Network
22 | </center>
23 | <br>
24 | The linear blocks (Rotations, for simplicity) are given by:
25 | 
26 | $$
27 | \vect{s}_{k+1} = \mW_k z_k
28 | $$
29 | 
30 | And the non-linear blocks (Squashing functions for intuitive understanding) are given by:
31 | 
32 | $$ \vect{z}_k = h(\vect{s}_k) $$
33 | 
34 | In the above diagram and equations, $$\vx \in \mathbb{R}^n$$ represents the input vector. $$\mW_k \in \mathbb{R}^{n_{k} \times n_{k-1}}$$ represents the matrix of an affine transformation corresponding to the $$k^{\text{th}}$$ block and is described below in further detail. The function $h$ is called the activation function and this function forms the non-linear block of the neural network. Sigmoid, ReLu and tanh are some of the common activation functions and we will look at them in the later parts of this section. After alternate applications of linear and non-linear blocks, the above network produces an output vector $$\vect{s}_k \in \mathbb{R}^{n_{k-1}}$$.
35 | 
36 | Let us first have a look at the linear block to gain some intuition on affine transformations. As a motivating example, let us consider image classification. Suppose we take a picture with a 1 megapixel camera. This image will have about 1,000 pixels vertically and 1,000 pixels horizontally, and each pixel will have three colour dimensions for red, green, and blue (RGB). Each particular image can then be considered as one point in a 3 million-dimensional space. With such massive dimensionality, many interesting images we might want to classify -- such as a dog *vs.* a cat -- will essentially be in the same region of the space.
37 | 
38 | In order to effectively separate these images, we consider ways of transforming the data in order to move the points. Recall that in 2-D space, a linear transformation is the same as matrix multiplication. For example, the following are transformations, which can be obtained by changing matrix characteristics:
39 | 
40 | -   Rotation (when the matrix is orthonormal).
41 | -   Scaling (when the matrix is diagonal).
42 | -   Reflection (when the determinant is negative).
43 | -   Shearing.
44 | -   Translation.
45 | 
46 | Note that translation alone is not linear since 0 will not always be mapped to 0, but it is an affine transformation. Returning to our image example, we can transform the data points by translating such that the points are clustered around 0 and scaling with a diagonal matrix such that we "zoom in" to that region. Finally, we can do classification by finding lines across the space which separate the different points into their respective classes. In other words, the idea is to use linear and nonlinear transformations to map the points into a space such that they are linearly separable. This idea will be made more concrete in the following sections.
47 | 
48 | In the next part, we visualize how a neural network separates points and a few linear and non-linear transformations. This can be accessed [here](https://atcold.github.io/NYU-DLSP20/en/week01/01-3/).
49 | 


--------------------------------------------------------------------------------
/docs/en/week02/02.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | lang-ref: ch.02
 3 | title: Week 2
 4 | ---
 5 | 
 6 | 
 7 | ## Lecture
 8 | 
 9 | Similar to [this](https://atcold.github.io/NYU-DLSP20/en/week11/11-1/), [this](https://atcold.github.io/NYU-DLSP20/en/week11/11-2/), and possibly more.
10 | 
11 | 
12 | ## Practicum
13 | 
14 | We discuss the motivation for applying transformations to data points visualized in space. We talk about Linear Algebra and the application of linear and non-linear transformations. We discuss the use of visualization to understand the function and effects of these transformations. We walk through examples in a Jupyter Notebook and conclude with a discussion of functions represented by neural networks.
15 | 


--------------------------------------------------------------------------------
/docs/en/week03/03-3.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | lang-ref: ch.03-3             
 3 | title: Spiral classification 
 4 | lecturer: Alfredo Canziani 
 5 | authors: Wenhao Li          
 6 | date: 6 May 2021        
 7 | ---                    
 8 | 
 9 | ## [Typora](https://typora.io/)
10 | Typora is a useful tool to write markdown with the addition of formulae in LaTeX. It is convenient to write paper and homework, and generating pdf file with Typora.
11 | 
12 | ## [Notion](https://www.notion.so/)
13 | <center>
14 | <img src="{{site.baseurl}}/images/week03/03-3/figure1.png" style="background-color:#DCDCDC;" /><br>
15 | </center>
16 | Here you can place all your favorite stuff. This includes but is not limited to recipes, music, books, notes. Everything in one place, simple and powerful. 
17 | 
18 | When you find some useful article regarding Deep Learning, you may want to collect it for future review. The database is just all you need. You can find [more information](https://www.notion.so/Intro-to-databases-fd8cd2d212f74c50954c11086d85997e) about how to use the database.
19 | 
20 | First you need to create a database by "Workspace" -> "Add a new page". Inside this page, choose "/table" -> "Table - Full Page". In addition to filling out the information related to the paper, we usually want to cover "The Golden Circle" aka "What? Why? How?" in our summary.
21 | 
22 | This is an [example](https://www.notion.so/When-to-use-parametric-models-in-reinforcement-learning-d4c5e586677e49338a41b663231c0633) of how to organize your summary.
23 | 
24 | 
25 | 
26 | 
27 | ## [Diagram.net](https://app.diagrams.net/)
28 | 
29 | Diagrams.net is a great tool to draw neural network diagrams. Next we will introduce a few rules to make our diagrams more consistent with the ones in lecture.
30 | 
31 | 
32 | 
33 | <center>
34 | <img src="{{site.baseurl}}/images/week03/03-3/figure7.png" style="background-color:#DCDCDC;" /><br>
35 | </center>
36 | 
37 | The grayscale background means this is an observation, which means they are data points from a given dataset. You can check the input and labels by going to the directory of the dataset if you want.
38 | 
39 | <center>
40 | <img src="{{site.baseurl}}/images/week03/03-3/figure9.png" style="background-color:#DCDCDC;" /><br>
41 | </center>
42 | 
43 | We use "Delay" to denote the encoder(e.g., neural network).
44 | 
45 | 
46 | <center>
47 | <img src="{{site.baseurl}}/images/week03/03-3/figure10.png" style="background-color:#DCDCDC;" /><br>
48 | </center>
49 | 
50 | In this example, $\vx$ and $\vy$  are observations.
51 | 
52 | In the half above, we feed the $\vx$ to a given encoder to get a prediction $\bar {\vy}$. This is called forward propagation.
53 | 
54 | In the half below, we want to get the prediction $\bar{\vx}$ given observation $\vy$. We keep doing gradient descent to make the network output as close as to $\vy$. This is called amortizing inference. 
55 | 
56 | Usually, we use backpropagation to compute the gradient, then we apply gradient descent with those computed values to train the model. This example shows that backpropagation is NOT only used during training. Backpropagation can also be used for inference.
57 | 
58 | 
59 | 
60 | ## Spiral Classification
61 | The following content is mostly the same, so [here](https://atcold.github.io/NYU-DLSP20/en/week02/02-3/) you can find what you need.
62 | 


--------------------------------------------------------------------------------
/docs/en/week03/03.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | lang-ref: ch.03
 3 | title: Week 3
 4 | ---
 5 | 
 6 | 
 7 | ## Lecture
 8 | 
 9 | Parts can be found [here](https://atcold.github.io/NYU-DLSP20/en/week03/03-1/) and part [here](https://atcold.github.io/NYU-DLSP20/en/week06/06-2/).
10 | 
11 | 
12 | ## Practicum
13 | 
14 | We introduced how to draw deep network schematics conveniently using diagrams.net. Then we showed the different effect of using only linear transformation, and the effect of combining linear and non-linear transformation together on spiral classification. Finally, we showed the mathematical principles underlying neural networks, including chain rule derivation, back propagation, and gradient descent.
15 | 


--------------------------------------------------------------------------------
/docs/en/week04/04.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | lang-ref: ch.04
 3 | title: Week 4
 4 | ---
 5 | 
 6 | 
 7 | ## Lecture
 8 | 
 9 | Similar to [last year's edition](https://atcold.github.io/NYU-DLSP20/en/week06/06-1/).
10 | 
11 | ## Practicum A & B
12 | 
13 | Similar to last year's edition of [CNN](https://atcold.github.io/NYU-DLSP20/en/week03/03-3/) and [RNN](https://atcold.github.io/NYU-DLSP20/en/week06/06-3/).
14 | 


--------------------------------------------------------------------------------
/docs/en/week05/05.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | lang-ref: ch.05
 3 | title: Week 5
 4 | ---
 5 | 
 6 | 
 7 | ## Lecture
 8 | 
 9 | Similar to [last year's](https://atcold.github.io/NYU-DLSP20/en/week07/07-1/) but different.
10 | 
11 | ## Practicum
12 | 
13 | Same as [last year](https://atcold.github.io/NYU-DLSP20/en/week15/15-1/).
14 | 


--------------------------------------------------------------------------------
/docs/en/week06/06.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | lang-ref: ch.06
 3 | title: Week 6
 4 | ---
 5 | 
 6 | 
 7 | ## Lecture
 8 | 
 9 | Similar to [this](https://atcold.github.io/NYU-DLSP20/en/week14/14-1/) and [this](https://atcold.github.io/NYU-DLSP20/en/week14/14-2/).
10 | 
11 | ## Practicum
12 | 
13 | Same as [last year](https://atcold.github.io/NYU-DLSP20/en/week15/15-2/).
14 | 


--------------------------------------------------------------------------------
/docs/en/week07/07-3.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | lang-ref: ch.07-3
  3 | title: Introduction to Autoencoders
  4 | lecturer: Alfredo Canziani
  5 | authors: Vidit Bhargava, Monika Dagar
  6 | date: 18 March 2021
  7 | ---
  8 | ## Applications of Autoencoder
  9 | 
 10 | 
 11 | ### DALL-E: Creating Images from Text
 12 | 
 13 | DALL-E (released by OpenAI) is a neural network based on the Transformers architecture, that creates images from text captions. It is a 12-billion parameter version of GPT-3, trained on a dataset of text-image pairs.
 14 | 
 15 | <center>
 16 | <img src="{{site.baseurl}}/images/week07/07-3/DALL-E.png" style="background-color:#DCDCDC;" /><br>
 17 | <b>Figure 1:</b> DALL-E: Input-Output
 18 | </center>
 19 | 
 20 | Go to the [website](https://openai.com/blog/dall-e/) and play with the captions! 
 21 | 
 22 | 
 23 | ## Autoencoder
 24 | Let's start with some definitions:
 25 | 
 26 | 
 27 | ### Definitions
 28 | 
 29 | 
 30 | #### Input
 31 | 
 32 | $\vx$: is observed during both training and testing 
 33 | 
 34 | $\vy$: is observed during training but not testing
 35 | 
 36 | $\vz$: is not observed (neither during training nor during testing).
 37 | 
 38 | 
 39 | #### Output
 40 | 
 41 | $\vh$: is computed from the input (hidden/internal)
 42 | 
 43 | $\vytilde$: is computed from the hidden (predicted $\vy$, ~ means *circa*)
 44 | 
 45 | Confused?
 46 | Refer to the below figure to understand the use of different variables in different machine learning techniques.
 47 | 
 48 | <center>
 49 | <img src="{{site.baseurl}}/images/week07/07-3/def.png" style="background-color:#DCDCDC;" /><br>
 50 | <b>Figure 2:</b> Variable definitions in different machine learning techniques
 51 | </center>
 52 | 
 53 | 
 54 | ### Introduction
 55 | 
 56 | These kinds of networks are used to learn the internal structure of some input and encode it in a hidden internal representation $\vh$, which expresses the input.
 57 | 
 58 | We already learned how to train energy-based models, let's look at the below network:
 59 | 
 60 | <center>
 61 | <img src="{{site.baseurl}}/images/week07/07-3/Autoencoder_Arch.png" style="background-color:#DCDCDC;" /><br>
 62 | <b>Figure 3:</b> Autoencoder Architecture
 63 | </center>
 64 | 
 65 | Here instead of computing the minimization of the energy $\red{E}$ for $\vz$, we use an encoder that approximates the minimization and provides a hidden representation $\vh$ for a given $\vy$.
 66 | 
 67 | $$
 68 | \vh = \Enc(\vy)
 69 | $$
 70 | 
 71 | Then the hidden representation is convected into $\vytilde$ (Here we don't have a predictor, we have an encoder).
 72 | 
 73 | $$
 74 | \vytilde= \Dec (\vh)
 75 | $$
 76 | 
 77 | Basically, $\vh$ is the output of a squashing function $f$ of the rotation of our input/observation $\vy$. $\vytilde$ is the output of squashing function $g$ of the rotation of our hidden representation $\vh$.
 78 | 
 79 | $$
 80 | \vh = f(\mW{_h} \vy + \vb{_h}) \\
 81 | \vytilde = g(\mW{_y}\vh + \vb{_y})
 82 | $$
 83 | 
 84 | Note that, here $\vy$ and $\vytilde$ both belong to the same input space, and $\vh$ belong to $\mathbb{R}^d$ which is the internal representation. $\mW_h$ and $\mW_y$ are matrices for rotation.
 85 | 
 86 | $$
 87 | \vy, \vytilde \in \mathbb{R}^n \\
 88 | \vh \in \mathbb{R}^d \\
 89 | \mW_h \in \mathbb{R}^{d \times n} \\
 90 | \mW_y \in \mathbb{R}^{n \times d}
 91 | $$
 92 | 
 93 | This is called Autoencoder. The encoder is performing amortizing and we don't have to minimize the energy  $\red{E}$ but $\red{F}$:
 94 | 
 95 | $$
 96 | \red{F}(\vy) = \red{C}(\vy,\vytilde) + \red{R}(\vh)
 97 | $$
 98 | 
 99 | 
100 | ### Reconstruction Costs
101 | 
102 | Below are the two examples of reconstruction energies:
103 | 
104 | 
105 | #### Real-Valued Input:
106 | 
107 | $$
108 | \red{C}(\vy,\vytilde) = \Vert{\vy-\vytilde}\Vert^2 = \Vert \vy-\Dec[\Enc(\vy)] \Vert^2
109 | $$
110 | 
111 | This is the square euclidean distance between $\vy$ and $\vytilde$.
112 | 
113 | 
114 | #### Binary input
115 | 
116 | In the case of binary input, we can simply use binary cross-entropy
117 | 
118 | $$
119 | \red{C}(\vy,\vytilde) = - \sum_{i=1}^n{\vy{_i}\log(\vytilde{_i}) + (1-\vy{_i})\log(1-\vytilde{_i})}
120 | $$
121 | 
122 | 
123 | ### Loss Functionals
124 | 
125 | Average across all training samples of per sample loss function
126 | 
127 | $$
128 | \mathcal{L}(\red{F}(\cdot),\mY) = \frac{1}{m}\sum_{j=1}^m{\ell(\red{F}(\cdot),\vy^{(j)})} \in \mathbb{R}
129 | $$
130 | 
131 | We take the energy loss and try to push the energy down on $\vytilde$
132 | 
133 | $$
134 | \ell_{\text{energy}}(\red{F}(\cdot),\vy) = \red{F}(\vy)
135 | $$
136 | 
137 | 
138 | ### Use-cases
139 | 
140 | The size of the hidden representation $\vh$ obtained using these networks can be both smaller and larger than the input size. 
141 | 
142 | If we choose a smaller $\vh$, the network can be used for non-linear dimensionality reduction.
143 | 
144 | In some situations it can be useful to have a larger than input $\vh$, however, in this scenario, a plain autoencoder would collapse. In other words, since we are trying to reconstruct the input, the model is prone to copying all the input features into the hidden layer and passing it as the output thus essentially behaving as an identity function. This needs to be avoided as this would imply that our model fails to learn anything.
145 | 
146 | To prevent the model from collapsing, we have to employ techniques that constrain the amount of region which can take zero or low energy values. These techniques can be some sort of regularization such as sparsity constraints, adding additional noise, or sampling.
147 | 
148 | 
149 | ### Denoising autoencoder
150 | 
151 | We add some augmentation/corruption like Gaussian noise to an input sampled from the training manifold $\vyhat$ before feeding it into the model and expect the reconstructed input $\vytilde$ to be similar to the original input $\vy$.
152 | 
153 | <center>
154 | <img src="{{site.baseurl}}/images/week07/07-3/DenoisingAutoEncoder.png" style="background-color:#DCDCDC;" /><br>
155 | <b>Figure 4:</b> Denoising Autoencoder Network architecture.
156 | </center>
157 | <!-- ![](https://i.imgur.com/WVcDLns.png) -->
158 | An important note: The noise added to the original input should be similar to what we expect in reality, so the model can easily recover from it.
159 | 
160 | <center>
161 | <img src="{{site.baseurl}}/images/week07/07-3/DAEOutput.png" style="background-color:#DCDCDC;" /><br>
162 | <b>Figure 5:</b> Measuring the traveling distance of the input data
163 | </center>
164 | 
165 | In the image above, the light colour points on the spiral represent the original data manifold. As we add noise, we go farther from the original points. These noise-added points are fed into the auto-encoder to generate this graph. 
166 | The direction of each arrow points to the original datapoint the model pushes the noise-added point towards; whereas the size of the arrow shows by how much. 
167 | We also see a dark purple spiral region which exists because the points in this region are equidistant from two points on the data manifold. 
168 | 
169 | 


--------------------------------------------------------------------------------
/docs/en/week07/07.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | lang-ref: ch.07
 3 | title: Week 7
 4 | ---
 5 | 
 6 | 
 7 | ## Lecture part A
 8 | 
 9 | 
10 | ## Lecture part B
11 | 
12 | 
13 | ## Practicum
14 | We started with an application of autoencoders: DALL-E.  We discussed Autoencoders (in terms of Energy-Based Models) and their use cases. Next, we discussed the reconstruction costs and the loss functions we should use. Finally, we discussed a particular type of autoencoder, i.e., denoising autoencoder.
15 | 


--------------------------------------------------------------------------------
/docs/en/week08/08.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | lang-ref: ch.08
 3 | title: Week 8
 4 | ---
 5 | 
 6 | 
 7 | ## Lecture part A
 8 | 
 9 | 
10 | ## Lecture part B
11 | 
12 | 
13 | ## Practicum
14 | In this section, we introduced some Generative Models including Denoising AE, Contractive AE and Variational AE. We compared the functionalities and advantages of Variational AEs over Basic Autoencoders. We explored the objective function of VAE in detail, understanding how it enforced some structure in the latent space. 


--------------------------------------------------------------------------------
/docs/en/week09/09.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | lang-ref: ch.09
 3 | title: Week 9
 4 | ---
 5 | 
 6 | 
 7 | ## Lecture part A
 8 | 
 9 | 
10 | ## Lecture part B
11 | 
12 | 
13 | ## Practicum
14 | In this section, we covered the implementation of *Generative models* viz. **Undercomplete Autoencoder**, **Denoising Autoencoders**, **Variational Autoencoders** and **Generative Adversarial Networks**. We analyze these models from the perspective of the framework of Energy Based Models (EBM). In doing so, we realize that these generative models can be considered as extensions of EBMs and differ from each other with subtle architectural adjustments.


--------------------------------------------------------------------------------
/docs/en/week10/10-2.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | lang-ref: ch.10-2
  3 | title: SEER, AVID + CMA, Distillation, Barlow Twins
  4 | lecturer: Ishan Misra
  5 | authors: Duc Anh Phi, Krishna Karthik Reddy Jonnala
  6 | date: 17 May 2021
  7 | ---
  8 | 
  9 | ## SEER: Learning from uncharted Images
 10 | Compared to Imagenet dataset, real world images may have different distributions (cartoons, memes) and may or may not have a prominent object. In order to verify if the models work well on images outside of Imagenet dataset we decided to test *Swav* method on large scale data. SEER is *Swav* method tested on billions of unfiltered images.
 11 | 
 12 | Following graph compares the fine tune performance of the four models when transferred to Imagenet. Using SEER method, a model can be trained with more than a billion parameters which are going to transfer really well to Imagenet.
 13 | <center>
 14 | <img src="{{site.baseurl}}/images/week10/10-2/seer_1.png" style="background-color:#DCDCDC;" /><br>
 15 | Figure 1 Comparing SEER to other methods on ImageNet data
 16 | </center>
 17 | 
 18 | As shown in the following table, the performance of SEER is comparable to the networks trained on curated data with weak supervision.
 19 | <center>
 20 | <img src="{{site.baseurl}}/images/week10/10-2/seer_2.png" style="background-color:#DCDCDC;" /><br>
 21 | Figure 2 SEER performance vs weak supervision model
 22 | </center>
 23 | 
 24 | ## AVID + CMA
 25 | Audio Visual Instance Discrimination with Cross Modal Agreement is a method that combines *contrastive learning* and *clustering* techniques.
 26 | 
 27 | For contrastive leaning on an Audio-Video dataset, when the (audio-video) inputs are passed to the two encoders ($f_a, f_v$) we will get two embeddings (audio and video). The embeddings from the same sample should be close in feature space compared to embeddings from different samples.
 28 | 
 29 | <center>
 30 | <img src="{{site.baseurl}}/images/week10/10-2/avid.png" style="background-color:#DCDCDC;" /><br>
 31 | Figure 3 AVID: Audio Video Instance Discrimination
 32 | </center>
 33 | 
 34 | To introduce the *clustering*, the notion of the positives and negatives is expanded as shown in the following image. Computing the similarities in the video and audio embeddings from a reference point to all the other samples results in *Positive Set* and *Negative Set*. A sample falls into positive set when both its audio and video embeddings are similar to the reference embeddings.
 35 | <center>
 36 | <img src="{{site.baseurl}}/images/week10/10-2/cma.png" style="background-color:#DCDCDC;" /><br>
 37 | Figure 4 CMA: Cross-Modal Agreements
 38 | </center>
 39 | 
 40 | ## Distillation
 41 | Distillation methods are similarity maximization based methods. Like other SSL methods distillation tries to prevent trivial solutions. It does so by asymmetry in two different ways.
 42 | * Asymmetric *learning rule* between student teacher
 43 | * Asymmetric *architecture* between student teacher
 44 | 
 45 | $$ f_{\vtheta}^{\text{student}}(I) = f_{\vtheta}^{\text{teacher}}(\text{augment}(I))$$
 46 | 
 47 | ### BYOL
 48 | BYOL is a distillation technique whose architecture is shown below.
 49 | <center>
 50 | <img src="{{site.baseurl}}/images/week10/10-2/byol.png" style="background-color:#DCDCDC;" /><br>
 51 | Figure 5 BYOL architecture
 52 | </center>
 53 | 
 54 | There is an asymmetry in architecture between student teacher as student has an additional prediction head. The gradient backpropagation only happens through Student encoder clearly creating an asymmetry in learning rate. In BYOL there is an additional source of asymmetry which is in weights of student encoder and teacher encoder. Teacher encoder is created as moving average of student encoder. These asymmetries will prevent the model from trivial solutions.
 55 | 
 56 | ### SimSiam
 57 | Recent studies showed that all the three sources of asymmetry discussed in BYOL are not needed to prevent the trivial solutions. In *SimSiam* architecture the student and teacher share the same set of weights and there are two sources of asymmetry.
 58 | * In architecture of student encoder with an additional predictor head.
 59 | * In learning rate, when backpropagating the gradients are passed only through student encoder but not the teacher encoder. After each epoch, the weights of student encoder are copied to the teacher encoder.
 60 | 
 61 | <center>
 62 | <img src="{{site.baseurl}}/images/week10/10-2/simsiam.png" style="background-color:#DCDCDC;" /><br>
 63 | Figure 6 SimSiam architecture
 64 | </center>
 65 | 
 66 | ## Barlow Twins
 67 | 
 68 | ### Hypothesis from information theory
 69 | The efficient coding hypothesis was proposed by Horace Barlow in 1961 as a theoretical model of sensory coding in the brain. Within the brain, neurons communicate with each other by sending electrical impulses called spikes. Barlow hypothesised that the spikes in the sensory system form a neural code for efficiently representing sensory information. By efficient, Barlow meant that the code minimises the number of spikes needed to transmit a given signal. 
 70 | 
 71 | ### Implementation
 72 | A successful approach to Self-Supervised-Learning (SSL) is to learn representations which are invariant to distortions of the input sample. However, a recurring problem with this approach is the existence of trivial constant solutions.
 73 | 
 74 | The Barlow Twins method proposes an objective function that naturally avoids such collapse by measuring the cross-correlation matrix between the outputs of two identical networks fed with distorted versions of a sample and making them as close as possible to the identity matrix.
 75 | 
 76 | Barlow's redundancy-reduction principle applied to a pair of identical networks. The objective function measures the cross-correlation matrix between the output features of two identical networks fed with distorted versions of a batch of samples and attempts to bring this matrix close to the identity. This causes the representation vectors of distorted versions of a sample to be similar, while minimizing the redundancy between the components of these vectors (Figure 7).
 77 | 
 78 | <center>
 79 | <img src="{{site.baseurl}}/images/week10/10-2/figure_1.png" style="background-color:#DCDCDC;" /><br>
 80 | Figure 7 Barlow-Twins Architecture
 81 | </center>
 82 | 
 83 | More formally, it produces two distorted views for all images of a batch $X$. The distorted views are obtained via a distribution of data augmentations $\mathcal{T}$. The two batches of distorted views $Y^A$ and $Y^B$ are then fed to a function $f_{\vtheta}$, typically a deep network with trainable parameters $\vtheta$, producing batches of representations $Z^{A}$ and $Z^{B}$ respectively. 
 84 | 
 85 | The loss function $\mathcal{L_{BT}}$ contains a invariance and redundancy reduction:
 86 | 
 87 | $$
 88 | \mathcal{L_{BT}} \triangleq  \underbrace{\sum_i  (1-\mathcal{C}_{ii})^2}_\text{invariance term}  + ~~\lambda \underbrace{\sum_{i}\sum_{j \neq i} {\mathcal{C}_{ij}}^2}_\text{redundancy reduction term}
 89 | $$
 90 | 
 91 | where $\lambda$ is a constant controlling the importance of the first and second terms of the loss, and where $\mathcal{C}$ is the cross-correlation matrix computed between the outputs of the two identical networks along the batch dimension:
 92 | 
 93 | $$
 94 | \mathcal{C}_{ij} \triangleq \frac{
 95 | \sum_b z^A_{b,i} z^B_{b,j}}
 96 | {\sqrt{\sum_b {(z^A_{b,i})}^2} \sqrt{\sum_b {(z^B_{b,j})}^2}}
 97 | $$
 98 | 
 99 | where $b$ indexes batch samples and $i,j$ index the vector dimension of the networks' outputs. $\mathcal{C}$ is a square matrix with size the dimensionality of the network's output. In other words 
100 | 
101 | Intuitively, the invariance term of the objective, by trying to equate the diagonal elements of the cross-correlation matrix to 1, makes the representation invariant to the distortions applied.  The redundancy reduction term, by trying to equate the off-diagonal elements of the cross-correlation matrix to 0, decorrelates the different vector components of the representation. This decorrelation reduces the redundancy between output units, so that the output units contain non-redundant information about the sample. 
102 | 


--------------------------------------------------------------------------------
/docs/en/week10/10-3.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | lang-ref: ch.10-3
 3 | lecturer: Alfredo Canziani
 4 | title: Transformer  Encoder-predictor-decoder architecture
 5 | authors: Rahul Ahuja, jingshuai jiang
 6 | date: 15 Apr 2021
 7 | ---
 8 | 
 9 | 
10 | ## The Transformer
11 | 
12 | Before elaborating the encoder-predictor-decoder architecture, we are going to review two models we've seen before.
13 | 
14 | 
15 | ### Conditional EBM latent variable architecture
16 | 
17 | 
18 | We should be familiar with the terminology of these modules from the previous lectures.
19 | In the conditional EBM latent variable architecture, we have $x$ the conditional variable which goes into a predictor. We have $\vy$ which is the target value. The decoder modules will produce $\vytilde$ when fed with a latent variable $z$ and the output of the predictor. $\red{E}$ is the energy function which minimizes the energy between $\vytilde$ and $\vy$.
20 | 
21 | 
22 | <center>
23 | <img src="{{site.baseurl}}/images/week10/10-3/ebm.png" style="zoom: 80%; background-color:#DCDCDC;" /><br>
24 | <b>Figure 1: </b> (From the EBM lecture) Diagram above depicting the architecture of a conditional EBM latent variable model.
25 | </center>
26 | 
27 | ### Autoencoder architecture
28 | 
29 | In Autoencoder architecture , we observed there is no conditional input but only a target variable. The entire architecture is trying to learn the structure in these target variables. The target value $\vy$ is fed through an encoder module which transforms into a hidden representation space, forcing only the most important information through. And the decoder will make these variables come back to the original target space with a $\vytilde$. And the cost function will try to minimize the distance between $\vytilde$ and $\vy$.
30 | 
31 | 
32 | 
33 | <center>
34 | <img src="{{site.baseurl}}/images/week10/10-3/autoencoder.png" style="zoom: 80%; background-color:#DCDCDC;" /><br>
35 | <b>Figure 2: </b> (From the autoencoder lecture) Architecture of a basic Autoencoder consisting of encoder and decoder modules.
36 | </center>
37 | 
38 | 
39 | 
40 | ### Encoder-predictor-decoder architecture
41 | 
42 | <center>
43 | <img src="{{site.baseurl}}/images/week10/10-3/transformer.png" style="zoom: 80%; background-color:#DCDCDC;" /><br>
44 | <b>Figure 3: </b> The transformer architecture with a unit delay module.
45 | </center>
46 | 
47 | 
48 | In a transformer, $\vy$ (target sentence) is a discrete time signal. It has discrete representation in a time index. The $\vy$ is fed into a unit delay module succeeded by an encoder. The unit delay here transforms $\vy[j] \mapsto \vy[j-1]$. The only difference with the autoencoder here is this delayed variable. So we can use this structure in the language model to produce the future when given the past.
49 | 
50 | 
51 | 
52 | <center>
53 | <img src="{{site.baseurl}}/images/week10/10-3/unit_delay.png" style="zoom: 50%; background-color:#DCDCDC;" /><br>
54 | <b>Figure 4: </b> A unit delay module transforms $\vy[j] \mapsto \vy[j-1]$
55 | </center>
56 | 
57 | The observed signal, $\vx$ (source sentence) , is also fed through an encoder. The output of both encoder and delayed encoder are fed into the predictor, which gives a hidden representation $\vh$. This is very similar to denoising autoencoder as the delay module acts as noise in this case. And $\vx$ here makes this entire architecture a conditional delayed denoising autoencoder.
58 | 
59 | ### Encoder module
60 | You can see the detailed explanation of these modules from last year's slides [here](https://atcold.github.io/NYU-DLSP20/en/week12/12-3/).
61 | 
62 | 
63 | ### Predictor Module
64 | 
65 | The transformer predictor module follows a similar procedure as the encoder. However, there is one additional sub-block (i.e. cross-attention) to take into account. Additionally, the output of the encoder modules acts as the inputs to this module.
66 | 
67 | 
68 | <center>
69 | <img src="{{site.baseurl}}/images/week10/10-3/predictor.png" style="zoom: 100%; background-color:#DCDCDC;" /><br>
70 | <b>Figure 5: </b> The predictor module consisting of a cross attention block
71 | </center>
72 | 
73 | ### Cross attention
74 | You can see the detailed explanation of cross attention from last year's slides [cross-attention](https://atcold.github.io/NYU-DLSP20/en/week12/12-3/).
75 | 
76 | 
77 | ### Decoder module
78 | 
79 | Contrary to what authors of the Transformer paper define, the decoder module consists of `1D-convolution` and `Add, Norm` blocks. The output of the predictor module is fed to the decoder module and the output of the decoder module is the predicted sentence. We can train this by providing the delayed target sequence.
80 | 
81 | 
82 | <center>
83 | <img src="{{site.baseurl}}/images/week10/10-3/decoder.png" style="zoom: 80%; background-color:#DCDCDC;" /><br>
84 | <b>Figure 6: </b> The correct notation of the encoder,predictor and decoder modules in a transformer
85 | </center>
86 | 


--------------------------------------------------------------------------------
/docs/en/week10/10.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | lang-ref: ch.10
 3 | title: Week 10
 4 | ---
 5 | 
 6 | 
 7 | ## Lecture part A
 8 | A brief introduction to self-supervised learning and pretext tasks and discussion of associated trivial solutions. Categorization of recent self-supervised methods: Introduction to Contrastive Learning and the loss function used. Brief overviews of PIRL, SimCLR and MoCo followed by SwAV which is a Clustering based method. Pretraining on Imagenet and non-Imagenet data is also discussed towards the end.
 9 | 
10 | ## Lecture part B
11 | 
12 | 
13 | ## Practicum
14 | We introduce attention, focusing on self-attention and its hidden layer representations of the inputs. Then, we introduce the key-value store paradigm and discuss how to represent queries, keys, and values as rotations of an input. Finally, we use attention to interpret the transformer architecture taking a forward pass through a basic transformer  through an EBM perspective,, and comparing the encoder-predictor-decoder paradigm to sequential architectures.


--------------------------------------------------------------------------------
/docs/en/week11/11-1.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | lang-ref: ch.11-1
  3 | lecturer: Awni Hannun
  4 | title: Speech Recognition and Graph Transformer Network I
  5 | authors: Cal Peyser, Kevin Chang
  6 | date: 14 Apr 2021
  7 | ---
  8 | 
  9 | ## Modern Speech Recognition
 10 | 
 11 | This section is a high level introduction to speech recognition and modern speech recognition specifically why it's become so good, but what are some of the problems still.
 12 | 
 13 | * Automatic speech recognition has greatly improved since 2012
 14 |     * Machine performance can be as good or better than human level performance
 15 | * Speech recognition still struggles in
 16 |     * conversational speech
 17 |     * multiple speakers
 18 |     * lots of background noise
 19 |     * the accent of the speakers
 20 |     * certain features not well represented in the training data
 21 | * Pre 2012 speech recognition systems consisted of lots of many hand engineered components
 22 |     * larger dataset is not useful so datasets remain small
 23 |     * combining modules only at inference time instead of learning them together allowed for errors to cascade
 24 |     * researchers hard to know how to improve complex systems
 25 | 
 26 | * Post 2012 speech recognition systems improvements
 27 |     * replaced a lot of the traditional components
 28 |     * add more data
 29 |     * above two together work in a virtuous cycle
 30 | 
 31 | 
 32 | ## The CTC Loss
 33 | 
 34 | Given some input speech utterance $\mX$, which consists of $T$ frames of audio. We desire to produce a transcription $\mY$ and we'll think of our transcription as consisting of the letters of a sentence, so $y_1$ is the first letter $y_U$ is the last letter.
 35 | 
 36 | $$
 37 | \mX=[x_1,...,x_T],\ \mY=[y_1,...,y_U]
 38 | $$
 39 | 
 40 | Compute conditional probability(the score) to evaluate transcription, we want to maximize the probability.
 41 | 
 42 | $$\log{P(\mY \mid \mX;\theta)}$$
 43 | 
 44 | 
 45 | ### Example 1
 46 | 
 47 | $$
 48 | \mX=[x_1, x_2, x_3],\ \mY=[c,a,t]
 49 | $$
 50 | 
 51 | $\mX$ has three frames, $\mY$ has three letters, the number of inputs matches the number of outputs, it's easy to compute the probability by one to one mapping.
 52 | 
 53 | $$\log{P(c \mid x_1)} + \log{P(a \mid x_2)} + \log{P(t \mid x_3)}$$
 54 | 
 55 | 
 56 | ### Example 2
 57 | 
 58 | $$
 59 | \mX=[x_1, x_2, x_3, x_4],\ \mY=[c,a,t]
 60 | $$
 61 | 
 62 | * Alignment: three possible ways
 63 |     * $A_1$: $x_1\rightarrow c$, $x_2\rightarrow a$, $x_3\rightarrow t$, $x_4\rightarrow t$
 64 |     * $A_2$: $x_1\rightarrow c$, $x_2\rightarrow a$, $x_3\rightarrow a$, $x_4\rightarrow t$
 65 |     * $A_3$: $x_1\rightarrow c$, $x_2\rightarrow c$, $x_3\rightarrow a$, $x_4\rightarrow t$
 66 |     
 67 | * Which alignment should we use to compute the score?
 68 |     * All of them. We're going to try to increase the score of all alignments and then hope the model sorts things out internally. The model can decide to optimize these different alignments and weight them accordingly and learn which one is the best.
 69 |     
 70 | $$\log{P(\mY \mid \mX)}=\log{[P(A_1 \mid \mX)+P(A_2 \mid \mX)+P(A_3 \mid \mX)]}$$
 71 |     
 72 | **Reminder**: use actual-softsoftmax to sum log probabilities.
 73 | 
 74 | We want $\log{(P_1+P_2)}$ from $\log{P_1}$ and $\log{P_2}$
 75 | 
 76 | $$
 77 | \begin{aligned}
 78 | \text{actual-softmax}(\log{P_1}, \log{P_2}) 
 79 | &= \log{P_1}+\log{P_2} \\
 80 | &= \log{(e^{\log{P1}}+e^{\log{P2}})}
 81 | \end{aligned}
 82 | $$
 83 | 
 84 | ### Alignment graph
 85 | 
 86 | Alignment graph is a way to encode the set of possible alignments to an arbitrary length input.
 87 |     
 88 | <center>
 89 | <img src="{{site.baseurl}}/images/week11/11-1/figure1.png" style="zoom: 40%; background-color:#DCDCDC;"/><br>
 90 | <b>Figure 1:</b> Alignment graph<br>
 91 | <br>
 92 | </center>
 93 | 
 94 | This graph is sometimes called weighted finite state acceptor (WFSA). The bold state marked 0 at the beginning is a start state, the concentric circle marked 3 is an accepting state. On each edge, there're a label and a weight on both sides of a slash. Any path in this graph is an encoding of an alignment.
 95 |     
 96 | 
 97 | ### Problem: too many alignments
 98 | 
 99 | There's a problem when using all of the alignments. The $\mX$ input audio can have lots of frames, in practice they can be as high as thousands. The $\mY$ transcription can have lots of letters, in practice it can be hundreds or more. This is an astronomically large number of alignments, so we can't compute individual score and sum all of them.
100 | 
101 | 
102 | ### Solution: the forward algorithm(dynamic programming)
103 | 
104 | Define forward variable $\alpha_t^u$, the subscript $t$ is where we are in the input and the superscript $u$ is where we are in the output. This represents the score for all alignments of length $t$ which end in the output $y_u$.
105 | 
106 | Suppose $\mX=[x_1,x_2,x_3,x_4]$, $\mY=[c,a,t]$, the forward variable $\alpha_2^c$ represents the score of all possible alignments of length two up to the first two frames that ends in $c$ in the first output of the transcription. There's only one possible alignment for that $x_1\rightarrow c$, $x_2\rightarrow c$. This is simple to compute.
107 | 
108 | $$\alpha_2^c=\log{P(c \mid x_1)}+\log{P(c \mid x_2)}$$
109 | 
110 | Similarly, $\alpha_2^a$ has only one possibility.
111 | 
112 | $$\alpha_2^a=\log{P(c \mid x_1)}+\log{P(a \mid x_2)}$$
113 | 
114 | For $\alpha_3^a$, there are two possible alignments
115 | 
116 | * $A_1$: $x_1\rightarrow c$, $x_2\rightarrow c$, $x_3\rightarrow a$
117 | * $A_2$: $x_1\rightarrow c$, $x_2\rightarrow a$, $x_3\rightarrow a$
118 | 
119 | $$
120 | \alpha_3^a=\text{actual-softmax}[\log{P(A_1)}, \log{P(A_2)}] \\
121 | \log{P(A_1)}=\log{P(c \mid x_1)}+\log{P(c \mid x_2)}+\log{P(a \mid x_3)} \\
122 | \log{P(A_2)}=\log{P(c \mid x_1)}+\log{P(a \mid x_2)}+\log{P(a \mid x_3)}
123 | $$
124 | 
125 | This is the naive approach to compute $\alpha_3^a$.
126 | 
127 | Using this forward variable, we seek to model the probability distribution $P(\mY \mid \mX) = \sum_{a \in A} P(a)$, where $A$ is the set of all possible alignments from $\mY$ to $\mX$.  This decomposes as 
128 | 
129 | $$P(\mY \mid \mX) = \sum_{a \in A}  \prod_{t=1}^T P(a_t \mid \mX)$$
130 | 
131 | where $P(a_t \mid \mX)$ are the output logits of a system such as an RNN. That is, to compute the likelihood of the transcript $\mY$ we must marginalize over an intractably large number of alignments.  We may do this with a recursive decomposition of the forward variable.  The below presentation is inspired by https://distill.pub/2017/ctc/, which is an excellent introduction to the algorithm.
132 | 
133 | First, we permit an alignment to contain the empty output $\epsilon$ in order to account for the fact that audio sequences are longer than their corresponding transcripts.  We also collapse repetitions, so that $\{a, \epsilon, a, a, \epsilon, a\}$ corresponds to the sequence $aaa$.  We will also define $\alpha$ using an alternative transcript $Z$, which is equal to $\mY$ but is interspersed with $\epsilon$.  That is, $Z = \{\epsilon, y_1, \epsilon, y_2, ..., y_n, \epsilon \}$.
134 | 
135 | Now, suppose $y_i = y_{i+1}$, so that $Z$ contains a subsequence $y_i, \epsilon, y_{i+1}$, and suppose $y_{i+1}$ occurs at psosition $s$ in $Z$.  Then the alignment for $\alpha_{s}^t$ can be arrived at by one of two ways: either the prediction at time $t-1$ can be $y_{i+1}$ (in which case the repetition is collapsed) or else the prediction at time $t-1$ can be epsilon.  So, we may decompose:
136 | 
137 | $$\alpha_s^t = (\alpha_{s, t-1} + \alpha_{s-1, t-1}) P(z_s \mid \mX)$$
138 | 
139 | where the elements of the sum represent the two possible prefixes to the alignment.  If, on the other hand, we have $y_i \ne y_{i+1}$ then there is the additional third possibility that the prediction at time $t-1$ is equal to $y_i$.  So, we have the decomposition
140 | 
141 | $$\alpha_s^t = (\alpha_{s, t-1} + \alpha_{s-1, t-1} + \alpha{s-2, t-1}) P(z_s \mid \mX)$$
142 | 
143 | By computing $\alpha_{\vert Z\vert}^{T}$, we may effectively marginalize over all possible alignments between the transcript $\mY$ and the audio $\mX$, allowing efficient training and inference.  This is called Connectionist Temporal Classification, or CTC.
144 | 
145 | 
146 | 


--------------------------------------------------------------------------------
/docs/en/week11/11.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | lang-ref: ch.11
 3 | title: Week 11
 4 | ---
 5 | 
 6 | 
 7 | ## Lecture part A
 8 | We provide an introduction to the problem of speech recognition using neural models, emphasizing the CTC loss for training and inference when input and output sequences are of different lengths.
 9 | 
10 | 
11 | ## Lecture part B
12 | We discuss beam search for use during inference, and how that procedure may be modeled at training time using a Graph Transformer Network. Graph transformers networks are basically weighted finite-state automata with automatic differentiation, that allows us to encode priors into a graph. There are different type of weighted finite-state and different operations including union, Kleene closure, intersection, compose, and forward score. The loss function is usually the difference between to functions. We can easily implement these networks using GTN library. 
13 | 
14 | 
15 | ## Practicum
16 | 


--------------------------------------------------------------------------------
/docs/en/week12/12-3.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | lang-ref: ch.12-3
  3 | title: MPC (EBM version)
  4 | lecturer: Alfredo Canziani
  5 | authors: Yang Zhou, Daniel Yao
  6 | date: 28 Apr 2021
  7 | ---
  8 | 
  9 | 
 10 | ## Action plan
 11 | - Model predictive control **[Here we are today]**
 12 |     - Backprop through kinematic equation
 13 |     - Minimisation of the latent
 14 | - Truck backer-upper
 15 |     - Learning an emulator of the kinematics from observations
 16 |     - Training a policy
 17 | - PPUU
 18 |     - Stochastic environment
 19 |     - Uncertainty minimisation
 20 |     - Latent decoupling
 21 | 
 22 | 
 23 | ## State transition equations -- Evolution of the state
 24 | 
 25 | Here we discuss a state transition equation where $\vx$ represents the state, $\vu$ represents control. We can formulate the state transition function in a continuous-time system where $\vx(t)$ is a function of continuous variable $t$.
 26 | 
 27 | <div class="MathJax_Display" style="text-align: center;">
 28 | $$
 29 | \begin{aligned}
 30 | \dot{\vx} &= f(\vx,\vu)\\
 31 | \frac{\partial \vx(t)}{\partial t} &= f(\vx(t), \vu(t))
 32 | \end{aligned}
 33 | $$
 34 | </div>
 35 | 
 36 | <center>
 37 | <img src="{{site.baseurl}}/images/week12/12-3/figure1.png" style="background-color:#DCDCDC;" /><br>
 38 | Figure 1: State and control illustration of a three-cycle
 39 | </center>
 40 | 
 41 | We use a tri-cycle as the example to study it. The orange wheel is the control $\vu$, $(x_c,y_c)$ is the instantaneous center of rotation. You can also have two wheels in the front. For simplicity, we use one wheel as the example.
 42 | 
 43 | In this example $\vx=(x,y,\theta,s)$ is the state, $\vu=(\phi,\alpha)$ is the control.
 44 | 
 45 | $$
 46 | \left\{\begin{array}{l}
 47 | \dot{x}=s \cos \theta \\
 48 | \dot{y}=s \sin \theta \\
 49 | \dot{\theta}=\frac{s}{L} \tan \phi \\
 50 | \dot{s}=a
 51 | \end{array}\right.
 52 | $$
 53 | 
 54 | 
 55 | We can reformulate the differential equation from continuous-time system to discrete-time system
 56 | 
 57 | $$
 58 | \vx[t]=\vx[t-1]+f(\vx[t-1], \vu[t]) \mathrm{d} t
 59 | $$
 60 | 
 61 | To be clear, we show the units of $\vx, \vu$.
 62 | 
 63 | $$
 64 | \begin{array}{l}
 65 | {[\vu]=\left(\mathrm{rad}\  \frac{\mathrm{m}}{\mathrm{s}^{2}}\right)} \\
 66 | {[\vx]=\left(\mathrm{m} \  \mathrm{m} \  \mathrm{rad} \  \frac{\mathrm{m}}{\mathrm{s}}\right)}
 67 | \end{array}
 68 | $$
 69 | 
 70 | Let's take a look at different examples. We use different color for variables we care about.
 71 | 
 72 | <center>
 73 | <img src="{{site.baseurl}}/images/week12/12-3/figure2.png" style="background-color:#DCDCDC;" /><br>
 74 | Figure 2: State Formulation
 75 | </center>
 76 | 
 77 | Example 1: Uniform Linear Motion: No acceleration, no steering
 78 | <center>
 79 | <img src="{{site.baseurl}}/images/week12/12-3/figure3.svg" style="background-color:#DCDCDC;" /><br>
 80 | Figure 3: Control of Uniform Linear Motion
 81 | </center>
 82 | <center>
 83 | <img src="{{site.baseurl}}/images/week12/12-3/figure4.svg" style="background-color:#DCDCDC;" /><br>
 84 | Figure 4: State of Uniform Linear Motion
 85 | </center>
 86 | 
 87 | 
 88 | Example 2: Crush into itself: Negative acceleration, no steering
 89 | <center>
 90 | <img src="{{site.baseurl}}/images/week12/12-3/figure5.svg" style="background-color:#DCDCDC;" /><br>
 91 | Figure 5: Control of Curshing into itself
 92 | </center>
 93 | <center>
 94 | <img src="{{site.baseurl}}/images/week12/12-3/figure6.svg" style="background-color:#DCDCDC;" /><br>
 95 | Figure 6: State of Curshing into itself
 96 | </center>
 97 | 
 98 | 
 99 | Example 3: Sine wave: Positive steering for the first part, negative steering for the second part
100 | <center>
101 | <img src="{{site.baseurl}}/images/week12/12-3/figure7.svg" style="background-color:#DCDCDC;" /><br>
102 | Figure 7: Control of Sine Wave
103 | </center>
104 | <center>
105 | <img src="{{site.baseurl}}/images/week12/12-3/figure8.svg" style="background-color:#DCDCDC;" /><br>
106 | Figure 8: State of Sine Wave
107 | </center>
108 | 
109 | 
110 | ## Kelley-Bryson algorithm
111 | What if we want the tri-cycle to reach a specified destination with a specified speed?
112 | - This can be achieved by inference using **Kelley-Bryson algorithm**, which utilizes **backprop through time** and **gradient descent**.
113 | 
114 | 
115 | ### Recap of RNN
116 | We can compare the inference process here  with the training process of RNN.
117 | 
118 | Below is an RNN schematic chart. We feed variable $\vx[t]$ and the previous state $\vh[t-1]$ into the predictor, while $\vh[0]$ is set to be zero. The predictor outputs the hidden representation $\vh[t]$.
119 | <center>
120 | <img src="{{site.baseurl}}/images/week12/12-3/figure9.png" style="background-color:#DCDCDC;" /><br>
121 | Figure 9: RNN schematic chart
122 | </center>
123 | 
124 | 
125 | ### Optimal control (inference)
126 | In optimal control (inference) shown as below, we feed the latent variable (control) $\vz[t]$ and the previous state $\vx[t-1]$ into the predictor, while $\vx[0]$ is set to be $\vx_0$. The predictor outputs the state $\vx[t]$.
127 | 
128 | <center>
129 | <img src="{{site.baseurl}}/images/week12/12-3/figure10.png" style="background-color:#DCDCDC;" /><br>
130 | Figure 10: Optimal Control schematic chart
131 | </center>
132 | 
133 | Backprop is implemented in both RNN and Optimal Control. However, gradient descent is implemented with respect to predictor's parameters in RNN, and is implemented wrt latent variable $\vz$ in optimal control.
134 | 
135 | 
136 | ### Unfolded version of optimal control
137 | In unfolded version of optimal control, cost can be set to either the final step of the tri-cycle or every step of the tri-cycle. Besides, cost functions can take many forms, such as Average Distance, Softmin, etc.
138 | 
139 | 
140 | #### Set the cost to the final step
141 | From the figure below, we can see there is only one cost $c$ set in the final step (step 5), which measures the distance of our target $\vy$ and state $\vx[5]$ with control $\vz[5]$
142 | <center>
143 | <img src="{{site.baseurl}}/images/week12/12-3/figure11.png" style="background-color:#DCDCDC;" /><br>
144 | Figure 11: Cost to the final step
145 | </center>
146 | 
147 | $(1)$ If the cost function only involves the final position with no restrictions on the final speed, we can obtain the results after inference shown as below.
148 | <center>
149 | <img src="{{site.baseurl}}/images/week12/12-3/figure12.png" style="background-color:#DCDCDC;" /><br>
150 | Figure 12: Cost function involving only the final position
151 | </center>
152 | From the figure above, it is seen that when $T=5$ or $T=6$, the final position meets the target position, but when $T$ is above 6 the final position does not.
153 | 
154 | $(2)$ If the cost function involves the final position and zero final speed, we can obtain the results after inference shown as below.
155 | <center>
156 | <img src="{{site.baseurl}}/images/week12/12-3/figure13.png" style="background-color:#DCDCDC;" /><br>
157 | Figure 13: Cost function involving the final position and zero final speed
158 | </center>
159 | From the figure above, it is seen that when $T=5$ or $T=6$, the final position roughly meets the target position, but when $T$ is above 6 the final position does not.
160 | 
161 | 
162 | ### Set the cost to every step
163 | From the figure below, we can see there is a cost $c$ set in every step.
164 | <center>
165 | <img src="{{site.baseurl}}/images/week12/12-3/figure14.png" style="background-color:#DCDCDC;" /><br>
166 | Figure 14: Cost to every step
167 | </center>
168 | 
169 | $(1)$ Cost Example: Average Distance
170 | <center>
171 | <img src="{{site.baseurl}}/images/week12/12-3/figure15.png" style="background-color:#DCDCDC;" /><br>
172 | Figure 15: Cost Example: Average Distance
173 | </center>
174 | 
175 | $(2)$ Cost Example: Softmin
176 | <center>
177 | <img src="{{site.baseurl}}/images/week12/12-3/figure16.png" style="background-color:#DCDCDC;" /><br>
178 | Figure 16: Cost Example: Softmin
179 | </center>
180 | 
181 | Different forms of cost functions can be explored through experimentation.
182 | 
183 | 
184 | ## Optimization_Path_Planner-Notebook
185 | In this notebook, we use tri-cycle as an example as well.
186 | 
187 | 
188 | ### Define kinematic model of a tricycle $\dot{\vx}=f(\vx,\vu)$.
189 | * $\vx$ represents state: ($x$, $y$, $θ$, $s$)
190 | * $\vu$ represents control: ($ϕ$, $a$)
191 | * We feed $\vx[t-1]$ and $\vu[t]$ to obtain the next state $\vx[t]$
192 | 
193 | ```python
194 | def f(x, u, t=None):
195 |     L = 1  # m
196 |     x, y, θ, s = x
197 | 
198 |     ϕ, a = u
199 |     f = torch.zeros(4)
200 |     f[0] = s * torch.cos(θ)
201 |     f[1] = s * torch.sin(θ)
202 |     f[2] = s / L * torch.tan(ϕ)
203 |     f[3] = a
204 |     return f
205 | ```
206 | 
207 | 
208 | ### Define several cost functions
209 | As mentioned above, cost functions can take various forms. In this notebook, we list 5 kinds as follows:
210 | * `vanilla_cost`: Focuses on the final position
211 | * `cost_with_target_s`: Focuses on the final position and final zero speed.
212 | * `cost_sum_distances`: Focuses on the position of every step, and minimizes the mean of the distances.
213 | * `cost_sum_square_distances`: Focuses on the position of every step, and minimizes the mean of squared distances.
214 | * `cost_logsumexp`: The distance of the closest position should be minimized.
215 | 
216 | 
217 | ```python
218 | def vanilla_cost(state, target):
219 |     x_x, x_y = target
220 |     return (state[-1][0] - x_x).pow(2) + (state[-1][1] - x_y).pow(2)
221 | 
222 | def cost_with_target_s(state, target):
223 |     x_x, x_y = target
224 |     return (state[-1][0] - x_x).pow(2) + (state[-1][1] - x_y).pow(2)
225 |                                        + (state[-1][-1]).pow(2)
226 | 
227 | def cost_sum_distances(state, target):
228 |     x_x, x_y = target
229 |     dists = ((state[:, 0] - x_x).pow(2) + (state[:, 1] - x_y).pow(2)).pow(0.5)
230 |     return dists.mean()
231 | 
232 | def cost_sum_square_distances(state, target):
233 |     x_x, x_y = target
234 |     dists = ((state[:, 0] - x_x).pow(2) + (state[:, 1] - x_y).pow(2))
235 |     return dists.mean()
236 | 
237 | def cost_logsumexp(state, target):
238 |     x_x, x_y = target
239 |     dists = ((state[:, 0] - x_x).pow(2) + (state[:, 1] - x_y).pow(2))#.pow(0.5)
240 |     return -1 * torch.logsumexp(-1 * dists, dim=0)
241 | ```
242 | 
243 | 
244 | ### Define path planning with cost
245 | * The optimizer is set to be SGD.
246 | * Time interval `T` is set to be 1s.
247 | * We need to compute every state from the initial state by the following code:
248 | ```python
249 | x = [torch.tensor((0, 0, 0, s),dtype=torch.float32)]
250 | for t in range(1, T+1):
251 |       x.append(x[-1] + f(x[-1], u[t-1]) * dt)
252 | x_t = torch.stack(x)
253 | ```
254 | * Then compute the cost:
255 | ```python
256 | cost = cost_f(x_t, (x_x, x_y))
257 | costs.append(cost.item())
258 | ```
259 | * Implement backprop and update $\vu$
260 | ```python
261 | optimizer.zero_grad()
262 | cost.backward()
263 | optimizer.step()
264 | ```
265 | * Now we can feed values to path_planning_with_cost to obtain inference results and plot trajectories. **Example**:
266 | ```python
267 | path_planning_with_cost(
268 |       x_x=5, x_y=1, s=1, T=5, epochs=5,
269 |       stepsize=0.01, cost_f=vanilla_cost, debug=False
270 | )
271 | ```
272 | 


--------------------------------------------------------------------------------
/docs/en/week12/12.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | lang-ref: ch.12
 3 | title: Week 12
 4 | ---
 5 | 
 6 | 
 7 | ## Lecture part A
 8 | This lecture introduces the topic of Neural Machine Translation with the help of an example. We then discuss language modelling, model architecture, NMT inference. Further, we discuss the issues faced because of the languages and the need for Low Resource Machine Translation. Also, we examine a case study and the challenges faced in Low Resource MT, different stages in the cycle of research, how they can be used for Machine Translation.
 9 | 
10 | ## Lecture part B
11 | This week's lecture was a guest lecture by [Marc'Aurelio Ranzato](https://ai.facebook.com/people/marc-aurelio-ranzato/), who is a research scientist and manager at the Facebook AI Research (FAIR) lab, where he works to enable machines to learn with weaker supervision and to efficiently transfer knowledge across tasks. The first part of Lecture B focuses on understanding low resource machine translation, and the second half discusses potential domain mismatches in machine learning and machine translation.
12 | 
13 | 
14 | ## Practicum
15 | We introduced the state transition function and the way to model a physical system with state and control. We discussed how to achieve optimal control by inference using Kelley-Bryson algorithm, which utilizes backprop through time and gradient descent. Finally, we explained the notebook of Optimization_Path_Planner, in which various cost functions are defined and path planning is implemented to guide a tri-cycle to reach the desired position with the specified speed.
16 | 


--------------------------------------------------------------------------------
/docs/en/week13/13.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | lang-ref: ch.13
 3 | title: Week 13
 4 | ---
 5 | 
 6 | 
 7 | ## Lecture part A
 8 | 
 9 | 
10 | ## Lecture part B
11 | 
12 | 
13 | ## Practicum


--------------------------------------------------------------------------------
/docs/en/week14/14.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | lang-ref: ch.14
 3 | title: Week 14
 4 | ---
 5 | 
 6 | 
 7 | ## Lecture part A
 8 | 
 9 | 
10 | ## Lecture part B
11 | 
12 | 
13 | ## Practicum


--------------------------------------------------------------------------------
/docs/en/week15/15-1.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | lang-ref: ch.15-1
  3 | lecturer: Alfredo Canziani and Jiachen Zhu
  4 | title: Joint Embedding Methods - Contrastive
  5 | authors: Sai Charitha Akula
  6 | date: 12 May 2022
  7 | ---
  8 | 
  9 | 
 10 | 
 11 | ## Visual Representation Learning
 12 | 
 13 | Representation learning trains a system to produce the representations required for feature detection or classification from raw data. Visual representation learning is about the representations of images or videos in particular.
 14 | 
 15 | <center>
 16 | <img src="{{site.baseurl}}/images/week15/15-2/2_fig1.png" width="75%"/><br>
 17 | <b>Fig. 1</b>: Visual Representation Learning
 18 | </center>
 19 | 
 20 | This can be broadly classified as shown above and the focus of the lecture would be on self-supervised visual representation learning.
 21 | 
 22 | ## Self-supervised Visual Representation Learning
 23 | 
 24 | It is a two stage process comprising pretraining and evaluation
 25 | 
 26 | ##### Step1: Pretraining
 27 | 
 28 | Uses a large amount of unlabeled data to train a backbone network. Different methods will produce the backbone network differently
 29 | 
 30 | ##### Step2: Evaluation
 31 | 
 32 | It can be performed in two ways: feature extraction and finetuning. Both these methods generate representation from ​​the image and then use it to train DsTH ( Downstream Task Head ). The learning of the downstream task would thus be in the representation space instead of the image space. The only difference between the two methods is the stop gradient before the encoder. In finetuning, we can change the encoder unlike in feature extraction.
 33 | 
 34 | <center>
 35 | <img src="{{site.baseurl}}/images/week15/15-2/2_fig2.png" width="40%"/><br>
 36 | <b>Fig. 2</b>: Self-supervised Visual Representation Learning
 37 | </center>
 38 | 
 39 | ### Generative Models
 40 | 
 41 | The popular one is the denoising autoencoder. You train the model to reconstruct the original image from the noisy image. After the training, we retain the encoder for the downstream task.
 42 | 
 43 | ##### Issues:
 44 | 
 45 | The model tries to solve a problem that is too hard. For example: For a lot of downstream tasks, you don't have to reconstruct the image, which is a tougher problem than the downstream task itself. Also, sometimes the loss function is not good enough. For example: the Euclidean distance used as a reconstruction loss metric isn’t a good metric for comparing the similarity between two images.
 46 | 
 47 | <center>
 48 | <img src="{{site.baseurl}}/images/week15/15-2/2_fig3.png" height="75%" weight="75%"/><br>
 49 | <b>Fig. 3</b>: Generative Models - Autoencoder
 50 | </center>
 51 | 
 52 | ### Pretext Tasks
 53 | 
 54 | It’s almost the same as above but you train the model to figure out a smart way to generate pseudo labels. For example: Given the image of a tiger, the shuffled image is the input x, and the output y would be the correct way of labeling the patches. The network successfully reinventing the patches indicates that it understands the image.
 55 | 
 56 | ##### Issues:
 57 | Designing the pretext task is tricky. if you design the task too easy, the network won’t learn good representation. But if you design the task hard, it can become harder than the downstream task and the network wouldn't be trained well. Also, the representations generated via this method will be tailored to the specific downstream task.
 58 | 
 59 | <center>
 60 | <img src="{{site.baseurl}}/images/week15/15-2/2_fig4.png" height="75%" weight="75%"/><br>
 61 | <b>Fig. 4</b>: Pretext Tasks
 62 | </center>
 63 | 
 64 | ## Joint Embedding Methods
 65 | 
 66 | Joint Embedding methods try to make their backbone network robust to certain distortions and are invariant to data augmentation.
 67 | 
 68 | As an example, as shown in the image below, for an image of a dog, you take two distorted versions of the image, then encode them with your backbone network to generate representations and you make them to be close to each other. Thus, ensuring the two images share some semantic information.
 69 | 
 70 | <center>
 71 | <img src="{{site.baseurl}}/images/week15/15-1/1_fig0.png" width="50%"/><br>
 72 | <b>Fig. 1</b>: Data Augmentation in JEM
 73 | </center>
 74 | 
 75 | They also prevent trivial solutions. The network could collapse with just the above condition, as the network can become invariant not only to distortions but to the input altogether i.e., irrespective of the input, it could generate the same output. JEMs try to prevent this trivial solution in different ways.
 76 | 
 77 | Instead of considering only local energy ( between two pairs of distorted images ), these methods get a batch of the images and ensure that the collection of the representation, $\green{H}_{\vx}$, doesn’t have the same rows or columns. ( which is the trivial solution )
 78 | 
 79 | <center>
 80 | <img src="{{site.baseurl}}/images/week15/15-1/1_fig1.png" width="50%"/><br>
 81 | <b>Fig. 2</b>: Preventing Trivial Solutions in JEM
 82 | </center>
 83 | <!---
 84 | [comment]: <>( D is the energy that is calculated per sample. A and B are the loss  functionals that are calculated per batch of size N. The dotted operator is for stacking. hx, hy are the representation of x, y and Hx nad Hy are the matrixes with each row as hx.)
 85 | ---->
 86 | 
 87 | ### Components:
 88 | 
 89 | Every Joint Embedding Method has the following components:
 90 | 
 91 | 1. Data augmentation ( $\vx$ and $\vy$ ): The way you generate the two distorted versions of the image.
 92 | 2. Backbone Network ( $\lavender{BB}$ ) - The definition of the backbone
 93 | 3. Energy function ( $\red{D}$ ) - The definition of the distance between the two representations.
 94 | 4. Loss functionals ( $\green{A}$ and $\green{B}$ ) - The definition of the loss functionals calculated per batch of size N.
 95 | 
 96 | ### Joint Embedding Loss Functions:
 97 | 
 98 | Joint Embedding Loss Functions contain two components:
 99 | 1. A term that pushes the positive pair closer
100 | 2. An (implicit) term that prevents the trivial solution (constant output) - implicit because a lot of "other methods" do not have an explicit term to prevent the trivial solution.
101 | 
102 | To make the training stable, people usually normalize the embeddings or put a hinge on the loss function to prevent the norm of embeddings from becoming too large or too small
103 | 
104 | ### Training Methods
105 | 
106 | The training methods can be further classified into the following four types:
107 | 1. Contrastive methods
108 | 2. Non-Contrastive methods
109 | 3. Clustering methods
110 | 4. Other methods
111 | 
112 | We now go into the details of each of these methods
113 | 
114 | ### Contrastive methods
115 | 
116 | Contrastive methods push positive pairs closer and negative pairs away. More details about the contrastive methods including MoCo, PIRL, and SimCLR have been discussed [here](https://atcold.github.io/NYU-DLSP20/en/week08/08-1/).
117 | 
118 | 
119 | #### The InfoNCE loss function:
120 | Both SimCLR and MoCO use the InfoNCE loss function.
121 | 
122 | $$
123 | \red{L}(\boldsymbol{w},\vx,\vy) = \\[0.5cm]
124 | = -\text{log} \frac{\exp(\blue{\,\beta\,} \text{sim} ( \green{h_{\vx}}, \green{h_{\vy}} )  ) }
125 | { \sum_{\red{n}}^{N}\exp(\blue{\,\beta\,} \text{sim} ( \green{h_{\vx}}, \green{h_{\vx}^\red{n}} )) +
126 | \sum_{\red{n}}^{N}\exp(\blue{\,\beta\,} \text{sim} ( \green{h_{\vx}}, \green{h_{\vy}^\red{n}} ))  } \\[0.5cm]
127 | 
128 | = -\blue{\,\beta\,} \text{sim} ( \green{h_{\vx}}, \green{h_{\vy}} ) + \text{log} \Big[
129 |     \sum_{\red{n}}^{N}\exp(\blue{\,\beta\,} \text{sim} ( \green{h_{\vx}}, \green{h_{\vx}^\red{n}} )) +
130 | \sum_{\red{n}}^{N}\exp(\blue{\,\beta\,} \text{sim} ( \green{h_{\vx}}, \green{h_{\vy}^\red{n}} ))  ]\\[0.5cm]
131 | 
132 | = -\blue{\,\beta\,} \text{sim} ( \green{h_{\vx}}, \green{h_{\vy}} ) + \text{softmax}_\blue{\beta} [
133 |      \text{sim} ( \green{h_{\vx}}, \green{h_{\vx}^\red{n}} ),
134 |  \text{sim} ( \green{h_{\vx}}, \green{h_{\vy}^\red{n}} ) ] \\[0.5cm]
135 | 
136 | \text{sim} (\green{h_{\vx}}, \green{h_{\vy}} ) = \frac{ \green{h_{\vx}}^\top \green{h_{\vy}} } { ||\green{h_{\vx}} || \, ||\green{h_{\vy}} ||  }
137 | 
138 | $$
139 | 
140 | 
141 | The first term indicates the similarity between positive pairs and the second term is the softmax between all the negative pairs. We would like to minimize this whole function.
142 | 
143 | Notice that it gives different weights to different negative samples. The negative pair that has high similarity is pushed much harder than the negative pair with low similarity because there's a softmax. Also, the similarity measurement here is the inner product between the two representations, and to prevent the gradient explosion, the norm is normalized. Thus, even if the vector grew long, the term ensures that it is a unit vector.
144 | 
145 | #### Memory Bank:
146 | 
147 | As already mentioned, these models require negative samples. However, finding negative pairs becomes difficult as the embedding spaces become large.
148 | 
149 | To handle this, SimCLR and MoCO use large batch sizes to find the samples. The difference between SimCLR and MoCO is the way they deal with the large batch size. SimCLR uses 8192 as the batch size. However, MoCO tries to solve the requirement of a large batch size without actually using a large batch size by using a memory bank. It uses a small batch size but instead of using negative samples from only the current batch, it collects them even from previous batches. For example: with a 256 batch size, aggregating the previous 32 batches of negative samples results essentially in a batch size of 8192. This method saves memory and avoids the effort to generate the negative samples again and again.
150 | 
151 | <center>
152 | <img src="{{site.baseurl}}/images/week15/15-1/1_fig3.png" height="75%" width="75%"/><br>
153 | <b>Fig. 4</b>: Memory Bank
154 | </center>
155 | 
156 | Issue:
157 | Because B is updated every step, the backbone is updated every step, and thus, after a while, the old negative samples are not valid anymore and can lead to a decrease in performance. To avoid this, MoCO uses a momentum backbone that slows down the training of the right backbone. In that case, the difference between the older momentum backbone and the new momentum backbone is not that different, retaining the validitiy of the negative sample even after a while.
158 | 
159 | <center>
160 | <img src="{{site.baseurl}}/images/week15/15-1/1_fig4.png" height="75%" width="75%"/><br>
161 | <b>Fig. 5</b>: Memory Bank with Momentum Backbone
162 | </center>
163 | 
164 | $\vartheta_{t+1}$ ( momemtum backbone’s parameter ) is an exponential moving average of $\theta_{t}$. The learning rate of $\vartheta$ is $( 1 -  m )* \eta$. High values of $m$ will make the $\vartheta_{t}$ stable. $m$ =1 will make $\vartheta_{t}$  basically untrained. If $m$ is very small like 0, $\vartheta_{t+1}$ is $\theta_{t+1}$.
165 | 
166 | $$
167 | \theta_{t+1} = \theta_{t} - \eta\Delta\theta_{t}  \\
168 | \vartheta_{t+1} = m\vartheta_{t} + ( 1- m )\theta_{t+1}
169 | $$
170 | 
171 | <div style="text-align: center">
172 | $\theta:$ backbone parameters
173 | </div>
174 | 
175 | <div style="text-align: center">
176 | $\vartheta:$ momentum backbone parameters
177 | </div>
178 | 
179 | 
180 | #### Disadvantages of Contrastive methods:
181 | 
182 | In practice, people found out that contrastive methods need a lot of setup to make them work. They require techniques such as weight sharing between the branches, batch normalization, feature-wise normalization, output quantization, stop gradient, memory banks etc.,.This makes it hard to analyze. Also, they are not stable without the use of those techniques.
183 | 


--------------------------------------------------------------------------------
/docs/en/week15/15-2.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | lang-ref: ch.15-2
  3 | lecturer: Alfredo Canziani and Jiachen Zhu
  4 | title: Joint Embedding Methods - Regularised
  5 | authors: Sai Charitha Akula
  6 | date: 12 May 2022
  7 | ---
  8 | 
  9 | ### Non-Contrastive methods
 10 | 
 11 | #### Non-Contrastive methods and information theory:
 12 | 
 13 | Most of the non-contrastive methods are based on information theory. For example: Redundancy reduction ( Barlow Twins ) and Information.  They don't require special architectures or engineering techniques.
 14 | 
 15 | #### VicReg:
 16 | It tries to maximize the information content of the embeddings by producing embedding variables that are decorrelated to each other. If the variables are correlated to each other, they covariate together and the information content is reduced. Thus, it prevents an informational collapse in which the variables carry redundant information. Also, this method requires a comparatively small batch size.
 17 | 
 18 | Two types of collapse can occur in these architectures: \\
 19 | $\textbf{Type 1}:$ Irrespective of the input, the network generates the same representation \\
 20 | $\textbf{Type 2}:$ Special collapse - Although different images have different representations, the information content is really low in each representation.       
 21 | 
 22 | ##### Loss function:
 23 | The loss function is pushing:
 24 | 1. Positive pairs closer - to be invariant to data augmentation
 25 | 2. The variance of the embeddings large by pushing all of the diagonal terms of the covariance matrix large - to prevent the first kind of collapse
 26 | 3. The covariance of the embeddings small by pushing all off the diagonal terms of the covariance matrix small- to prevent the second kind of collapse.
 27 | 
 28 | 
 29 | $$
 30 | \\[0.5cm]
 31 | \green{C} = \frac{1}{N} \green{H}^\top\green{H} \\[0.5 cm]
 32 | 
 33 | \red{L}(\boldsymbol{w},\vx,\vy) =
 34 |  \Vert \green{h_{\vx}} - \green{h_{\vy}} \Vert^2  \\[0.2cm]
 35 | 
 36 |  + \frac{1}{d}[ \sum_{i}^{d} ( \gamma - \,_{\vx}\green{C}_{ii}  )^+  +  ( \gamma - \, _{\vy}\green{C}_{ii} )^+ ] \\
 37 | 
 38 |  + \frac{1}{d}[ \sum_{i}^{d} \sum_{j \neq i}^{d}  ( _{\vx}\green{C}_{ij} )^2 + (_{\vy}\green{C}_{ij} )^2 ]
 39 | 
 40 | $$
 41 | 
 42 | 
 43 | ### Clustering methods
 44 | 
 45 | #### SwAV
 46 | 
 47 | This method prevents trivial solution by quantizing the embedding space. SwAV does the following:
 48 | 
 49 | 1. Generates representations and stack the generated representations ( into $\green{H_{x}}$ and $\green{H_{y}}$ ).
 50 | 2. Applies sinkhorn clustering method to each of the stacked representation to generate corresponding clustered $\green{\boldsymbol{Q}}$ matrices where each row ( $\violet{q_{\vx}}$ ) represents a one hot vector indicating the cluster the corresponding representation belongs to
 51 | 3. Performs second clustering for the representations $\vh_{\vx}$ and $\vh_{\vy}$ with soft-kmeans. This step generates predictions for $\green{q_{\vx}}$ and $\green{q_{\,\vy}}$, $\violet{\tilde{q_{\vx}}}$ and $\tilde{\violet{q_{\vy}}}$, from $\vh_{\vy}$ and $\vh_{\vx}$ respectively ( Thus, called swap prediction )
 52 | 4. Minimizes the loss function which is the sum of two crossentropy functions between $\green{q_{\vx}}$ and $\violet{\tilde{q_{\vx}}}$ and $\green{q_{\vy}}$ and $\violet{\tilde{q_{\vy}}}$.
 53 | 
 54 | 
 55 | <center>
 56 | <img src="{{site.baseurl}}/images/week15/15-1/1_fig7.png" height="85%" width="85%"/><br>
 57 | <b>Fig. 8</b>: SWaV
 58 | </center>
 59 | 
 60 | ##### The Loss function:
 61 | 
 62 | Sinkhorn algorithm:
 63 | Sinkhorn algorithm can distribute samples to not just one cluster but to every cluster. Thus, it can help us prevent all the data clustering into a single centroid or any such nonuniform distribution. It takes in hyperparameters that allow us to deploy different levels of uniform distribution across clusters degenerating to K-means algorithm on one extreme and to the perfectly uniform distribution on the other extreme
 64 | 
 65 | Softargmax clustering:
 66 | Each $\green{h_{\vy}}$ is normalized. $\boldsymbol{W}\green{h_{\vy}}$ indicates similarity between $\green{h_{\vy}}$ and all other centroids. Softargmax turns the cosine similarly ( positive or negative ) into a probability.
 67 | 
 68 | Since this is predicting the $\green{q_{\vx}}$, we will compare the cross entropy of the prediction, $\violet{\tilde{q_{\vx}}}$, with the actual $\green{q_{\vx}}$ to measure the prediction.
 69 | 
 70 | <!---
 71 | \{violet{\tilde{q_{\vx}}}}
 72 | --->
 73 | 
 74 | $$
 75 | 
 76 | \green{Q_{\vx}} = \text{sinkhorn}_{\boldsymbol{W}}(\green{H_{\vx}})  \in \mathbb{R}^{ N \times K } \\\\\\[0.2 cm]
 77 | 
 78 | \green{Q_{\vx}} = [ \green{q_{\vx}}^1,...,\green{q_{\vx}}^N ]^\top  \\\\[0.2 cm]
 79 | 
 80 | \boldsymbol{W} \in \mathbb{R}^{ K \times d } : \text{dictionary} \\ \\[0.2 cm]
 81 | 
 82 | \violet{\tilde{q_{\vx}}} = \text{softargmax}_{\blue{\beta}}(\boldsymbol{W}\green{h}_\vy) \in \mathbb{R}^{ K}  \\ \\[0.2 cm]
 83 | 
 84 | \red{F}(\vx, \vy) = \red{C}(\green{q_{\vx}}, \violet{\tilde{q_{\vx}}}) + \red{C}(\green{q_{\vy}}, \violet{\tilde{q_{\vy}}})
 85 | 
 86 | $$
 87 | 
 88 | 
 89 | ##### Interpretation of clusters:
 90 | This method partitions latent space into a few clusters automatically without labels and the hope is that these clusters will be related to the actual classes. Thus, later, we would just need a few labeled data samples to assign each cluster to the corresponding label under supervised learning.
 91 | 
 92 | ##### Invariance to data augmentation:
 93 | Instead of pushing the pairs closer to each other, you push both the representations to be inside the same cluster.
 94 | 
 95 | ##### Preventing trivial solution
 96 | In a trivial solution, all the representations will be the same and thus belong to the same centroid. However, with sinkhorn, different clusters have an equal number of samples, thus the representations can’t be put into one centroid, preventing a trivial solution.
 97 | 
 98 | ### Other methods
 99 | 
100 | The loss function for all the previous methods including contrasting methods needs a batch or pool of negative samples, thus creating problems with distributed training. However, the loss functions of these methods are local. These methods perform well but an understanding of why they don’t collapse is not yet available. Probably there's some implicit regularization happening in these networks to prevent them from converging to a trivial solution.
101 | 
102 | <center>
103 | <img src="{{site.baseurl}}/images/week15/15-1/1_fig9.png" height="100%" width="100%"/><br>
104 | <b>Fig. 10</b>: Other Methods
105 | </center>
106 | 
107 | #### BYOL:
108 | BOYL adds a predictor, predicting $\green{h_{\vy}}$ from $\green{h_{\vx}}$. The energy function ( $\red{D}$ ) is a cosine similarity between $\green{h_{\vy}}$ and predicted $\green{h_{\vy}}$. There is no term for negative samples i.e., this method only pushes positive pairs closer and has no enforcement on negative pairs. It is thought that asymmetrical architecture with extra layers makes this method work.
109 | 
110 | SimSiam is a followup version that uses a regular backbone instead of the momentum backbone
111 | 
112 | #### Dino:
113 | The two softargmax components used have different coldness or temperature. The energy function is the cross entropy between these two, pushing them together. Even this method doesn’t enforce anything on negative samples.
114 | 
115 | #### Data2Vec:
116 | Adds a layer norm at the end of the representation.
117 | 
118 | ##### Initialization of the network:
119 | If you initialize the network with a trivial solution, then that network will never work.  This is because if the trivial solution is already achieved, the loss function will produce a zero gradient and thus, can never escape from the trivial solution. However, in other cases, the training dynamic is adjusted in a way that they never converge in these methods.
120 | 
121 | 
122 | ### Improvements for JEMs
123 | 
124 | We can further improve these models by experimenting with data augmentation and network architecture. We don’t have a good understanding of these but they are very important. In fact, finding good augmentation may boost more performance than changing the loss function.
125 | 
126 | #### Data Augmentation
127 | 
128 | Most dominant augmentations were proposed by simCLR and improved a little bit by BYOL:
129 | 1. Random Crop (the most critical one)
130 | 2. Flip
131 | 3. Color Jitter
132 | 4. Gaussian Blur
133 | 
134 | It has been found empirically that random crop is the most critical one. It might be because the random crop is the only we can change the spatial information about the images. Flip does the same partly but is weak. Color jitter and gaussian blur change channels.
135 | 
136 | <center>
137 | <img src="{{site.baseurl}}/images/week15/15-2/2_fig5.png" width="75%" /><br>
138 | <b>Fig. 5</b>: Data Augmentation
139 | </center>
140 | 
141 | ##### Masking augmentation:
142 | Recently people are moving towards masking augmentation instead of traditional augmentation in which we mask out most ( ~75% in the below image ) of the patches. It can replace random crop since it’s another way to remove the redundancy of the spatial information
143 | 
144 | **Issues:**
145 | This works well only with transformer type of architecture and not with convnet. This is because masking introduces too many random artificial edges. For any transformer, the first layer is the conv layer, with kernel size equal to the patch size and thus, this never experiences artificial edges. For convnets which have sliding windows, the artificial edges can't be ignored and will result in noise.
146 | 
147 | <center>
148 | <img src="{{site.baseurl}}/images/week15/15-2/2_fig6.png" weight="50%" /><br>
149 | <b>Fig. 6</b>: Masked Augmentation
150 | </center>
151 | 
152 | #### Network Architecture
153 | 
154 | ##### Projector/Expander:
155 | It is a two/three-layer feed-forward neural network and empirical results show that it is always better to add this in the network architecture.
156 | 
157 | The projector is used to project into a lower dimension and the expander is used to project into a higher dimension. A projector is used only during the pretraining and removed while performing the downstream task. This is because the projector removes a lot of information even if the output dimension of the projector and the backbone are the same.
158 | 
159 | ##### Momentum Encoder:
160 | Even without a memory bank, a momentum encoder usually helps the performance of the downstream tasks, especially with weak data augmentation.
161 | 
162 | <center>
163 | <img src="{{site.baseurl}}/images/week15/15-2/2_fig7.png" width="50%" /><br>
164 | <b>Fig. 7</b>: Projector/Expander
165 | </center>
166 | 


--------------------------------------------------------------------------------
/docs/en/week15/15.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | lang-ref: ch.15
 3 | title: Week 15
 4 | ---
 5 | 
 6 | ## Lecture part A
 7 | 
 8 | 
 9 | As pointed out already, we can broadly classify Energy Based Models into generative or joint-embedding based on architectures and into contrastive or regularised & architectural based on training methods.
10 | 
11 | <!---
12 | in the following ways based on training methods and architectures for multimodal prediction
13 | 
14 |  either generative of joint-embedding, they are either contrastive or regularised and/or architectural.
15 | 
16 | | EBMS       |  Training Methods          | Architectures  |
17 | | ------------- |:-------------:| :-----:|
18 | | 1.      | Contrastive methods | Latent variable models |
19 | | 2.      | Regularized & Architectural methods      |   Latent variable models |
20 | | 3.      | Contrastive methods | Joint embedding architectures |
21 | | 4.      | Regularized & Architectural methods      | Joint embedding architectures |
22 | --->
23 | 
24 | <!---
25 | [comment]: <>(1. Contrastive methods, Latent variable models)
26 | [comment]: <>(2. Regularized & Architectural methods, Latent variable models)
27 | [comment]: <>(3. Contrastive methods, Joint embedding architectures)
28 | [comment]: <>(4. Regularized & Architectural methods, Joint embedding architectures)
29 | ---->
30 | 
31 | In this section, we discussed Visual Representation Learning, focused on self-supervised visual representation learning. This can be classified into Generative models, Pretext Tasks and Joint Embedding methods. In generative models, you train the model to reconstruct the original image from the noisy image. In pretext tasks, you train the model to figure out a smart way to generate pseudo labels. Joint Embedding methods try to make their backbone network robust to certain distortions and are invariant to data augmentation. JEM training methods can be classified into four types: contrastive methods, non-contrastive methods, clustering methods and Other methods. He concluded the lecture by discussing contrastive methods which push positive pairs closer and negative pairs away. 
32 | <!--
33 | These models require negative samples and finding them becomes difficult as the embedding spaces become large. This problem can be solved by using a memory bank: use a small batch size but instead of using negative samples from only the current batch, collect them even from previous batches. 
34 | -->
35 |  
36 | 
37 | ## Lecture part B
38 | 
39 | In this section, we discussed non-contrastive methods which are based on information theory and don’t require special architectures or engineering techniques. Then, he went on to discuss clustering methods which prevent trivial solution by quantizing the embedding space. Finally, he discussed "Other" methods which are local and don't create problem with distributed training unlike previous methods. He concluded the lecture by suggesting various improvisations for JEMs w.r.t Data augmentation and network architecture. 
40 | 
41 | <!--
42 | He mentioned that random crop is the most critical one and that people are moving towards masking augmentation which works well for transformer type of architecture. He finally concluded the lecture with the discussion about projector/expander that can improve the training if added to the network architecture.
43 | -->
44 | 


--------------------------------------------------------------------------------
/docs/fr/README-FR.md:
--------------------------------------------------------------------------------
  1 | <!--
  2 | # NYU Deep Learning Spring 2021 (NYU-DLSP21)
  3 | -->
  4 | # Cours sur l'apprentissage profond de la NYU - printemps 2021 (NYU-DLSP21)
  5 | 
  6 | <!-- English - French -->
  7 | [🇬🇧](https://github.com/Atcold/NYU-DLSP21/blob/master/README.md) &nbsp; [🇫🇷](https://github.com/Atcold/NYU-DLSP21/blob/master/docs/fr/README-FR.md)
  8 | 
  9 | 
 10 | <!--
 11 | ## Content new organisation
 12 | 
 13 | This semester we have reorganised the didactic material.
 14 | In the first half of the semester we covered 3 topics, spanning two weeks, each followed by an assignment.
 15 | Moreover, each lecture had a corresponding practicum.
 16 | 
 17 | 1. History, backpropagation, and gradient descent
 18 | 2. Parameter sharing: recurrent and convolutional networks
 19 | 3. Latent variable (LV) energy based models (EBMs)
 20 | 
 21 | Pay attention that we have redesigned the curriculum and lectures' content.
 22 | We've treated LV-EBM as a *basic* module, which to build upon.
 23 | -->
 24 | 
 25 | ## Nouvelle organisation du contenu
 26 | 
 27 | Ce semestre, nous avons réorganisé le matériel didactique.
 28 | Au cours de la première moitié du semestre, nous avons couvert 3 sujets, s'étalant sur deux semaines, chacun étant suivi d'un devoir.
 29 | De plus, à chaque cours magistral sont associés des travaux dirigés.
 30 | 
 31 | 1. Historique, rétropropagation et descente de gradient.
 32 | 2. Partage des paramètres : réseaux récurrents et convolutifs.
 33 | 3. Modèles à base d'énergie (EBMs pour *energy based models*) à variable latente (LV pour *latent variable*).
 34 | 
 35 | Notez que nous avons remanié le programme et le contenu des cours.
 36 | Nous avons traité les LV-EBMs comme un module *de base*, sur lequel il faut s'appuyer.
 37 | 
 38 | 
 39 | <!--
 40 | ## Enters the semester's second half
 41 | 
 42 | I thought I was going to repropose the same practica I've used during [NYU-DLSP20](https://github.com/Atcold/NYU-DLSP20), last year edition, just in different order.
 43 | 
 44 | But I couldn't.
 45 | 
 46 | This year's students have LV-EBMs on their side.
 47 | We told them about *the cake* and now I cannot pretend it doesn't exist and teach as if they were unaware of the elephant in the room.
 48 | It would have been intellectually dishonest.
 49 | Henceforth, I've redesigned my whole deck of slides.
 50 | -->
 51 | 
 52 | ## La seconde moitié du semestre
 53 | 
 54 | Alfredo pensait reproduire les mêmes travaux pratiques utilisés pour l'édition de l'année dernière, [NYU-DLSP20](https://github.com/Atcold/NYU-DLSP20), mais dans un ordre différent.
 55 | 
 56 | Cependant il n'a pas pu.
 57 | 
 58 | Les étudiants de cette année ont vu les LV-EBMs et on leur a parlé du *gâteau*.
 59 | Alfredo ne pouvait donc pas prétendre qu'il n'existe pas et enseigner comme s'ils n'étaient pas conscients de l'éléphant dans la pièce.
 60 | Cela aurait été intellectuellement malhonnête.
 61 | Il a donc redessiné l'ensemble de ses diapositives.
 62 | 
 63 | 
 64 | <!--
 65 | ## This semester repository
 66 | 
 67 | That's why this repo has been created.
 68 | I'm **not** going to try to do the same insane work I've put up with last year, but I need a space where to post updated slides, notebooks, and host new transcriptions.
 69 | Last year material is still valid.
 70 | This year you have a different take.
 71 | A more powerful one.
 72 | -->
 73 | 
 74 | ## Dépôt de ce semestre
 75 | 
 76 | C'est pourquoi ce dépôt a été créé.
 77 | Il n'est **pas** prévu de faire le même travail insensé que l'année dernière, mais Alfredo a besoin d'un espace où poster des diapositives mises à jour, des notebooks et accueillir de nouvelles transcriptions/traductions.
 78 | Le matériel de l'année dernière est toujours valable.
 79 | Cette année, vous avez un point de vue différent.
 80 | Un point de vue plus puissant.
 81 | 
 82 | 
 83 | <!--
 84 | ## Previous releases
 85 | 
 86 | Before NYU-DLSP21 there were…
 87 | 
 88 | - [NYU-DLSP20](https://github.com/Atcold/NYU-DLSP20) (major release)
 89 | - [NYU-DLSP19](https://github.com/Atcold/NYU-DLSP20/releases/tag/dlsp19)
 90 | - [AIMS-DLFL19](https://github.com/Atcold/NYU-DLSP20/releases/tag/aims-fl18)
 91 | - [CoDaS-HEP18](https://github.com/Atcold/NYU-DLSP20/releases/tag/v1.0.0)
 92 | - [NYU-DLSP18](https://docs.google.com/document/d/1_p1Mw-NtMGN_vpas_pchLsQC2u0NM5mTnRapBrQ2ivk/)
 93 | - [Purdue-DLFL16](https://docs.google.com/document/d/1ugJRMqQ_cCUQC1B8mSE0iro7sKrDT8-BnppTZv0rA08/)
 94 | - [torch-Video-Tutorials](https://github.com/Atcold/torch-Video-Tutorials)
 95 | -->
 96 | 
 97 | ## Contenus précédents
 98 | 
 99 | Avant NYU-DLSP21, il y a eu :
100 | 
101 | - [NYU-DLSP20](https://github.com/Atcold/NYU-DLSP20) (version la plus importante)
102 | - [NYU-DLSP19](https://github.com/Atcold/NYU-DLSP20/releases/tag/dlsp19)
103 | - [AIMS-DLFL19](https://github.com/Atcold/NYU-DLSP20/releases/tag/aims-fl18)
104 | - [CoDaS-HEP18](https://github.com/Atcold/NYU-DLSP20/releases/tag/v1.0.0)
105 | - [NYU-DLSP18](https://docs.google.com/document/d/1_p1Mw-NtMGN_vpas_pchLsQC2u0NM5mTnRapBrQ2ivk/)
106 | - [Purdue-DLFL16](https://docs.google.com/document/d/1ugJRMqQ_cCUQC1B8mSE0iro7sKrDT8-BnppTZv0rA08/)
107 | - [torch-Video-Tutorials](https://github.com/Atcold/torch-Video-Tutorials)
108 | 
109 | <!--
110 | ## More info
111 | 
112 | Keep reading on the [class website](https://atcold.github.io/NYU-DLSP21/).
113 | -->
114 | 
115 | ## Plus d'informations
116 | 
117 | Consultez le [site web du cours](https://atcold.github.io/NYU-DLSP21/).
118 | 


--------------------------------------------------------------------------------
/docs/fr/faq.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: Avant-propos, FAQ et éléments de traduction
  3 | author: Loïck Bourdois
  4 | date: 07 Jul 2021
  5 | lang-ref: faq
  6 | lang: fr
  7 | ---
  8 | 
  9 | #	Avant-propos
 10 | Ce cours porte sur les techniques de représentation et d'apprentissage profond les plus récentes. Il se concentre sur l'apprentissage supervisé, non supervisé et autosupervisté, mais aussi sur les méthodes d’enchâssement, l'apprentissage métrique et les réseaux convolutifs et récurrents. 
 11 | Il est illustré d’applications à la vision par ordinateur, la compréhension du langage naturel et la reconnaissance vocale.
 12 | Pour suivre ce cours, il est fortement conseillé d’avoir des prérequis en algèbre et d’avoir déjà suivi un cours introductif d'apprentissage machine ou de *data science*. D’après Yann Le Cun, ces cours sont destinés à des personnes de niveau bac+4 ou bac+5.
 13 |   
 14 | Nous vous invitons à privilégier les vidéos de la [chaine YouTube](https://www.youtube.com/watch?v=8L10w1KoOU8&list=PLLHTzKZzVU9e6xUfG10TkTWApKSZCzuBI&index=21) (contenu « officiel ») puisque le cours y est donné par le corps enseignant contrairement au site web où il s’agit des notes prises par les étudiants pendant le cours.
 15 | Le site web étant des résumés des vidéos, celles-ci comprennent donc généralement des informations supplémentaires par rapport au site. Comme par exemple :
 16 | -	des anecdotes sur les différents concepts abordés,
 17 | -	des blagues, 
 18 | -	la répétition d’un même concept mais sous la forme de différentes formulations permettant ainsi généralement de comprendre une idée si une première formulation n’est pas saisie, 
 19 | -	les questions des étudiants qui peuvent être celles que vous ayez vous-même pendant le visionnage,
 20 | Notez que si des concepts ne sont toujours pas compris à l’issue de la vidéo, vous avez la possibilité de poser une question en commentaire de la vidéo YouTube, ce que ne permet pas le site web.
 21 | -	les références des articles sur lesquels se basent le cours sont présentes sur les diapositives des vidéos alors qu’elles sont absentes du site.  
 22 |   
 23 | Le site web sert ainsi davantage de résumé des vidéos ou encore de base que vous pouvez réutiliser pour vos notes personnelles que vous prenez pendant le visionnage des vidéos. 
 24 | En cas de besoin vous pouvez facilement basculer du site à un moment d’une vidéo donnée en cliquant sur les titres des paragraphes des pages web.
 25 | 
 26 | 
 27 | 
 28 | # FAQ
 29 | Voici quelques réponses à des questions fréquemment posées :
 30 | -	**Est-ce que suivre ce cours permet d’obtenir une certification ?**  
 31 | >	Non. Pour proposer une certification, il faudrait pouvoir vous évaluer or le contenu n’a pas été prévu pour (contrairement à un MOOC par exemple).    
 32 | >	Cette demande étant fréquente, des réflexions sont menées pour essayer d’en proposer une pour des éditions futures du cours.
 33 | -	**Combien de temps consacrer à ce cours ?**  
 34 | > Pour chaque semaine, il y a environ 2h30/3h de contenu vidéo. Avec le temps consacré à la prise de notes et celui pour jouer avec les *notebooks*, une estimation totale de 5h par semaine semble raisonnable. Pour la suite, cela dépend du niveau d'immersion que vous voulez atteindre dans un sujet donné (lire les articles donnés en référence, appliquer ce qui a été vu en classe à vos propres projets, etc.).
 35 | -	**Où poser une question à l’issue du visionnage d’une vidéo ?**  
 36 | >	Vous pouvez la poser directement (en anglais) dans la section commentaires sous la vidéo YouTube en question, Alfredo se fera un plaisir d’y répondre. Si cette question porte sur un point précis de la vidéo, pensez à indiquer l’horodatage. Vous pouvez le faire également sur le [Discord](https://discord.gg/CthuqsX8Pb) de la classe dédié expressément aux étudiants. Il sert également à coordonner des groupes de visionnage, discuter des devoirs, suggérer des améliorations ou plus généralement pour tout sujet lié au cours.
 37 | - **Puis-je utiliser ce cours?**  
 38 | > Bien sûr, le cours est placé sous la [Licence internationale Creative Commons Attribution-NonCommercial-ShareAlike 4.0](https://creativecommons.org/licenses/by-nc-sa/4.0/deed.fr).
 39 | > Cela signifie que :
 40 | > - Vous n'êtes pas autorisé à faire un usage commercial de cette œuvre.
 41 | > - Vous devez créditer l'œuvre, intégrer un lien vers la licence et indiquer si des modifications ont été effectuées à l'œuvre. Vous devez indiquer ces informations par tous les moyens raisonnables, sans toutefois suggérer que l'offrant vous soutient ou soutient la façon dont vous avez utilisé son œuvre.
 42 | > - Dans le cas où vous effectuez un remix, que vous transformez, ou créez à partir du matériel à partir de l'œuvre originale, vous devez diffuser l'œuvre modifiée dans les mêmes conditions, c'est à dire avec la même licence avec laquelle l'œuvre originale a été diffusée. 
 43 | >  
 44 | > - Pour le crédit, vous pouvez utiliser le BibTeX suivant :
 45 | > @misc{canziani2020nyudlsp21,  
 46 |   author = {Canziani, Alfredo and LeCun, Yann},  
 47 |   title = {NYU Deep Learning, Spring 2021},  
 48 |   howpublished = "\url{https://github.com/Atcold/NYU-DLSP21}",  
 49 |   year = {2021},  
 50 |   note = "[Online; accessed <today>]"  
 51 | }
 52 | 
 53 | 
 54 | 
 55 | #	Traduction
 56 | Vous trouverez ici les informations concernant les choix de traduction adoptés.
 57 | 
 58 | ###	Informations de base : 
 59 | - Pour le site :  
 60 | Tous les textes présents sur ce site sont des notes de cours prises par les étudiants de la *New York University* lors des cours donnés par Yann Le Cun, Alfredo Canziani, Ishan Misra, Awni Hannun et Marc'Aurelio Ranzato.  
 61 | Ainsi les textes en anglais ont été rédigés par plusieurs personnes, ce qui a un impact sur l’homogénéité des textes (certains écrivent au passé, d’autres au présent ; les abréviations utilisées ne sont pas forcément toujours les mêmes ; certains écrivent des phrases courtes, quand d’autres écrivent des phrases pouvant aller jusqu’à 5 ou 6 lignes, etc.).  
 62 | La traduction en français qui vous est proposée a été effectuée par une seule personne dans le but d’atténuer les problèmes cités à l’instant et de proposer une traduction homogène.
 63 | 
 64 | - Pour les vidéos :  
 65 | Afin de fluidifier la traduction et la compréhension, il a été décidé de ne pas retranscrire les mots « parasites » de remplissage et de transition (les « *you know* »,  « *sort of* », « *right* », « *so* », etc.).  
 66 | Quand le débit est élevé, une traduction ne reste qu'environ 4 secondes à l'écran. Pour pouvoir retranscrire le plus d'informations possibles dans cet intervalle de temps, nous utilisons des abréviations lorsque cela est possible (« RNNs » au lieu de « réseaux de neurones récurrents » par exemple). Nous privilégions également l'usage de mots courts (par exemple un « car » à la place d’un « parce que »).  
 67 | En raison du travail important nécessaire pour effectuer la traduction (1h de travail pour 10min de vidéo) il n'a pas été possible d'effectuer une relecture détaillée des traductions vidéos. Ainsi, si vous remarquez des fautes d'orthographe/de conjugaison, fautes de frappes, etc., nous vous invitons à soumettre une PR sur le [répertoire GitHub du site](https://github.com/Atcold/NYU-DLSP21/pulls) en précisant avec un `[FR]` qu’elle concerne la traduction française.
 68 | 
 69 | 
 70 | ###	Choix de traductions des termes techniques : 
 71 | 
 72 | - Choix de traduire les termes anglais en français :
 73 | 
 74 | Terme | Traduction | Raisons / Explications
 75 | --- | --- |--- | 
 76 | Chain rule  | Règle de dérivation des fonctions composées | En pratique usage du terme « règle de la chaîne » dans les sous-titres des vidéos pour gagner de la place.
 77 | CNN  | ConvNet | Yann tient particulièrement au respect de cette traduction. Voir notamment la page 202 du livre [*Quand la machine apprend*](https://www.odilejacob.fr/catalogue/sciences/informatique/quand-la-machine-apprend_9782738149312.php).
 78 | Downstream tasks  | Tâches en aval | Les tâches de prétexte étant les tâches en amont.
 79 | Energy-Based Models  | Modèles à base d’énergie | Traduction pas forcément satisfaisante mais adoptée faute de mieux.
 80 | Embedding  | Enchâssement | Reprise de la traduction utilisée page 228 dans le livre *Quand la machine apprend*. Dans la littérature, il est possible de trouver également l'usage du terme « plongement » comme traduction. Parler tout simplement de vectorisation paraîtrait beaucoup plus simple pour faire le lien avec le concept mathématique (on vectorise un mot par exemple).
 81 | Forward model | Modèle prédictif |
 82 | Graph Neural Networks  | Réseaux de neurones pour graphe | En pratique, pour les sous-titres des vidéos, l'abréviation GNN est privilégiée.
 83 | Graph Convolution Networks  | Réseaux convolutifs pour graphe | En pratique, pour les sous-titres des vidéos, l'abréviation GCN est privilégiée.
 84 | Manifold  | Variété | Voir [l'article Wikipédia](https://fr.wikipedia.org/wiki/Vari%C3%A9t%C3%A9_(g%C3%A9om%C3%A9trie)).
 85 | Nonlinearity function	 | Fonction non linéaire | En français, on utilise également le terme de « fonction d’activation ».
 86 | Overfitting  | Surentraînement | Reprise de la traduction utilisée page 155 dans le livre *Quand la machine apprend*.
 87 | Regularizer  | Régulariseur | Néologisme préférable à régularisateur.
 88 | Sparse  | Epars | Pour l'expression « sparse matrix », nous traduisons « sparse » en « creuse » pour « matrice creuse ». Pour tous les autres cas nous utilisons « épars » ou « éparse » en fonction du genre du mot auquel l'adjectif se rapporte.
 89 | Sparsity  | Eparsité | Néologisme basé sur le mot « épars ».
 90 | Template Matching  | Template Matching | L'expression « appariement de patrons » comme traduction peut être trouvable sur le site ou dans les vidéos.
 91 | Yann LeCun |	Yann Le Cun ou Yann	| L'explication de l'écriture du nom de famille est donnée page 193 du livre *Quand la machine apprend*. Dans les notes en anglais des étudiants, il est possible de trouver « Mr Yann LeCun », « Mr LeCun », « Doctor Yann LeCun », « Professor LeCun », etc. Nous utilisons simplement « Yann ».
 92 |  
 93 |  - Choix de ne pas traduire les termes anglais en français :  
 94 | Nous avons fait le choix de ne pas traduire certains termes anglais pour des raisons pratiques. Par exemple, certains concepts nécessitent 3 ou 4 mots en français là où 1 seul suffit en anglais. Cela pose notamment problème pour les vidéos où le temps d'affichage est limité, d'où la préférence à garder le terme en anglais. Il serait possible d'utiliser des néologismes mais nous avons préféré ne pas en imposer car ne pouvant peut-être pas faire consensus. Sur le site, les mots laissés en anglais sont indiqués en italique. 
 95 | 
 96 | Terme | Traduction | Raisons / Explications
 97 | --- | --- |--- | 
 98 | Dropout | Dropout | Le mot « décimation » serait approprié mais il est déjà utilisé en traitement du signal pour signifier « sous-échantillonnage ».
 99 | Finetuning | Finetuning | Le terme « affinage » peut être trouvable dans la littérature. 
100 | One hot | One hot | La notion de « vecteurs de base canonique » pourrait être utilisée mais elle est un peu technique et l'expression est plutôt longue pour traduire à peine 2 mots.  N.D.T : lorsque j'étais étudiant, dans mes cours d'algèbre linéaire, j'utilisais soit « v.b.c » pour « vecteurs de base canonique » ou bien « zérun » (pour un vecteur contenant des 0 et un 1) mais il s'agit d'une convention personnelle que je ne préfère pas imposer.
101 | Pooling | Pooling | Plusieurs traductions envisagées comme agrégation, agglomération, ou coalescence. Garder le terme en anglais est plus simple (un « max-agrégation » n'est pas très élégant par exemple).
102 | 
103 | 
104 | En vous souhaitant un bon visionnage ou une bonne lecture !
105 | 


--------------------------------------------------------------------------------
/docs/fr/week01/01.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | lang: fr
 3 | lang-ref: ch.01
 4 | title: Semaine 1
 5 | translation-date: 19 June 2021
 6 | translator: Loïck Bourdois
 7 | ---
 8 | 
 9 | <!--
10 | ## Lecture
11 | 
12 | Some history can be found [here](https://atcold.github.io/NYU-DLSP20/en/week01/01-1/), while gradient descent can be found [here](https://atcold.github.io/NYU-DLSP20/en/week02/02-1/).
13 | -->
14 | ## Cours magistral
15 | Un peu d'histoire sur l'apprentissage supervisé peut être trouvée [ici](https://atcold.github.io/NYU-DLSP20/fr/week01/01-1/), tandis que la descente de gradient peut être trouvée [ici](https://atcold.github.io/NYU-DLSP20/fr/week02/02-1/).
16 | 
17 | <!--
18 | ## Practicum
19 | 
20 | This plus the next practicum's summary can be found [here](https://atcold.github.io/NYU-DLSP20/en/week01/01-3/).
21 | -->
22 | ## Travaux dirigés
23 | Le résumé de cette semaine et de la suivante peuvent être trouvé [ici](https://atcold.github.io/NYU-DLSP20/fr/week01/01-3/).
24 | 


--------------------------------------------------------------------------------
/docs/fr/week02/02-3.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | lang: fr
  3 | lang-ref: ch.02-3
  4 | title: Motivation des problèmes, algèbre linéaire et visualisation
  5 | lecturer: Alfredo Canziani
  6 | authors: Rajashekar Vasantha
  7 | date: 04 Feb 2021
  8 | typora-root-url: 02-3
  9 | translation-date: 19 Jun 2021
 10 | translator: Loïck Bourdois
 11 | ---
 12 | 
 13 | 
 14 | <!--
 15 | ## Resources
 16 | 
 17 | Please follow Alfredo Canziani [on Twitter @alfcnz](https://twitter.com/alfcnz). Videos and textbooks with relevant details on linear algebra and singular value decomposition (SVD) can be found by searching Alfredo's Twitter, for example type `linear algebra (from:alfcnz)` in the search box.
 18 | -->
 19 | ## Ressources
 20 | 
 21 | Nous vous invitons à suivre Alfredo Canziani [sur Twitter @alfcnz](https://twitter.com/alfcnz). Vous trouverez sur son compte des vidéos et des manuels contenant des détails pertinents sur l'algèbre linéaire et la décomposition en valeurs singulières (SVD). Ce contenu est trouvable en effectuant une recherche (en anglais) sur le Twitter d'Alfredo, en tapant par exemple `linear algebra (from:alfcnz)` dans la barre de recherche.
 22 | 
 23 | 
 24 | <!--
 25 | ## [Neural Nets: Rotation and Squashing](https://youtu.be/0TdAmZUMj2k)
 26 | A traditional neural network is an alternating collection of two blocks - the linear blocks and the non-linear blocks. Given below is a block diagram of a traditional neural network.
 27 | <br>
 28 | <br>
 29 | <center>
 30 | <img src="{{site.baseurl}}/images/week02/02-3/figure1.png" width="1000px"/>
 31 | Figure 1: Block Diagram of a Traditional Neural Network
 32 | </center>
 33 | <br>
 34 | The linear blocks (Rotations, for simplicity) are given by:
 35 | 
 36 | $$
 37 | \vect{s}_{k+1} = \mW_k z_k
 38 | $$
 39 | 
 40 | And the non-linear blocks (Squashing functions for intuitive understanding) are given by:
 41 | 
 42 | $$ \vect{z}_k = h(\vect{s}_k) $$
 43 | 
 44 | In the above diagram and equations, $$\vx \in \mathbb{R}^n$$ represents the input vector. $$\mW_k \in \mathbb{R}^{n_{k} \times n_{k-1}}$$ represents the matrix of an affine transformation corresponding to the $$k^{\text{th}}$$ block and is described below in further detail. The function $h$ is called the activation function and this function forms the non-linear block of the neural network. Sigmoid, ReLu and tanh are some of the common activation functions and we will look at them in the later parts of this section. After alternate applications of linear and non-linear blocks, the above network produces an output vector $$\vect{s}_k \in \mathbb{R}^{n_{k-1}}$$.
 45 | 
 46 | Let us first have a look at the linear block to gain some intuition on affine transformations. As a motivating example, let us consider image classification. Suppose we take a picture with a 1 megapixel camera. This image will have about 1,000 pixels vertically and 1,000 pixels horizontally, and each pixel will have three colour dimensions for red, green, and blue (RGB). Each particular image can then be considered as one point in a 3 million-dimensional space. With such massive dimensionality, many interesting images we might want to classify -- such as a dog *vs.* a cat -- will essentially be in the same region of the space.
 47 | 
 48 | In order to effectively separate these images, we consider ways of transforming the data in order to move the points. Recall that in 2-D space, a linear transformation is the same as matrix multiplication. For example, the following are transformations, which can be obtained by changing matrix characterictics:
 49 | 
 50 | -   Rotation (when the matrix is orthonormal).
 51 | -   Scaling (when the matrix is diagonal).
 52 | -   Reflection (when the determinant is negative).
 53 | -   Shearing.
 54 | -   Translation.
 55 | 
 56 | Note that translation alone is not linear since 0 will not always be mapped to 0, but it is an affine transformation. Returning to our image example, we can transform the data points by translating such that the points are clustered around 0 and scaling with a diagonal matrix such that we "zoom in" to that region. Finally, we can do classification by finding lines across the space which separate the different points into their respective classes. In other words, the idea is to use linear and nonlinear transformations to map the points into a space such that they are linearly separable. This idea will be made more concrete in the following sections.
 57 | 
 58 | In the next part, we visualize how a neural network separates points and a few linear and non-linear transformations. This can be accessed [here](https://atcold.github.io/NYU-DLSP20/en/week01/01-3/).
 59 | -->
 60 | 
 61 | 
 62 | ## [Réseaux neuronaux : rotation et écrasement](https://youtu.be/0TdAmZUMj2k)
 63 | Un réseau de neurones traditionnel est une collection alternée de deux blocs : les blocs linéaires et les blocs non linéaires.
 64 | Voici le schéma fonctionnel d'un réseau de neurones traditionnel.
 65 | <br>
 66 | <br>
 67 | <center>
 68 | <img src="{{site.baseurl}}/images/week02/02-3/figure1.png" width="1000px"/>
 69 |   <b>Figure 1 :</b> Schéma d'un réseau de neurones traditionnel
 70 | </center>
 71 | <br>
 72 | Les blocs linéaires (rotations pour simplifier) sont donnés par :
 73 | 
 74 | $$
 75 | \vect{s}_{k+1} = \mW_k z_k
 76 | $$
 77 | 
 78 | Et les blocs non linéaires (fonctions d'écrasement pour une compréhension intuitive) sont donnés par :
 79 | 
 80 | 
 81 | $$ \vect{z}_k = h(\vect{s}_k) $$
 82 | 
 83 | Dans le schéma et les équations ci-dessus,  $$\vx \in \mathbb{R}^n$$ représente le vecteur d'entrée.
 84 | $$\mW_k \in \mathbb{R}^{n_{k} \times n_{k-1}}$$ représente la matrice d'une transformation affine correspondant au $$k^{\text{ème}}$$ bloc et est décrite plus en détail ci-dessous.
 85 | La fonction $h$ est appelée fonction d'activation et cette fonction forme le bloc non linéaire du réseau neuronal.
 86 | Sigmoïde, ReLU et tanh sont quelques-unes des fonctions d'activation les plus courantes et nous les examinerons dans les parties suivantes de cette section.
 87 | Après des applications alternées des blocs linéaire et non linéaire, le réseau ci-dessus produit un vecteur de sortie $$\vect{s}_k \in \mathbb{R}^{n_{k-1}}$$.
 88 | 
 89 | Examinons d'abord le bloc linéaire pour comprendre les transformations affines. Comme exemple considérons la classification d'images.
 90 | Supposons que nous prenions une photo avec un appareil photo de $1$ mégapixel.
 91 | Cette image aura environ $1 000$ pixels verticalement et $1 000$ pixels horizontalement, et chaque pixel aura trois dimensions de couleur pour le rouge, le vert et le bleu (RVB).
 92 | Chaque image peut donc être considérée comme un point dans un espace à $3$ millions de dimensions.
 93 | Avec une telle dimensionnalité, de nombreuses images intéressantes que nous pourrions vouloir classer, comme un chien *vs* un chat, se trouveront essentiellement dans la même région de l'espace.
 94 | 
 95 | Afin de séparer efficacement ces images, nous envisageons des moyens de transformer les données afin de déplacer les points.
 96 | Rappelons que dans l'espace bidimensionnel, une transformation linéaire équivaut à une multiplication de matrice.
 97 | Par exemple, les transformations suivantes peuvent être obtenues en changeant les caractéristiques de la matrice :
 98 | 
 99 | - Rotation : lorsque la matrice est orthonormée.
100 | - Mise à l'échelle (« scalabilité ») : lorsque la matrice est diagonale.
101 | - Réflexion : lorsque le déterminant est négatif.
102 | - *Shearing*.
103 | - Translation.
104 | 
105 | A noter que la translation seule n'est pas linéaire puisque $0$ ne sera pas toujours mis en correspondance avec 0, mais c'est une transformation affine.
106 | Pour revenir à notre exemple d'image, nous pouvons transformer les points de données en les translatant de manière à ce qu'ils soient regroupés autour de 0 et en les mettant à l'échelle à l'aide d'une matrice diagonale de manière à effectuer un « zoom avant » sur cette région.
107 | Enfin, nous pouvons effectuer une classification en trouvant des lignes dans l'espace qui séparent les différents points dans leurs classes respectives.
108 | En d'autres termes, l'idée est d'utiliser des transformations linéaires et non linéaires pour représenter les points dans un espace tel qu'ils soient linéairement séparables.
109 | Cette idée sera rendue plus concrète dans les sections suivantes.
110 | 
111 | Dans la suite, nous visualisons comment un réseau neuronal sépare des points et quelques transformations linéaires et non linéaires.
112 | Ce contenu est essentiellement le même que celui de l'année dernière, ainsi nous vous invitons à vous rendre [ici](https://atcold.github.io/NYU-DLSP20/fr/week01/01-3/) pour le consulter.
113 | 


--------------------------------------------------------------------------------
/docs/fr/week02/02.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | lang: fr
 3 | lang-ref: ch.02
 4 | title: Semaine 2
 5 | translation-date: 19 June 2021
 6 | translator: Loïck Bourdois
 7 | ---
 8 | 
 9 | <!--
10 | ## Lecture
11 | 
12 | Similar to [this](https://atcold.github.io/NYU-DLSP20/en/week11/11-1/), [this](https://atcold.github.io/NYU-DLSP20/en/week11/11-2/), and possibly more.
13 | -->
14 | 
15 | ## Cours magistral
16 | Similaire à [ceci](https://atcold.github.io/NYU-DLSP20/fr/week11/11-1/) et [ceci](https://atcold.github.io/NYU-DLSP20/fr/week11/11-2/) et peut-être plus.
17 | 
18 | 
19 | <!--
20 | ## Practicum
21 | 
22 | We discuss the motivation for applying transformations to data points visualized in space. We talk about Linear Algebra and the application of linear and non-linear transformations. We discuss the use of visualization to understand the function and effects of these transformations. We walk through examples in a Jupyter Notebook and conclude with a discussion of functions represented by neural networks.
23 | -->
24 | 
25 | ## Travaux dirigés
26 | Nous discutons de la motivation d'appliquer des transformations à des points de données visualisés dans l'espace. Nous parlons d'algèbre linéaire et de l'application de transformations linéaires et non linéaires. Nous abordons l'utilisation de la visualisation pour comprendre la fonction et les effets de ces transformations et parcourons des exemples dans un *notebook* Jupyter. Nous concluons par une discussion sur les fonctions représentées par des réseaux neuronaux.
27 | 


--------------------------------------------------------------------------------
/docs/fr/week03/03-3.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | lang: fr
  3 | lang-ref: ch.03-3
  4 | title: Classification d'une spirale
  5 | lecturer: Alfredo Canziani
  6 | authors: Wenhao Li
  7 | date: 6 May 2021
  8 | typora-root-url: 03-3
  9 | translation-date: 19 Jun 2021
 10 | translator: Loïck Bourdois
 11 | ---
 12 | 
 13 | <!--
 14 | ## [Typora](https://typora.io/)
 15 | Typora is a useful tool to write markdown with the addition of formulae in LaTeX. It is convenient to write paper and homework, and generating pdf file with Typora.
 16 | -->
 17 | ## [Typora](https://typora.io/)
 18 | *Typora* est un outil utile pour écrire en markdown et ajouter des formules en *LaTeX*. Il est pratique pour rédiger des articles, des devoirs et générer des fichiers pdf.
 19 | 
 20 | <!--
 21 | ## [Notion](https://www.notion.so/)
 22 | <center>
 23 | <img src="{{site.baseurl}}/images/week03/03-3/figure1.png" style="background-color:#DCDCDC;" /><br>
 24 | </center>
 25 | Here you can place all your favorite stuff. This includes but is not limited to recipes, music, books, notes. Everything in one place, simple and powerful.
 26 | 
 27 | When you find some useful article regarding Deep Learning, you may want to collect it for future review. The database is just all you need. You can find [more information](https://www.notion.so/Intro-to-databases-fd8cd2d212f74c50954c11086d85997e) about how to use the database.
 28 | 
 29 | First you need to create a database by "Workspace" -> "Add a new page". Inside this page, choose "/table" -> "Table - Full Page". In addition to filling out the information related to the paper, we usually want to cover "The Golden Circle" aka "What? Why? How?" in our summary.
 30 | 
 31 | This is an [example](https://www.notion.so/When-to-use-parametric-models-in-reinforcement-learning-d4c5e586677e49338a41b663231c0633) of how to organize your summary.
 32 | -->
 33 | ## [Notion](https://www.notion.so/)
 34 | <center>
 35 | <img src="{{site.baseurl}}/images/week03/03-3/figure1.png" style="background-color:#DCDCDC;" /><br>
 36 | </center>
 37 | 
 38 | Avec *Notion* vous pouvez placer en un endroit toutes vos affaires préférées. Cela inclut, sans s'y limiter, les recettes, la musique, les livres, les notes. Tout en un seul endroit, simple et puissant.
 39 | 
 40 | Lorsque vous trouvez un article utile sur l'apprentissage profond, vous pouvez l'y stocker pour le consulter ultérieurement. Vous pouvez trouver [plus d'informations](https://www.notion.so/Intro-to-databases-fd8cd2d212f74c50954c11086d85997e) sur la façon d'utiliser la base de données.
 41 | 
 42 | Vous devez d'abord créer une base de données via *Workspace* => *Add a new page*. Dans cette page, choisissez */table* => *Table - Full Page*.
 43 | En plus de remplir les informations relatives au document, nous voulons généralement couvrir le traditionnel « Qui ? Quoi ? Où ? Pourquoi ? Comment ? Quand ? » dans le résumé.
 44 | 
 45 | Voici un [exemple](https://www.notion.so/When-to-use-parametric-models-in-reinforcement-learning-d4c5e586677e49338a41b663231c0633) (en anglais) de la façon d'organiser votre résumé.
 46 | 
 47 | 
 48 | <!--
 49 | ## [Diagram.net](https://app.diagrams.net/)
 50 | 
 51 | Diagrams.net is a great tool to draw neural network diagrams. Next we will introduce a few rules to make our diagrams more consistent with the ones in lecture.
 52 | 
 53 | <center>
 54 | <img src="{{site.baseurl}}/images/week03/03-3/figure7.png" style="background-color:#DCDCDC;" /><br>
 55 | </center>
 56 | 
 57 | The grayscale background means this is an observation, which means they are data points from a given dataset. You can check the input and labels by going to the directory of the dataset if you want.
 58 | 
 59 | <center>
 60 | <img src="{{site.baseurl}}/images/week03/03-3/figure9.png" style="background-color:#DCDCDC;" /><br>
 61 | </center>
 62 | 
 63 | We use "Delay" to denote the encoder(e.g., neural network).
 64 | 
 65 | <center>
 66 | <img src="{{site.baseurl}}/images/week03/03-3/figure10.png" style="background-color:#DCDCDC;" /><br>
 67 | </center>
 68 | 
 69 | In this example, $\vx$ and $\vy$  are observations.
 70 | 
 71 | In the half above, we feed the $\vx$ to a given encoder to get a prediction $\bar {\vy}$. This is called forward propagation.
 72 | 
 73 | In the half below, we want to get the prediction $\bar{\vx}$ given observation $\vy$. We keep doing gradient descent to make the network output as close as to $\vy$. This is called amortizing inference.
 74 | 
 75 | Usually, we use backpropagation to compute the gradient, then we apply gradient descent with those computed values to train the model. This example shows that backpropagation is NOT only used during training. Backpropagation can also be used for inference.
 76 | -->
 77 | ## [Diagrams.net](https://app.diagrams.net/)
 78 | 
 79 | Diagrams.net est un excellent outil pour dessiner des diagrammes de réseaux neuronaux. Nous allons introduire quelques règles pour rendre nos diagrammes plus cohérents avec ceux du cours.
 80 | 
 81 | <center>
 82 | <img src="{{site.baseurl}}/images/week03/03-3/figure7.png" style="background-color:#DCDCDC;" /><br>
 83 | </center>
 84 | 
 85 | Le fond en niveaux de gris signifie qu'il s'agit d'une observation donc qu'il s'agit de points de données d'un jeu de données fourni.
 86 | Vous pouvez vérifier l'entrée et les étiquettes en allant dans le répertoire du jeu de données si vous le souhaitez.
 87 | 
 88 | <center>
 89 | <img src="{{site.baseurl}}/images/week03/03-3/figure9.png" style="background-color:#DCDCDC;" /><br>
 90 | </center>
 91 | 
 92 | Nous utilisons *Delay* pour désigner l'encodeur (par exemple, un réseau neuronal).
 93 | 
 94 | <center>
 95 | <img src="{{site.baseurl}}/images/week03/03-3/figure10.png" style="background-color:#DCDCDC;" /><br>
 96 | </center>
 97 | 
 98 | Dans cet exemple, $\vx$ et $\vy$ sont des observations.
 99 | 
100 | Dans la moitié ci-dessus, nous donnons les $\vx$ à un encodeur pour obtenir une prédiction $\bar {\vy}$. C'est ce qu'on appelle la propagation vers l'avant.
101 | 
102 | Dans la moitié inférieure, nous voulons obtenir la prédiction $\bar{\vx}$ étant donné l'observation $\vy$.
103 | Nous continuons à faire une descente de gradient pour que la sortie du réseau soit aussi proche que possible de $\vy$. C'est ce qu'on appelle « l'inférence amortissante ».
104 | 
105 | Habituellement, nous utilisons la rétropropagation pour calculer le gradient, puis nous appliquons la descente de gradient avec ces valeurs calculées pour entraîner le modèle.
106 | Cet exemple montre que la rétropropagation n'est PAS uniquement utilisée pendant l'entraînement. La rétropropagation peut également être utilisée pour l'inférence.
107 | 
108 | 
109 | <!--
110 | ## Spiral Classification
111 | The following content is mostly the same, so [here](https://atcold.github.io/NYU-DLSP20/en/week02/02-3/) you can find what you need.
112 | -->
113 | ## Classification d'une spirale
114 | Le contenu suivant est essentiellement le même que celui de l'année dernière, rendez-vous donc [ici](https://atcold.github.io/NYU-DLSP20/fr/week02/02-3/) pour le consulter.
115 | 


--------------------------------------------------------------------------------
/docs/fr/week03/03.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | lang: fr
 3 | lang-ref: ch.03
 4 | title: Semaine 3
 5 | translation-date: 19 June 2021
 6 | translator: Loïck Bourdois
 7 | ---
 8 | 
 9 | 
10 | <!--
11 | ## Lecture
12 | 
13 | Parts can be found [here](https://atcold.github.io/NYU-DLSP20/en/week03/03-1/) and part [here](https://atcold.github.io/NYU-DLSP20/en/week06/06-2/).
14 | -->
15 | ## Cours magistral
16 | Les différentes parties peuvent être trouvées [ici](https://atcold.github.io/NYU-DLSP20/fr/week03/03-1/) et [ici](https://atcold.github.io/NYU-DLSP20/fr/week06/06-2/).
17 | 
18 | 
19 | <!--
20 | ## Practicum
21 | We introduced how to draw deep network schematics conveniently using diagrams.net. Then we showed the different effect of using only linear transformation, and the effect of combining linear and non-linear transformation together on spiral classification. Finally, we showed the mathematical principles underlying neural networks, including chain rule derivation, back propagation, and gradient descent.
22 | -->
23 | ## Travaux dirigés
24 | Nous présentons comment dessiner des schémas de réseaux profonds de manière pratique en utilisant **diagrams.net**. Nous montrons ensuite les différents effets de l'utilisation de la seule transformation linéaire, et l'effet de la combinaison de la transformation linéaire et non linéaire sur la classification en spirale. Enfin nous voyons les principes mathématiques qui sous-tendent les réseaux neuronaux, notamment le théorème de dérivation des fonctions composées, la rétropropagation et la descente de gradient.
25 | 


--------------------------------------------------------------------------------
/docs/fr/week04/04.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | lang: fr
 3 | lang-ref: ch.04
 4 | title: Semaine 4
 5 | translation-date: 19 June 2021
 6 | translator: Loïck Bourdois
 7 | ---
 8 | 
 9 | <!--
10 | ## Lecture
11 | 
12 | Similar to [last year's edition](https://atcold.github.io/NYU-DLSP20/en/week06/06-1/).
13 | -->
14 | ## Cours magistral
15 | Similaire à [l'édition de l'année dernière](https://atcold.github.io/NYU-DLSP20/fr/week06/06-1/).
16 | 
17 | 
18 | <!--
19 | ## Practicum A & B
20 | 
21 | Similar to last year's edition of [CNN](https://atcold.github.io/NYU-DLSP20/en/week03/03-3/) and [RNN](https://atcold.github.io/NYU-DLSP20/en/week06/06-3/).
22 | -->
23 | ## Travaux dirigés A & B
24 | Similaires à l'édition de l'année dernière : pour les [ConvNets](https://atcold.github.io/NYU-DLSP20/fr/week03/03-3/), pour les [RNNs](https://atcold.github.io/NYU-DLSP20/fr/week06/06-3/).
25 | 


--------------------------------------------------------------------------------
/docs/fr/week05/05.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | lang: fr
 3 | lang-ref: ch.05
 4 | title: Semaine 5
 5 | translation-date: 19 June 2021
 6 | translator: Loïck Bourdois
 7 | ---
 8 | 
 9 | <!--
10 | ## Lecture
11 | 
12 | Similar to [last year's](https://atcold.github.io/NYU-DLSP20/en/week07/07-1/) but different.
13 | -->
14 | ## Cours magistral
15 | Similaire à [celui de l'année dernière](https://atcold.github.io/NYU-DLSP20/fr/week07/07-1/) mais un peu différent.
16 | 
17 | 
18 | <!--
19 | ## Practicum
20 | 
21 | Same as [last year](https://atcold.github.io/NYU-DLSP20/en/week15/15-1/).
22 | -->
23 | ## Travaux dirigés
24 | Comme [l'année dernière](https://atcold.github.io/NYU-DLSP20/fr/week15/15-1/).
25 | 


--------------------------------------------------------------------------------
/docs/fr/week06/06.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | lang: fr
 3 | lang-ref: ch.06
 4 | title: Semaine 6
 5 | translation-date: 19 June 2021
 6 | translator: Loïck Bourdois
 7 | ---
 8 | 
 9 | 
10 | <!--
11 | ## Lecture
12 | 
13 | Similar to [this](https://atcold.github.io/NYU-DLSP20/en/week14/14-1/) and [this](https://atcold.github.io/NYU-DLSP20/en/week14/14-2/).
14 | -->
15 | ## Cours magistral
16 | Similaire à [ceci](https://atcold.github.io/NYU-DLSP20/fr/week14/14-1/) et [ceci](https://atcold.github.io/NYU-DLSP20/fr/week14/14-2/).
17 | 
18 | <!--
19 | ## Practicum
20 | 
21 | Same as [last year](https://atcold.github.io/NYU-DLSP20/en/week15/15-2/).
22 | -->
23 | ## Travaux dirigés
24 | Comme [l'année dernière](https://atcold.github.io/NYU-DLSP20/fr/week15/15-2/).
25 | 


--------------------------------------------------------------------------------
/docs/fr/week07/07.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | lang: fr
 3 | lang-ref: ch.07
 4 | title: Semaine 7
 5 | translation-date: 19 June 2021
 6 | translator: Loïck Bourdois
 7 | ---
 8 | 
 9 | 
10 | <!--
11 | ## Lecture part A
12 | -->
13 | ## Cours magistral partie A
14 | 
15 | <!--
16 | ## Lecture part B
17 | -->
18 | ## Cours magistral partie B
19 | 
20 | <!--
21 | ## Practicum
22 | We started with an application of autoencoders: DALL-E. We discussed Autoencoders (in terms of Energy-Based Models) and their use cases. Next, we discussed the reconstruction costs and the loss functions we should use. Finally, we discussed a particular type of autoencoder, i.e., denoising autoencoder.
23 | -->
24 | ## Travaux dirigés
25 | Nous commençons par une application des auto-encodeurs : DALL-E. Nous discutons ensuite des auto-encodeurs (en termes de modèles à base d’énergie) et de leurs cas d'utilisation. Puis nous discutons des coûts de reconstruction et des fonctions de perte à utiliser. Enfin, nous abordons un type particulier d'auto-encodeur à savoir l'auto-encodeur débruiteur.
26 | 


--------------------------------------------------------------------------------
/docs/fr/week08/08.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | lang: fr
 3 | lang-ref: ch.08
 4 | title: Semaine 8
 5 | translation-date: 19 June 2021
 6 | translator: Loïck Bourdois
 7 | ---
 8 | 
 9 | 
10 | <!--
11 | ## Lecture part A
12 | -->
13 | ## Cours magistral partie A
14 | 
15 | <!--
16 | ## Lecture part B
17 | -->
18 | ## Cours magistral partie B
19 | 
20 | <!--
21 | ## Practicum
22 | In this section, we introduced some Generative Models including Denoising AE, Contractive AE and Variational AE. We compared the functionalities and advantages of Variational AEs over Basic Autoencoders. We explored the objective function of VAE in detail, understanding how it enforced some structure in the latent space.
23 | -->
24 | ## Travaux dirigés
25 | Dans cette section, nous présentons quelques modèles génératifs, dont l'auto-encodeur débruiteur, l'auto-encodeur contractif et l'auto-encodeur variationnel. Nous avons comparé les fonctionnalités et les avantages des auto-encodeurs variationnels par rapport aux auto-encodeurs de base. Nous avons exploré en détail la fonction objective de l'auto-encodeur variationnel en comprenant comment il impose une certaine structure dans l'espace latent.
26 | 


--------------------------------------------------------------------------------
/docs/fr/week09/09.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | lang: fr
 3 | lang-ref: ch.09
 4 | title: Semaine 9
 5 | translation-date: 19 June 2021
 6 | translator: Loïck Bourdois
 7 | ---
 8 | 
 9 | 
10 | <!--
11 | ## Lecture part A
12 | -->
13 | ## Cours magistral partie A
14 | 
15 | <!--
16 | ## Lecture part B
17 | -->
18 | ## Cours magistral partie B
19 | 
20 | <!--
21 | ## Practicum
22 | In this section, we covered the implementation of Generative models viz. Undercomplete Autoencoder, Denoising Autoencoders, Variational Autoencoders and Generative Adversarial Networks. We analyze these models from the perpesective of the framework of Energy Based Models (EBM). In doing so, we realize that these generative models can be considered as extenstions of EBMs and differ from each other with subtle architectural adjustments.
23 | -->
24 | ## Travaux dirigés
25 | Dans cette section nous couvrons l'implémentation de modèles génératifs, à savoir les auto-encodeurs sous-complets, les auto-encodeurs débruieurs, les auto-encodeurs variationnels et les réseaux antagonistes génératifs. Nous analysons ces modèles du point de vue du cadre des modèles à base d’énergie (EBMs). Ce faisant, nous nous rendons compte que ces modèles génératifs peuvent être considérés comme des extensions des EBMs et qu'ils diffèrent les uns des autres par de subtils ajustements architecturaux.
26 | 


--------------------------------------------------------------------------------
/docs/fr/week10/10-3.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | lang: fr
  3 | lang-ref: ch.10-3
  4 | lecturer: Alfredo Canziani
  5 | title: Architecture encodeur-predicteur-décodeur d'un transformer
  6 | authors: Rahul Ahuja, jingshuai jiang
  7 | date: 15 Apr 2021
  8 | typora-root-url: 10-3
  9 | translation-date: 20 Jun 2021
 10 | translator: Loïck Bourdois
 11 | ---
 12 | 
 13 | 
 14 | <!--
 15 | ## The Transformer
 16 | 
 17 | Before elaborating the encoder-predictor-decoder architecture, we are going to review two models we've seen before.
 18 | -->
 19 | ## Le *transformer*
 20 | 
 21 | Avant d'élaborer l'architecture encodeur-prédicteur-décodeur, nous allons passer en revue deux modèles que nous avons déjà vus.
 22 | 
 23 | 
 24 | <!--
 25 | ### Conditional EBM latent variable architecture
 26 | 
 27 | We should be familiar with the terminology of these modules from the previous lectures.
 28 | In the conditional EBM latent variable architecture, we have $x$ the conditional variable which goes into a predictor. We have $\vy$ which is the target value. The decoder modules will produce $\vytilde$ when fed with a latent variable $z$ and the output of the predictor. $\red{E}$ is the energy function which minimizes the energy between $\vytilde$ and $\vy$.
 29 | 
 30 | 
 31 | <center>
 32 | <img src="{{site.baseurl}}/images/week10/10-3/ebm.png" style="zoom: 80%; background-color:#DCDCDC;" /><br>
 33 | <b>Figure 1: </b> (From the EBM lecture) Diagram above depicting the architecture of a conditional EBM latent variable model.
 34 | </center>
 35 | -->
 36 | 
 37 | ### Architecture d'un EBM conditionnel à variable latente
 38 | 
 39 | Dans l'architecture d'un EBM conditionnel à variable latente, nous avons $x$ la variable conditionnelle qui va dans un prédicteur.
 40 | Nous avons $\vy$ qui est la valeur cible. Les modules de décodage produisent $\vytilde$ lorsqu'on leur donne une variable latente $z$ et la sortie du prédicteur.
 41 | $\red{E}$ est la fonction d'énergie qui minimise l'énergie entre $\vytilde$ et $\vy$.
 42 | 
 43 | 
 44 | <center>
 45 | <img src="{{site.baseurl}}/images/week10/10-3/autoencoder.png" style="zoom: 80%; background-color:#DCDCDC;" /><br>
 46 | <b>Figure 1 : </b> Architecture d'un EBM conditionnel à variable latente
 47 | </center>
 48 | 
 49 | 
 50 | <!--
 51 | ### Autoencoder architecture
 52 | 
 53 | In Autoencoder architecture , we observed there is no conditional input but only a target variable. The entire architecture is trying to learn the structure in these target variables. The target value $\vy$ is fed through an encoder module which transforms into a hidden representation space, forcing only the most important information through. And the decoder will make these variables come back to the original target space with a $\vytilde$. And the cost function will try to minimize the distance between $\vytilde$ and $\vy$.
 54 | 
 55 | 
 56 | <center>
 57 | <img src="{{site.baseurl}}/images/week10/10-3/autoencoder.png" style="zoom: 80%; background-color:#DCDCDC;" /><br>
 58 | <b>Figure 2: </b> (From the autoencoder lecture) Architecture of a basic Autoencoder consisting of encoder and decoder modules.
 59 | </center>
 60 | -->
 61 | 
 62 | ### Architecture d'un auto-encodeur
 63 | 
 64 | Dans l'architecture d'un auto-encodeur, nous avons observé qu'il n'y a pas d'entrée conditionnelle mais seulement une variable cible.
 65 | L'architecture entière essaie d'apprendre la structure de ces variables cibles.
 66 | La valeur cible $\vy$ est introduite dans un module encodeur qui la transforme en un espace de représentation caché, ne laissant passer que les informations les plus importantes.
 67 | Et le décodeur fera en sorte que ces variables reviennent à l'espace cible original avec une valeur $\vytilde$.
 68 | La fonction de coût va essayer de minimiser la distance entre $\vytilde$ et $\vy$.
 69 | 
 70 | 
 71 | <center>
 72 | <img src="{{site.baseurl}}/images/week10/10-3/autoencoder.png" style="zoom: 80%; background-color:#DCDCDC;" /><br>
 73 | <b>Figure 2 : </b> Architecture d'un autoencodeur de base composé de modules encodeur et décodeur
 74 | </center>
 75 | 
 76 | 
 77 | <!--
 78 | ### Encoder-predictor-decoder architecture
 79 | 
 80 | <center>
 81 | <img src="{{site.baseurl}}/images/week10/10-3/transformer.png" style="zoom: 80%; background-color:#DCDCDC;" /><br>
 82 | <b>Figure 3: </b> The transformer architecture with a unit delay module.
 83 | </center>
 84 | 
 85 | 
 86 | In a transformer, $\vy$ (target sentence) is a discrete time signal. It has discrete representation in a time index. The $\vy$ is fed into a unit delay module succeeded by an encoder. The unit delay here transforms $\vy[j] \mapsto \vy[j-1]$. The only difference with the autoencoder here is this delayed variable. So we can use this structure in the language model to produce the future when given the past.
 87 | 
 88 | 
 89 | <center>
 90 | <img src="{{site.baseurl}}/images/week10/10-3/unit_delay.png" style="zoom: 50%; background-color:#DCDCDC;" /><br>
 91 | <b>Figure 4: </b> A unit delay module transforms $\vy[j] \mapsto \vy[j-1]$
 92 | </center>
 93 | 
 94 | The observed signal, $\vx$ (source sentence) , is also fed through an encoder. The output of both encoder and delayed encoder are fed into the predictor, which gives a hidden representation $\vh$. This is very similar to denoising autoencoder as the delay module acts as noise in this case. And $\vx$ here makes this entire architecture a conditional delayed denoising autoencoder.
 95 | -->
 96 | 
 97 | ### Architecture de l'encodeur-prédicteur-décodeur
 98 | 
 99 | <center>
100 | <img src="{{site.baseurl}}/images/week10/10-3/transformer.png" style="zoom: 80%; background-color:#DCDCDC;" /><br>
101 |   <b>Figure 3 : </b> L'architecture du <i>transformer</i> avec un module de retard unitaire
102 | </center>
103 | 
104 | 
105 | Dans un *transformer*, $\vy$ (phrase cible) est un signal temporel discret. Il a une représentation discrète dans un index temporel.
106 | Le $\vy$ est introduit dans un module de retard unitaire suivi d'un encodeur. Le retard unitaire transforme ici $\vy[j] \mapsto \vy[j-1]$
107 | La seule différence avec l'auto-encodeur ici est cette variable retardée.
108 | Nous pouvons donc utiliser cette structure dans le modèle de langage pour produire le futur lorsqu'on nous donne le passé.
109 | 
110 | <center>
111 | <img src="{{site.baseurl}}/images/week10/10-3/unit_delay.png" style="zoom: 50%; background-color:#DCDCDC;" /><br>
112 | <b>Figure 4 : </b> Un module de retard unitaire transforme $\vy[j] \mapsto \vy[j-1]$
113 | </center>
114 | 
115 | Le signal observé, $\vx$ (phrase source) passe également par un encodeur.
116 | La sortie de l'encodeur et de l'encodeur retardé est introduite dans le prédicteur qui donne une représentation cachée $\vh$.
117 | Ceci est très similaire à l'auto-encodeur débruiteur car le module de retard agit comme un bruit dans ce cas.
118 | $\vx$ fait de cette architecture entière un auto-encodeur débruiteur conditionnel retardé.
119 | 
120 | 
121 | <!--
122 | ### Encoder module
123 | You can see the detailed explaination of these modules from last year's slides [here](https://atcold.github.io/NYU-DLSP20/en/week12/12-3/).
124 | -->
125 | 
126 | ### Module encodeur
127 | Vous pouvez voir l'explication détaillée de ce module dans les notes de l'année dernière disponibles [ici](https://atcold.github.io/NYU-DLSP20/fr/week12/12-3/).
128 | 
129 | 
130 | <!--
131 | ### Predictor Module
132 | 
133 | The transformer predictor module follows a similar procedure as the encoder. However, there is one additional sub-block (i.e. cross-attention) to take into account. Additionally, the output of the encoder modules acts as the inputs to this module.
134 | 
135 | 
136 | <center>
137 | <img src="{{site.baseurl}}/images/week10/10-3/predictor.png" style="zoom: 100%; background-color:#DCDCDC;" /><br>
138 | <b>Figure 5: </b> The predictor module consisting of a cross attention block
139 | </center>
140 | -->
141 | 
142 | ### Module prédicteur
143 | 
144 | Le module prédicteur du *transformer* suit une procédure similaire à celle de l'encodeur.
145 | Cependant, il y a un sous-bloc supplémentaire (c'est-à-dire l'attention croisée) à prendre en compte.
146 | De plus, la sortie des modules encodeurs agit comme les entrées de ce module.
147 | 
148 | 
149 | <center>
150 | <img src="{{site.baseurl}}/images/week10/10-3/predictor.png" style="zoom: 100%; background-color:#DCDCDC;" /><br>
151 | <b>Figure 5 : </b> Le module prédicteur composé d'un bloc d'attention croisée
152 | </center>
153 | 
154 | 
155 | <!--
156 | ### Cross attention
157 | You can see the detailed explaination of cross attention from last year's slides [cross-attention](https://atcold.github.io/NYU-DLSP20/en/week12/12-3/).
158 | -->
159 | 
160 | ### Attention croisée
161 | Vous pouvez consulter l'explication détaillée de l'attention croisée dans les notes de l'année dernière disponibles [ici](https://atcold.github.io/NYU-DLSP20/fr/week12/12-3/).
162 | 
163 | <!--
164 | ### Decoder module
165 | 
166 | Contrary to what authors of the Transformer paper define, the decoder module consists of `1D-convolution` and `Add, Norm` blocks. The output of the predictor module is fed to the decoder module and the output of the decoder module is the predicted sentence. We can train this by providing the delayed target sequence.
167 | 
168 | 
169 | <center>
170 | <img src="{{site.baseurl}}/images/week10/10-3/decoder.png" style="zoom: 80%; background-color:#DCDCDC;" /><br>
171 | <b>Figure 6: </b> The correct notation of the encoder,predictor and decoder modules in a transformer
172 | </center>
173 | -->
174 | 
175 | 
176 | ### Module décodeur
177 | 
178 | Contrairement à ce que les auteurs du papier du *transformer* définissent, le module décodeur est composé de blocs `1D-convolution` et `Add, Norm`.
179 | La sortie du module prédicteur est introduite dans le module décodeur et la sortie du module décodeur est la phrase prédite.
180 | On peut l'entraîner en fournissant la séquence cible retardée.
181 | 
182 | 
183 | <center>
184 | <img src="{{site.baseurl}}/images/week10/10-3/decoder.png" style="zoom: 80%; background-color:#DCDCDC;" /><br>
185 | <b>Figure 6 : </b> La notation correcte des modules encodeur, prédicteur et décodeur dans un <i>transformer</i>
186 | </center>
187 | 
188 | 


--------------------------------------------------------------------------------
/docs/fr/week10/10.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | lang: fr
 3 | lang-ref: ch.10
 4 | title: Semaine 10
 5 | translation-date: 19 June 2021
 6 | translator: Loïck Bourdois
 7 | ---
 8 | 
 9 | 
10 | <!--
11 | ## Lecture part A
12 | A brief introduction to self-supervised learning and pretext tasks and discussion of associated trivial solutions. Categorization of recent self-supervised methods: Introduction to Contrastive Learning and the loss function used. Brief overviews of PIRL, SimCLR and MoCo followed by SwAV which is a Clustering based method. Pretraining on Imagenet and non-Imagenet data is also discussed towards the end.
13 | -->
14 | ## Cours magistral partie A
15 | 
16 | Une brève introduction à l'apprentissage autosupervisé et aux tâches de prétexte ainsi qu'une discussion à propos des solutions triviales associées. Puis une catégorisation des méthodes autosupervisées récentes avec une introduction à l'apprentissage contrastif et à la fonction de perte utilisée. Nous poursuivons avec de brèves présentations de PIRL, SimCLR et MoCo suivies de SwAV qui est une méthode basée sur du *clustering*. Le pré-entraînement sur les données ImageNet et non-ImageNet est également discuté à la fin.
17 | 
18 | <!--
19 | ## Lecture part B
20 | -->
21 | ## Cours magistral partie B
22 | 
23 | <!--
24 | ## Practicum
25 | We introduce attention, focusing on self-attention and its hidden layer representations of the inputs. Then, we introduce the key-value store paradigm and discuss how to represent queries, keys, and values as rotations of an input. Finally, we use attention to interpret the transformer architecture taking a forward pass through a basic transformer through an EBM perspective,, and comparing the encoder-predictor-decoder paradigm to sequential architectures.
26 | -->
27 | ## Travaux dirigés
28 | 
29 | Nous présentons l'attention en nous concentrant sur l'auto-attention et ses représentations des entrées dans la couche cachée. Ensuite, nous introduisons le paradigme clé-valeur et discutons de la manière de représenter les requêtes, les clés et les valeurs comme des rotations d'une entrée. Enfin, nous utilisons l'attention pour interpréter l'architecture du *transformer*. Pour cela nous passons par le biais d'un *transformer* de base dans la perspective des EBMs et en comparant le paradigme encodeur-prédicteur-décodeur aux architectures séquentielles.
30 | 


--------------------------------------------------------------------------------
/docs/fr/week11/11.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | lang: fr
 3 | lang-ref: ch.11
 4 | title: Semaine 11
 5 | translation-date: 19 June 2021
 6 | translator: Loïck Bourdois
 7 | ---
 8 | 
 9 | 
10 | <!--
11 | ## Lecture part A
12 | We provide an introduction to the problem of speech recognition using neural models, emphasizing the CTC loss for training and inference when input and output sequences are of different lengths.
13 | -->
14 | ## Cours magistral partie A
15 | Nous présentons une introduction au problème de la reconnaissance de la parole à l'aide de modèles neuronaux en mettant l'accent sur la perte CTC (*Connectionist Temporal Classification*) pour l'entraînement et l'inférence lorsque les séquences d'entrée et de sortie sont de longueurs différentes.
16 | 
17 | <!--
18 | ## Lecture part B
19 | We discuss beam search for use during inference, and how that procedure may be modeled at training time using a Graph Transformer Network. Graph transformers networks are basically weighted finite-state automata with automatic differentiation, that allows us to encode priors into a graph. There are different type of weighted finite-state and different operations including union, Kleene closure, intersection, compose, and forward score. The loss function is usually the difference between to functions. We can easily implement these networks using GTN library.
20 | -->
21 | ## Cours magistral partie B
22 | Nous discutons de l'utilisation de la recherche en faisceau pendant l'inférence ainsi que de la façon dont cette procédure peut être modélisée au moment de l'entraînement d'un *Graph Transformer Network* (GTN). Les GTNs sont essentiellement des « accepteur d'état fini pondéré » (WFSA pour « Weighted Finite State Acceptor ») avec différenciation automatique permettant d'encoder des a priori dans un graphe. Il existe différents types d'états finis pondérés et opérations, notamment l'union, l'étoile de Kleene, l'intersection, la composition et le score *forward*. La fonction de perte est généralement la différence entre deux fonctions. Nous pouvons facilement implémenter ces réseaux en utilisant la bibliothèque *gtn*.
23 | 
24 | <!--
25 | ## Practicum
26 | -->
27 | ## Travaux dirigés
28 | 


--------------------------------------------------------------------------------
/docs/fr/week12/12.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | lang: fr
 3 | lang-ref: ch.12
 4 | title: Semaine 12
 5 | translation-date: 19 June 2021
 6 | translator: Loïck Bourdois
 7 | ---
 8 | 
 9 | 
10 | <!--
11 | ## Lecture part A
12 | This lecture introduces the topic of Neural Machine Translation with the help of an example. We then discuss language modelling, model architecture, NMT inference. Further, we discuss the issues faced because of the languages and the need for Low Resource Machine Translation. Also, we examine a case study and the challenges faced in Low Resource MT, different stages in the cycle of research, how they can be used for Machine Translation.
13 | -->
14 | ## Cours magistral partie A
15 | Cette conférence introduit le sujet de la traduction automatique neuronale à l'aide d'un exemple. Nous abordons la modélisation du langage, l'architecture du modèle et l'inférence de la traduction automatique neuronale. En outre, nous discutons des problèmes rencontrés en raison des langues et de la nécessité d'une traduction automatique à faibles ressources. Nous examinons également une étude de cas, les différentes étapes du cycle de recherche et la manière dont elles peuvent être utilisées pour la traduction automatique.
16 | 
17 | <!--
18 | ## Lecture part B
19 | This week’s lecture was a guest lecture by Marc’Aurelio Ranzato, who is a research scientist and manager at the Facebook AI Research (FAIR) lab, where he works to enable machines to learn with weaker supervision and to efficiently transfer knowledge across tasks. The first part of Lecture B focuses on understanding low resource machine translation, and the second half discusses potential domain mismatches in machine learning and machine translation.
20 | -->
21 | ## Cours magistral partie B
22 | La première partie de cette partie B se concentre sur la compréhension de la traduction automatique à faibles ressources et la seconde partie discute des incompatibilités potentielles entre les domaines de l'apprentissage automatique et de la traduction automatique.
23 | 
24 | 
25 | <!--
26 | ## Practicum
27 | We introduced the state transition function and the way to model a physical system with state and control. We discussed how to achieve optimal control by inference using Kelley-Bryson algorithm, which utilizes backprop through time and gradient descent. Finally, we explained the notebook of Optimization_Path_Planner, in which various cost functions are defined and path planning is implemented to guide a tri-cycle to reach the desired position with the specified speed.
28 | -->
29 | ## Travaux dirigés
30 | Nous introduisons la fonction de transition d'état et la manière de modéliser un système physique avec état et contrôle. Nous avons discuté de la manière d'obtenir un contrôle optimal par inférence en utilisant l'algorithme de Kelley-Bryson qui utilise la rétropropagation dans le temps et la descente de gradient. Enfin, nous voyons dans un notebook diverses fonctions de coût et la planification d'une trajectoire pour guider un tricycle afin qu'il atteigne la position souhaitée avec la vitesse spécifiée.
31 | 


--------------------------------------------------------------------------------
/docs/fr/week13/13.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | lang: fr
 3 | lang-ref: ch.13
 4 | title: Semaine 13
 5 | translation-date: 19 June 2021
 6 | translator: Loïck Bourdois
 7 | ---
 8 | 
 9 | 
10 | <!--
11 | ## Lecture part A
12 | -->
13 | ## Cours magistral partie A
14 | 
15 | <!--
16 | ## Lecture part B
17 | -->
18 | ## Cours magistral partie B
19 | 
20 | <!--
21 | ## Practicum
22 | -->
23 | ## Travaux dirigés
24 | 


--------------------------------------------------------------------------------
/docs/fr/week14/14.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | lang: fr
 3 | lang-ref: ch.14
 4 | title: Semaine 14
 5 | translation-date: 19 June 2021
 6 | translator: Loïck Bourdois
 7 | ---
 8 | 
 9 | 
10 | <!--
11 | ## Lecture part A
12 | -->
13 | ## Cours magistral partie A
14 | 
15 | <!--
16 | ## Lecture part B
17 | -->
18 | ## Cours magistral partie B
19 | 
20 | <!--
21 | ## Practicum
22 | -->
23 | ## Travaux dirigés
24 | 


--------------------------------------------------------------------------------
/docs/fr/week15/15.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | lang: fr
 3 | lang-ref: ch.15
 4 | title: Semaine 15
 5 | translation-date: 31 July 2022
 6 | translator: Loïck Bourdois
 7 | ---
 8 | 
 9 | 
10 | <!--
11 | ## Lecture part A
12 | 
13 | As pointed out already, we can broadly classify Energy Based Models into generative or joint-embedding based on architectures and into contrastive or regularised & architectural based on training methods.
14 | 
15 | In this section, we discussed Visual Representation Learning, focused on self-supervised visual representation learning. This can be classified into Generative models, Pretext Tasks and Joint Embedding methods. In generative models, you train the model to reconstruct the original image from the noisy image. In pretext tasks, you train the model to figure out a smart way to generate pseudo labels. Joint Embedding methods try to make their backbone network robust to certain distortions and are invariant to data augmentation. JEM training methods can be classified into four types: contrastive methods, non-contrastive methods, clustering methods and Other methods. He concluded the lecture by discussing contrastive methods which push positive pairs closer and negative pairs away. 
16 | -->
17 | 
18 | 
19 | ## Cours magistral partie A
20 | 
21 | Dans cette section, nous abordons l'apprentissage de représentations visuelles en nous concentrant sur l'apprentissage autosupervisé. Les méthodes applicables peuvent être classées en modèles génératifs, tâches de prétexte et méthodes d’enchâssements joints. Dans les modèles génératifs, on entraîne le modèle à reconstruire l'image originale à partir de l'image bruitée. Dans les tâches de prétextes, on entraîne le modèle à trouver un moyen intelligent de générer des pseudo-étiquettes. Les méthodes d’enchâssements joints tentent de rendre leur *backbone* robuste à certaines distorsions et invariant à l'augmentation des données. Les méthodes d'entraînement des JEMs peuvent être classées en quatre types : méthodes contrastives, méthodes non-contrastives, méthodes de *clustering* et les « autres méthodes ». Nous concluons en discutant des méthodes contrastives qui rapprochent les paires positives et éloignent les paires négatives. 
22 | 
23 | 
24 | 
25 | <!--
26 | ## Lecture part B
27 | 
28 | In this section, we discussed non-contrastive methods which are based on information theory and don’t require special architectures or engineering techniques. Then, he went on to discuss clustering methods which prevent trivial solution by quantizing the embedding space. Finally, he discussed "Other" methods which are local and don't create problem with distributed training unlike previous methods. He concluded the lecture by suggesting various improvisations for JEMs w.r.t Data augmentation and network architecture. 
29 | -->
30 | 
31 | 
32 | ## Cours magistral partie B
33 | 
34 | Dans cette section, nous abordons les méthodes non-contrastives qui sont basées sur la théorie de l'information et ne nécessitent pas d'architectures ou de techniques d'ingénierie particulières. Ensuite, nous voyons les méthodes de *clustering* qui empêchent une solution triviale en quantifiant l'espace d’enchâssement. Enfin, nous parlons d’« autres méthodes » qui sont locales et ne créent pas de problème pour l’entraînement distribué contrairement aux méthodes précédentes. Nous concluons en suggérant diverses améliorations pour les JEMs par rapport à l’augmentation de données et l’architecture des réseaux.
35 | 


--------------------------------------------------------------------------------
/docs/images/week02/02-3/figure1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week02/02-3/figure1.png


--------------------------------------------------------------------------------
/docs/images/week03/03-3/figure1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week03/03-3/figure1.png


--------------------------------------------------------------------------------
/docs/images/week03/03-3/figure10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week03/03-3/figure10.png


--------------------------------------------------------------------------------
/docs/images/week03/03-3/figure7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week03/03-3/figure7.png


--------------------------------------------------------------------------------
/docs/images/week03/03-3/figure9.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week03/03-3/figure9.png


--------------------------------------------------------------------------------
/docs/images/week07/07-3/Autoencoder_Arch.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week07/07-3/Autoencoder_Arch.png


--------------------------------------------------------------------------------
/docs/images/week07/07-3/DAEOutput.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week07/07-3/DAEOutput.png


--------------------------------------------------------------------------------
/docs/images/week07/07-3/DALL-E.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week07/07-3/DALL-E.png


--------------------------------------------------------------------------------
/docs/images/week07/07-3/DenoisingAutoEncoder.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week07/07-3/DenoisingAutoEncoder.png


--------------------------------------------------------------------------------
/docs/images/week07/07-3/def.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week07/07-3/def.png


--------------------------------------------------------------------------------
/docs/images/week08/08-3/AE.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week08/08-3/AE.png


--------------------------------------------------------------------------------
/docs/images/week08/08-3/DAE.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week08/08-3/DAE.png


--------------------------------------------------------------------------------
/docs/images/week08/08-3/VAE.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week08/08-3/VAE.png


--------------------------------------------------------------------------------
/docs/images/week08/08-3/VAE_DAE.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week08/08-3/VAE_DAE.png


--------------------------------------------------------------------------------
/docs/images/week08/08-3/VAEloss.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week08/08-3/VAEloss.png


--------------------------------------------------------------------------------
/docs/images/week08/08-3/bubbles_z.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week08/08-3/bubbles_z.png


--------------------------------------------------------------------------------
/docs/images/week08/08-3/contractiveAE.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week08/08-3/contractiveAE.png


--------------------------------------------------------------------------------
/docs/images/week08/08-3/target_prop.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week08/08-3/target_prop.png


--------------------------------------------------------------------------------
/docs/images/week09/09-3/10_autoencoder_cell_12_output_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week09/09-3/10_autoencoder_cell_12_output_2.png


--------------------------------------------------------------------------------
/docs/images/week09/09-3/10_autoencoder_cell_12_output_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week09/09-3/10_autoencoder_cell_12_output_3.png


--------------------------------------------------------------------------------
/docs/images/week09/09-3/dae_noise.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week09/09-3/dae_noise.png


--------------------------------------------------------------------------------
/docs/images/week09/09-3/dae_output.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week09/09-3/dae_output.png


--------------------------------------------------------------------------------
/docs/images/week09/09-3/fig_10_cluster_samples.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week09/09-3/fig_10_cluster_samples.png


--------------------------------------------------------------------------------
/docs/images/week09/09-3/fig_11_gan_vs_dae.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week09/09-3/fig_11_gan_vs_dae.png


--------------------------------------------------------------------------------
/docs/images/week09/09-3/fig_12_gan_vs_vae.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week09/09-3/fig_12_gan_vs_vae.png


--------------------------------------------------------------------------------
/docs/images/week09/09-3/fig_1_ae.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week09/09-3/fig_1_ae.png


--------------------------------------------------------------------------------
/docs/images/week09/09-3/fig_2_under_over.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week09/09-3/fig_2_under_over.png


--------------------------------------------------------------------------------
/docs/images/week09/09-3/fig_3_ae_outputs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week09/09-3/fig_3_ae_outputs.png


--------------------------------------------------------------------------------
/docs/images/week09/09-3/fig_4_autoencoder_kernel.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week09/09-3/fig_4_autoencoder_kernel.png


--------------------------------------------------------------------------------
/docs/images/week09/09-3/fig_5_dae.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week09/09-3/fig_5_dae.png


--------------------------------------------------------------------------------
/docs/images/week09/09-3/fig_6_dae_kernels.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week09/09-3/fig_6_dae_kernels.png


--------------------------------------------------------------------------------
/docs/images/week09/09-3/fig_7_dae_comparison.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week09/09-3/fig_7_dae_comparison.png


--------------------------------------------------------------------------------
/docs/images/week09/09-3/fig_8_merged_imgs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week09/09-3/fig_8_merged_imgs.png


--------------------------------------------------------------------------------
/docs/images/week09/09-3/fig_9_vae.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week09/09-3/fig_9_vae.png


--------------------------------------------------------------------------------
/docs/images/week09/09-3/noise_input.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week09/09-3/noise_input.png


--------------------------------------------------------------------------------
/docs/images/week09/09-3/ns_output.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week09/09-3/ns_output.png


--------------------------------------------------------------------------------
/docs/images/week09/09-3/telea_output.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week09/09-3/telea_output.png


--------------------------------------------------------------------------------
/docs/images/week10/10-1/CL_objective.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week10/10-1/CL_objective.png


--------------------------------------------------------------------------------
/docs/images/week10/10-1/cl_loss_fn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week10/10-1/cl_loss_fn.png


--------------------------------------------------------------------------------
/docs/images/week10/10-1/clustering.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week10/10-1/clustering.png


--------------------------------------------------------------------------------
/docs/images/week10/10-1/con_learning.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week10/10-1/con_learning.png


--------------------------------------------------------------------------------
/docs/images/week10/10-1/contrastive-learning.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week10/10-1/contrastive-learning.png


--------------------------------------------------------------------------------
/docs/images/week10/10-1/equipartition.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week10/10-1/equipartition.png


--------------------------------------------------------------------------------
/docs/images/week10/10-1/moco.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week10/10-1/moco.png


--------------------------------------------------------------------------------
/docs/images/week10/10-1/non-imagenet.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week10/10-1/non-imagenet.png


--------------------------------------------------------------------------------
/docs/images/week10/10-1/pirl.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week10/10-1/pirl.png


--------------------------------------------------------------------------------
/docs/images/week10/10-1/semantic_features.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week10/10-1/semantic_features.png


--------------------------------------------------------------------------------
/docs/images/week10/10-1/soft-assignment.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week10/10-1/soft-assignment.png


--------------------------------------------------------------------------------
/docs/images/week10/10-1/ssl_trivial.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week10/10-1/ssl_trivial.png


--------------------------------------------------------------------------------
/docs/images/week10/10-1/swav.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week10/10-1/swav.png


--------------------------------------------------------------------------------
/docs/images/week10/10-2/avid.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week10/10-2/avid.png


--------------------------------------------------------------------------------
/docs/images/week10/10-2/byol.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week10/10-2/byol.png


--------------------------------------------------------------------------------
/docs/images/week10/10-2/cma.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week10/10-2/cma.png


--------------------------------------------------------------------------------
/docs/images/week10/10-2/figure_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week10/10-2/figure_1.png


--------------------------------------------------------------------------------
/docs/images/week10/10-2/seer_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week10/10-2/seer_1.png


--------------------------------------------------------------------------------
/docs/images/week10/10-2/seer_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week10/10-2/seer_2.png


--------------------------------------------------------------------------------
/docs/images/week10/10-2/simsiam.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week10/10-2/simsiam.png


--------------------------------------------------------------------------------
/docs/images/week10/10-3/autoencoder.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week10/10-3/autoencoder.png


--------------------------------------------------------------------------------
/docs/images/week10/10-3/decoder.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week10/10-3/decoder.png


--------------------------------------------------------------------------------
/docs/images/week10/10-3/ebm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week10/10-3/ebm.png


--------------------------------------------------------------------------------
/docs/images/week10/10-3/predictor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week10/10-3/predictor.png


--------------------------------------------------------------------------------
/docs/images/week10/10-3/transformer.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week10/10-3/transformer.png


--------------------------------------------------------------------------------
/docs/images/week10/10-3/unit_delay.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week10/10-3/unit_delay.png


--------------------------------------------------------------------------------
/docs/images/week11/11-1/figure1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week11/11-1/figure1.png


--------------------------------------------------------------------------------
/docs/images/week11/11-2/Screenshot (85).png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week11/11-2/Screenshot (85).png


--------------------------------------------------------------------------------
/docs/images/week11/11-2/bs1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week11/11-2/bs1.png


--------------------------------------------------------------------------------
/docs/images/week11/11-2/bs2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week11/11-2/bs2.png


--------------------------------------------------------------------------------
/docs/images/week11/11-2/bs3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week11/11-2/bs3.png


--------------------------------------------------------------------------------
/docs/images/week11/11-2/figure10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week11/11-2/figure10.png


--------------------------------------------------------------------------------
/docs/images/week11/11-2/figure11.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week11/11-2/figure11.png


--------------------------------------------------------------------------------
/docs/images/week11/11-2/figure12.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week11/11-2/figure12.png


--------------------------------------------------------------------------------
/docs/images/week11/11-2/figure13.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week11/11-2/figure13.png


--------------------------------------------------------------------------------
/docs/images/week11/11-2/figure14.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week11/11-2/figure14.png


--------------------------------------------------------------------------------
/docs/images/week11/11-2/figure5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week11/11-2/figure5.png


--------------------------------------------------------------------------------
/docs/images/week11/11-2/figure6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week11/11-2/figure6.png


--------------------------------------------------------------------------------
/docs/images/week11/11-2/figure7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week11/11-2/figure7.png


--------------------------------------------------------------------------------
/docs/images/week11/11-2/figure8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week11/11-2/figure8.png


--------------------------------------------------------------------------------
/docs/images/week11/11-2/figure9.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week11/11-2/figure9.png


--------------------------------------------------------------------------------
/docs/images/week11/11-2/greedy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week11/11-2/greedy.png


--------------------------------------------------------------------------------
/docs/images/week12/12-1/figure1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week12/12-1/figure1.png


--------------------------------------------------------------------------------
/docs/images/week12/12-1/figure10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week12/12-1/figure10.png


--------------------------------------------------------------------------------
/docs/images/week12/12-1/figure11.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week12/12-1/figure11.png


--------------------------------------------------------------------------------
/docs/images/week12/12-1/figure12.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week12/12-1/figure12.png


--------------------------------------------------------------------------------
/docs/images/week12/12-1/figure13.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week12/12-1/figure13.png


--------------------------------------------------------------------------------
/docs/images/week12/12-1/figure14.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week12/12-1/figure14.png


--------------------------------------------------------------------------------
/docs/images/week12/12-1/figure15.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week12/12-1/figure15.png


--------------------------------------------------------------------------------
/docs/images/week12/12-1/figure16.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week12/12-1/figure16.png


--------------------------------------------------------------------------------
/docs/images/week12/12-1/figure17.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week12/12-1/figure17.png


--------------------------------------------------------------------------------
/docs/images/week12/12-1/figure2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week12/12-1/figure2.png


--------------------------------------------------------------------------------
/docs/images/week12/12-1/figure3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week12/12-1/figure3.png


--------------------------------------------------------------------------------
/docs/images/week12/12-1/figure4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week12/12-1/figure4.png


--------------------------------------------------------------------------------
/docs/images/week12/12-1/figure5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week12/12-1/figure5.png


--------------------------------------------------------------------------------
/docs/images/week12/12-1/figure6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week12/12-1/figure6.png


--------------------------------------------------------------------------------
/docs/images/week12/12-1/figure7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week12/12-1/figure7.png


--------------------------------------------------------------------------------
/docs/images/week12/12-1/figure8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week12/12-1/figure8.png


--------------------------------------------------------------------------------
/docs/images/week12/12-1/figure9.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week12/12-1/figure9.png


--------------------------------------------------------------------------------
/docs/images/week12/12-2/figure1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week12/12-2/figure1.png


--------------------------------------------------------------------------------
/docs/images/week12/12-2/figure10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week12/12-2/figure10.png


--------------------------------------------------------------------------------
/docs/images/week12/12-2/figure10_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week12/12-2/figure10_1.png


--------------------------------------------------------------------------------
/docs/images/week12/12-2/figure11.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week12/12-2/figure11.png


--------------------------------------------------------------------------------
/docs/images/week12/12-2/figure12.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week12/12-2/figure12.png


--------------------------------------------------------------------------------
/docs/images/week12/12-2/figure13.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week12/12-2/figure13.png


--------------------------------------------------------------------------------
/docs/images/week12/12-2/figure14.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week12/12-2/figure14.png


--------------------------------------------------------------------------------
/docs/images/week12/12-2/figure15.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week12/12-2/figure15.png


--------------------------------------------------------------------------------
/docs/images/week12/12-2/figure16.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week12/12-2/figure16.png


--------------------------------------------------------------------------------
/docs/images/week12/12-2/figure17.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week12/12-2/figure17.png


--------------------------------------------------------------------------------
/docs/images/week12/12-2/figure18.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week12/12-2/figure18.png


--------------------------------------------------------------------------------
/docs/images/week12/12-2/figure19.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week12/12-2/figure19.png


--------------------------------------------------------------------------------
/docs/images/week12/12-2/figure2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week12/12-2/figure2.png


--------------------------------------------------------------------------------
/docs/images/week12/12-2/figure2_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week12/12-2/figure2_1.png


--------------------------------------------------------------------------------
/docs/images/week12/12-2/figure2_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week12/12-2/figure2_2.png


--------------------------------------------------------------------------------
/docs/images/week12/12-2/figure3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week12/12-2/figure3.png


--------------------------------------------------------------------------------
/docs/images/week12/12-2/figure3_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week12/12-2/figure3_1.png


--------------------------------------------------------------------------------
/docs/images/week12/12-2/figure3_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week12/12-2/figure3_2.png


--------------------------------------------------------------------------------
/docs/images/week12/12-2/figure4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week12/12-2/figure4.png


--------------------------------------------------------------------------------
/docs/images/week12/12-2/figure4_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week12/12-2/figure4_2.png


--------------------------------------------------------------------------------
/docs/images/week12/12-2/figure4_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week12/12-2/figure4_3.png


--------------------------------------------------------------------------------
/docs/images/week12/12-2/figure5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week12/12-2/figure5.png


--------------------------------------------------------------------------------
/docs/images/week12/12-2/figure6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week12/12-2/figure6.png


--------------------------------------------------------------------------------
/docs/images/week12/12-2/figure7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week12/12-2/figure7.png


--------------------------------------------------------------------------------
/docs/images/week12/12-2/figure8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week12/12-2/figure8.png


--------------------------------------------------------------------------------
/docs/images/week12/12-2/figure8_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week12/12-2/figure8_1.png


--------------------------------------------------------------------------------
/docs/images/week12/12-2/figure8_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week12/12-2/figure8_2.png


--------------------------------------------------------------------------------
/docs/images/week12/12-2/figure9.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week12/12-2/figure9.png


--------------------------------------------------------------------------------
/docs/images/week12/12-3/figure1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week12/12-3/figure1.png


--------------------------------------------------------------------------------
/docs/images/week12/12-3/figure10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week12/12-3/figure10.png


--------------------------------------------------------------------------------
/docs/images/week12/12-3/figure11.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week12/12-3/figure11.png


--------------------------------------------------------------------------------
/docs/images/week12/12-3/figure12.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week12/12-3/figure12.png


--------------------------------------------------------------------------------
/docs/images/week12/12-3/figure13.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week12/12-3/figure13.png


--------------------------------------------------------------------------------
/docs/images/week12/12-3/figure14.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week12/12-3/figure14.png


--------------------------------------------------------------------------------
/docs/images/week12/12-3/figure15.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week12/12-3/figure15.png


--------------------------------------------------------------------------------
/docs/images/week12/12-3/figure16.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week12/12-3/figure16.png


--------------------------------------------------------------------------------
/docs/images/week12/12-3/figure2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week12/12-3/figure2.png


--------------------------------------------------------------------------------
/docs/images/week12/12-3/figure9.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week12/12-3/figure9.png


--------------------------------------------------------------------------------
/docs/images/week15/15-1/1_fig0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week15/15-1/1_fig0.png


--------------------------------------------------------------------------------
/docs/images/week15/15-1/1_fig1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week15/15-1/1_fig1.png


--------------------------------------------------------------------------------
/docs/images/week15/15-1/1_fig3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week15/15-1/1_fig3.png


--------------------------------------------------------------------------------
/docs/images/week15/15-1/1_fig4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week15/15-1/1_fig4.png


--------------------------------------------------------------------------------
/docs/images/week15/15-1/1_fig7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week15/15-1/1_fig7.png


--------------------------------------------------------------------------------
/docs/images/week15/15-1/1_fig9.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week15/15-1/1_fig9.png


--------------------------------------------------------------------------------
/docs/images/week15/15-2/2_fig1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week15/15-2/2_fig1.png


--------------------------------------------------------------------------------
/docs/images/week15/15-2/2_fig2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week15/15-2/2_fig2.png


--------------------------------------------------------------------------------
/docs/images/week15/15-2/2_fig3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week15/15-2/2_fig3.png


--------------------------------------------------------------------------------
/docs/images/week15/15-2/2_fig4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week15/15-2/2_fig4.png


--------------------------------------------------------------------------------
/docs/images/week15/15-2/2_fig5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week15/15-2/2_fig5.png


--------------------------------------------------------------------------------
/docs/images/week15/15-2/2_fig6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week15/15-2/2_fig6.png


--------------------------------------------------------------------------------
/docs/images/week15/15-2/2_fig7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/NYU-DLSP21/1d98a3ad92f9b01553306caaa6f68a003b5aedf0/docs/images/week15/15-2/2_fig7.png


--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | layout: default
  3 | title: DEEP LEARNING
  4 | author: Alfredo Canziani
  5 | lang-ref: home
  6 | ---
  7 | 
  8 | **DS-GA 1008 · SPRING 2021 · [NYU CENTER FOR DATA SCIENCE](http://cds.nyu.edu/)**
  9 | 
 10 | | INSTRUCTORS | Yann LeCun & Alfredo Canziani |
 11 | | LECTURES    | Wednesday 9:30 – 11:30, Zoom |
 12 | | PRACTICA    | Tuesdays 9:30 – 10:30, Zoom |
 13 | | FORUM       | [r/NYU_DeepLearning](https://www.reddit.com/r/NYU_DeepLearning/) |
 14 | | DISCORD     | [NYU DL](https://discord.gg/CthuqsX8Pb) |
 15 | | MATERIAL    | [2021 repo](https://github.com/Atcold/NYU-DLSP21) |
 16 | 
 17 | 
 18 | ## 2021 edition disclaimer
 19 | 
 20 | Check the repo's [`README.md`](https://github.com/Atcold/NYU-DLSP21/blob/master/README.md) and learn about:
 21 | 
 22 | - Content new organisation
 23 | - The semester's second half intellectual dilemma
 24 | - This semester repository
 25 | - Previous releases
 26 | 
 27 | 
 28 | ## Lectures
 29 | 
 30 | Most of the lectures, labs, and notebooks are similar to the previous edition, nevertheless, some are brand new.
 31 | I will try to make clear which is which.
 32 | 
 33 | **Legend**: 🖥 slides, 📝 notes, 📓 Jupyter notebook, 🎥 YouTube video.
 34 | 
 35 | 
 36 | ### Theme 1: Introduction
 37 | 
 38 |  * History and resources [🎥](https://youtu.be/mTtDfKgLm54) [🖥](https://drive.google.com/file/d/1vVNUye-1JNJnqP4A0704sjtF7gs_MpCI/)
 39 |  * Gradient descent and the backpropagation algorithm [🎥](https://youtu.be/nTlCqaL7fCY) [🖥](https://drive.google.com/file/d/1tYPYGYFDQw5IBs9wx4egCcBTTX2h9d9g/)
 40 |  * [Neural nets inference](https://atcold.github.io/NYU-DLSP21/en/week02/02-3/) [🎥](https://youtu.be/0TdAmZUMj2k) [📓](https://github.com/Atcold/NYU-DLSP20/blob/master/02-space_stretching.ipynb)
 41 |  * Modules and architectures [🎥](https://youtu.be/IYQN3i7dJIQ) [🖥](https://drive.google.com/file/d/1IaDI6BJ6g4SJbJLtNjVE_miWRzBH1-MX/)
 42 |  * [Neural nets training](https://atcold.github.io/NYU-DLSP21/en/week03/03-3/) [🎥](https://youtu.be/EyKiYVwrdjE) [🖥](https://github.com/Atcold/NYU-DLSP20/blob/master/slides/01%20-%20Spiral%20classification.pdf) [📓](https://github.com/Atcold/NYU-DLSP20/blob/master/04-spiral_classification.ipynb) [📓](https://github.com/Atcold/NYU-DLSP20/blob/master/05-regression.ipynb)
 43 | * [Homework 1: backprop](https://drive.google.com/drive/folders/1g-uQNEi_NJyELGRMrJGXXxmARDabcXFd)
 44 | 
 45 | 
 46 | ### Theme 2: Parameters sharing
 47 | 
 48 |  * Recurrent and convolutional nets [🎥](https://youtu.be/7dU3TFBJl-0) [🖥](https://drive.google.com/file/d/1GtI4ywzI84oamyr_W5k_wzgfRN139aFD/) [📝](https://drive.google.com/file/d/12jP4ssUIoGURAU8jGj6QwKXyZVdXW0o6/)
 49 |  * ConvNets in practice [🎥](https://youtu.be/-wz_vADGbtE) [🖥](https://drive.google.com/file/d/1WX3HoZhekL4MVvi_7VuLRYJtBGnF9JJY/) [📝](https://drive.google.com/file/d/1ToWP7e71diAeMtQ0D9pU-f0BXF4bAg46/)
 50 |  * Natural signals properties and the convolution [🎥](https://youtu.be/KvvNkE2vQVk) [🖥](https://github.com/Atcold/NYU-DLSP20/blob/master/slides/02%20-%20CNN.pdf) [📓](https://github.com/Atcold/NYU-DLSP20/blob/master/06-convnet.ipynb)
 51 |  * Recurrent neural networks, vanilla and gated (LSTM) [🎥](https://youtu.be/5KSGNomPJTE) [🖥](https://github.com/Atcold/NYU-DLSP20/blob/master/slides/04%20-%20RNN.pdf) [📓](https://github.com/Atcold/NYU-DLSP20/blob/master/08-seq_classification.ipynb) [📓](https://github.com/Atcold/NYU-DLSP20/blob/master/09-echo_data.ipynb)
 52 |  * [Homework 2: RNN & CNN](https://drive.google.com/drive/folders/1or1YiW0fFiZGEYy6b4EOEDgRPr0GQX0i)
 53 | 
 54 | 
 55 | ### Theme 3: Energy based models, foundations
 56 | 
 57 |  * Energy based models (I) [🎥](https://youtu.be/xIn-Czj1g2Q) [🖥](https://drive.google.com/file/d/1kLUgZdRYFO5ksYHzbsRS8m8IocNiGu2J/)
 58 |  * Inference for LV-EBMs [🎥](https://youtu.be/xA_OPjRby5g) [🖥](https://github.com/Atcold/NYU-DLSP20/blob/master/slides/12%20-%20EBM.pdf)
 59 |  * What are EBMs good for? [🎥](https://youtu.be/eJeJWWEo7cE)
 60 |  * Energy based models (II) [🎥](https://youtu.be/8u2s64ZtmiA) [🖥](https://drive.google.com/file/d/1czfiEE6IPqE7q1fTm-SWOiC3VNEtpNrj/) [📝](https://drive.google.com/file/d/1IB5dkcAQ6GsHEz8Eg2hjaeQeVtT2i4Z5/)
 61 |  * Training LV-EBMs [🎥](https://youtu.be/XIMaWj5YjOQ) [🖥](https://github.com/Atcold/NYU-DLSP20/blob/master/slides/12%20-%20EBM.pdf)
 62 |  * [Homework 3: structured prediction](https://drive.google.com/drive/folders/1zGy_SnMBqaoS7_dHRmKiOFtqNV1jJJb6)
 63 | 
 64 | 
 65 | ### Theme 4: Energy based models, advanced
 66 | 
 67 |  * Energy based models (III) [🎥](https://youtu.be/AOFUZZZ6KyU) [🖥](https://drive.google.com/file/d/19crFMCpJ5YCGbWv6myv7O4pGaJT6-u5p/)
 68 |  * [Unsup learning and autoencoders](https://atcold.github.io/NYU-DLSP21/en/week07/07-3/) [🎥](https://youtu.be/IuXsG3sN3zY) [🖥](https://drive.google.com/file/d/1aa1Hzq5KRekq32mlW4_pgIXMec18WgOg/)
 69 |  * Energy based models (VI) [🎥](https://youtu.be/bdebHVF__mo) [🖥](https://drive.google.com/file/d/1w6QO0a2_0Prz1U1mxa1n-YP9U8GW1_kq/)
 70 |  * [From LV-EBM to target prop to (any) autoencoder](https://atcold.github.io/NYU-DLSP21/en/week08/08-3/) [🎥](https://youtu.be/PpcN-F7ovK0) [🖥](https://drive.google.com/file/d/1aa1Hzq5KRekq32mlW4_pgIXMec18WgOg/)
 71 |  * Energy based models (V) [🎥](https://youtu.be/AQtPoDnauq4) [🖥](https://drive.google.com/file/d/1tKzrnJgptnyMcE_4zWJNP5INeVcVBWkr/)
 72 |  * [AEs with PyTorch and GANs](https://atcold.github.io/NYU-DLSP21/en/week09/09-3/) [🎥](https://youtu.be/bZF4N8HR1cc) [🖥](https://drive.google.com/file/d/1aa1Hzq5KRekq32mlW4_pgIXMec18WgOg/) [📓](https://github.com/Atcold/NYU-DLSP21/blob/master/10-autoencoder.ipynb) [📓](https://github.com/Atcold/NYU-DLSP21/blob/master/11-VAE.ipynb)
 73 |  * [Joint Embedding Methods (I)](en/week15/15-1/) [🎥](https://youtu.be/5VjEBHWuYs8) [🖥](https://drive.google.com/file/d/17NYsSagXF5wrprv7ISCEYLeEyRwCZg9r/) [🖥](https://drive.google.com/file/d/1fo5teinBim6GQX5QkzV5f5yzDjbSVraf/)
 74 |  * [Joint Embedding Methods (II)](en/week15/15-2/) [🎥](https://youtu.be/EBrbaD2zyuo) [🖥](https://drive.google.com/file/d/1I6kggxFK_x--UEhsKbuNHLwmqGSRFIpR/)
 75 | 
 76 | ### Theme 5: Associative memories
 77 | 
 78 |  * Energy based models (V) [🎥](https://youtu.be/AQtPoDnauq4) [🖥](https://drive.google.com/file/d/1tKzrnJgptnyMcE_4zWJNP5INeVcVBWkr/)
 79 |  * [Attention & transformer](https://atcold.github.io/NYU-DLSP21/en/week10/10-3/) [🎥](https://youtu.be/fEVyfT-gLqQ) [🖥](https://drive.google.com/file/d/1MGfNPjg9YpxMcdfP2GcjluMQXlXud10C/) [📓](https://github.com/Atcold/NYU-DLSP20/blob/master/15-transformer.ipynb)
 80 | 
 81 | 
 82 | ### Theme 6: Graphs
 83 | 
 84 |  * [Graph transformer nets](https://atcold.github.io/NYU-DLSP21/en/week11/11/) [[A](https://atcold.github.io/NYU-DLSP21/en/week11/11-1/)][[B](https://atcold.github.io/NYU-DLSP21/en/week11/11-2/)] [🎥](https://youtu.be/Of9s8epjflU) [🖥](https://drive.google.com/file/d/1-u2fSSICaWoFu91oiMsd2mAhg6ZGomMg/)
 85 |  * Graph convolutional nets (I) [from last year] [🎥](https://youtu.be/Iiv9R6BjxHM) [🖥](https://drive.google.com/file/d/1oq-nZE2bEiQjqBlmk5_N_rFC8LQY0jQr/)
 86 |  * Graph convolutional nets (II) [🎥](https://youtu.be/lWUh7jzhQ1Q) [🖥](https://github.com/Atcold/NYU-DLSP20/blob/master/slides/11%20-%20GCN.pdf) [📓](https://github.com/Atcold/NYU-DLSP20/blob/master/16-gated_GCN.ipynb)
 87 | 
 88 | 
 89 | ### Theme 7: Control
 90 | 
 91 |  1. [Planning and control](https://atcold.github.io/NYU-DLSP21/en/week12/12-3/) [🎥](https://youtu.be/wTg6qJlXkok) [🖥](https://drive.google.com/file/d/1JDssHbOxX_MZlmOopQaPZxuyCVoNExcM/)
 92 |  2. The Truck Backer-Upper [🎥](https://youtu.be/C4iSZ3IJU-w) [🖥](https://github.com/Atcold/NYU-DLSP20/blob/master/slides/09%20-%20Controller%20learning.pdf) [📓](https://github.com/Atcold/NYU-DLSP20/blob/master/14-truck_backer-upper.ipynb)
 93 |  3. Prediction and Planning Under Uncertainty [🎥](https://youtu.be/DJgloa244ZQ) [🖥](http://bit.ly/PPUU-slides)
 94 | 
 95 | 
 96 | ### Theme 8: Optimisation
 97 |  * Optimisation (I) [from last year] [🎥](https://youtu.be/--NZb480zlg) [🖥](https://drive.google.com/open?id=1pwlGN6hDFfEYQqBqcMjWbe4yfBDTxsab)
 98 |  * Optimisation (II) [🎥](https://youtu.be/n1w5b5rTFv0) [🖥](https://drive.google.com/file/d/1ExKFOOdyUiLuk3zN5LAVwUyEoI1HJxag/) [📝](https://drive.google.com/file/d/1UJibhwdwJPZDwqlVVzeAHScPxK4TDCq5/)
 99 | 
100 | 
101 | ### Miscellaneous
102 | 
103 |  * [SSL for vision](https://atcold.github.io/NYU-DLSP21/en/week10/10/) [[A](https://atcold.github.io/NYU-DLSP21/en/week10/10-1/)][[B](https://atcold.github.io/NYU-DLSP21/en/week10/10-2/)] [🎥](https://youtu.be/8L10w1KoOU8) [🖥](https://drive.google.com/file/d/1BQlWMVesOcioW69RCKWCjp6280Q42W9q/)
104 |  * [Low resource machine translation](https://atcold.github.io/NYU-DLSP21/en/week12/12/) [[A](https://atcold.github.io/NYU-DLSP21/en/week12/12-1/)][[B](https://atcold.github.io/NYU-DLSP21/en/week12/12-2/)] [🎥](https://youtu.be/fR42OOy9ROo) [🖥](https://drive.google.com/file/d/1pm1fM1DFqCHrjGorCQCwg5SgMjwZBwGR/)
105 |  * Lagrangian backprop, final project, and Q&A [🎥](https://youtu.be/MJfnamMFylo) [🖥](https://drive.google.com/file/d/1Z9tkkTpsHzcyoPN9yqq8Nv_Bnw5bghEK/) [📝](https://drive.google.com/file/d/1BMoaE7I-IwZF32YfASiTw1OnMblWAVGb/)
106 | 


--------------------------------------------------------------------------------
/docs/serve.sh:
--------------------------------------------------------------------------------
1 | /opt/homebrew/lib/ruby/gems/3.2.0/bin/jekyll serve --trace --baseurl '/NYU-DLSP21'
2 | 


--------------------------------------------------------------------------------
/docs/static:
--------------------------------------------------------------------------------
1 | jekyllbook/static/


--------------------------------------------------------------------------------
/res/plot_lib.py:
--------------------------------------------------------------------------------
  1 | import matplotlib as mpl
  2 | import numpy as np
  3 | import torch
  4 | from IPython.display import clear_output
  5 | from matplotlib import pyplot as plt
  6 | 
  7 | 
  8 | def set_default(figsize=(10, 10), dpi=100):
  9 |     plt.style.use(['dark_background', 'bmh'])
 10 |     plt.rc('axes', facecolor='k')
 11 |     plt.rc('figure', facecolor='k')
 12 |     plt.rc('figure', figsize=figsize, dpi=dpi)
 13 | 
 14 | 
 15 | def plot_data(X, y, d=0, auto=False, zoom=1, title='Training data (x, y)'):
 16 |     X = X.cpu()
 17 |     y = y.cpu()
 18 |     s = plt.scatter(X.numpy()[:, 0], X.numpy()[:, 1], c=y, s=20, cmap=plt.cm.Spectral)
 19 |     plt.axis('square')
 20 |     plt.axis(np.array((-1.1, 1.1, -1.1, 1.1)) * zoom)
 21 |     if auto is True: plt.axis('equal')
 22 |     plt.axis('off')
 23 | 
 24 |     _m, _c = 0, '.35'
 25 |     plt.axvline(0, ymin=_m, color=_c, lw=1)
 26 |     plt.axhline(0, xmin=_m, color=_c, lw=1)
 27 |     plt.title(title)
 28 |     return s
 29 | 
 30 | 
 31 | def plot_model(X, y, model):
 32 |     model.cpu()
 33 |     mesh = torch.arange(-1.1, 1.11, 0.01)
 34 |     xx, yy = torch.meshgrid(mesh, mesh, indexing='xy')
 35 |     with torch.no_grad():
 36 |         data = torch.stack((xx.reshape(-1), yy.reshape(-1)), dim=1)
 37 |         Z = model(data)
 38 |     Z = Z.argmax(dim=1).reshape(xx.shape)
 39 |     plt.contourf(xx.numpy(), yy.numpy(), Z, cmap=plt.cm.Spectral, alpha=0.3)
 40 |     plot_data(X, y)
 41 |     plt.title('Model decision boundaries')
 42 | 
 43 | 
 44 | def plot_embeddings(X, y, model, zoom=10):
 45 |     # Use forward hook to get internal embeddings of the second last layer
 46 |     layer_outputs = {}
 47 | 
 48 |     def get_layer_outputs(name):
 49 |         def hook(model, input, output):
 50 |             layer_outputs[name] = output
 51 | 
 52 |         return hook
 53 | 
 54 |     layer = model[-2]
 55 | 
 56 |     if layer.__class__ == torch.nn.modules.linear.Linear and layer.out_features == 2:
 57 |         layer.register_forward_hook(get_layer_outputs("low_dim_embeddings"))
 58 |         with torch.no_grad():
 59 |             model(X)  # pass data through model to populate layer_outputs
 60 |         plot_data(
 61 |             layer_outputs["low_dim_embeddings"],
 62 |             y,
 63 |             zoom=zoom,
 64 |             title="Low dim embeddings",
 65 |         )
 66 |         last_layer = model[-1]
 67 |         mesh = torch.arange(-1.1, 1.1, 0.01) * zoom
 68 |         xx, yy = torch.meshgrid(mesh, mesh, indexing="ij")
 69 |         with torch.no_grad():
 70 |             data = torch.stack((xx.reshape(-1), yy.reshape(-1)), dim=1)
 71 |             Z = last_layer(data)
 72 |         Z = Z.argmax(dim=1).reshape(xx.shape)
 73 |         plt.contourf(xx.numpy(), yy.numpy(), Z, cmap=plt.cm.Spectral, alpha=0.3, levels=y.max().item())
 74 |     else:
 75 |         print(
 76 |             "Cannot plot: second-last layer is not a linear layer"
 77 |             f" with output in R^2 (it is {layer})"
 78 |         )
 79 | 
 80 | 
 81 | def acc(l, y):
 82 |     score, predicted = torch.max(l, 1)
 83 |     return (y == predicted).sum().float() / len(y)
 84 | 
 85 | 
 86 | def overwrite(string):
 87 |     print(string)
 88 |     clear_output(wait=True)
 89 | 
 90 | 
 91 | def plot_2d_energy_levels(X, y, energy, v=None, l=None):
 92 |     xx, yy, F, k, K = energy
 93 |     if not v: vmin = vmax = None
 94 |     else: vmin, vmax = v
 95 |     if not l: levels = None
 96 |     else: levels = torch.arange(l[0], l[1], l[2])
 97 |     plt.figure(figsize=(12, 10))
 98 |     plt.pcolormesh(xx.numpy(), yy.numpy(), F, vmin=vmin, vmax=vmax)
 99 |     plt.colorbar()
100 |     cnt = plt.contour(xx.numpy(), yy.numpy(), F, colors='w', linewidths=1, levels=levels)
101 |     plt.clabel(cnt, inline=True, fontsize=10, colors='w')
102 |     s = plot_data(X, y)
103 |     plt.legend(*s.legend_elements(), title='Classes', loc='lower right')
104 |     plt.axvline(color='0.55', lw=1)
105 |     plt.axhline(color='0.55', lw=1)
106 |     plt.axis([-1.5, 1.5, -1.5, 1.5])
107 |     ȳ = torch.zeros(K).int(); ȳ[k] = 1
108 |     plt.title(f'Free energy F(x, y = {ȳ.tolist()})')
109 | 
110 | 
111 | def plot_3d_energy_levels(X, y, energy, v=None, l=None, cbl=None):
112 |     xx, yy, F, k, K = energy
113 |     if not v: vmin = vmax = None
114 |     else: vmin, vmax = v
115 |     if not l: levels = None
116 |     else: levels = torch.arange(l[0], l[1], l[2])
117 |     fig = plt.figure(figsize=(9.5, 6), facecolor='k')
118 |     ax = fig.add_subplot(projection='3d')
119 |     cnt = ax.contour(xx.numpy(), yy.numpy(), F, levels=levels, vmin=vmin, vmax=vmax)
120 |     ax.scatter(X[:,0], X[:,1], zs=0, c=y, cmap=plt.cm.Spectral)
121 |     ax.xaxis.set_pane_color(color=(0,0,0))
122 |     ax.yaxis.set_pane_color(color=(0,0,0))
123 |     ax.zaxis.set_pane_color(color=(0,0,0))
124 | 
125 |     vmin, vmax = cnt.get_clim()
126 |     ax.set_zlim3d(vmin, vmax)
127 |     norm = mpl.colors.Normalize(vmin=vmin, vmax=vmax)
128 |     if not cbl: cbl = l
129 |     else: cbl = torch.arange(cbl[0], cbl[1], cbl[2])
130 |     sm = plt.cm.ScalarMappable(norm=norm, cmap=cnt.cmap)
131 |     sm.set_array([])
132 |     fig.colorbar(sm, ticks=cbl, ax=ax)
133 |     ȳ = torch.zeros(K).int(); ȳ[k] = 1
134 |     plt.title(f'Free energy F(x, y = {ȳ.tolist()})')
135 |     plt.tight_layout()
136 |     return fig, ax
137 | 


--------------------------------------------------------------------------------