├── README.md
├── apps
    └── mlp_resnet.py
├── data
    ├── t10k-images-idx3-ubyte.gz
    ├── t10k-labels-idx1-ubyte.gz
    ├── train-images-idx3-ubyte.gz
    └── train-labels-idx1-ubyte.gz
├── figures
    ├── mlp_resnet.png
    └── residualblock.png
├── hw2.ipynb
├── python
    └── needle
    │   ├── __init__.py
    │   ├── autograd.py
    │   ├── backend_numpy.py
    │   ├── data
    │       ├── __init__.py
    │       ├── data_basic.py
    │       ├── data_transforms.py
    │       └── datasets
    │       │   ├── __init__.py
    │       │   ├── mnist_dataset.py
    │       │   └── ndarray_dataset.py
    │   ├── init
    │       ├── __init__.py
    │       ├── init_basic.py
    │       └── init_initializers.py
    │   ├── nn
    │       ├── __init__.py
    │       └── nn_basic.py
    │   ├── ops
    │       ├── __init__.py
    │       ├── ops_logarithmic.py
    │       ├── ops_mathematic.py
    │       └── ops_tuple.py
    │   └── optim.py
└── tests
    └── hw2
        ├── test_data.py
        └── test_nn_and_optim.py


/README.md:
--------------------------------------------------------------------------------
1 | # Homework 2
2 | 
3 | Public repository and stub/testing code for Homework 2 of 10-714.
4 | 
5 | 


--------------------------------------------------------------------------------
/apps/mlp_resnet.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | sys.path.append("../python")
 4 | import needle as ndl
 5 | import needle.nn as nn
 6 | import numpy as np
 7 | import time
 8 | import os
 9 | 
10 | np.random.seed(0)
11 | # MY_DEVICE = ndl.backend_selection.cuda()
12 | 
13 | 
14 | def ResidualBlock(dim, hidden_dim, norm=nn.BatchNorm1d, drop_prob=0.1):
15 |     ### BEGIN YOUR SOLUTION
16 |     raise NotImplementedError()
17 |     ### END YOUR SOLUTION
18 | 
19 | 
20 | def MLPResNet(
21 |     dim,
22 |     hidden_dim=100,
23 |     num_blocks=3,
24 |     num_classes=10,
25 |     norm=nn.BatchNorm1d,
26 |     drop_prob=0.1,
27 | ):
28 |     ### BEGIN YOUR SOLUTION
29 |     raise NotImplementedError()
30 |     ### END YOUR SOLUTION
31 | 
32 | 
33 | def epoch(dataloader, model, opt=None):
34 |     np.random.seed(4)
35 |     ### BEGIN YOUR SOLUTION
36 |     raise NotImplementedError()
37 |     ### END YOUR SOLUTION
38 | 
39 | 
40 | def train_mnist(
41 |     batch_size=100,
42 |     epochs=10,
43 |     optimizer=ndl.optim.Adam,
44 |     lr=0.001,
45 |     weight_decay=0.001,
46 |     hidden_dim=100,
47 |     data_dir="data",
48 | ):
49 |     np.random.seed(4)
50 |     ### BEGIN YOUR SOLUTION
51 |     raise NotImplementedError()
52 |     ### END YOUR SOLUTION
53 | 
54 | 
55 | if __name__ == "__main__":
56 |     train_mnist(data_dir="../data")
57 | 


--------------------------------------------------------------------------------
/data/t10k-images-idx3-ubyte.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dlsyscourse/hw2/633bad8ac8e52cd40531e94b1a6b6bccae92a102/data/t10k-images-idx3-ubyte.gz


--------------------------------------------------------------------------------
/data/t10k-labels-idx1-ubyte.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dlsyscourse/hw2/633bad8ac8e52cd40531e94b1a6b6bccae92a102/data/t10k-labels-idx1-ubyte.gz


--------------------------------------------------------------------------------
/data/train-images-idx3-ubyte.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dlsyscourse/hw2/633bad8ac8e52cd40531e94b1a6b6bccae92a102/data/train-images-idx3-ubyte.gz


--------------------------------------------------------------------------------
/data/train-labels-idx1-ubyte.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dlsyscourse/hw2/633bad8ac8e52cd40531e94b1a6b6bccae92a102/data/train-labels-idx1-ubyte.gz


--------------------------------------------------------------------------------
/figures/mlp_resnet.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dlsyscourse/hw2/633bad8ac8e52cd40531e94b1a6b6bccae92a102/figures/mlp_resnet.png


--------------------------------------------------------------------------------
/figures/residualblock.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dlsyscourse/hw2/633bad8ac8e52cd40531e94b1a6b6bccae92a102/figures/residualblock.png


--------------------------------------------------------------------------------
/hw2.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# 10-714 Homework 2\n",
  8 |     "\n",
  9 |     "In this homework, you will be implementing a neural network library in the needle framework. Reminder: __you must save a copy in drive__."
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": null,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "# Code to set up the assignment\n",
 19 |     "from google.colab import drive\n",
 20 |     "drive.mount('/content/drive')\n",
 21 |     "%cd /content/drive/MyDrive/\n",
 22 |     "!mkdir -p 10714\n",
 23 |     "%cd /content/drive/MyDrive/10714\n",
 24 |     "!git clone https://github.com/dlsys10714/hw2.git\n",
 25 |     "%cd /content/drive/MyDrive/10714/hw2\n",
 26 |     "\n",
 27 |     "!pip3 install --upgrade --no-deps git+https://github.com/dlsys10714/mugrade.git"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "markdown",
 32 |    "metadata": {},
 33 |    "source": [
 34 |     "## Question 0\n",
 35 |     "\n",
 36 |     "This homework builds off of Homework 1. First, in your Homework 2 directory, copy the files `python/needle/autograd.py`, `python/needle/ops/ops_mathematic.py` from your Homework 1.\n",
 37 |     "\n",
 38 |     "***NOTE***: The default data type for the tensor is `float32`. If you want to change the data type, you can do so by setting the `dtype` parameter in the `Tensor` constructor. For example, `Tensor([1, 2, 3], dtype='float64')` will create a tensor with `float64` data type. \n",
 39 |     "In this homework, **make sure any tensor you create has `float32` data type to avoid any issues with the autograder**."
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "code",
 44 |    "execution_count": null,
 45 |    "metadata": {},
 46 |    "outputs": [],
 47 |    "source": [
 48 |     "import sys\n",
 49 |     "sys.path.append('./python')\n",
 50 |     "sys.path.append('./apps')"
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "markdown",
 55 |    "metadata": {
 56 |     "tags": []
 57 |    },
 58 |    "source": [
 59 |     "## Question 1\n",
 60 |     "\n",
 61 |     "In this first question, you will implement a few different methods for weight initialization.  This will be done in the `python/needle/init/init_initializers.py` file, which contains a number of routines for initializing needle Tensors using various random and constant initializations.  Following the same methodology of the existing initializers (you will want to call e.g. `init.rand` or `init.randn` implemented in `python/needle/init/init_basic.py` from your functions below, implement the following common initialization methods.  In all cases, the functions should return `fan_in` by `fan_out` 2D tensors (extensions to other sizes can be done via e.g., reshaping).\n",
 62 |     "\n",
 63 |     "\n",
 64 |     "### Xavier uniform\n",
 65 |     "`xavier_uniform(fan_in, fan_out, gain=1.0, **kwargs)`\n",
 66 |     "\n",
 67 |     "Fills the input Tensor with values according to the method described in [Understanding the difficulty of training deep feedforward neural networks](https://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf), using a uniform distribution. The resulting Tensor will have values sampled from $\\mathcal{U}(-a, a)$ where \n",
 68 |     "\\begin{equation}\n",
 69 |     "a = \\text{gain} \\times \\sqrt{\\frac{6}{\\text{fan_in} + \\text{fan_out}}}\n",
 70 |     "\\end{equation}\n",
 71 |     "\n",
 72 |     "Pass remaining `**kwargs` parameters to the corresponding `init` random call.\n",
 73 |     "\n",
 74 |     "##### Parameters\n",
 75 |     "- `fan_in` - dimensionality of input\n",
 76 |     "- `fan_out` - dimensionality of output\n",
 77 |     "- `gain` - optional scaling factor\n",
 78 |     "___\n",
 79 |     "\n",
 80 |     "### Xavier normal\n",
 81 |     "`xavier_normal(fan_in, fan_out, gain=1.0, **kwargs)`\n",
 82 |     "\n",
 83 |     "Fills the input Tensor with values according to the method described in [Understanding the difficulty of training deep feedforward neural networks](https://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf), using a normal distribution. The resulting Tensor will have values sampled from $\\mathcal{N}(0, \\text{std}^2)$ where \n",
 84 |     "\\begin{equation}\n",
 85 |     "\\text{std} = \\text{gain} \\times \\sqrt{\\frac{2}{\\text{fan_in} + \\text{fan_out}}}\n",
 86 |     "\\end{equation}\n",
 87 |     "\n",
 88 |     "##### Parameters\n",
 89 |     "- `fan_in` - dimensionality of input\n",
 90 |     "- `fan_out` - dimensionality of output\n",
 91 |     "- `gain` - optional scaling factor\n",
 92 |     "___\n",
 93 |     "\n",
 94 |     "### Kaiming uniform\n",
 95 |     "`kaiming_uniform(fan_in, fan_out, nonlinearity=\"relu\", **kwargs)`\n",
 96 |     "\n",
 97 |     "Fills the input Tensor with values according to the method described in [Delving deep into rectifiers: Surpassing human-level performance on ImageNet classification](https://arxiv.org/pdf/1502.01852.pdf), using a uniform distribution. The resulting Tensor will have values sampled from $\\mathcal{U}(-\\text{bound}, \\text{bound})$ where \n",
 98 |     "\\begin{equation}\n",
 99 |     "\\text{bound} = \\text{gain} \\times \\sqrt{\\frac{3}{\\text{fan_in}}}\n",
100 |     "\\end{equation}\n",
101 |     "\n",
102 |     "Use the recommended gain value for ReLU: $\\text{gain}=\\sqrt{2}$.\n",
103 |     "\n",
104 |     "##### Parameters\n",
105 |     "- `fan_in` - dimensionality of input\n",
106 |     "- `fan_out` - dimensionality of output\n",
107 |     "- `nonlinearity` - the non-linear function\n",
108 |     "___\n",
109 |     "\n",
110 |     "### Kaiming normal\n",
111 |     "`kaiming_normal(fan_in, fan_out, nonlinearity=\"relu\", **kwargs)`\n",
112 |     "\n",
113 |     "Fills the input Tensor with values according to the method described in [Delving deep into rectifiers: Surpassing human-level performance on ImageNet classification](https://arxiv.org/pdf/1502.01852.pdf), using a uniform distribution. The resulting Tensor will have values sampled from $\\mathcal{N}(0, \\text{std}^2)$ where \n",
114 |     "\\begin{equation}\n",
115 |     "\\text{std} = \\frac{\\text{gain}}{\\sqrt{\\text{fan_in}}}\n",
116 |     "\\end{equation}\n",
117 |     "\n",
118 |     "Use the recommended gain value for ReLU: $\\text{gain}=\\sqrt{2}$.\n",
119 |     "\n",
120 |     "##### Parameters\n",
121 |     "- `fan_in` - dimensionality of input\n",
122 |     "- `fan_out` - dimensionality of output\n",
123 |     "- `nonlinearity` - the non-linear function"
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "code",
128 |    "execution_count": null,
129 |    "metadata": {},
130 |    "outputs": [],
131 |    "source": [
132 |     "!python3 -m pytest -v -k \"test_init\""
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "code",
137 |    "execution_count": null,
138 |    "metadata": {},
139 |    "outputs": [],
140 |    "source": [
141 |     "!python -m mugrade submit 'YOUR_GRADER_KEY_HERE' -k \"init\" -s"
142 |    ]
143 |   },
144 |   {
145 |    "cell_type": "markdown",
146 |    "metadata": {},
147 |    "source": [
148 |     "## Question 2\n",
149 |     "\n",
150 |     "In this question, you will implement additional modules in `python/needle/nn/nn_basic.py`. Specifically, for the following modules described below, initialize any variables of the module in the constructor, and fill out the `forward` method. **Note:** Be sure that you are using the `init` functions that you just implemented to initialize the parameters, and don't forget to pass the `dtype` argument.\n",
151 |     "___\n",
152 |     "\n",
153 |     "### Linear\n",
154 |     "`needle.nn.Linear(in_features, out_features, bias=True, device=None, dtype=\"float32\")`\n",
155 |     "\n",
156 |     "Applies a linear transformation to the incoming data: $y = xA^T + b$. The input shape is $(N, H_{in})$ where $H_{in}=\\text{in_features}$. The output shape is $(N, H_{out})$ where $H_{out}=\\text{out_features}$.\n",
157 |     "\n",
158 |     "**Be careful to explicitly broadcast the bias term to the correct shape -- Needle does not support implicit broadcasting.**\n",
159 |     "\n",
160 |     "**Note: for all layers including this one, you should initialize the weight Tensor before the bias Tensor, and should initialize all Parameters using only functions from `init`**. This does not affect the algorithm's correctness. It is only necessary to ensure the value matches the expected results in the mugrade tests for this assignment's implementation scope. \n",
161 |     "\n",
162 |     "##### Parameters\n",
163 |     "- `in_features` - size of each input sample\n",
164 |     "- `out_features` - size of each output sample\n",
165 |     "- `bias` - If set to `False`, the layer will not learn an additive bias.\n",
166 |     "\n",
167 |     "##### Variables\n",
168 |     "- `weight` - the learnable weights of shape (`in_features`, `out_features`). The values should be initialized with the Kaiming Uniform initialization with `fan_in = in_features`\n",
169 |     "- `bias` - the learnable bias of shape (`out_features`). The values should be initialized with the Kaiming Uniform initialize with `fan_in = out_features`. **Note the difference in fan_in choice, due to their relative sizes**. \n",
170 |     "\n",
171 |     "Make sure to enclose all necessary variables e.g. (`weight`, `bias`) in the `Parameter` class so that they are visible to the optimizers which would be implemented next."
172 |    ]
173 |   },
174 |   {
175 |    "cell_type": "code",
176 |    "execution_count": null,
177 |    "metadata": {},
178 |    "outputs": [],
179 |    "source": [
180 |     "!python3 -m pytest -v -k \"test_nn_linear\""
181 |    ]
182 |   },
183 |   {
184 |    "cell_type": "code",
185 |    "execution_count": null,
186 |    "metadata": {},
187 |    "outputs": [],
188 |    "source": [
189 |     "!python -m mugrade submit 'YOUR_GRADER_KEY_HERE' -k \"nn_linear\""
190 |    ]
191 |   },
192 |   {
193 |    "cell_type": "markdown",
194 |    "metadata": {},
195 |    "source": [
196 |     "### ReLU\n",
197 |     "`needle.nn.ReLU()`\n",
198 |     "\n",
199 |     "Applies the rectified linear unit function element-wise:\n",
200 |     "$ReLU(x) = max(0, x)$.\n",
201 |     "\n",
202 |     "If you have previously implemented ReLU's backwards pass in terms of itself, note that this is numerically unstable and will likely cause problems\n",
203 |     "down the line.\n",
204 |     "Instead, consider that we could write the derivative of ReLU as $I\\{x>0\\}$, where we arbitrarily decide that the derivative at $x=0$ is 0.\n",
205 |     "(This is a _subdifferentiable_ function.)\n",
206 |     "\n",
207 |     "___"
208 |    ]
209 |   },
210 |   {
211 |    "cell_type": "code",
212 |    "execution_count": null,
213 |    "metadata": {},
214 |    "outputs": [],
215 |    "source": [
216 |     "!python3 -m pytest -v -k \"test_nn_relu\""
217 |    ]
218 |   },
219 |   {
220 |    "cell_type": "code",
221 |    "execution_count": null,
222 |    "metadata": {},
223 |    "outputs": [],
224 |    "source": [
225 |     "!python -m mugrade submit 'YOUR_GRADER_KEY_HERE' -k \"nn_relu\""
226 |    ]
227 |   },
228 |   {
229 |    "cell_type": "markdown",
230 |    "metadata": {
231 |     "tags": []
232 |    },
233 |    "source": [
234 |     "### Sequential\n",
235 |     "`needle.nn.Sequential(*modules)`\n",
236 |     "\n",
237 |     "Applies a sequence of modules to the input (in the order that they were passed to the constructor) and returns the output of the last module.\n",
238 |     "These should be kept in a `.module` property: you should _not_ redefine any magic methods like `__getitem__`, as this may not be compatible with our tests.\n",
239 |     "\n",
240 |     "##### Parameters\n",
241 |     "- `*modules` - any number of modules of type `needle.nn.Module`\n",
242 |     "\n",
243 |     "___"
244 |    ]
245 |   },
246 |   {
247 |    "cell_type": "code",
248 |    "execution_count": null,
249 |    "metadata": {},
250 |    "outputs": [],
251 |    "source": [
252 |     "!python3 -m pytest -v -k \"test_nn_sequential\""
253 |    ]
254 |   },
255 |   {
256 |    "cell_type": "code",
257 |    "execution_count": null,
258 |    "metadata": {},
259 |    "outputs": [],
260 |    "source": [
261 |     "!python -m mugrade submit 'YOUR_GRADER_KEY_HERE' -k \"nn_sequential\""
262 |    ]
263 |   },
264 |   {
265 |    "cell_type": "markdown",
266 |    "metadata": {
267 |     "tags": []
268 |    },
269 |    "source": [
270 |     "### LogSumExp\n",
271 |     "\n",
272 |     "`needle.ops.LogSumExp(axes)`\n",
273 |     "\n",
274 |     "Applies a numerically stable log-sum-exp function to the input by subtracting off the maximum elements. You will need to implement this and the next operation in file `python/needle/ops/ops_logarithmic.py`.\n",
275 |     "\n",
276 |     "\\begin{equation}\n",
277 |     "\\text{LogSumExp}(z) = \\log (\\sum_{i} \\exp (z_i - \\max{z})) + \\max{z}\n",
278 |     "\\end{equation}\n",
279 |     "\n",
280 |     "#### Parameters\n",
281 |     "- `axes` - Tuple of axes to sum and take the maximum element over. This uses the same conventions as `needle.ops.Summation()`\n",
282 |     "\n",
283 |     "___"
284 |    ]
285 |   },
286 |   {
287 |    "cell_type": "code",
288 |    "execution_count": null,
289 |    "metadata": {},
290 |    "outputs": [],
291 |    "source": [
292 |     "!python3 -m pytest -v -k \"test_op_logsumexp\""
293 |    ]
294 |   },
295 |   {
296 |    "cell_type": "code",
297 |    "execution_count": null,
298 |    "metadata": {},
299 |    "outputs": [],
300 |    "source": [
301 |     "!python -m mugrade submit 'YOUR_GRADER_KEY_HERE' -k \"op_logsumexp\""
302 |    ]
303 |   },
304 |   {
305 |    "cell_type": "markdown",
306 |    "metadata": {},
307 |    "source": [
308 |     "### LogSoftmax\n",
309 |     "\n",
310 |     "`needle.ops.LogSoftmax(axes)`\n",
311 |     "\n",
312 |     "Applies a numerically stable logsoftmax function to the input by subtracting off the maximum elements. Assume the input NDArray is 2 dimensional and we are doing softmax over `axis=1`.\n",
313 |     "\n",
314 |     "\\begin{equation}\n",
315 |     "\\text{LogSoftmax}(z) = \\log \\left(\\frac{\\exp(z_i - \\max z)}{\\sum_{i}\\exp(z_i - \\max z)}\\right) = z - \\text{LogSumExp}(z)\n",
316 |     "\\end{equation}\n",
317 |     "___"
318 |    ]
319 |   },
320 |   {
321 |    "cell_type": "code",
322 |    "execution_count": null,
323 |    "metadata": {},
324 |    "outputs": [],
325 |    "source": [
326 |     "!python3 -m pytest -v -k \"test_op_logsoftmax\""
327 |    ]
328 |   },
329 |   {
330 |    "cell_type": "code",
331 |    "execution_count": null,
332 |    "metadata": {},
333 |    "outputs": [],
334 |    "source": [
335 |     "!python -m mugrade submit 'YOUR_GRADER_KEY_HERE' -k \"op_logsoftmax\""
336 |    ]
337 |   },
338 |   {
339 |    "cell_type": "markdown",
340 |    "metadata": {
341 |     "tags": []
342 |    },
343 |    "source": [
344 |     "### SoftmaxLoss\n",
345 |     "\n",
346 |     "`needle.nn.SoftmaxLoss()`\n",
347 |     "\n",
348 |     "Applies the softmax loss as defined below (and as implemented in Homework 1), taking in as input a Tensor of logits and a Tensor of the true labels (expressed as a list of numbers, *not* one-hot encoded).\n",
349 |     "\n",
350 |     "Note that you can use the `init.one_hot` function now instead of writing this yourself.  Note: You will need to use the numerically stable logsumexp operator you just implemented for this purpose.\n",
351 |     "\n",
352 |     "\\begin{equation}\n",
353 |     "\\ell_\\text{softmax}(z,y) = \\log \\sum_{i=1}^k \\exp z_i - z_y\n",
354 |     "\\end{equation}\n",
355 |     "\n",
356 |     "___"
357 |    ]
358 |   },
359 |   {
360 |    "cell_type": "code",
361 |    "execution_count": null,
362 |    "metadata": {},
363 |    "outputs": [],
364 |    "source": [
365 |     "!python3 -m pytest -v -k \"test_nn_softmax_loss\""
366 |    ]
367 |   },
368 |   {
369 |    "cell_type": "code",
370 |    "execution_count": null,
371 |    "metadata": {},
372 |    "outputs": [],
373 |    "source": [
374 |     "!python -m mugrade submit 'YOUR_GRADER_KEY_HERE' -k \"nn_softmax_loss\""
375 |    ]
376 |   },
377 |   {
378 |    "cell_type": "markdown",
379 |    "metadata": {
380 |     "tags": []
381 |    },
382 |    "source": [
383 |     "### LayerNorm1d\n",
384 |     "`needle.nn.LayerNorm1d(dim, eps=1e-5, device=None, dtype=\"float32\")`\n",
385 |     "\n",
386 |     "Applies layer normalization over a mini-batch of inputs as described in the paper [Layer Normalization](https://arxiv.org/abs/1607.06450).\n",
387 |     "\n",
388 |     "\\begin{equation}\n",
389 |     "y = w \\circ \\frac{x_i - \\textbf{E}[x]}{((\\textbf{Var}[x]+\\epsilon)^{1/2})} + b\n",
390 |     "\\end{equation}\n",
391 |     "\n",
392 |     "where $\\textbf{E}[x]$ denotes the empirical mean of the inputs, $\\textbf{Var}[x]$ denotes their empirical variance (note that here we are using the \"biased\" estimate of the variance, i.e., dividing by $N$ rather than by $N-1$), and $w$ and $b$ denote learnable scalar weights and biases respectively.  Note you can assume the input to this layer is a 2D tensor, with batches in the first dimension and features in the second. You might need to broadcast the weight and bias before applying them.\n",
393 |     "\n",
394 |     "##### Parameters\n",
395 |     "- `dim` - number of channels\n",
396 |     "- `eps` - a value added to the denominator for numerical stability.\n",
397 |     "\n",
398 |     "##### Variables\n",
399 |     "- `weight` - the learnable weights of size `dim`, elements initialized to 1.\n",
400 |     "- `bias` - the learnable bias of shape `dim`, elements initialized to 0.\n",
401 |     "___"
402 |    ]
403 |   },
404 |   {
405 |    "cell_type": "code",
406 |    "execution_count": null,
407 |    "metadata": {},
408 |    "outputs": [],
409 |    "source": [
410 |     "!python3 -m pytest -v -k \"test_nn_layernorm\""
411 |    ]
412 |   },
413 |   {
414 |    "cell_type": "code",
415 |    "execution_count": null,
416 |    "metadata": {},
417 |    "outputs": [],
418 |    "source": [
419 |     "!python -m mugrade submit 'YOUR_GRADER_KEY_HERE' -k \"nn_layernorm\""
420 |    ]
421 |   },
422 |   {
423 |    "cell_type": "markdown",
424 |    "metadata": {},
425 |    "source": [
426 |     "\n",
427 |     "### Flatten\n",
428 |     "`needle.nn.Flatten()`\n",
429 |     "\n",
430 |     "Takes in a tensor of shape `(B,X_0,X_1,...)`, and flattens all non-batch dimensions so that the output is of shape `(B, X_0 * X_1 * ...)`"
431 |    ]
432 |   },
433 |   {
434 |    "cell_type": "code",
435 |    "execution_count": null,
436 |    "metadata": {},
437 |    "outputs": [],
438 |    "source": [
439 |     "!python3 -m pytest -v -k \"test_nn_flatten\""
440 |    ]
441 |   },
442 |   {
443 |    "cell_type": "code",
444 |    "execution_count": null,
445 |    "metadata": {},
446 |    "outputs": [],
447 |    "source": [
448 |     "!python -m mugrade submit 'YOUR_GRADER_KEY_HERE' -k \"nn_flatten\""
449 |    ]
450 |   },
451 |   {
452 |    "cell_type": "markdown",
453 |    "metadata": {},
454 |    "source": [
455 |     "### BatchNorm1d\n",
456 |     "`needle.nn.BatchNorm1d(dim, eps=1e-5, momentum=0.1, device=None, dtype=\"float32\")`\n",
457 |     "\n",
458 |     "Applies batch normalization over a mini-batch of inputs as described in the paper [Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift](https://arxiv.org/abs/1502.03167).\n",
459 |     "\n",
460 |     "\\begin{equation}\n",
461 |     "y = w \\circ \\frac{z_i - \\textbf{E}[x]}{((\\textbf{Var}[x]+\\epsilon)^{1/2})} + b\n",
462 |     "\\end{equation}\n",
463 |     "\n",
464 |     "but where here the mean and variance refer to to the mean and variance over the _batch_dimensions.  The function also computes a running average of mean/variance for all features at each layer $\\hat{\\mu}, \\hat{\\sigma}^2$, and at test time normalizes by these quantities:\n",
465 |     "\n",
466 |     "\\begin{equation}\n",
467 |     "y = \\frac{(x - \\hat{mu})}{((\\hat{\\sigma}^2_{i+1})_j+\\epsilon)^{1/2}}\n",
468 |     "\\end{equation}\n",
469 |     "\n",
470 |     "\n",
471 |     "BatchNorm uses the running estimates of mean and variance instead of batch statistics at test time, i.e.,\n",
472 |     "after `model.eval()` has been called on the BatchNorm layer's `training` flag is false.\n",
473 |     "\n",
474 |     "To compute the running estimates, you can use the equation $$\\hat{x_{new}} = (1 - m) \\hat{x_{old}} + mx_{observed},$$\n",
475 |     "where $m$ is momentum.\n",
476 |     "\n",
477 |     "##### Parameters\n",
478 |     "- `dim` - input dimension\n",
479 |     "- `eps` - a value added to the denominator for numerical stability.\n",
480 |     "- `momentum` - the value used for the running mean and running variance computation.\n",
481 |     "\n",
482 |     "##### Variables\n",
483 |     "- `weight` - the learnable weights of size `dim`, elements initialized to 1.\n",
484 |     "- `bias` - the learnable bias of size `dim`, elements initialized to 0.\n",
485 |     "- `running_mean` - the running mean used at evaluation time, elements initialized to 0.\n",
486 |     "- `running_var` - the running (unbiased) variance used at evaluation time, elements initialized to 1. \n",
487 |     "\n",
488 |     "___"
489 |    ]
490 |   },
491 |   {
492 |    "cell_type": "code",
493 |    "execution_count": null,
494 |    "metadata": {},
495 |    "outputs": [],
496 |    "source": [
497 |     "!python3 -m pytest -v -k \"test_nn_batchnorm\""
498 |    ]
499 |   },
500 |   {
501 |    "cell_type": "code",
502 |    "execution_count": null,
503 |    "metadata": {},
504 |    "outputs": [],
505 |    "source": [
506 |     "!python -m mugrade submit 'YOUR_GRADER_KEY_HERE' -k \"nn_batchnorm\""
507 |    ]
508 |   },
509 |   {
510 |    "cell_type": "markdown",
511 |    "metadata": {},
512 |    "source": [
513 |     "### Dropout\n",
514 |     "`needle.nn.Dropout(p = 0.5)`\n",
515 |     "\n",
516 |     "During training, randomly zeroes some of the elements of the input tensor with probability `p` using samples from a Bernoulli distribution. This has proven to be an effective technique for regularization and preventing the co-adaptation of neurons as described in the paper [Improving neural networks by preventing co-adaption of feature detectors](https://arxiv.org/abs/1207.0580). During evaluation the module simply computes an identity function. \n",
517 |     "\n",
518 |     "\\begin{equation}\n",
519 |     "\\hat{z}_{i+1} = \\sigma_i (W_i^T z_i + b_i) \\\\\n",
520 |     "(z_{i+1})_j = \n",
521 |     "    \\begin{cases}\n",
522 |     "    (\\hat{z}_{i+1})_j /(1-p) & \\text{with probability } 1-p \\\\\n",
523 |     "    0 & \\text{with probability } p \\\\\n",
524 |     "    \\end{cases}\n",
525 |     "\\end{equation}\n",
526 |     "\n",
527 |     "**Important**: If the Dropout module the flag `training=False`, you shouldn't \"dropout\" any weights. That is, dropout applies during training only, not during evaluation. Note that `training` is a flag in `nn.Module`.\n",
528 |     "\n",
529 |     "##### Parameters\n",
530 |     "- `p` - the probability of an element to be zeroed.\n",
531 |     "\n",
532 |     "___"
533 |    ]
534 |   },
535 |   {
536 |    "cell_type": "code",
537 |    "execution_count": null,
538 |    "metadata": {},
539 |    "outputs": [],
540 |    "source": [
541 |     "!python3 -m pytest -v -k \"test_nn_dropout\""
542 |    ]
543 |   },
544 |   {
545 |    "cell_type": "code",
546 |    "execution_count": null,
547 |    "metadata": {},
548 |    "outputs": [],
549 |    "source": [
550 |     "!python -m mugrade submit 'YOUR_GRADER_KEY_HERE' -k \"nn_dropout\""
551 |    ]
552 |   },
553 |   {
554 |    "cell_type": "markdown",
555 |    "metadata": {
556 |     "tags": []
557 |    },
558 |    "source": [
559 |     "### Residual\n",
560 |     "`needle.nn.Residual(fn: Module)`\n",
561 |     "\n",
562 |     "Applies a residual or skip connection given module $\\mathcal{F}$ and input Tensor $x$, returning $\\mathcal{F}(x) + x$.\n",
563 |     "##### Parameters\n",
564 |     "- `fn` - module of type `needle.nn.Module`"
565 |    ]
566 |   },
567 |   {
568 |    "cell_type": "code",
569 |    "execution_count": null,
570 |    "metadata": {},
571 |    "outputs": [],
572 |    "source": [
573 |     "!python3 -m pytest -v -k \"test_nn_residual\""
574 |    ]
575 |   },
576 |   {
577 |    "cell_type": "code",
578 |    "execution_count": null,
579 |    "metadata": {},
580 |    "outputs": [],
581 |    "source": [
582 |     "!python -m mugrade submit 'YOUR_GRADER_KEY_HERE' -k \"nn_residual\""
583 |    ]
584 |   },
585 |   {
586 |    "cell_type": "markdown",
587 |    "metadata": {
588 |     "tags": []
589 |    },
590 |    "source": [
591 |     "## Question 3\n",
592 |     "\n",
593 |     "Implement the `step` function of the following optimizers in `python/needle/optim.py`.\n",
594 |     "Make sure that your optimizers _don't_ modify the gradients of tensors in-place.\n",
595 |     "\n",
596 |     "We have included some tests to ensure that you are not consuming excessive memory, which can happen if you are\n",
597 |     "not using `.data` or `.detach()` in the right places, thus building an increasingly large computational graph\n",
598 |     "(not just in the optimizers, but in the previous modules as well).\n",
599 |     "You can ignore these tests, which include the string `memory_check` at your own discretion.\n",
600 |     "\n",
601 |     "___\n",
602 |     "\n",
603 |     "### SGD\n",
604 |     "`needle.optim.SGD(params, lr=0.01, momentum=0.0, weight_decay=0.0)`\n",
605 |     "\n",
606 |     "Implements stochastic gradient descent (optionally with momentum, shown as $\\beta$ below). \n",
607 |     "\n",
608 |     "\\begin{equation}\n",
609 |     "\\begin{split}\n",
610 |     "    u_{t+1} &= \\beta u_t + (1-\\beta) \\nabla_\\theta f(\\theta_t) \\\\\n",
611 |     "    \\theta_{t+1} &= \\theta_t - \\alpha u_{t+1}\n",
612 |     "\\end{split}\n",
613 |     "\\end{equation}\n",
614 |     "\n",
615 |     "##### Parameters\n",
616 |     "- `params` - iterable of parameters of type `needle.nn.Parameter` to optimize\n",
617 |     "- `lr` (*float*) - learning rate\n",
618 |     "- `momentum` (*float*) - momentum factor\n",
619 |     "- `weight_decay` (*float*) - weight decay (L2 penalty)\n",
620 |     "___"
621 |    ]
622 |   },
623 |   {
624 |    "cell_type": "code",
625 |    "execution_count": null,
626 |    "metadata": {},
627 |    "outputs": [],
628 |    "source": [
629 |     "!python3 -m pytest -v -k \"test_optim_sgd\""
630 |    ]
631 |   },
632 |   {
633 |    "cell_type": "code",
634 |    "execution_count": null,
635 |    "metadata": {},
636 |    "outputs": [],
637 |    "source": [
638 |     "!python -m mugrade submit 'YOUR_GRADER_KEY_HERE' -k \"optim_sgd\""
639 |    ]
640 |   },
641 |   {
642 |    "cell_type": "markdown",
643 |    "metadata": {
644 |     "tags": []
645 |    },
646 |    "source": [
647 |     "### Adam\n",
648 |     "`needle.optim.Adam(params, lr=0.01, beta1=0.9, beta2=0.999, eps=1e-8, weight_decay=0.0)`\n",
649 |     "\n",
650 |     "Implements Adam algorithm, proposed in [Adam: A Method for Stochastic Optimization](https://arxiv.org/abs/1412.6980). \n",
651 |     "\n",
652 |     "\\begin{equation}\n",
653 |     "\\begin{split}\n",
654 |     "u_{t+1} &= \\beta_1 u_t + (1-\\beta_1) \\nabla_\\theta f(\\theta_t) \\\\\n",
655 |     "v_{t+1} &= \\beta_2 v_t + (1-\\beta_2) (\\nabla_\\theta f(\\theta_t))^2 \\\\\n",
656 |     "\\hat{u}_{t+1} &= u_{t+1} / (1 - \\beta_1^t) \\quad \\text{(bias correction)} \\\\\n",
657 |     "\\hat{v}_{t+1} &= v_{t+1} / (1 - \\beta_2^t) \\quad \\text{(bias correction)}\\\\\n",
658 |     "\\theta_{t+1} &= \\theta_t - \\alpha \\hat{u_{t+1}}/(\\hat{v}_{t+1}^{1/2}+\\epsilon)\n",
659 |     "\\end{split}\n",
660 |     "    \\end{equation}\n",
661 |     "\n",
662 |     "**Important:** Pay attention to whether or not you are applying bias correction.\n",
663 |     "\n",
664 |     "##### Parameters\n",
665 |     "- `params` - iterable of parameters of type `needle.nn.Parameter` to optimize\n",
666 |     "- `lr` (*float*) - learning rate\n",
667 |     "- `beta1` (*float*) - coefficient used for computing running average of gradient\n",
668 |     "- `beta2` (*float*) - coefficient used for computing running average of square of gradient\n",
669 |     "- `eps` (*float*) - term added to the denominator to improve numerical stability\n",
670 |     "- `weight_decay` (*float*) - weight decay (L2 penalty)\n",
671 |     "\n",
672 |     "**Hint**: To help deal with memory issues, try to understand how to use `.data` or `.detach()`"
673 |    ]
674 |   },
675 |   {
676 |    "cell_type": "code",
677 |    "execution_count": null,
678 |    "metadata": {},
679 |    "outputs": [],
680 |    "source": [
681 |     "!python3 -m pytest -v -k \"test_optim_adam\""
682 |    ]
683 |   },
684 |   {
685 |    "cell_type": "code",
686 |    "execution_count": null,
687 |    "metadata": {},
688 |    "outputs": [],
689 |    "source": [
690 |     "!python -m mugrade submit 'YOUR_GRADER_KEY_HERE' -k \"optim_adam\""
691 |    ]
692 |   },
693 |   {
694 |    "cell_type": "markdown",
695 |    "metadata": {},
696 |    "source": [
697 |     "## Question 4\n",
698 |     "\n",
699 |     "In this question, you will implement two data primitives: `needle.data.DataLoader` and `needle.data.Dataset`. `Dataset` stores the samples and their corresponding labels, and `DataLoader` wraps an iterable around the `Dataset` to enable easy access to the samples. \n",
700 |     "\n",
701 |     "For this question, you will be working in the `python/needle/data` directory. \n",
702 |     "\n",
703 |     "### Transformations\n",
704 |     "\n",
705 |     "First we will implement a few transformations that are helpful when working with images. We will stick with a horizontal flip and a random crop for now. Fill out the following functions in `needle/data/data_transforms.py`.\n",
706 |     "___ \n",
707 |     "\n",
708 |     "#### RandomFlipHorizontal\n",
709 |     "`needle.data.RandomFlipHorizontal(p = 0.5)`\n",
710 |     "\n",
711 |     "Flips the image horizontally, with probability `p`.\n",
712 |     "\n",
713 |     "##### Parameters\n",
714 |     "- `p` (*float*) - The probability of flipping the input image.\n",
715 |     "___\n",
716 |     "\n",
717 |     "#### RandomCrop\n",
718 |     "`needle.data.RandomCrop(padding=3)`\n",
719 |     "\n",
720 |     "Padding is added to all sides of the image, and then the image is cropped back to it's original size at a random location. Returns an image the same size as the original image.\n",
721 |     "\n",
722 |     "##### Parameters\n",
723 |     "- `padding` (*int*) - The padding on each border of the image."
724 |    ]
725 |   },
726 |   {
727 |    "cell_type": "code",
728 |    "execution_count": null,
729 |    "metadata": {},
730 |    "outputs": [],
731 |    "source": [
732 |     "!python3 -m pytest -v -k \"flip_horizontal\"\n",
733 |     "!python3 -m pytest -v -k \"random_crop\""
734 |    ]
735 |   },
736 |   {
737 |    "cell_type": "code",
738 |    "execution_count": null,
739 |    "metadata": {},
740 |    "outputs": [],
741 |    "source": [
742 |     "!python -m mugrade submit 'YOUR_GRADER_KEY_HERE' -k \"flip_horizontal\"\n",
743 |     "!python -m mugrade submit 'YOUR_GRADER_KEY_HERE' -k \"random_crop\""
744 |    ]
745 |   },
746 |   {
747 |    "cell_type": "markdown",
748 |    "metadata": {},
749 |    "source": [
750 |     "### Dataset\n",
751 |     "\n",
752 |     "Each `Dataset` subclass must implement three functions: `__init__`, `__len__`, and `__getitem__`. The `__init__` function initializes the images, labels, and transforms. The `__len__` function returns the number of samples in the dataset. The `__getitem__` function retrieves a sample from the dataset at a given index `idx`, calls the transform functions on the image (if applicable), converts the image and label to a numpy array (the data will be converted to Tensors elsewhere). The output of `__getitem__` and `__next__` should be NDArrays, and you should follow the shapes such that you're accessing an array of size (Datapoint Number, Feature Dim 1, Feature Dim 2, ...). \n",
753 |     "\n",
754 |     "Fill out these functions in the `MNISTDataset` class in `needle/data/datasets/mnist_dataset.py`. You can use your solution to `parse_mnist` from the previous homework for the `__init__` function.\n",
755 |     "\n",
756 |     "### MNISTDataset\n",
757 |     "`needle.data.MNISTDataset(image_filesname, label_filesname, transforms)`\n",
758 |     "\n",
759 |     "##### Parameters\n",
760 |     "- `image_filesname` - path of file containing images\n",
761 |     "- `label_filesname` - path of file containing labels\n",
762 |     "- `transforms` - an optional list of transforms to apply to data\n"
763 |    ]
764 |   },
765 |   {
766 |    "cell_type": "code",
767 |    "execution_count": null,
768 |    "metadata": {},
769 |    "outputs": [],
770 |    "source": [
771 |     "!python3 -m pytest -v -k \"test_mnist_dataset\""
772 |    ]
773 |   },
774 |   {
775 |    "cell_type": "code",
776 |    "execution_count": null,
777 |    "metadata": {},
778 |    "outputs": [],
779 |    "source": [
780 |     "!python -m mugrade submit 'YOUR_GRADER_KEY_HERE' -k \"mnist_dataset\""
781 |    ]
782 |   },
783 |   {
784 |    "cell_type": "markdown",
785 |    "metadata": {},
786 |    "source": [
787 |     "### Dataloader\n",
788 |     "\n",
789 |     "In `needle/data/data_basic.py`, the Dataloader class provides an interface for assembling mini-batches of examples suitable for training using SGD-based approaches, backed by a Dataset object.  In order to build the typical Dataloader interface (allowing users to iterate over all the mini-batches in the dataset), you will need to implement the `__iter__()` and `__next__()` calls in the class: `__iter__()` is called at the start of iteration, while `__next__()` is called to grab the next mini-batch. Please note that subsequent calls to next will require you to return the following batches, so next is not a pure function.\n",
790 |     "\n",
791 |     "### Dataloader\n",
792 |     "`needle.data.Dataloader(dataset: Dataset, batch_size: Optional[int] = 1, shuffle: bool = False)`\n",
793 |     "\n",
794 |     "Combines a dataset and a sampler, and provides an iterable over the given dataset. \n",
795 |     "\n",
796 |     "##### Parameters\n",
797 |     "- `dataset` - `needle.data.Dataset` - a dataset \n",
798 |     "- `batch_size` - `int` - what batch size to serve the data in \n",
799 |     "- `shuffle` - `bool` - set to ``True`` to have the data reshuffle at every epoch, default ``False``.\n",
800 |     "___ \n",
801 |     "\n",
802 |     "\n",
803 |     "\n"
804 |    ]
805 |   },
806 |   {
807 |    "cell_type": "code",
808 |    "execution_count": null,
809 |    "metadata": {},
810 |    "outputs": [],
811 |    "source": [
812 |     "!python3 -m pytest -v -k \"test_dataloader\""
813 |    ]
814 |   },
815 |   {
816 |    "cell_type": "code",
817 |    "execution_count": null,
818 |    "metadata": {},
819 |    "outputs": [],
820 |    "source": [
821 |     "!python -m mugrade submit 'YOUR_GRADER_KEY_HERE' -k \"dataloader\""
822 |    ]
823 |   },
824 |   {
825 |    "cell_type": "markdown",
826 |    "metadata": {},
827 |    "source": [
828 |     "## Question 5\n",
829 |     "\n",
830 |     "Given you have now implemented all the necessary components for our neural network library, let's build and train an MLP ResNet. For this question, you will be working in `apps/mlp_resnet.py`. First, fill out the functions `ResidualBlock` and `MLPResNet` as described below:\n",
831 |     "\n",
832 |     "### ResidualBlock\n",
833 |     "`ResidualBlock(dim, hidden_dim, norm=nn.BatchNorm1d, drop_prob=0.1)`\n",
834 |     "\n",
835 |     "Implements a residual block as follows:\n",
836 |     "\n",
837 |     "<p align=\"center\">\n",
838 |     "  <img src=\"https://raw.github.com/dlsyscourse/hw2/blob/main/figures/residualblock.png\" alt=\"Residual Block\"/>\n",
839 |     "</p>\n",
840 |     "\n",
841 |     "**NOTE**: if the figure does not render, please see the figure in the `figures` directory.\n",
842 |     "\n",
843 |     "where the first linear layer has `in_features=dim` and `out_features=hidden_dim`, and the last linear layer has `out_features=dim`. Returns the block as type `nn.Module`. \n",
844 |     "\n",
845 |     "##### Parameters\n",
846 |     "- `dim` (*int*) - input dim\n",
847 |     "- `hidden_dim` (*int*) - hidden dim\n",
848 |     "- `norm` (*nn.Module*) - normalization method\n",
849 |     "- `drop_prob` (*float*) - dropout probability\n",
850 |     "\n",
851 |     "___\n",
852 |     "\n",
853 |     "### MLPResNet\n",
854 |     "`MLPResNet(dim, hidden_dim=100, num_blocks=3, num_classes=10, norm=nn.BatchNorm1d, drop_prob=0.1)`\n",
855 |     "\n",
856 |     "Implements an MLP ResNet as follows:\n",
857 |     "\n",
858 |     "<p align=\"center\">\n",
859 |     "  <img src=\"https://raw.github.com/dlsyscourse/hw2/blob/main/figures/mlp_resnet.png\" alt=\"MLP Resnet\"/>\n",
860 |     "</p>\n",
861 |     "\n",
862 |     "where the first linear layer has `in_features=dim` and `out_features=hidden_dim`, and each ResidualBlock has `dim=hidden_dim` and `hidden_dim=hidden_dim//2`. Returns a network of type `nn.Module`.\n",
863 |     "\n",
864 |     "##### Parameters\n",
865 |     "- `dim` (*int*) - input dim\n",
866 |     "- `hidden_dim` (*int*) - hidden dim\n",
867 |     "- `num_blocks` (*int*) - number of ResidualBlocks\n",
868 |     "- `num_classes` (*int*) - number of classes\n",
869 |     "- `norm` (*nn.Module*) - normalization method\n",
870 |     "- `drop_prob` (*float*) - dropout probability (0.1)\n",
871 |     "\n",
872 |     "**Note**: Modules should be initialized to match the order of execution in the Resnet.\n",
873 |     "___ \n",
874 |     "\n",
875 |     "Once you have the deep learning model architecture correct, let's train the network using our new neural network library components. Specifically, implement the functions `epoch` and `train_mnist`.\n",
876 |     "\n",
877 |     "### Epoch\n",
878 |     "\n",
879 |     "`epoch(dataloader, model, opt=None)`\n",
880 |     "\n",
881 |     "Executes one epoch of training or evaluation, iterating over the entire training dataset once (just like `nn_epoch` from previous homeworks). Returns the average error rate (as a *float*) and the average loss over all samples (as a *float*). Set the model to `training` mode at the beginning of the function if `opt` is given; set the model to `eval` if `opt` is not given (i.e. `None`). When setting the modes, use `.train()` and `.eval()` instead of modifying the training attribute.\n",
882 |     "\n",
883 |     "##### Parameters\n",
884 |     "- `dataloader` (*`needle.data.DataLoader`*) - dataloader returning samples from the training dataset\n",
885 |     "- `model` (*`needle.nn.Module`*) - neural network\n",
886 |     "- `opt` (*`needle.optim.Optimizer`*) - optimizer instance, or `None`\n",
887 |     "\n",
888 |     "___\n",
889 |     "\n",
890 |     "### Train Mnist\n",
891 |     "\n",
892 |     "`train_mnist(batch_size=100, epochs=10, optimizer=ndl.optim.Adam, lr=0.001, weight_decay=0.001, hidden_dim=100, data_dir=\"data\")`\n",
893 |     "                \n",
894 |     "Initializes a training dataloader (with `shuffle` set to `True`) and a test dataloader for MNIST data, and trains an `MLPResNet` using the given optimizer (if `opt` is not None) and the softmax loss for a given number of epochs. Returns a tuple of the training error, training loss, test error, test loss computed in the last epoch of training. If any parameters are not specified, use the default parameters.\n",
895 |     "\n",
896 |     "##### Parameters\n",
897 |     "- `batch_size` (*int*) - batch size to use for train and test dataloader\n",
898 |     "- `epochs` (*int*) - number of epochs to train for\n",
899 |     "- `optimizer` (*`needle.optim.Optimizer` type*) - optimizer type to use\n",
900 |     "- `lr` (*float*) - learning rate \n",
901 |     "- `weight_decay` (*float*) - weight decay\n",
902 |     "- `hidden_dim` (*int*) - hidden dim for `MLPResNet`\n",
903 |     "- `data_dir` (*int*) - directory containing MNIST image/label files\n"
904 |    ]
905 |   },
906 |   {
907 |    "cell_type": "code",
908 |    "execution_count": null,
909 |    "metadata": {},
910 |    "outputs": [],
911 |    "source": [
912 |     "!python3 -m pytest -v -k \"test_mlp\""
913 |    ]
914 |   },
915 |   {
916 |    "cell_type": "code",
917 |    "execution_count": null,
918 |    "metadata": {},
919 |    "outputs": [],
920 |    "source": [
921 |     "!python -m mugrade submit 'YOUR_GRADER_KEY_HERE' -k \"mlp_resnet\""
922 |    ]
923 |   },
924 |   {
925 |    "cell_type": "markdown",
926 |    "metadata": {},
927 |    "source": [
928 |     "We encourage to experiment with the `mlp_resnet.py` training script.\n",
929 |     "You can investigate the effect of using different initializers on the Linear layers,\n",
930 |     "increasing the dropout probability,\n",
931 |     "or adding transforms (via a list to the `transforms=` keyword argument of Dataset)\n",
932 |     "such as random cropping."
933 |    ]
934 |   }
935 |  ],
936 |  "metadata": {
937 |   "kernelspec": {
938 |    "display_name": "Python 3.8.10 64-bit",
939 |    "language": "python",
940 |    "name": "python3"
941 |   },
942 |   "language_info": {
943 |    "codemirror_mode": {
944 |     "name": "ipython",
945 |     "version": 3
946 |    },
947 |    "file_extension": ".py",
948 |    "mimetype": "text/x-python",
949 |    "name": "python",
950 |    "nbconvert_exporter": "python",
951 |    "pygments_lexer": "ipython3",
952 |    "version": "3.8.10"
953 |   },
954 |   "vscode": {
955 |    "interpreter": {
956 |     "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6"
957 |    }
958 |   }
959 |  },
960 |  "nbformat": 4,
961 |  "nbformat_minor": 4
962 | }
963 | 


--------------------------------------------------------------------------------
/python/needle/__init__.py:
--------------------------------------------------------------------------------
 1 | from . import ops
 2 | from .ops import *
 3 | from .autograd import Tensor, cpu, all_devices
 4 | 
 5 | from . import init
 6 | from .init import ones, zeros, zeros_like, ones_like
 7 | 
 8 | from . import data
 9 | from . import nn
10 | from . import optim
11 | 


--------------------------------------------------------------------------------
/python/needle/autograd.py:
--------------------------------------------------------------------------------
  1 | """Core data structures."""
  2 | import needle
  3 | from .backend_numpy import Device, cpu, all_devices
  4 | from typing import List, Optional, NamedTuple, Tuple, Union
  5 | from collections import namedtuple
  6 | import numpy
  7 | 
  8 | from needle import init
  9 | 
 10 | # needle version
 11 | LAZY_MODE = False
 12 | TENSOR_COUNTER = 0
 13 | 
 14 | # NOTE: we will import numpy as the array_api
 15 | # as the backend for our computations, this line will change in later homeworks
 16 | 
 17 | import numpy as array_api
 18 | NDArray = numpy.ndarray
 19 | 
 20 | 
 21 | class Op:
 22 |     """Operator definition."""
 23 | 
 24 |     def __call__(self, *args):
 25 |         raise NotImplementedError()
 26 | 
 27 |     def compute(self, *args: Tuple[NDArray]):
 28 |         """Calculate forward pass of operator.
 29 | 
 30 |         Parameters
 31 |         ----------
 32 |         input: np.ndarray
 33 |             A list of input arrays to the function
 34 | 
 35 |         Returns
 36 |         -------
 37 |         output: nd.array
 38 |             Array output of the operation
 39 | 
 40 |         """
 41 |         raise NotImplementedError()
 42 | 
 43 |     def gradient(
 44 |         self, out_grad: "Value", node: "Value"
 45 |     ) -> Union["Value", Tuple["Value"]]:
 46 |         """Compute partial adjoint for each input value for a given output adjoint.
 47 | 
 48 |         Parameters
 49 |         ----------
 50 |         out_grad: Value
 51 |             The adjoint wrt to the output value.
 52 | 
 53 |         node: Value
 54 |             The value node of forward evaluation.
 55 | 
 56 |         Returns
 57 |         -------
 58 |         input_grads: Value or Tuple[Value]
 59 |             A list containing partial gradient adjoints to be propagated to
 60 |             each of the input node.
 61 |         """
 62 |         raise NotImplementedError()
 63 | 
 64 |     def gradient_as_tuple(self, out_grad: "Value", node: "Value") -> Tuple["Value"]:
 65 |         """Convenience method to always return a tuple from gradient call"""
 66 |         output = self.gradient(out_grad, node)
 67 |         if isinstance(output, tuple):
 68 |             return output
 69 |         elif isinstance(output, list):
 70 |             return tuple(output)
 71 |         else:
 72 |             return (output,)
 73 | 
 74 | 
 75 | class TensorOp(Op):
 76 |     """Op class specialized to output tensors, will be alternate subclasses for other structures"""
 77 | 
 78 |     def __call__(self, *args):
 79 |         return Tensor.make_from_op(self, args)
 80 | 
 81 | 
 82 | class TensorTupleOp(Op):
 83 |     """Op class specialized to output TensorTuple"""
 84 | 
 85 |     def __call__(self, *args):
 86 |         return TensorTuple.make_from_op(self, args)
 87 | 
 88 | 
 89 | class Value:
 90 |     """A value in the computational graph."""
 91 | 
 92 |     # trace of computational graph
 93 |     op: Optional[Op]
 94 |     inputs: List["Value"]
 95 |     # The following fields are cached fields for
 96 |     # dynamic computation
 97 |     cached_data: NDArray
 98 |     requires_grad: bool
 99 | 
100 |     def realize_cached_data(self):
101 |         """Run compute to realize the cached data"""
102 |         # avoid recomputation
103 |         if self.cached_data is not None:
104 |             return self.cached_data
105 |         # note: data implicitly calls realized cached data
106 |         self.cached_data = self.op.compute(
107 |             *[x.realize_cached_data() for x in self.inputs]
108 |         )
109 |         return self.cached_data
110 | 
111 |     def is_leaf(self):
112 |         return self.op is None
113 | 
114 |     def __del__(self):
115 |         global TENSOR_COUNTER
116 |         TENSOR_COUNTER -= 1
117 | 
118 |     def _init(
119 |         self,
120 |         op: Optional[Op],
121 |         inputs: List["Tensor"],
122 |         *,
123 |         num_outputs: int = 1,
124 |         cached_data: List[object] = None,
125 |         requires_grad: Optional[bool] = None
126 |     ):
127 |         global TENSOR_COUNTER
128 |         TENSOR_COUNTER += 1
129 |         if requires_grad is None:
130 |             requires_grad = any(x.requires_grad for x in inputs)
131 |         self.op = op
132 |         self.inputs = inputs
133 |         self.num_outputs = num_outputs
134 |         self.cached_data = cached_data
135 |         self.requires_grad = requires_grad
136 | 
137 |     @classmethod
138 |     def make_const(cls, data, *, requires_grad=False):
139 |         value = cls.__new__(cls)
140 |         value._init(
141 |             None,
142 |             [],
143 |             cached_data=data,
144 |             requires_grad=requires_grad,
145 |         )
146 |         return value
147 | 
148 |     @classmethod
149 |     def make_from_op(cls, op: Op, inputs: List["Value"]):
150 |         value = cls.__new__(cls)
151 |         value._init(op, inputs)
152 | 
153 |         if not LAZY_MODE:
154 |             if not value.requires_grad:
155 |                 return value.detach()
156 |             value.realize_cached_data()
157 |         return value
158 | 
159 | 
160 | ### Not needed in HW1
161 | class TensorTuple(Value):
162 |     """Represent a tuple of tensors.
163 | 
164 |     To keep things simple, we do not support nested tuples.
165 |     """
166 | 
167 |     def __len__(self):
168 |         cdata = self.realize_cached_data()
169 |         return len(cdata)
170 | 
171 |     def __getitem__(self, index: int):
172 |         return needle.ops.tuple_get_item(self, index)
173 | 
174 |     def tuple(self):
175 |         return tuple([x for x in self])
176 | 
177 |     def __repr__(self):
178 |         return "needle.TensorTuple" + str(self.tuple())
179 | 
180 |     def __str__(self):
181 |         return self.__repr__()
182 | 
183 |     def __add__(self, other):
184 |         assert isinstance(other, TensorTuple)
185 |         assert len(self) == len(other)
186 |         return needle.ops.make_tuple(*[self[i] + other[i] for i in range(len(self))])
187 | 
188 |     def detach(self):
189 |         """Create a new tensor that shares the data but detaches from the graph."""
190 |         return TensorTuple.make_const(self.realize_cached_data())
191 | 
192 | 
193 | class Tensor(Value):
194 |     grad: "Tensor"
195 | 
196 |     def __init__(
197 |         self,
198 |         array,
199 |         *,
200 |         device: Optional[Device] = None,
201 |         dtype=None,
202 |         requires_grad=True,
203 |         **kwargs
204 |     ):
205 |         if isinstance(array, Tensor):
206 |             if device is None:
207 |                 device = array.device
208 |             if dtype is None:
209 |                 dtype = array.dtype
210 |             if device == array.device and dtype == array.dtype:
211 |                 cached_data = array.realize_cached_data()
212 |             else:
213 |                 # fall back, copy through numpy conversion
214 |                 cached_data = Tensor._array_from_numpy(
215 |                     array.numpy(), device=device, dtype=dtype
216 |                 )
217 |         else:
218 |             device = device if device else cpu()
219 |             cached_data = Tensor._array_from_numpy(array, device=device, dtype=dtype)
220 | 
221 |         self._init(
222 |             None,
223 |             [],
224 |             cached_data=cached_data,
225 |             requires_grad=requires_grad,
226 |         )
227 | 
228 |     @staticmethod
229 |     def _array_from_numpy(numpy_array, device, dtype):
230 |         if array_api is numpy:
231 |             return numpy.array(numpy_array, dtype=dtype)
232 |         return array_api.array(numpy_array, device=device, dtype=dtype)
233 | 
234 |     @staticmethod
235 |     def make_from_op(op: Op, inputs: List["Value"]):
236 |         tensor = Tensor.__new__(Tensor)
237 |         tensor._init(op, inputs)
238 |         if not LAZY_MODE:
239 |             if not tensor.requires_grad:
240 |                 return tensor.detach()
241 |             tensor.realize_cached_data()
242 |         return tensor
243 | 
244 |     @staticmethod
245 |     def make_const(data, requires_grad=False):
246 |         tensor = Tensor.__new__(Tensor)
247 |         tensor._init(
248 |             None,
249 |             [],
250 |             cached_data=data
251 |             if not isinstance(data, Tensor)
252 |             else data.realize_cached_data(),
253 |             requires_grad=requires_grad,
254 |         )
255 |         return tensor
256 | 
257 |     @property
258 |     def data(self):
259 |         return self.detach()
260 | 
261 |     @data.setter
262 |     def data(self, value):
263 |         assert isinstance(value, Tensor)
264 |         assert value.dtype == self.dtype, "%s %s" % (
265 |             value.dtype,
266 |             self.dtype,
267 |         )
268 |         self.cached_data = value.realize_cached_data()
269 | 
270 |     def detach(self):
271 |         """Create a new tensor that shares the data but detaches from the graph."""
272 |         return Tensor.make_const(self.realize_cached_data())
273 | 
274 |     @property
275 |     def shape(self):
276 |         return self.realize_cached_data().shape
277 | 
278 |     @property
279 |     def dtype(self):
280 |         return self.realize_cached_data().dtype
281 | 
282 |     @property
283 |     def device(self):
284 |         data = self.realize_cached_data()
285 |         # numpy array always sits on cpu
286 |         if array_api is numpy:
287 |             return cpu()
288 |         return data.device
289 | 
290 |     def backward(self, out_grad=None):
291 |         out_grad = (
292 |             out_grad
293 |             if out_grad
294 |             else init.ones(*self.shape, dtype=self.dtype, device=self.device)
295 |         )
296 |         compute_gradient_of_variables(self, out_grad)
297 | 
298 |     def __repr__(self):
299 |         return "needle.Tensor(" + str(self.realize_cached_data()) + ")"
300 | 
301 |     def __str__(self):
302 |         return self.realize_cached_data().__str__()
303 | 
304 |     def numpy(self):
305 |         data = self.realize_cached_data()
306 |         if array_api is numpy:
307 |             return data
308 |         return data.numpy()
309 | 
310 |     def __add__(self, other):
311 |         if isinstance(other, Tensor):
312 |             return needle.ops.EWiseAdd()(self, other)
313 |         else:
314 |             return needle.ops.AddScalar(other)(self)
315 | 
316 |     def __mul__(self, other):
317 |         if isinstance(other, Tensor):
318 |             return needle.ops.EWiseMul()(self, other)
319 |         else:
320 |             return needle.ops.MulScalar(other)(self)
321 | 
322 |     def __pow__(self, other):
323 |         if isinstance(other, Tensor):
324 |             return needle.ops.EWisePow()(self, other)
325 |         else:
326 |             return needle.ops.PowerScalar(other)(self)
327 | 
328 |     def __sub__(self, other):
329 |         if isinstance(other, Tensor):
330 |             return needle.ops.EWiseAdd()(self, needle.ops.Negate()(other))
331 |         else:
332 |             return needle.ops.AddScalar(-other)(self)
333 | 
334 |     def __truediv__(self, other):
335 |         if isinstance(other, Tensor):
336 |             return needle.ops.EWiseDiv()(self, other)
337 |         else:
338 |             return needle.ops.DivScalar(other)(self)
339 | 
340 |     def __matmul__(self, other):
341 |         return needle.ops.MatMul()(self, other)
342 | 
343 |     def matmul(self, other):
344 |         return needle.ops.MatMul()(self, other)
345 | 
346 |     def sum(self, axes=None):
347 |         return needle.ops.Summation(axes)(self)
348 | 
349 |     def broadcast_to(self, shape):
350 |         return needle.ops.BroadcastTo(shape)(self)
351 | 
352 |     def reshape(self, shape):
353 |         return needle.ops.Reshape(shape)(self)
354 | 
355 |     def __neg__(self):
356 |         return needle.ops.Negate()(self)
357 | 
358 |     def transpose(self, axes=None):
359 |         return needle.ops.Transpose(axes)(self)
360 | 
361 | 
362 |     __radd__ = __add__
363 |     __rmul__ = __mul__
364 | 
365 | 
366 | 
367 | def compute_gradient_of_variables(output_tensor, out_grad):
368 |     """Take gradient of output node with respect to each node in node_list.
369 | 
370 |     Store the computed result in the grad field of each Variable.
371 |     """
372 |     # a map from node to a list of gradient contributions from each output node
373 |     node_to_output_grads_list: Dict[Tensor, List[Tensor]] = {}
374 |     # Special note on initializing gradient of
375 |     # We are really taking a derivative of the scalar reduce_sum(output_node)
376 |     # instead of the vector output_node. But this is the common case for loss function.
377 |     node_to_output_grads_list[output_tensor] = [out_grad]
378 | 
379 |     # Traverse graph in reverse topological order given the output_node that we are taking gradient wrt.
380 |     reverse_topo_order = list(reversed(find_topo_sort([output_tensor])))
381 | 
382 |     ### BEGIN YOUR SOLUTION
383 |     raise NotImplementedError()
384 |     ### END YOUR SOLUTION
385 | 
386 | 
387 | def find_topo_sort(node_list: List[Value]) -> List[Value]:
388 |     """Given a list of nodes, return a topological sort list of nodes ending in them.
389 | 
390 |     A simple algorithm is to do a post-order DFS traversal on the given nodes,
391 |     going backwards based on input edges. Since a node is added to the ordering
392 |     after all its predecessors are traversed due to post-order DFS, we get a topological
393 |     sort.
394 |     """
395 |     ### BEGIN YOUR SOLUTION
396 |     raise NotImplementedError()
397 |     ### END YOUR SOLUTION
398 | 
399 | 
400 | def topo_sort_dfs(node, visited, topo_order):
401 |     """Post-order DFS"""
402 |     ### BEGIN YOUR SOLUTION
403 |     raise NotImplementedError()
404 |     ### END YOUR SOLUTION
405 | 
406 | 
407 | ##############################
408 | ####### Helper Methods #######
409 | ##############################
410 | 
411 | 
412 | def sum_node_list(node_list):
413 |     """Custom sum function in order to avoid create redundant nodes in Python sum implementation."""
414 |     from operator import add
415 |     from functools import reduce
416 | 
417 |     return reduce(add, node_list)
418 | 


--------------------------------------------------------------------------------
/python/needle/backend_numpy.py:
--------------------------------------------------------------------------------
 1 | """This file defies specific implementations of devices when using numpy as NDArray backend.
 2 | """
 3 | import numpy
 4 | 
 5 | 
 6 | class Device:
 7 |     """Baseclass of all device"""
 8 | 
 9 | 
10 | class CPUDevice(Device):
11 |     """Represents data that sits in CPU"""
12 | 
13 |     def __repr__(self):
14 |         return "needle.cpu()"
15 | 
16 |     def __hash__(self):
17 |         return self.__repr__().__hash__()
18 | 
19 |     def __eq__(self, other):
20 |         return isinstance(other, CPUDevice)
21 | 
22 |     def enabled(self):
23 |         return True
24 | 
25 |     def zeros(self, *shape, dtype="float32"):
26 |         return numpy.zeros(shape, dtype=dtype)
27 | 
28 |     def ones(self, *shape, dtype="float32"):
29 |         return numpy.ones(shape, dtype=dtype)
30 | 
31 |     def randn(self, *shape):
32 |         # note: numpy doesn't support types within standard random routines, and
33 |         # .astype("float32") does work if we're generating a singleton
34 |         return numpy.random.randn(*shape)
35 | 
36 |     def rand(self, *shape):
37 |         # note: numpy doesn't support types within standard random routines, and
38 |         # .astype("float32") does work if we're generating a singleton
39 |         return numpy.random.rand(*shape)
40 | 
41 |     def one_hot(self, n, i, dtype="float32"):
42 |         return numpy.eye(n, dtype=dtype)[i]
43 | 
44 |     def empty(self, shape, dtype="float32"):
45 |         return numpy.empty(shape, dtype=dtype)
46 | 
47 |     def full(self, shape, fill_value, dtype="float32"):
48 |         return numpy.full(shape, fill_value, dtype=dtype)
49 | 
50 | 
51 | def cpu():
52 |     """Return cpu device"""
53 |     return CPUDevice()
54 | 
55 | 
56 | def default_device():
57 |     return cpu()
58 | 
59 | 
60 | def all_devices():
61 |     """return a list of all available devices"""
62 |     return [cpu()]
63 | 


--------------------------------------------------------------------------------
/python/needle/data/__init__.py:
--------------------------------------------------------------------------------
1 | from .data_basic import *
2 | from .data_transforms import *
3 | from .datasets import *
4 | 


--------------------------------------------------------------------------------
/python/needle/data/data_basic.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from ..autograd import Tensor
 3 | 
 4 | from typing import Iterator, Optional, List, Sized, Union, Iterable, Any
 5 | 
 6 | 
 7 | 
 8 | class Dataset:
 9 |     r"""An abstract class representing a `Dataset`.
10 | 
11 |     All subclasses should overwrite :meth:`__getitem__`, supporting fetching a
12 |     data sample for a given key. Subclasses must also overwrite
13 |     :meth:`__len__`, which is expected to return the size of the dataset.
14 |     """
15 | 
16 |     def __init__(self, transforms: Optional[List] = None):
17 |         self.transforms = transforms
18 | 
19 |     def __getitem__(self, index) -> object:
20 |         raise NotImplementedError
21 | 
22 |     def __len__(self) -> int:
23 |         raise NotImplementedError
24 |     
25 |     def apply_transforms(self, x):
26 |         if self.transforms is not None:
27 |             # apply the transforms
28 |             for tform in self.transforms:
29 |                 x = tform(x)
30 |         return x
31 | 
32 | 
33 | class DataLoader:
34 |     r"""
35 |     Data loader. Combines a dataset and a sampler, and provides an iterable over
36 |     the given dataset.
37 |     Args:
38 |         dataset (Dataset): dataset from which to load the data.
39 |         batch_size (int, optional): how many samples per batch to load
40 |             (default: ``1``).
41 |         shuffle (bool, optional): set to ``True`` to have the data reshuffled
42 |             at every epoch (default: ``False``).
43 |      """
44 |     dataset: Dataset
45 |     batch_size: Optional[int]
46 | 
47 |     def __init__(
48 |         self,
49 |         dataset: Dataset,
50 |         batch_size: Optional[int] = 1,
51 |         shuffle: bool = False,
52 |     ):
53 | 
54 |         self.dataset = dataset
55 |         self.shuffle = shuffle
56 |         self.batch_size = batch_size
57 |         if not self.shuffle:
58 |             self.ordering = np.array_split(np.arange(len(dataset)), 
59 |                                            range(batch_size, len(dataset), batch_size))
60 | 
61 |     def __iter__(self):
62 |         ### BEGIN YOUR SOLUTION
63 |         raise NotImplementedError()
64 |         ### END YOUR SOLUTION
65 |         return self
66 | 
67 |     def __next__(self):
68 |         ### BEGIN YOUR SOLUTION
69 |         raise NotImplementedError()
70 |         ### END YOUR SOLUTION
71 | 
72 | 


--------------------------------------------------------------------------------
/python/needle/data/data_transforms.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | class Transform:
 4 |     def __call__(self, x):
 5 |         raise NotImplementedError
 6 | 
 7 | 
 8 | class RandomFlipHorizontal(Transform):
 9 |     def __init__(self, p = 0.5):
10 |         self.p = p
11 | 
12 |     def __call__(self, img):
13 |         """
14 |         Horizonally flip an image, specified as an H x W x C NDArray.
15 |         Args:
16 |             img: H x W x C NDArray of an image
17 |         Returns:
18 |             H x W x C ndarray corresponding to image flipped with probability self.p
19 |         Note: use the provided code to provide randomness, for easier testing
20 |         """
21 |         flip_img = np.random.rand() < self.p
22 |         ### BEGIN YOUR SOLUTION
23 |         raise NotImplementedError()
24 |         ### END YOUR SOLUTION
25 | 
26 | 
27 | class RandomCrop(Transform):
28 |     def __init__(self, padding=3):
29 |         self.padding = padding
30 | 
31 |     def __call__(self, img):
32 |         """ Zero pad and then randomly crop an image.
33 |         Args:
34 |              img: H x W x C NDArray of an image
35 |         Return 
36 |             H x W x C NAArray of cliped image
37 |         Note: generate the image shifted by shift_x, shift_y specified below
38 |         """
39 |         shift_x, shift_y = np.random.randint(low=-self.padding, high=self.padding+1, size=2)
40 |         ### BEGIN YOUR SOLUTION
41 |         raise NotImplementedError()
42 |         ### END YOUR SOLUTION
43 | 


--------------------------------------------------------------------------------
/python/needle/data/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | from .mnist_dataset import *
2 | from .ndarray_dataset import *
3 | 


--------------------------------------------------------------------------------
/python/needle/data/datasets/mnist_dataset.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Optional
 2 | from ..data_basic import Dataset
 3 | import numpy as np
 4 | 
 5 | class MNISTDataset(Dataset):
 6 |     def __init__(
 7 |         self,
 8 |         image_filename: str,
 9 |         label_filename: str,
10 |         transforms: Optional[List] = None,
11 |     ):
12 |         ### BEGIN YOUR SOLUTION
13 |         raise NotImplementedError()
14 |         ### END YOUR SOLUTION
15 | 
16 |     def __getitem__(self, index) -> object:
17 |         ### BEGIN YOUR SOLUTION
18 |         raise NotImplementedError()
19 |         ### END YOUR SOLUTION
20 | 
21 |     def __len__(self) -> int:
22 |         ### BEGIN YOUR SOLUTION
23 |         raise NotImplementedError()
24 |         ### END YOUR SOLUTION


--------------------------------------------------------------------------------
/python/needle/data/datasets/ndarray_dataset.py:
--------------------------------------------------------------------------------
 1 | from ..data_basic import Dataset
 2 | 
 3 | class NDArrayDataset(Dataset):
 4 |     def __init__(self, *arrays):
 5 |         self.arrays = arrays
 6 | 
 7 |     def __len__(self) -> int:
 8 |         return self.arrays[0].shape[0]
 9 | 
10 |     def __getitem__(self, i) -> object:
11 |         return tuple([a[i] for a in self.arrays])


--------------------------------------------------------------------------------
/python/needle/init/__init__.py:
--------------------------------------------------------------------------------
1 | from .init_basic import *
2 | 
3 | from .init_initializers import *
4 | 


--------------------------------------------------------------------------------
/python/needle/init/init_basic.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | import needle as ndl
 3 | 
 4 | 
 5 | def rand(*shape, low=0.0, high=1.0, device=None, dtype="float32", requires_grad=False):
 6 |     """Generate random numbers uniform between low and high"""
 7 |     device = ndl.cpu() if device is None else device
 8 |     array = device.rand(*shape) * (high - low) + low
 9 |     return ndl.Tensor(array, device=device, dtype=dtype, requires_grad=requires_grad)
10 | 
11 | 
12 | def randn(*shape, mean=0.0, std=1.0, device=None, dtype="float32", requires_grad=False):
13 |     """Generate random normal with specified mean and std deviation"""
14 |     device = ndl.cpu() if device is None else device
15 |     array = device.randn(*shape) * std + mean
16 |     return ndl.Tensor(array, device=device, dtype=dtype, requires_grad=requires_grad)
17 | 
18 | 
19 | 
20 | 
21 | def constant(*shape, c=1.0, device=None, dtype="float32", requires_grad=False):
22 |     """Generate constant Tensor"""
23 |     device = ndl.cpu() if device is None else device
24 |     array = device.ones(*shape, dtype=dtype) * c  # note: can change dtype
25 |     return ndl.Tensor(array, device=device, dtype=dtype, requires_grad=requires_grad)
26 | 
27 | 
28 | 
29 | def ones(*shape, device=None, dtype="float32", requires_grad=False):
30 |     """Generate all-ones Tensor"""
31 |     return constant(
32 |         *shape, c=1.0, device=device, dtype=dtype, requires_grad=requires_grad
33 |     )
34 | 
35 | 
36 | def zeros(*shape, device=None, dtype="float32", requires_grad=False):
37 |     """Generate all-zeros Tensor"""
38 |     return constant(
39 |         *shape, c=0.0, device=device, dtype=dtype, requires_grad=requires_grad
40 |     )
41 | 
42 | 
43 | def randb(*shape, p=0.5, device=None, dtype="bool", requires_grad=False):
44 |     """Generate binary random Tensor"""
45 |     device = ndl.cpu() if device is None else device
46 |     array = device.rand(*shape) <= p
47 |     return ndl.Tensor(array, device=device, dtype=dtype, requires_grad=requires_grad)
48 | 
49 | 
50 | def one_hot(n, i, device=None, dtype="float32", requires_grad=False):
51 |     """Generate one-hot encoding Tensor"""
52 |     device = ndl.cpu() if device is None else device
53 |     return ndl.Tensor(
54 |         device.one_hot(n, i.numpy().astype("int32"), dtype=dtype),
55 |         device=device,
56 |         requires_grad=requires_grad,
57 |     )
58 | 
59 | 
60 | def zeros_like(array, *, device=None, requires_grad=False):
61 |     device = device if device else array.device
62 |     return zeros(
63 |         *array.shape, dtype=array.dtype, device=device, requires_grad=requires_grad
64 |     )
65 | 
66 | 
67 | def ones_like(array, *, device=None, requires_grad=False):
68 |     device = device if device else array.device
69 |     return ones(
70 |         *array.shape, dtype=array.dtype, device=device, requires_grad=requires_grad
71 |     )
72 | 


--------------------------------------------------------------------------------
/python/needle/init/init_initializers.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | from .init_basic import *
 3 | 
 4 | 
 5 | def xavier_uniform(fan_in, fan_out, gain=1.0, **kwargs):
 6 |     ### BEGIN YOUR SOLUTION
 7 |     raise NotImplementedError()
 8 |     ### END YOUR SOLUTION
 9 | 
10 | 
11 | def xavier_normal(fan_in, fan_out, gain=1.0, **kwargs):
12 |     ### BEGIN YOUR SOLUTION
13 |     raise NotImplementedError()
14 |     ### END YOUR SOLUTION
15 | 
16 | def kaiming_uniform(fan_in, fan_out, nonlinearity="relu", **kwargs):
17 |     assert nonlinearity == "relu", "Only relu supported currently"
18 |     ### BEGIN YOUR SOLUTION
19 |     raise NotImplementedError()
20 |     ### END YOUR SOLUTION
21 | 
22 | 
23 | 
24 | def kaiming_normal(fan_in, fan_out, nonlinearity="relu", **kwargs):
25 |     assert nonlinearity == "relu", "Only relu supported currently"
26 |     ### BEGIN YOUR SOLUTION
27 |     raise NotImplementedError()
28 |     ### END YOUR SOLUTION


--------------------------------------------------------------------------------
/python/needle/nn/__init__.py:
--------------------------------------------------------------------------------
1 | from .nn_basic import *
2 | 


--------------------------------------------------------------------------------
/python/needle/nn/nn_basic.py:
--------------------------------------------------------------------------------
  1 | """The module.
  2 | """
  3 | from typing import List, Callable, Any
  4 | from needle.autograd import Tensor
  5 | from needle import ops
  6 | import needle.init as init
  7 | import numpy as np
  8 | 
  9 | 
 10 | class Parameter(Tensor):
 11 |     """A special kind of tensor that represents parameters."""
 12 | 
 13 | 
 14 | def _unpack_params(value: object) -> List[Tensor]:
 15 |     if isinstance(value, Parameter):
 16 |         return [value]
 17 |     elif isinstance(value, Module):
 18 |         return value.parameters()
 19 |     elif isinstance(value, dict):
 20 |         params = []
 21 |         for k, v in value.items():
 22 |             params += _unpack_params(v)
 23 |         return params
 24 |     elif isinstance(value, (list, tuple)):
 25 |         params = []
 26 |         for v in value:
 27 |             params += _unpack_params(v)
 28 |         return params
 29 |     else:
 30 |         return []
 31 | 
 32 | 
 33 | def _child_modules(value: object) -> List["Module"]:
 34 |     if isinstance(value, Module):
 35 |         modules = [value]
 36 |         modules.extend(_child_modules(value.__dict__))
 37 |         return modules
 38 |     if isinstance(value, dict):
 39 |         modules = []
 40 |         for k, v in value.items():
 41 |             modules += _child_modules(v)
 42 |         return modules
 43 |     elif isinstance(value, (list, tuple)):
 44 |         modules = []
 45 |         for v in value:
 46 |             modules += _child_modules(v)
 47 |         return modules
 48 |     else:
 49 |         return []
 50 | 
 51 | 
 52 | class Module:
 53 |     def __init__(self):
 54 |         self.training = True
 55 | 
 56 |     def parameters(self) -> List[Tensor]:
 57 |         """Return the list of parameters in the module."""
 58 |         return _unpack_params(self.__dict__)
 59 | 
 60 |     def _children(self) -> List["Module"]:
 61 |         return _child_modules(self.__dict__)
 62 | 
 63 |     def eval(self):
 64 |         self.training = False
 65 |         for m in self._children():
 66 |             m.training = False
 67 | 
 68 |     def train(self):
 69 |         self.training = True
 70 |         for m in self._children():
 71 |             m.training = True
 72 | 
 73 |     def __call__(self, *args, **kwargs):
 74 |         return self.forward(*args, **kwargs)
 75 | 
 76 | 
 77 | class Identity(Module):
 78 |     def forward(self, x):
 79 |         return x
 80 | 
 81 | 
 82 | class Linear(Module):
 83 |     def __init__(
 84 |         self, in_features, out_features, bias=True, device=None, dtype="float32"
 85 |     ):
 86 |         super().__init__()
 87 |         self.in_features = in_features
 88 |         self.out_features = out_features
 89 | 
 90 |         ### BEGIN YOUR SOLUTION
 91 |         raise NotImplementedError()
 92 |         ### END YOUR SOLUTION
 93 | 
 94 |     def forward(self, X: Tensor) -> Tensor:
 95 |         ### BEGIN YOUR SOLUTION
 96 |         raise NotImplementedError()
 97 |         ### END YOUR SOLUTION
 98 | 
 99 | 
100 | class Flatten(Module):
101 |     def forward(self, X):
102 |         ### BEGIN YOUR SOLUTION
103 |         raise NotImplementedError()
104 |         ### END YOUR SOLUTION
105 | 
106 | 
107 | class ReLU(Module):
108 |     def forward(self, x: Tensor) -> Tensor:
109 |         ### BEGIN YOUR SOLUTION
110 |         raise NotImplementedError()
111 |         ### END YOUR SOLUTION
112 | 
113 | class Sequential(Module):
114 |     def __init__(self, *modules):
115 |         super().__init__()
116 |         self.modules = modules
117 | 
118 |     def forward(self, x: Tensor) -> Tensor:
119 |         ### BEGIN YOUR SOLUTION
120 |         raise NotImplementedError()
121 |         ### END YOUR SOLUTION
122 | 
123 | 
124 | class SoftmaxLoss(Module):
125 |     def forward(self, logits: Tensor, y: Tensor):
126 |         ### BEGIN YOUR SOLUTION
127 |         raise NotImplementedError()
128 |         ### END YOUR SOLUTION
129 | 
130 | 
131 | class BatchNorm1d(Module):
132 |     def __init__(self, dim, eps=1e-5, momentum=0.1, device=None, dtype="float32"):
133 |         super().__init__()
134 |         self.dim = dim
135 |         self.eps = eps
136 |         self.momentum = momentum
137 |         ### BEGIN YOUR SOLUTION
138 |         raise NotImplementedError()
139 |         ### END YOUR SOLUTION
140 | 
141 |     def forward(self, x: Tensor) -> Tensor:
142 |         ### BEGIN YOUR SOLUTION
143 |         raise NotImplementedError()
144 |         ### END YOUR SOLUTION
145 | 
146 | 
147 | 
148 | class LayerNorm1d(Module):
149 |     def __init__(self, dim, eps=1e-5, device=None, dtype="float32"):
150 |         super().__init__()
151 |         self.dim = dim
152 |         self.eps = eps
153 |         ### BEGIN YOUR SOLUTION
154 |         raise NotImplementedError()
155 |         ### END YOUR SOLUTION
156 | 
157 |     def forward(self, x: Tensor) -> Tensor:
158 |         ### BEGIN YOUR SOLUTION
159 |         raise NotImplementedError()
160 |         ### END YOUR SOLUTION
161 | 
162 | 
163 | class Dropout(Module):
164 |     def __init__(self, p=0.5):
165 |         super().__init__()
166 |         self.p = p
167 | 
168 |     def forward(self, x: Tensor) -> Tensor:
169 |         ### BEGIN YOUR SOLUTION
170 |         raise NotImplementedError()
171 |         ### END YOUR SOLUTION
172 | 
173 | 
174 | class Residual(Module):
175 |     def __init__(self, fn: Module):
176 |         super().__init__()
177 |         self.fn = fn
178 | 
179 |     def forward(self, x: Tensor) -> Tensor:
180 |         ### BEGIN YOUR SOLUTION
181 |         raise NotImplementedError()
182 |         ### END YOUR SOLUTION
183 | 


--------------------------------------------------------------------------------
/python/needle/ops/__init__.py:
--------------------------------------------------------------------------------
1 | from .ops_mathematic import *
2 | 
3 | from .ops_logarithmic import *
4 | from .ops_tuple import *
5 | 


--------------------------------------------------------------------------------
/python/needle/ops/ops_logarithmic.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional
 2 | from ..autograd import NDArray
 3 | from ..autograd import Op, Tensor, Value, TensorOp
 4 | from ..autograd import TensorTuple, TensorTupleOp
 5 | 
 6 | from .ops_mathematic import *
 7 | 
 8 | import numpy as array_api
 9 | 
10 | class LogSoftmax(TensorOp):
11 |     def compute(self, Z):
12 |         ### BEGIN YOUR SOLUTION
13 |         raise NotImplementedError()
14 |         ### END YOUR SOLUTION
15 | 
16 |     def gradient(self, out_grad, node):
17 |         ### BEGIN YOUR SOLUTION
18 |         raise NotImplementedError()
19 |         ### END YOUR SOLUTION
20 | 
21 | 
22 | def logsoftmax(a):
23 |     return LogSoftmax()(a)
24 | 
25 | 
26 | class LogSumExp(TensorOp):
27 |     def __init__(self, axes: Optional[tuple] = None):
28 |         self.axes = axes
29 | 
30 |     def compute(self, Z):
31 |         ### BEGIN YOUR SOLUTION
32 |         raise NotImplementedError()
33 |         ### END YOUR SOLUTION
34 | 
35 |     def gradient(self, out_grad, node):
36 |         ### BEGIN YOUR SOLUTION
37 |         raise NotImplementedError()
38 |         ### END YOUR SOLUTION
39 | 
40 | 
41 | def logsumexp(a, axes=None):
42 |     return LogSumExp(axes=axes)(a)
43 | 
44 | 


--------------------------------------------------------------------------------
/python/needle/ops/ops_mathematic.py:
--------------------------------------------------------------------------------
  1 | """Operator implementations."""
  2 | 
  3 | from numbers import Number
  4 | from typing import Optional, List, Tuple, Union
  5 | 
  6 | from ..autograd import NDArray
  7 | from ..autograd import Op, Tensor, Value, TensorOp
  8 | from ..autograd import TensorTuple, TensorTupleOp
  9 | import numpy
 10 | 
 11 | # NOTE: we will import numpy as the array_api
 12 | # as the backend for our computations, this line will change in later homeworks
 13 | 
 14 | BACKEND = "np"
 15 | import numpy as array_api
 16 | 
 17 | 
 18 | class EWiseAdd(TensorOp):
 19 |     def compute(self, a: NDArray, b: NDArray):
 20 |         return a + b
 21 | 
 22 |     def gradient(self, out_grad: Tensor, node: Tensor):
 23 |         return out_grad, out_grad
 24 | 
 25 | 
 26 | def add(a, b):
 27 |     return EWiseAdd()(a, b)
 28 | 
 29 | 
 30 | class AddScalar(TensorOp):
 31 |     def __init__(self, scalar):
 32 |         self.scalar = scalar
 33 | 
 34 |     def compute(self, a: NDArray):
 35 |         return a + self.scalar
 36 | 
 37 |     def gradient(self, out_grad: Tensor, node: Tensor):
 38 |         return out_grad
 39 | 
 40 | 
 41 | def add_scalar(a, scalar):
 42 |     return AddScalar(scalar)(a)
 43 | 
 44 | 
 45 | class EWiseMul(TensorOp):
 46 |     def compute(self, a: NDArray, b: NDArray):
 47 |         return a * b
 48 | 
 49 |     def gradient(self, out_grad: Tensor, node: Tensor):
 50 |         lhs, rhs = node.inputs
 51 |         return out_grad * rhs, out_grad * lhs
 52 | 
 53 | 
 54 | def multiply(a, b):
 55 |     return EWiseMul()(a, b)
 56 | 
 57 | 
 58 | class MulScalar(TensorOp):
 59 |     def __init__(self, scalar):
 60 |         self.scalar = scalar
 61 | 
 62 |     def compute(self, a: NDArray):
 63 |         return a * self.scalar
 64 | 
 65 |     def gradient(self, out_grad: Tensor, node: Tensor):
 66 |         return (out_grad * self.scalar,)
 67 | 
 68 | 
 69 | def mul_scalar(a, scalar):
 70 |     return MulScalar(scalar)(a)
 71 | 
 72 | 
 73 | class EWisePow(TensorOp):
 74 |     """Op to element-wise raise a tensor to a power."""
 75 | 
 76 |     def compute(self, a: NDArray, b: NDArray) -> NDArray:
 77 |         ### BEGIN YOUR SOLUTION
 78 |         raise NotImplementedError()
 79 |         ### END YOUR SOLUTION
 80 | 
 81 |     def gradient(self, out_grad, node):
 82 |         ### BEGIN YOUR SOLUTION
 83 |         raise NotImplementedError()
 84 |         ### END YOUR SOLUTION
 85 | 
 86 | 
 87 | def power(a, b):
 88 |     return EWisePow()(a, b)
 89 | 
 90 | 
 91 | class PowerScalar(TensorOp):
 92 |     """Op raise a tensor to an (integer) power."""
 93 | 
 94 |     def __init__(self, scalar: int):
 95 |         self.scalar = scalar
 96 | 
 97 |     def compute(self, a: NDArray) -> NDArray:
 98 |         ### BEGIN YOUR SOLUTION
 99 |         raise NotImplementedError()
100 |         ### END YOUR SOLUTION
101 | 
102 |     def gradient(self, out_grad, node):
103 |         ### BEGIN YOUR SOLUTION
104 |         raise NotImplementedError()
105 |         ### END YOUR SOLUTION
106 | 
107 | 
108 | def power_scalar(a, scalar):
109 |     return PowerScalar(scalar)(a)
110 | 
111 | 
112 | class EWiseDiv(TensorOp):
113 |     """Op to element-wise divide two nodes."""
114 | 
115 |     def compute(self, a, b):
116 |         ### BEGIN YOUR SOLUTION
117 |         raise NotImplementedError()
118 |         ### END YOUR SOLUTION
119 | 
120 |     def gradient(self, out_grad, node):
121 |         ### BEGIN YOUR SOLUTION
122 |         raise NotImplementedError()
123 |         ### END YOUR SOLUTION
124 | 
125 | 
126 | def divide(a, b):
127 |     return EWiseDiv()(a, b)
128 | 
129 | 
130 | class DivScalar(TensorOp):
131 |     def __init__(self, scalar):
132 |         self.scalar = scalar
133 | 
134 |     def compute(self, a):
135 |         ### BEGIN YOUR SOLUTION
136 |         raise NotImplementedError()
137 |         ### END YOUR SOLUTION
138 | 
139 |     def gradient(self, out_grad, node):
140 |         ### BEGIN YOUR SOLUTION
141 |         raise NotImplementedError()
142 |         ### END YOUR SOLUTION
143 | 
144 | 
145 | def divide_scalar(a, scalar):
146 |     return DivScalar(scalar)(a)
147 | 
148 | 
149 | class Transpose(TensorOp):
150 |     def __init__(self, axes: Optional[tuple] = None):
151 |         self.axes = axes
152 | 
153 |     def compute(self, a):
154 |         ### BEGIN YOUR SOLUTION
155 |         raise NotImplementedError()
156 |         ### END YOUR SOLUTION
157 | 
158 |     def gradient(self, out_grad, node):
159 |         ### BEGIN YOUR SOLUTION
160 |         raise NotImplementedError()
161 |         ### END YOUR SOLUTION
162 | 
163 | 
164 | def transpose(a, axes=None):
165 |     return Transpose(axes)(a)
166 | 
167 | 
168 | class Reshape(TensorOp):
169 |     def __init__(self, shape):
170 |         self.shape = shape
171 | 
172 |     def compute(self, a):
173 |         ### BEGIN YOUR SOLUTION
174 |         raise NotImplementedError()
175 |         ### END YOUR SOLUTION
176 | 
177 |     def gradient(self, out_grad, node):
178 |         ### BEGIN YOUR SOLUTION
179 |         raise NotImplementedError()
180 |         ### END YOUR SOLUTION
181 | 
182 | 
183 | def reshape(a, shape):
184 |     return Reshape(shape)(a)
185 | 
186 | 
187 | class BroadcastTo(TensorOp):
188 |     def __init__(self, shape):
189 |         self.shape = shape
190 | 
191 |     def compute(self, a):
192 |         ### BEGIN YOUR SOLUTION
193 |         raise NotImplementedError()
194 |         ### END YOUR SOLUTION
195 | 
196 |     def gradient(self, out_grad, node):
197 |         ### BEGIN YOUR SOLUTION
198 |         raise NotImplementedError()
199 |         ### END YOUR SOLUTION
200 | 
201 | 
202 | def broadcast_to(a, shape):
203 |     return BroadcastTo(shape)(a)
204 | 
205 | 
206 | class Summation(TensorOp):
207 |     def __init__(self, axes: Optional[tuple] = None):
208 |         self.axes = axes
209 | 
210 |     def compute(self, a):
211 |         ### BEGIN YOUR SOLUTION
212 |         raise NotImplementedError()
213 |         ### END YOUR SOLUTION
214 | 
215 |     def gradient(self, out_grad, node):
216 |         ### BEGIN YOUR SOLUTION
217 |         raise NotImplementedError()
218 |         ### END YOUR SOLUTION
219 | 
220 | 
221 | def summation(a, axes=None):
222 |     return Summation(axes)(a)
223 | 
224 | 
225 | class MatMul(TensorOp):
226 |     def compute(self, a, b):
227 |         ### BEGIN YOUR SOLUTION
228 |         raise NotImplementedError()
229 |         ### END YOUR SOLUTION
230 | 
231 |     def gradient(self, out_grad, node):
232 |         ### BEGIN YOUR SOLUTION
233 |         raise NotImplementedError()
234 |         ### END YOUR SOLUTION
235 | 
236 | 
237 | def matmul(a, b):
238 |     return MatMul()(a, b)
239 | 
240 | 
241 | class Negate(TensorOp):
242 |     def compute(self, a):
243 |         ### BEGIN YOUR SOLUTION
244 |         raise NotImplementedError()
245 |         ### END YOUR SOLUTION
246 | 
247 |     def gradient(self, out_grad, node):
248 |         ### BEGIN YOUR SOLUTION
249 |         raise NotImplementedError()
250 |         ### END YOUR SOLUTION
251 | 
252 | 
253 | def negate(a):
254 |     return Negate()(a)
255 | 
256 | 
257 | class Log(TensorOp):
258 |     def compute(self, a):
259 |         ### BEGIN YOUR SOLUTION
260 |         raise NotImplementedError()
261 |         ### END YOUR SOLUTION
262 | 
263 |     def gradient(self, out_grad, node):
264 |         ### BEGIN YOUR SOLUTION
265 |         raise NotImplementedError()
266 |         ### END YOUR SOLUTION
267 | 
268 | 
269 | def log(a):
270 |     return Log()(a)
271 | 
272 | 
273 | class Exp(TensorOp):
274 |     def compute(self, a):
275 |         ### BEGIN YOUR SOLUTION
276 |         raise NotImplementedError()
277 |         ### END YOUR SOLUTION
278 | 
279 |     def gradient(self, out_grad, node):
280 |         ### BEGIN YOUR SOLUTION
281 |         raise NotImplementedError()
282 |         ### END YOUR SOLUTION
283 | 
284 | 
285 | def exp(a):
286 |     return Exp()(a)
287 | 
288 | 
289 | class ReLU(TensorOp):
290 |     def compute(self, a):
291 |         ### BEGIN YOUR SOLUTION
292 |         raise NotImplementedError()
293 |         ### END YOUR SOLUTION
294 | 
295 |     def gradient(self, out_grad, node):
296 |         ### BEGIN YOUR SOLUTION
297 |         raise NotImplementedError()
298 |         ### END YOUR SOLUTION
299 | 
300 | 
301 | def relu(a):
302 |     return ReLU()(a)
303 | 
304 | 
305 | 


--------------------------------------------------------------------------------
/python/needle/ops/ops_tuple.py:
--------------------------------------------------------------------------------
 1 | from ..autograd import Op, Tensor, TensorTuple, Value, TensorOp, TensorTupleOp
 2 | import needle.init as init
 3 | 
 4 | class MakeTensorTuple(TensorTupleOp):
 5 |     def compute(self, *args) -> tuple:
 6 |         return tuple(args)
 7 | 
 8 |     def gradient(self, out_grad, node):
 9 |         assert isinstance(out_grad, TensorTuple)
10 |         return tuple([out_grad[i] for i in range(len(out_grad))])
11 | 
12 | 
13 | def make_tuple(*args):
14 |     return MakeTensorTuple()(*args)
15 | 
16 | 
17 | class TupleGetItem(TensorOp):
18 |     def __init__(self, index):
19 |         self.index = index
20 | 
21 |     def __call__(self, a: TensorTuple, fold_const=True) -> Value:
22 |         assert isinstance(a, TensorTuple)
23 |         # constant folding
24 |         if fold_const and isinstance(a.op, MakeTensorTuple):
25 |             return a.inputs[self.index]
26 |         return Tensor.make_from_op(self, [a])
27 | 
28 |     def compute(self, a):
29 |         return a[self.index]
30 | 
31 |     def gradient(self, out_grad, node):
32 |         index = self.index
33 |         in_grad = []
34 |         for i, value in enumerate(node.inputs[0]):
35 |             if i != index:
36 |                 in_grad.append(init.zeros_like(value))
37 |             else:
38 |                 in_grad.append(out_grad)
39 |         return MakeTensorTuple()(*in_grad)
40 | 
41 | 
42 | def tuple_get_item(value, index):
43 |     return TupleGetItem(index)(value)
44 | 
45 | 
46 | class FusedAddScalars(TensorTupleOp):
47 |     def __init__(self, c0: float, c1: float):
48 |         self.c0 = c0
49 |         self.c1 = c1
50 | 
51 |     def compute(self, a):
52 |         return a + self.c0, a + self.c1
53 | 
54 |     def gradient(self, out_grad, node):
55 |         return out_grad[0] + out_grad[1]
56 | 
57 | 
58 | def fused_add_scalars(x, c0, c1):
59 |     return FusedAddScalars(c0, c1)(x)
60 | 


--------------------------------------------------------------------------------
/python/needle/optim.py:
--------------------------------------------------------------------------------
 1 | """Optimization module"""
 2 | import needle as ndl
 3 | import numpy as np
 4 | 
 5 | 
 6 | class Optimizer:
 7 |     def __init__(self, params):
 8 |         self.params = params
 9 | 
10 |     def step(self):
11 |         raise NotImplementedError()
12 | 
13 |     def reset_grad(self):
14 |         for p in self.params:
15 |             p.grad = None
16 | 
17 | 
18 | class SGD(Optimizer):
19 |     def __init__(self, params, lr=0.01, momentum=0.0, weight_decay=0.0):
20 |         super().__init__(params)
21 |         self.lr = lr
22 |         self.momentum = momentum
23 |         self.u = {}
24 |         self.weight_decay = weight_decay
25 | 
26 |     def step(self):
27 |         ### BEGIN YOUR SOLUTION
28 |         raise NotImplementedError()
29 |         ### END YOUR SOLUTION
30 | 
31 |     def clip_grad_norm(self, max_norm=0.25):
32 |         """
33 |         Clips gradient norm of parameters.
34 |         """
35 |         ### BEGIN YOUR SOLUTION
36 |         raise NotImplementedError()
37 |         ### END YOUR SOLUTION
38 | 
39 | 
40 | class Adam(Optimizer):
41 |     def __init__(
42 |         self,
43 |         params,
44 |         lr=0.01,
45 |         beta1=0.9,
46 |         beta2=0.999,
47 |         eps=1e-8,
48 |         weight_decay=0.0,
49 |     ):
50 |         super().__init__(params)
51 |         self.lr = lr
52 |         self.beta1 = beta1
53 |         self.beta2 = beta2
54 |         self.eps = eps
55 |         self.weight_decay = weight_decay
56 |         self.t = 0
57 | 
58 |         self.m = {}
59 |         self.v = {}
60 | 
61 |     def step(self):
62 |         ### BEGIN YOUR SOLUTION
63 |         raise NotImplementedError()
64 |         ### END YOUR SOLUTION
65 | 


--------------------------------------------------------------------------------
/tests/hw2/test_nn_and_optim.py:
--------------------------------------------------------------------------------
   1 | import sys
   2 | 
   3 | sys.path.append("./python")
   4 | import numpy as np
   5 | import needle as ndl
   6 | import needle.nn as nn
   7 | 
   8 | sys.path.append("./apps")
   9 | from mlp_resnet import *
  10 | 
  11 | import mugrade
  12 | 
  13 | """Deterministically generate a matrix"""
  14 | 
  15 | 
  16 | def get_tensor(*shape, entropy=1):
  17 |     np.random.seed(np.prod(shape) * len(shape) * entropy)
  18 |     return ndl.Tensor(np.random.randint(0, 100, size=shape) / 20, dtype="float32")
  19 | 
  20 | 
  21 | def get_int_tensor(*shape, low=0, high=10, entropy=1):
  22 |     np.random.seed(np.prod(shape) * len(shape) * entropy)
  23 |     return ndl.Tensor(np.random.randint(low, high, size=shape))
  24 | 
  25 | 
  26 | def check_prng(*shape):
  27 |     """We want to ensure that numpy generates random matrices on your machine/colab
  28 |     Such that our tests will make sense
  29 |     So this matrix should match our to full precision
  30 |     """
  31 |     return get_tensor(*shape).cached_data
  32 | 
  33 | 
  34 | def batchnorm_forward(*shape, affine=False):
  35 |     x = get_tensor(*shape)
  36 |     bn = ndl.nn.BatchNorm1d(shape[1])
  37 |     if affine:
  38 |         bn.weight.data = get_tensor(shape[1], entropy=42)
  39 |         bn.bias.data = get_tensor(shape[1], entropy=1337)
  40 |     return bn(x).cached_data
  41 | 
  42 | 
  43 | def batchnorm_backward(*shape, affine=False):
  44 |     x = get_tensor(*shape)
  45 |     bn = ndl.nn.BatchNorm1d(shape[1])
  46 |     if affine:
  47 |         bn.weight.data = get_tensor(shape[1], entropy=42)
  48 |         bn.bias.data = get_tensor(shape[1], entropy=1337)
  49 |     y = (bn(x) ** 2).sum().backward()
  50 |     return x.grad.cached_data
  51 | 
  52 | 
  53 | def flatten_forward(*shape):
  54 |     x = get_tensor(*shape)
  55 |     tform = ndl.nn.Flatten()
  56 |     return tform(x).cached_data
  57 | 
  58 | 
  59 | def flatten_backward(*shape):
  60 |     x = get_tensor(*shape)
  61 |     tform = ndl.nn.Flatten()
  62 |     (tform(x) ** 2).sum().backward()
  63 |     return x.grad.cached_data
  64 | 
  65 | 
  66 | def batchnorm_running_mean(*shape, iters=10):
  67 |     bn = ndl.nn.BatchNorm1d(shape[1])
  68 |     for i in range(iters):
  69 |         x = get_tensor(*shape, entropy=i)
  70 |         y = bn(x)
  71 |     return bn.running_mean.cached_data
  72 | 
  73 | 
  74 | def batchnorm_running_var(*shape, iters=10):
  75 |     bn = ndl.nn.BatchNorm1d(shape[1])
  76 |     for i in range(iters):
  77 |         x = get_tensor(*shape, entropy=i)
  78 |         y = bn(x)
  79 |     return bn.running_var.cached_data
  80 | 
  81 | 
  82 | def batchnorm_running_grad(*shape, iters=10):
  83 |     bn = ndl.nn.BatchNorm1d(shape[1])
  84 |     for i in range(iters):
  85 |         x = get_tensor(*shape, entropy=i)
  86 |         y = bn(x)
  87 |     bn.eval()
  88 |     (y**2).sum().backward()
  89 |     return x.grad.cached_data
  90 | 
  91 | 
  92 | def relu_forward(*shape):
  93 |     f = ndl.nn.ReLU()
  94 |     x = get_tensor(*shape)
  95 |     return f(x).cached_data
  96 | 
  97 | 
  98 | def relu_backward(*shape):
  99 |     f = ndl.nn.ReLU()
 100 |     x = get_tensor(*shape)
 101 |     (f(x) ** 2).sum().backward()
 102 |     return x.grad.cached_data
 103 | 
 104 | 
 105 | def layernorm_forward(shape, dim):
 106 |     f = ndl.nn.LayerNorm1d(dim)
 107 |     x = get_tensor(*shape)
 108 |     return f(x).cached_data
 109 | 
 110 | 
 111 | def layernorm_backward(shape, dims):
 112 |     f = ndl.nn.LayerNorm1d(dims)
 113 |     x = get_tensor(*shape)
 114 |     (f(x) ** 4).sum().backward()
 115 |     return x.grad.cached_data
 116 | 
 117 | def logsoftmax_forward(shape, mult=1.0):
 118 |     x = get_tensor(*shape) * mult
 119 |     return ndl.ops.logsoftmax(x).cached_data
 120 | 
 121 | def logsoftmax_backward(shape, mult=1.0):
 122 |     x = get_tensor(*shape)
 123 |     y = ndl.ops.logsoftmax(x * mult)
 124 |     z = (y**2).sum()
 125 |     z.backward()
 126 |     return x.grad.cached_data
 127 | 
 128 | def softmax_loss_forward(rows, classes):
 129 |     x = get_tensor(rows, classes)
 130 |     y = get_int_tensor(rows, low=0, high=classes)
 131 |     f = ndl.nn.SoftmaxLoss()
 132 |     return np.array(f(x, y).cached_data)
 133 | 
 134 | 
 135 | def softmax_loss_backward(rows, classes):
 136 |     x = get_tensor(rows, classes)
 137 |     y = get_int_tensor(rows, low=0, high=classes)
 138 |     f = ndl.nn.SoftmaxLoss()
 139 |     loss = f(x, y)
 140 |     loss.backward()
 141 |     return x.grad.cached_data
 142 | 
 143 | 
 144 | def linear_forward(lhs_shape, rhs_shape):
 145 |     np.random.seed(199)
 146 |     f = ndl.nn.Linear(*lhs_shape)
 147 |     f.bias.data = get_tensor(lhs_shape[-1])
 148 |     x = get_tensor(*rhs_shape)
 149 |     return f(x).cached_data
 150 | 
 151 | 
 152 | def linear_backward(lhs_shape, rhs_shape):
 153 |     np.random.seed(199)
 154 |     f = ndl.nn.Linear(*lhs_shape)
 155 |     f.bias.data = get_tensor(lhs_shape[-1])
 156 |     x = get_tensor(*rhs_shape)
 157 |     (f(x) ** 2).sum().backward()
 158 |     return x.grad.cached_data
 159 | 
 160 | 
 161 | def sequential_forward(batches=3):
 162 |     np.random.seed(42)
 163 |     f = nn.Sequential(nn.Linear(5, 8), nn.ReLU(), nn.Linear(8, 5))
 164 |     x = get_tensor(batches, 5)
 165 |     return f(x).cached_data
 166 | 
 167 | 
 168 | def sequential_backward(batches=3):
 169 |     np.random.seed(42)
 170 |     f = nn.Sequential(nn.Linear(5, 8), nn.ReLU(), nn.Linear(8, 5))
 171 |     x = get_tensor(batches, 5)
 172 |     f(x).sum().backward()
 173 |     return x.grad.cached_data
 174 | 
 175 | 
 176 | def residual_forward(shape=(5, 5)):
 177 |     np.random.seed(42)
 178 |     f = nn.Residual(
 179 |         nn.Sequential(nn.Linear(*shape), nn.ReLU(), nn.Linear(*shape[::-1]))
 180 |     )
 181 |     x = get_tensor(*shape[::-1])
 182 |     return f(x).cached_data
 183 | 
 184 | 
 185 | def residual_backward(shape=(5, 5)):
 186 |     np.random.seed(42)
 187 |     f = nn.Residual(
 188 |         nn.Sequential(nn.Linear(*shape), nn.ReLU(), nn.Linear(*shape[::-1]))
 189 |     )
 190 |     x = get_tensor(*shape[::-1])
 191 |     f(x).sum().backward()
 192 |     return x.grad.cached_data
 193 | 
 194 | 
 195 | def learn_model_1d(feature_size, nclasses, _model, optimizer, epochs=1, **kwargs):
 196 |     np.random.seed(42)
 197 |     model = _model([])
 198 |     X = get_tensor(1024, feature_size).cached_data
 199 |     y = get_int_tensor(1024, low=0, high=nclasses).cached_data.astype(np.uint8)
 200 |     m = X.shape[0]
 201 |     batch = 32
 202 | 
 203 |     loss_func = nn.SoftmaxLoss()
 204 |     opt = optimizer(model.parameters(), **kwargs)
 205 | 
 206 |     for _ in range(epochs):
 207 |         for i, (X0, y0) in enumerate(
 208 |             zip(np.array_split(X, m // batch), np.array_split(y, m // batch))
 209 |         ):
 210 |             opt.reset_grad()
 211 |             X0, y0 = ndl.Tensor(X0, dtype="float32"), ndl.Tensor(y0)
 212 |             out = model(X0)
 213 |             loss = loss_func(out, y0)
 214 |             loss.backward()
 215 |             # Opt should not change gradients.
 216 |             grad_before = model.parameters()[0].grad.detach().cached_data
 217 |             opt.step()
 218 |             grad_after = model.parameters()[0].grad.detach().cached_data
 219 |             np.testing.assert_allclose(
 220 |                 grad_before,
 221 |                 grad_after,
 222 |                 rtol=1e-5,
 223 |                 atol=1e-5,
 224 |                 err_msg="Optim should not modify gradients in place",
 225 |             )
 226 | 
 227 |     return np.array(loss.cached_data)
 228 | 
 229 | 
 230 | def learn_model_1d_eval(feature_size, nclasses, _model, optimizer, epochs=1, **kwargs):
 231 |     np.random.seed(42)
 232 |     model = _model([])
 233 |     X = get_tensor(1024, feature_size).cached_data
 234 |     y = get_int_tensor(1024, low=0, high=nclasses).cached_data.astype(np.uint8)
 235 |     m = X.shape[0]
 236 |     batch = 32
 237 | 
 238 |     loss_func = nn.SoftmaxLoss()
 239 |     opt = optimizer(model.parameters(), **kwargs)
 240 | 
 241 |     for i, (X0, y0) in enumerate(
 242 |         zip(np.array_split(X, m // batch), np.array_split(y, m // batch))
 243 |     ):
 244 |         opt.reset_grad()
 245 |         X0, y0 = ndl.Tensor(X0, dtype="float32"), ndl.Tensor(y0)
 246 |         out = model(X0)
 247 |         loss = loss_func(out, y0)
 248 |         loss.backward()
 249 |         opt.step()
 250 | 
 251 |     X_test = ndl.Tensor(get_tensor(batch, feature_size).cached_data)
 252 |     y_test = ndl.Tensor(
 253 |         get_int_tensor(batch, low=0, high=nclasses).cached_data.astype(np.uint8)
 254 |     )
 255 | 
 256 |     model.eval()
 257 | 
 258 |     return np.array(loss_func(model(X_test), y_test).cached_data)
 259 | 
 260 | 
 261 | def init_a_tensor_of_shape(shape, init_fn):
 262 |     x = get_tensor(*shape)
 263 |     np.random.seed(42)
 264 |     init_fn(x)
 265 |     return x.cached_data
 266 | 
 267 | 
 268 | def global_tensor_count():
 269 |     return np.array(ndl.autograd.TENSOR_COUNTER)
 270 | 
 271 | 
 272 | def nn_linear_weight_init():
 273 |     np.random.seed(1337)
 274 |     f = ndl.nn.Linear(7, 4)
 275 |     f.weight.cached_data
 276 |     return f.weight.cached_data
 277 | 
 278 | 
 279 | def nn_linear_bias_init():
 280 |     np.random.seed(1337)
 281 |     f = ndl.nn.Linear(7, 4)
 282 |     return f.bias.cached_data
 283 | 
 284 | 
 285 | class UselessModule(ndl.nn.Module):
 286 |     def __init__(self):
 287 |         super().__init__()
 288 |         self.stuff = {
 289 |             "layer1": nn.Linear(4, 4),
 290 |             "layer2": [nn.Dropout(0.1), nn.Sequential(nn.Linear(4, 4))],
 291 |         }
 292 | 
 293 |     def forward(self, x):
 294 |         raise NotImplementedError()
 295 | 
 296 | 
 297 | def check_training_mode():
 298 |     model = nn.Sequential(
 299 |         nn.BatchNorm1d(4),
 300 |         nn.Sequential(
 301 |             nn.LayerNorm1d(4),
 302 |             nn.Linear(4, 4),
 303 |             nn.Dropout(0.1),
 304 |         ),
 305 |         nn.Linear(4, 4),
 306 |         UselessModule(),
 307 |     )
 308 | 
 309 |     model_refs = [
 310 |         model.modules[0],
 311 |         model.modules[1].modules[0],
 312 |         model.modules[1].modules[1],
 313 |         model.modules[1].modules[2],
 314 |         model.modules[2],
 315 |         model.modules[3],
 316 |         model.modules[3].stuff["layer1"],
 317 |         model.modules[3].stuff["layer2"][0],
 318 |         model.modules[3].stuff["layer2"][1].modules[0],
 319 |     ]
 320 | 
 321 |     eval_mode = [1 if not x.training else 0 for x in model_refs]
 322 |     model.eval()
 323 |     eval_mode.extend([1 if not x.training else 0 for x in model_refs])
 324 |     model.train()
 325 |     eval_mode.extend([1 if not x.training else 0 for x in model_refs])
 326 | 
 327 |     return np.array(eval_mode)
 328 | 
 329 | 
 330 | def power_scalar_forward(shape, power=2):
 331 |     x = get_tensor(*shape)
 332 |     return (x**power).cached_data
 333 | 
 334 | 
 335 | def power_scalar_backward(shape, power=2):
 336 |     x = get_tensor(*shape)
 337 |     y = (x**power).sum()
 338 |     y.backward()
 339 |     return x.grad.cached_data
 340 | 
 341 | 
 342 | def logsumexp_forward(shape, axes):
 343 |     x = get_tensor(*shape)
 344 |     return (ndl.ops.logsumexp(x, axes=axes)).cached_data
 345 | 
 346 | 
 347 | def logsumexp_backward(shape, axes):
 348 |     x = get_tensor(*shape)
 349 |     y = (ndl.ops.logsumexp(x, axes=axes) ** 2).sum()
 350 |     y.backward()
 351 |     return x.grad.cached_data
 352 | 
 353 | 
 354 | def dropout_forward(shape, prob=0.5):
 355 |     np.random.seed(3)
 356 |     x = get_tensor(*shape)
 357 |     f = nn.Dropout(prob)
 358 |     return f(x).cached_data
 359 | 
 360 | 
 361 | def dropout_backward(shape, prob=0.5):
 362 |     np.random.seed(3)
 363 |     x = get_tensor(*shape)
 364 |     f = nn.Dropout(prob)
 365 |     y = f(x).sum()
 366 |     y.backward()
 367 |     return x.grad.cached_data
 368 | 
 369 | 
 370 | def num_params(model):
 371 |     return np.sum([np.prod(x.shape) for x in model.parameters()])
 372 | 
 373 | 
 374 | def residual_block_num_params(dim, hidden_dim, norm):
 375 |     model = ResidualBlock(dim, hidden_dim, norm)
 376 |     return np.array(num_params(model))
 377 | 
 378 | 
 379 | def residual_block_forward(dim, hidden_dim, norm, drop_prob):
 380 |     np.random.seed(2)
 381 |     input_tensor = ndl.Tensor(np.random.randn(1, dim))
 382 |     output_tensor = ResidualBlock(dim, hidden_dim, norm, drop_prob)(input_tensor)
 383 |     return output_tensor.numpy()
 384 | 
 385 | 
 386 | def mlp_resnet_num_params(dim, hidden_dim, num_blocks, num_classes, norm):
 387 |     model = MLPResNet(dim, hidden_dim, num_blocks, num_classes, norm)
 388 |     return np.array(num_params(model))
 389 | 
 390 | 
 391 | def mlp_resnet_forward(dim, hidden_dim, num_blocks, num_classes, norm, drop_prob):
 392 |     np.random.seed(4)
 393 |     input_tensor = ndl.Tensor(np.random.randn(2, dim), dtype=np.float32)
 394 |     output_tensor = MLPResNet(
 395 |         dim, hidden_dim, num_blocks, num_classes, norm, drop_prob
 396 |     )(input_tensor)
 397 |     return output_tensor.numpy()
 398 | 
 399 | 
 400 | def train_epoch_1(hidden_dim, batch_size, optimizer, **kwargs):
 401 |     np.random.seed(1)
 402 |     train_dataset = ndl.data.MNISTDataset(
 403 |         "./data/train-images-idx3-ubyte.gz", "./data/train-labels-idx1-ubyte.gz"
 404 |     )
 405 |     train_dataloader = ndl.data.DataLoader(dataset=train_dataset, batch_size=batch_size)
 406 | 
 407 |     model = MLPResNet(784, hidden_dim)
 408 |     opt = optimizer(model.parameters(), **kwargs)
 409 |     model.eval()
 410 |     return np.array(epoch(train_dataloader, model, opt))
 411 | 
 412 | 
 413 | def eval_epoch_1(hidden_dim, batch_size):
 414 |     np.random.seed(1)
 415 |     test_dataset = ndl.data.MNISTDataset(
 416 |         "./data/t10k-images-idx3-ubyte.gz", "./data/t10k-labels-idx1-ubyte.gz"
 417 |     )
 418 |     test_dataloader = ndl.data.DataLoader(
 419 |         dataset=test_dataset, batch_size=batch_size, shuffle=False
 420 |     )
 421 | 
 422 |     model = MLPResNet(784, hidden_dim)
 423 |     model.train()
 424 |     return np.array(epoch(test_dataloader, model))
 425 | 
 426 | 
 427 | def train_mnist_1(batch_size, epochs, optimizer, lr, weight_decay, hidden_dim):
 428 |     np.random.seed(1)
 429 |     out = train_mnist(
 430 |         batch_size, epochs, optimizer, lr, weight_decay, hidden_dim, data_dir="./data"
 431 |     )
 432 |     return np.array(out)
 433 | 
 434 | 
 435 | def test_check_prng_contact_us_if_this_fails_1():
 436 |     np.testing.assert_allclose(
 437 |         check_prng(3, 3),
 438 |         np.array(
 439 |             [[2.1, 0.95, 3.45], [3.1, 2.45, 2.3], [3.3, 0.4, 1.2]], dtype=np.float32
 440 |         ),
 441 |         rtol=1e-08,
 442 |         atol=1e-08,
 443 |     )
 444 | 
 445 | 
 446 | def test_op_power_scalar_forward_1():
 447 |     np.testing.assert_allclose(
 448 |         power_scalar_forward((2, 2), power=2),
 449 |         np.array([[11.222499, 17.639997], [0.0625, 20.25]], dtype=np.float32),
 450 |         rtol=1e-5,
 451 |         atol=1e-5,
 452 |     )
 453 | 
 454 | 
 455 | def test_op_power_scalar_forward_2():
 456 |     np.testing.assert_allclose(
 457 |         power_scalar_forward((2, 2), power=-1.5),
 458 |         np.array([[0.16309206, 0.11617859], [8.0, 0.10475656]], dtype=np.float32),
 459 |         rtol=1e-5,
 460 |         atol=1e-5,
 461 |     )
 462 | 
 463 | 
 464 | def test_op_power_scalar_backward_1():
 465 |     np.testing.assert_allclose(
 466 |         power_scalar_backward((2, 2), power=2),
 467 |         np.array([[6.7, 8.4], [0.5, 9.0]], dtype=np.float32),
 468 |         rtol=1e-5,
 469 |         atol=1e-5,
 470 |     )
 471 | 
 472 | 
 473 | def test_op_logsoftmax_forward_1():
 474 | 	np.testing.assert_allclose(logsoftmax_forward((3, 3)),
 475 | 		np.array([[-1.6436583 , -2.7936583 , -0.29365814],
 476 | 		 [-0.6787312 , -1.3287311 , -1.4787312 ],
 477 | 		 [-0.16337626, -3.0633762 , -2.2633762 ]], dtype=np.float32), rtol=1e-5, atol=1e-5)
 478 | 
 479 | def test_op_logsoftmax_stable_forward_1():
 480 | 	np.testing.assert_allclose(logsoftmax_forward((3, 3), mult=1e5),
 481 | 		np.array([[-135000.02, -250000. , 0. ],
 482 | 		 [ 0. , -65000. , -80000. ],
 483 | 		 [ 0. , -290000. , -210000. ]], dtype=np.float32), rtol=1e-5, atol=1e-5)
 484 | 
 485 | def test_op_logsoftmax_backward_1():
 486 | 	np.testing.assert_allclose(logsoftmax_backward((3, 3)),
 487 | 		np.array([[-1.4585897 , -5.008274 , 6.4668627 ],
 488 | 		 [ 2.1793516 , -0.81108296, -1.3682691 ],
 489 | 		 [ 8.998467 , -5.613649 , -3.3848193 ]], dtype=np.float32), rtol=1e-5, atol=1e-5)
 490 | 
 491 | def submit_op_logsoftmax():
 492 | 	mugrade.submit(logsoftmax_forward((3, 4)))
 493 | 	mugrade.submit(logsoftmax_forward((3, 5), mult=1e5))
 494 | 	mugrade.submit(logsoftmax_forward((3, 6), mult=1e5))
 495 | 	mugrade.submit(logsoftmax_backward((1, 3)))
 496 | 	mugrade.submit(logsoftmax_backward((3, 6), mult=1e5))
 497 |  
 498 | 
 499 | def test_op_logsumexp_forward_1():
 500 |     np.testing.assert_allclose(
 501 |         logsumexp_forward((3, 3, 3), (1, 2)),
 502 |         np.array([5.366029, 4.9753823, 6.208126], dtype=np.float32),
 503 |         rtol=1e-5,
 504 |         atol=1e-5,
 505 |     )
 506 | 
 507 | 
 508 | def test_op_logsumexp_forward_2():
 509 |     np.testing.assert_allclose(
 510 |         logsumexp_forward((3, 3, 3), None),
 511 |         np.array([6.7517853], dtype=np.float32),
 512 |         rtol=1e-5,
 513 |         atol=1e-5,
 514 |     )
 515 | 
 516 | 
 517 | def test_op_logsumexp_forward_3():
 518 |     np.testing.assert_allclose(
 519 |         logsumexp_forward((1, 2, 3, 4), (0, 2)),
 520 |         np.array(
 521 |             [
 522 |                 [5.276974, 5.047317, 3.778802, 5.0103745],
 523 |                 [5.087831, 4.391712, 5.025037, 2.0214698],
 524 |             ],
 525 |             dtype=np.float32,
 526 |         ),
 527 |         rtol=1e-5,
 528 |         atol=1e-5,
 529 |     )
 530 | 
 531 | 
 532 | def test_op_logsumexp_forward_4():
 533 |     np.testing.assert_allclose(
 534 |         logsumexp_forward((3, 10), (1,)),
 535 |         np.array([5.705309, 5.976375, 5.696459], dtype=np.float32),
 536 |         rtol=1e-5,
 537 |         atol=1e-5,
 538 |     )
 539 | 
 540 | 
 541 | def test_op_logsumexp_forward_5():
 542 |     test_data = ndl.ops.logsumexp(
 543 |         ndl.Tensor(np.array([[1e10, 1e9, 1e8, -10], [1e-10, 1e9, 1e8, -10]])), (0,)
 544 |     ).numpy()
 545 |     np.testing.assert_allclose(
 546 |         test_data,
 547 |         np.array([1.00000000e10, 1.00000000e09, 1.00000001e08, -9.30685282e00]),
 548 |         rtol=1e-5,
 549 |         atol=1e-5,
 550 |     )
 551 | 
 552 | 
 553 | def test_op_logsumexp_backward_1():
 554 |     np.testing.assert_allclose(
 555 |         logsumexp_backward((3, 1), (1,)),
 556 |         np.array([[1.0], [7.3], [9.9]], dtype=np.float32),
 557 |         rtol=1e-5,
 558 |         atol=1e-5,
 559 |     )
 560 | 
 561 | 
 562 | def test_op_logsumexp_backward_2():
 563 |     np.testing.assert_allclose(
 564 |         logsumexp_backward((3, 3, 3), (1, 2)),
 565 |         np.array(
 566 |             [
 567 |                 [
 568 |                     [1.4293308, 1.2933122, 0.82465225],
 569 |                     [0.50017685, 2.1323113, 2.1323113],
 570 |                     [1.4293308, 0.58112264, 0.40951014],
 571 |                 ],
 572 |                 [
 573 |                     [0.3578173, 0.07983983, 4.359107],
 574 |                     [1.1300558, 0.561169, 0.1132981],
 575 |                     [0.9252113, 0.65198547, 1.7722803],
 576 |                 ],
 577 |                 [
 578 |                     [0.2755132, 2.365242, 2.888913],
 579 |                     [0.05291228, 1.1745441, 0.02627547],
 580 |                     [2.748018, 0.13681579, 2.748018],
 581 |                 ],
 582 |             ],
 583 |             dtype=np.float32,
 584 |         ),
 585 |         rtol=1e-5,
 586 |         atol=1e-5,
 587 |     )
 588 | 
 589 | 
 590 | def test_op_logsumexp_backward_3():
 591 |     np.testing.assert_allclose(
 592 |         logsumexp_backward((3, 3, 3), (0, 2)),
 593 |         np.array(
 594 |             [
 595 |                 [
 596 |                     [0.92824626, 0.839912, 0.5355515],
 597 |                     [0.59857905, 2.551811, 2.551811],
 598 |                     [1.0213376, 0.41524494, 0.29261813],
 599 |                 ],
 600 |                 [
 601 |                     [0.16957533, 0.03783737, 2.0658503],
 602 |                     [0.98689, 0.49007502, 0.09894446],
 603 |                     [0.48244575, 0.3399738, 0.9241446],
 604 |                 ],
 605 |                 [
 606 |                     [0.358991, 3.081887, 3.764224],
 607 |                     [0.12704718, 2.820187, 0.06308978],
 608 |                     [3.9397335, 0.19614778, 3.9397335],
 609 |                 ],
 610 |             ],
 611 |             dtype=np.float32,
 612 |         ),
 613 |         rtol=1e-5,
 614 |         atol=1e-5,
 615 |     )
 616 | 
 617 | 
 618 | def test_op_logsumexp_backward_5():
 619 |     grad_compare = ndl.Tensor(np.array([[1e10, 1e9, 1e8, -10], [1e-10, 1e9, 1e8, -10]]))
 620 |     test_data = (ndl.ops.logsumexp(grad_compare, (0,)) ** 2).sum().backward()
 621 |     np.testing.assert_allclose(
 622 |         grad_compare.grad.cached_data,
 623 |         np.array(
 624 |             [
 625 |                 [2.00000000e10, 9.99999999e08, 1.00000001e08, -9.30685282e00],
 626 |                 [0.00000000e00, 9.99999999e08, 1.00000001e08, -9.30685282e00],
 627 |             ]
 628 |         ),
 629 |         rtol=1e-5,
 630 |         atol=1e-5,
 631 |     )
 632 | 
 633 | 
 634 | def submit_op_logsumexp():
 635 |     mugrade.submit(logsumexp_forward((2, 2, 2), None))
 636 |     mugrade.submit(logsumexp_forward((1, 2, 3), (0,)))
 637 |     mugrade.submit(logsumexp_forward((2, 3, 3), (1, 2)))
 638 |     mugrade.submit(logsumexp_forward((1, 2, 2, 2, 2), (1, 2, 3, 4)))
 639 |     mugrade.submit(logsumexp_forward((1, 2, 2, 2, 2), (0, 1, 3)))
 640 |     mugrade.submit(logsumexp_backward((2, 2, 2), None))
 641 |     mugrade.submit(logsumexp_backward((1, 2, 3), (0,)))
 642 |     mugrade.submit(logsumexp_backward((2, 3, 3), (1, 2)))
 643 |     mugrade.submit(logsumexp_backward((1, 2, 2, 2, 2), (1, 2, 3, 4)))
 644 |     mugrade.submit(logsumexp_backward((1, 2, 2, 2, 2), (0, 1, 3)))
 645 | 
 646 | 
 647 | def test_op_logsumexp_backward_4():
 648 |     np.testing.assert_allclose(
 649 |         logsumexp_backward((1, 2, 3, 4), None),
 650 |         np.array(
 651 |             [
 652 |                 [
 653 |                     [
 654 |                         [0.96463485, 1.30212122, 0.09671321, 1.84779774],
 655 |                         [1.84779774, 0.39219132, 0.21523925, 0.30543892],
 656 |                         [0.01952606, 0.55654611, 0.32109909, 0.01598658],
 657 |                     ],
 658 |                     [
 659 |                         [1.30212122, 0.83026929, 0.30543892, 0.01680623],
 660 |                         [0.29054249, 0.07532032, 1.84779774, 0.05307731],
 661 |                         [0.75125862, 0.26289377, 0.04802637, 0.03932065],
 662 |                     ],
 663 |                 ]
 664 |             ],
 665 |             dtype=np.float32,
 666 |         ),
 667 |         rtol=1e-5,
 668 |         atol=1e-5,
 669 |     )
 670 | 
 671 | 
 672 | def test_init_kaiming_uniform():
 673 |     np.random.seed(42)
 674 |     np.testing.assert_allclose(
 675 |         ndl.init.kaiming_uniform(3, 5).numpy(),
 676 |         np.array(
 677 |             [
 678 |                 [-0.35485414, 1.2748126, 0.65617794, 0.27904832, -0.9729262],
 679 |                 [-0.97299445, -1.2499284, 1.0357026, 0.28599644, 0.58851814],
 680 |                 [-1.3559918, 1.3291057, 0.9402898, -0.81362784, -0.8999349],
 681 |             ],
 682 |             dtype=np.float32,
 683 |         ),
 684 |         rtol=1e-4,
 685 |         atol=1e-4,
 686 |     )
 687 | 
 688 | 
 689 | def test_init_kaiming_normal():
 690 |     np.random.seed(42)
 691 |     np.testing.assert_allclose(
 692 |         ndl.init.kaiming_normal(3, 5).numpy(),
 693 |         np.array(
 694 |             [
 695 |                 [0.4055654, -0.11289233, 0.5288355, 1.2435486, -0.19118543],
 696 |                 [-0.19117202, 1.2894219, 0.62660784, -0.38332424, 0.4429984],
 697 |                 [-0.37837896, -0.38026676, 0.19756137, -1.5621868, -1.4083896],
 698 |             ],
 699 |             dtype=np.float32,
 700 |         ),
 701 |         rtol=1e-4,
 702 |         atol=1e-4,
 703 |     )
 704 | 
 705 | 
 706 | def test_init_xavier_uniform():
 707 |     np.random.seed(42)
 708 |     np.testing.assert_allclose(
 709 |         ndl.init.xavier_uniform(3, 5, gain=1.5).numpy(),
 710 |         np.array(
 711 |             [
 712 |                 [-0.32595432, 1.1709901, 0.60273796, 0.25632226, -0.8936898],
 713 |                 [-0.89375246, -1.1481324, 0.95135355, 0.26270452, 0.54058844],
 714 |                 [-1.245558, 1.2208616, 0.8637113, -0.74736494, -0.826643],
 715 |             ],
 716 |             dtype=np.float32,
 717 |         ),
 718 |         rtol=1e-4,
 719 |         atol=1e-4,
 720 |     )
 721 | 
 722 | 
 723 | def test_init_xavier_normal():
 724 |     np.random.seed(42)
 725 |     np.testing.assert_allclose(
 726 |         ndl.init.xavier_normal(3, 5, gain=0.33).numpy(),
 727 |         np.array(
 728 |             [
 729 |                 [0.08195783, -0.022813609, 0.10686861, 0.25129992, -0.038635306],
 730 |                 [-0.038632598, 0.2605701, 0.12662673, -0.07746328, 0.08952241],
 731 |                 [-0.07646392, -0.07684541, 0.039923776, -0.31569123, -0.28461143],
 732 |             ],
 733 |             dtype=np.float32,
 734 |         ),
 735 |         rtol=1e-4,
 736 |         atol=1e-4,
 737 |     )
 738 | 
 739 | 
 740 | def submit_init():
 741 |     np.random.seed(0)
 742 |     mugrade.submit(ndl.init.kaiming_normal(2, 5).numpy())
 743 |     mugrade.submit(ndl.init.kaiming_uniform(2, 5).numpy())
 744 |     mugrade.submit(ndl.init.xavier_uniform(2, 5, gain=0.33).numpy())
 745 |     mugrade.submit(ndl.init.xavier_normal(2, 5, gain=1.3).numpy())
 746 | 
 747 | 
 748 | def test_nn_linear_weight_init_1():
 749 |     np.testing.assert_allclose(
 750 |         nn_linear_weight_init(),
 751 |         np.array(
 752 |             [
 753 |                 [-4.4064468e-01, -6.3199449e-01, -4.1082984e-01, -7.5330488e-02],
 754 |                 [-3.3144259e-01, 3.4056887e-02, -4.4079605e-01, 8.8153863e-01],
 755 |                 [4.3108878e-01, -7.1237373e-01, -2.1057765e-01, 2.3793796e-01],
 756 |                 [-6.9425780e-01, 8.9535803e-01, -1.0512712e-01, 5.3615785e-01],
 757 |                 [5.4460180e-01, -2.5689366e-01, -1.5534532e-01, 1.5601574e-01],
 758 |                 [4.8174453e-01, -5.7806653e-01, -3.9223823e-01, 3.1518409e-01],
 759 |                 [-6.5129338e-04, -5.9517515e-01, -1.6083106e-01, -5.5698222e-01],
 760 |             ],
 761 |             dtype=np.float32,
 762 |         ),
 763 |         rtol=1e-5,
 764 |         atol=1e-5,
 765 |     )
 766 | 
 767 | 
 768 | def test_nn_linear_bias_init_1():
 769 |     np.testing.assert_allclose(
 770 |         nn_linear_bias_init(),
 771 |         np.array([[0.077647, 0.814139, -0.770975, 1.120297]], dtype=np.float32),
 772 |         rtol=1e-5,
 773 |         atol=1e-5,
 774 |     )
 775 | 
 776 | 
 777 | def test_nn_linear_forward_1():
 778 |     np.testing.assert_allclose(
 779 |         linear_forward((10, 5), (1, 10)),
 780 |         np.array([[3.849948, 9.50499, 2.38029, 5.572587, 5.668391]], dtype=np.float32),
 781 |         rtol=1e-5,
 782 |         atol=1e-5,
 783 |     )
 784 | 
 785 | 
 786 | def test_nn_linear_forward_2():
 787 |     np.testing.assert_allclose(
 788 |         linear_forward((10, 5), (3, 10)),
 789 |         np.array(
 790 |             [
 791 |                 [7.763089, 10.086785, 0.380316, 6.242502, 6.944664],
 792 |                 [2.548275, 7.747925, 5.343155, 2.065694, 9.871243],
 793 |                 [2.871696, 7.466332, 4.236925, 2.461897, 8.209476],
 794 |             ],
 795 |             dtype=np.float32,
 796 |         ),
 797 |         rtol=1e-5,
 798 |         atol=1e-5,
 799 |     )
 800 | 
 801 | 
 802 | def test_nn_linear_forward_3():
 803 |     np.testing.assert_allclose(
 804 |         linear_forward((10, 5), (1, 3, 10)),
 805 |         np.array(
 806 |             [
 807 |                 [
 808 |                     [4.351459, 8.782808, 3.935711, 3.03171, 8.014219],
 809 |                     [5.214458, 8.728788, 2.376814, 5.672185, 4.974319],
 810 |                     [1.343204, 8.639378, 2.604359, -0.282955, 9.864498],
 811 |                 ]
 812 |             ],
 813 |             dtype=np.float32,
 814 |         ),
 815 |         rtol=1e-5,
 816 |         atol=1e-5,
 817 |     )
 818 | 
 819 | 
 820 | def test_nn_linear_backward_1():
 821 |     np.testing.assert_allclose(
 822 |         linear_backward((10, 5), (1, 10)),
 823 |         np.array(
 824 |             [
 825 |                 [
 826 |                     20.61148,
 827 |                     6.920893,
 828 |                     -1.625556,
 829 |                     -13.497676,
 830 |                     -6.672813,
 831 |                     18.762121,
 832 |                     7.286628,
 833 |                     8.18535,
 834 |                     2.741301,
 835 |                     5.723689,
 836 |                 ]
 837 |             ],
 838 |             dtype=np.float32,
 839 |         ),
 840 |         rtol=1e-5,
 841 |         atol=1e-5,
 842 |     )
 843 | 
 844 | 
 845 | def test_nn_linear_backward_2():
 846 |     print(linear_backward((10, 5), (3, 10)))
 847 |     np.testing.assert_allclose(
 848 |         linear_backward((10, 5), (3, 10)),
 849 |         np.array(
 850 |             [
 851 |                 [
 852 |                     24.548800,
 853 |                     8.775347,
 854 |                     4.387898,
 855 |                     -21.248514,
 856 |                     -3.9669373,
 857 |                     24.256767,
 858 |                     6.3171115,
 859 |                     6.029777,
 860 |                     0.8809935,
 861 |                     3.5995162,
 862 |                 ],
 863 |                 [
 864 |                     12.233745,
 865 |                     -3.792646,
 866 |                     -4.1903896,
 867 |                     -5.106719,
 868 |                     -12.004269,
 869 |                     11.967942,
 870 |                     11.939469,
 871 |                     19.314493,
 872 |                     10.631226,
 873 |                     14.510731,
 874 |                 ],
 875 |                 [
 876 |                     12.920014,
 877 |                     -1.4545978,
 878 |                     -3.0892954,
 879 |                     -6.762379,
 880 |                     -9.713004,
 881 |                     12.523148,
 882 |                     9.904757,
 883 |                     15.442993,
 884 |                     8.044141,
 885 |                     11.4106865,
 886 |                 ],
 887 |             ],
 888 |             dtype=np.float32,
 889 |         ),
 890 |         rtol=1e-5,
 891 |         atol=1e-5,
 892 |     )
 893 | 
 894 | 
 895 | def test_nn_linear_backward_3():
 896 |     print(linear_backward((10, 5), (1, 3, 10)))
 897 |     np.testing.assert_allclose(
 898 |         linear_backward((10, 5), (1, 3, 10)),
 899 |         np.array(
 900 |             [
 901 |                 [
 902 |                     [
 903 |                         16.318823,
 904 |                         0.3890714,
 905 |                         -2.3196607,
 906 |                         -10.607947,
 907 |                         -8.891977,
 908 |                         16.04581,
 909 |                         9.475689,
 910 |                         14.571134,
 911 |                         6.581477,
 912 |                         10.204643,
 913 |                     ],
 914 |                     [
 915 |                         20.291656,
 916 |                         7.48733,
 917 |                         1.2581345,
 918 |                         -14.285493,
 919 |                         -6.0252004,
 920 |                         19.621624,
 921 |                         4.343303,
 922 |                         6.973201,
 923 |                         -0.8103489,
 924 |                         4.037069,
 925 |                     ],
 926 |                     [
 927 |                         11.332953,
 928 |                         -5.698288,
 929 |                         -8.815561,
 930 |                         -7.673438,
 931 |                         -7.6161675,
 932 |                         9.361553,
 933 |                         17.341637,
 934 |                         17.269142,
 935 |                         18.1076,
 936 |                         14.261493,
 937 |                     ],
 938 |                 ]
 939 |             ],
 940 |             dtype=np.float32,
 941 |         ),
 942 |         rtol=1e-5,
 943 |         atol=1e-5,
 944 |     )
 945 | 
 946 | 
 947 | def submit_nn_linear():
 948 |     mugrade.submit(linear_forward((3, 5), (1, 3)))
 949 |     mugrade.submit(linear_forward((3, 5), (3, 3)))
 950 |     mugrade.submit(linear_forward((3, 5), (1, 3, 3)))
 951 |     mugrade.submit(linear_backward((4, 5), (1, 4)))
 952 |     mugrade.submit(linear_backward((4, 5), (3, 4)))
 953 |     mugrade.submit(linear_backward((4, 5), (1, 3, 4)))
 954 | 
 955 | 
 956 | def test_nn_relu_forward_1():
 957 |     np.testing.assert_allclose(
 958 |         relu_forward(2, 2),
 959 |         np.array([[3.35, 4.2], [0.25, 4.5]], dtype=np.float32),
 960 |         rtol=1e-5,
 961 |         atol=1e-5,
 962 |     )
 963 | 
 964 | 
 965 | def test_nn_relu_backward_1():
 966 |     np.testing.assert_allclose(
 967 |         relu_backward(3, 2),
 968 |         np.array([[7.5, 2.7], [0.6, 0.2], [0.3, 6.7]], dtype=np.float32),
 969 |         rtol=1e-5,
 970 |         atol=1e-5,
 971 |     )
 972 | 
 973 | 
 974 | def submit_nn_relu():
 975 |     mugrade.submit(relu_forward(2, 3))
 976 |     mugrade.submit(relu_backward(3, 4))
 977 | 
 978 | 
 979 | def test_nn_sequential_forward_1():
 980 |     print(sequential_forward(batches=3))
 981 |     np.testing.assert_allclose(
 982 |         sequential_forward(batches=3),
 983 |         np.array(
 984 |             [
 985 |                 [3.296263, 0.057031, 2.97568, -4.618432, -0.902491],
 986 |                 [2.465332, -0.228394, 2.069803, -3.772378, -0.238334],
 987 |                 [3.04427, -0.25623, 3.848721, -6.586399, -0.576819],
 988 |             ],
 989 |             dtype=np.float32,
 990 |         ),
 991 |         rtol=1e-5,
 992 |         atol=1e-5,
 993 |     )
 994 | 
 995 | 
 996 | def test_nn_sequential_backward_1():
 997 |     np.testing.assert_allclose(
 998 |         sequential_backward(batches=3),
 999 |         np.array(
1000 |             [
1001 |                 [0.802697, -1.0971, 0.120842, 0.033051, 0.241105],
1002 |                 [-0.364489, 0.651385, 0.482428, 0.925252, -1.233545],
1003 |                 [0.802697, -1.0971, 0.120842, 0.033051, 0.241105],
1004 |             ],
1005 |             dtype=np.float32,
1006 |         ),
1007 |         rtol=1e-5,
1008 |         atol=1e-5,
1009 |     )
1010 | 
1011 | 
1012 | def submit_nn_sequential():
1013 |     mugrade.submit(sequential_forward(batches=2))
1014 |     mugrade.submit(sequential_backward(batches=2))
1015 | 
1016 | 
1017 | def test_nn_softmax_loss_forward_1():
1018 |     np.testing.assert_allclose(
1019 |         softmax_loss_forward(5, 10),
1020 |         np.array(4.041218, dtype=np.float32),
1021 |         rtol=1e-5,
1022 |         atol=1e-5,
1023 |     )
1024 | 
1025 | 
1026 | def test_nn_softmax_loss_forward_2():
1027 |     np.testing.assert_allclose(
1028 |         softmax_loss_forward(3, 11),
1029 |         np.array(3.3196716, dtype=np.float32),
1030 |         rtol=1e-5,
1031 |         atol=1e-5,
1032 |     )
1033 | 
1034 | 
1035 | def test_nn_softmax_loss_backward_1():
1036 |     np.testing.assert_allclose(
1037 |         softmax_loss_backward(5, 10),
1038 |         np.array(
1039 |             [
1040 |                 [
1041 |                     0.00068890385,
1042 |                     0.0015331834,
1043 |                     0.013162163,
1044 |                     -0.16422154,
1045 |                     0.023983022,
1046 |                     0.0050903494,
1047 |                     0.00076135644,
1048 |                     0.050772052,
1049 |                     0.0062173656,
1050 |                     0.062013146,
1051 |                 ],
1052 |                 [
1053 |                     0.012363418,
1054 |                     0.02368262,
1055 |                     0.11730081,
1056 |                     0.001758993,
1057 |                     0.004781439,
1058 |                     0.0029000894,
1059 |                     -0.19815083,
1060 |                     0.017544521,
1061 |                     0.015874943,
1062 |                     0.0019439887,
1063 |                 ],
1064 |                 [
1065 |                     0.001219767,
1066 |                     0.08134181,
1067 |                     0.057320606,
1068 |                     0.0008595553,
1069 |                     0.0030001428,
1070 |                     0.0009499555,
1071 |                     -0.19633561,
1072 |                     0.0008176346,
1073 |                     0.0014898272,
1074 |                     0.0493363,
1075 |                 ],
1076 |                 [
1077 |                     -0.19886842,
1078 |                     0.08767337,
1079 |                     0.017700946,
1080 |                     0.026406704,
1081 |                     0.0013147127,
1082 |                     0.0107361665,
1083 |                     0.009714483,
1084 |                     0.023893777,
1085 |                     0.019562569,
1086 |                     0.0018656658,
1087 |                 ],
1088 |                 [
1089 |                     0.007933789,
1090 |                     0.017656967,
1091 |                     0.027691642,
1092 |                     0.0005605318,
1093 |                     0.05576411,
1094 |                     0.0013114461,
1095 |                     0.06811045,
1096 |                     0.011835824,
1097 |                     0.0071787895,
1098 |                     -0.19804356,
1099 |                 ],
1100 |             ],
1101 |             dtype=np.float32,
1102 |         ),
1103 |         rtol=1e-5,
1104 |         atol=1e-5,
1105 |     )
1106 | 
1107 | 
1108 | def test_nn_softmax_loss_backward_2():
1109 |     np.testing.assert_allclose(
1110 |         softmax_loss_backward(3, 11),
1111 |         np.array(
1112 |             [
1113 |                 [
1114 |                     0.0027466794,
1115 |                     0.020295369,
1116 |                     0.012940894,
1117 |                     0.04748398,
1118 |                     0.052477922,
1119 |                     0.090957515,
1120 |                     0.0028875037,
1121 |                     0.012940894,
1122 |                     0.040869843,
1123 |                     0.04748398,
1124 |                     -0.33108455,
1125 |                 ],
1126 |                 [
1127 |                     0.0063174255,
1128 |                     0.001721699,
1129 |                     0.09400159,
1130 |                     0.0034670753,
1131 |                     0.038218185,
1132 |                     0.009424488,
1133 |                     0.0042346967,
1134 |                     0.08090791,
1135 |                     -0.29697907,
1136 |                     0.0044518122,
1137 |                     0.054234188,
1138 |                 ],
1139 |                 [
1140 |                     0.14326698,
1141 |                     0.002624026,
1142 |                     0.0032049934,
1143 |                     0.01176007,
1144 |                     0.045363605,
1145 |                     0.0043262867,
1146 |                     0.039044812,
1147 |                     0.017543964,
1148 |                     0.0037236712,
1149 |                     -0.3119051,
1150 |                     0.04104668,
1151 |                 ],
1152 |             ],
1153 |             dtype=np.float32,
1154 |         ),
1155 |         rtol=1e-5,
1156 |         atol=1e-5,
1157 |     )
1158 | 
1159 | 
1160 | def submit_nn_softmax_loss():
1161 |     mugrade.submit(softmax_loss_forward(4, 9))
1162 |     mugrade.submit(softmax_loss_forward(2, 7))
1163 |     mugrade.submit(softmax_loss_backward(4, 9))
1164 |     mugrade.submit(softmax_loss_backward(2, 7))
1165 | 
1166 | 
1167 | def test_nn_layernorm_forward_1():
1168 |     np.testing.assert_allclose(
1169 |         layernorm_forward((3, 3), 3),
1170 |         np.array(
1171 |             [
1172 |                 [-0.06525002, -1.1908097, 1.2560595],
1173 |                 [1.3919864, -0.47999576, -0.911992],
1174 |                 [1.3628436, -1.0085043, -0.3543393],
1175 |             ],
1176 |             dtype=np.float32,
1177 |         ),
1178 |         rtol=1e-5,
1179 |         atol=1e-5,
1180 |     )
1181 | 
1182 | 
1183 | def test_nn_layernorm_forward_2():
1184 |     np.testing.assert_allclose(
1185 |         layernorm_forward((2, 10), 10),
1186 |         np.array(
1187 |             [
1188 |                 [
1189 |                     0.8297899,
1190 |                     1.6147263,
1191 |                     -1.525019,
1192 |                     -0.4036814,
1193 |                     0.306499,
1194 |                     0.08223152,
1195 |                     0.6429003,
1196 |                     -1.3381294,
1197 |                     0.8671678,
1198 |                     -1.0764838,
1199 |                 ],
1200 |                 [
1201 |                     -1.8211555,
1202 |                     0.39098236,
1203 |                     -0.5864739,
1204 |                     0.853988,
1205 |                     -0.3806936,
1206 |                     1.2655486,
1207 |                     0.33953735,
1208 |                     1.522774,
1209 |                     -0.8951442,
1210 |                     -0.68936396,
1211 |                 ],
1212 |             ],
1213 |             dtype=np.float32,
1214 |         ),
1215 |         rtol=1e-5,
1216 |         atol=1e-5,
1217 |     )
1218 | 
1219 | 
1220 | def test_nn_layernorm_forward_3():
1221 |     np.testing.assert_allclose(
1222 |         layernorm_forward((1, 5), 5),
1223 |         np.array(
1224 |             [[-1.0435007, -0.8478443, 0.7500162, -0.42392215, 1.565251]],
1225 |             dtype=np.float32,
1226 |         ),
1227 |         rtol=1e-5,
1228 |         atol=1e-5,
1229 |     )
1230 | 
1231 | 
1232 | def test_nn_layernorm_backward_1():
1233 |     np.testing.assert_allclose(
1234 |         layernorm_backward((3, 3), 3),
1235 |         np.array(
1236 |             [
1237 |                 [-2.8312206e-06, -6.6757202e-05, 6.9618225e-05],
1238 |                 [1.9950867e-03, -6.8092346e-04, -1.3141632e-03],
1239 |                 [4.4703484e-05, -3.2544136e-05, -1.1801720e-05],
1240 |             ],
1241 |             dtype=np.float32,
1242 |         ),
1243 |         rtol=1e-5,
1244 |         atol=1e-5,
1245 |     )
1246 | 
1247 | 
1248 | def test_nn_layernorm_backward_2():
1249 |     np.testing.assert_allclose(
1250 |         layernorm_backward((2, 10), 10),
1251 |         np.array(
1252 |             [
1253 |                 [
1254 |                     -2.301574,
1255 |                     4.353944,
1256 |                     -1.9396116,
1257 |                     2.4330146,
1258 |                     -1.1070801,
1259 |                     0.01571643,
1260 |                     -2.209449,
1261 |                     0.49513134,
1262 |                     -2.261348,
1263 |                     2.5212562,
1264 |                 ],
1265 |                 [
1266 |                     -9.042961,
1267 |                     -2.6184766,
1268 |                     4.5592957,
1269 |                     -4.2109876,
1270 |                     3.4247458,
1271 |                     -1.9075732,
1272 |                     -2.2689414,
1273 |                     2.110825,
1274 |                     5.044025,
1275 |                     4.910048,
1276 |                 ],
1277 |             ],
1278 |             dtype=np.float32,
1279 |         ),
1280 |         rtol=1e-5,
1281 |         atol=1e-5,
1282 |     )
1283 | 
1284 | 
1285 | def test_nn_layernorm_backward_3():
1286 |     np.testing.assert_allclose(
1287 |         layernorm_backward((1, 5), 5),
1288 |         np.array(
1289 |             [[0.150192, 0.702322, -3.321343, 0.31219, 2.156639]], dtype=np.float32
1290 |         ),
1291 |         rtol=1e-5,
1292 |         atol=1e-5,
1293 |     )
1294 | 
1295 | 
1296 | def test_nn_layernorm_backward_4():
1297 |     np.testing.assert_allclose(
1298 |         layernorm_backward((5, 1), 1),
1299 |         np.array([[0], [0], [0], [0], [0]], dtype=np.float32),
1300 |         rtol=1e-5,
1301 |         atol=1e-5,
1302 |     )
1303 | 
1304 | 
1305 | def submit_nn_layernorm():
1306 |     mugrade.submit(layernorm_forward((1, 1), 1))
1307 |     mugrade.submit(layernorm_forward((10, 10), 10))
1308 |     mugrade.submit(layernorm_forward((10, 30), 30))
1309 |     mugrade.submit(layernorm_forward((1, 3), 3))
1310 |     mugrade.submit(layernorm_backward((1, 1), 1))
1311 |     mugrade.submit(layernorm_backward((10, 10), 10))
1312 |     mugrade.submit(layernorm_backward((10, 30), 30))
1313 |     mugrade.submit(layernorm_backward((1, 3), 3))
1314 | 
1315 | 
1316 | def test_nn_batchnorm_check_model_eval_switches_training_flag_1():
1317 |     np.testing.assert_allclose(
1318 |         check_training_mode(),
1319 |         np.array(
1320 |             [
1321 |                 0,
1322 |                 0,
1323 |                 0,
1324 |                 0,
1325 |                 0,
1326 |                 0,
1327 |                 0,
1328 |                 0,
1329 |                 0,
1330 |                 1,
1331 |                 1,
1332 |                 1,
1333 |                 1,
1334 |                 1,
1335 |                 1,
1336 |                 1,
1337 |                 1,
1338 |                 1,
1339 |                 0,
1340 |                 0,
1341 |                 0,
1342 |                 0,
1343 |                 0,
1344 |                 0,
1345 |                 0,
1346 |                 0,
1347 |                 0,
1348 |             ]
1349 |         ),
1350 |         rtol=1e-5,
1351 |         atol=1e-5,
1352 |     )
1353 | 
1354 | 
1355 | def test_nn_batchnorm_forward_1():
1356 |     np.testing.assert_allclose(
1357 |         batchnorm_forward(4, 4),
1358 |         np.array(
1359 |             [
1360 |                 [7.8712696e-01, -3.1676728e-01, -6.4885163e-01, 2.0828949e-01],
1361 |                 [-7.9508079e-03, 1.0092355e00, 1.6221288e00, 8.5209310e-01],
1362 |                 [8.5073310e-01, -1.4954363e00, -9.6686421e-08, -1.6852506e00],
1363 |                 [-1.6299094e00, 8.0296844e-01, -9.7327745e-01, 6.2486827e-01],
1364 |             ],
1365 |             dtype=np.float32,
1366 |         ),
1367 |         rtol=1e-5,
1368 |         atol=1e-5,
1369 |     )
1370 | 
1371 | 
1372 | def test_nn_batchnorm_forward_affine_1():
1373 |     np.testing.assert_allclose(
1374 |         batchnorm_forward(4, 4, affine=True),
1375 |         np.array(
1376 |             [
1377 |                 [7.49529, 0.047213316, 2.690084, 5.5227957],
1378 |                 [4.116209, 3.8263211, 7.79979, 7.293256],
1379 |                 [7.765616, -3.3119934, 4.15, 0.31556034],
1380 |                 [-2.7771149, 3.23846, 1.9601259, 6.6683874],
1381 |             ],
1382 |             dtype=np.float32,
1383 |         ),
1384 |         rtol=1e-5,
1385 |         atol=1e-5,
1386 |     )
1387 | 
1388 | 
1389 | def test_nn_batchnorm_backward_1():
1390 |     np.testing.assert_allclose(
1391 |         batchnorm_backward(5, 4),
1392 |         np.array(
1393 |             [
1394 |                 [2.1338463e-04, 5.2094460e-06, -2.8359889e-05, -4.4368207e-06],
1395 |                 [-3.8480759e-04, -4.0292739e-06, 1.8370152e-05, -1.1172146e-05],
1396 |                 [2.5629997e-04, -1.1003018e-05, -9.0479853e-06, 5.5171549e-06],
1397 |                 [-4.2676926e-04, 3.4213067e-06, 1.3601780e-05, 1.0166317e-05],
1398 |                 [3.4189224e-04, 6.4015389e-06, 5.4359434e-06, -7.4505806e-08],
1399 |             ],
1400 |             dtype=np.float32,
1401 |         ),
1402 |         rtol=1e-5,
1403 |         atol=1e-5,
1404 |     )
1405 | 
1406 | 
1407 | def test_nn_batchnorm_backward_affine_1():
1408 |     np.testing.assert_allclose(
1409 |         batchnorm_backward(5, 4, affine=True),
1410 |         np.array(
1411 |             [
1412 |                 [3.8604736e-03, 4.2676926e-05, -1.4114380e-04, -3.2424927e-05],
1413 |                 [-6.9427490e-03, -3.3140182e-05, 9.1552734e-05, -8.5830688e-05],
1414 |                 [4.6386719e-03, -8.9883804e-05, -4.5776367e-05, 4.3869019e-05],
1415 |                 [-7.7133179e-03, 2.7418137e-05, 6.6757202e-05, 7.4386597e-05],
1416 |                 [6.1874390e-03, 5.2213669e-05, 2.8610229e-05, -1.9073486e-06],
1417 |             ],
1418 |             dtype=np.float32,
1419 |         ),
1420 |         rtol=1e-5,
1421 |         atol=1e-4,
1422 |     )
1423 | 
1424 | 
1425 | def test_nn_batchnorm_running_mean_1():
1426 |     np.testing.assert_allclose(
1427 |         batchnorm_running_mean(4, 3),
1428 |         np.array([2.020656, 1.69489, 1.498846], dtype=np.float32),
1429 |         rtol=1e-5,
1430 |         atol=1e-5,
1431 |     )
1432 | 
1433 | 
1434 | def test_nn_batchnorm_running_var_1():
1435 |     np.testing.assert_allclose(
1436 |         batchnorm_running_var(4, 3),
1437 |         np.array([1.412775, 1.386191, 1.096604], dtype=np.float32),
1438 |         rtol=1e-5,
1439 |         atol=1e-5,
1440 |     )
1441 | 
1442 | 
1443 | def test_nn_batchnorm_running_grad_1():
1444 |     np.testing.assert_allclose(
1445 |         batchnorm_running_grad(4, 3),
1446 |         np.array(
1447 |             [
1448 |                 [8.7022781e-06, -4.9751252e-06, 9.5367432e-05],
1449 |                 [6.5565109e-06, -7.2401017e-06, -2.3484230e-05],
1450 |                 [-3.5762787e-06, -4.5262277e-07, 1.6093254e-05],
1451 |                 [-1.1682510e-05, 1.2667850e-05, -8.7976456e-05],
1452 |             ],
1453 |             dtype=np.float32,
1454 |         ),
1455 |         rtol=1e-5,
1456 |         atol=1e-5,
1457 |     )
1458 | 
1459 | 
1460 | def submit_nn_batchnorm():
1461 |     mugrade.submit(batchnorm_forward(2, 3))
1462 |     mugrade.submit(batchnorm_forward(3, 4, affine=True))
1463 |     mugrade.submit(batchnorm_backward(5, 3))
1464 | 
1465 |     # todo(Zico) : these need to be added to mugrade
1466 |     mugrade.submit(batchnorm_backward(4, 2, affine=True))
1467 |     mugrade.submit(batchnorm_running_mean(3, 3))
1468 |     mugrade.submit(batchnorm_running_mean(3, 3))
1469 |     mugrade.submit(batchnorm_running_var(4, 3))
1470 |     mugrade.submit(batchnorm_running_var(4, 4))
1471 |     mugrade.submit(batchnorm_running_grad(4, 3))
1472 | 
1473 | 
1474 | def test_nn_dropout_forward_1():
1475 |     np.testing.assert_allclose(
1476 |         dropout_forward((2, 3), prob=0.45),
1477 |         np.array([[6.818182, 0.0, 0.0], [0.18181819, 0.0, 6.090909]], dtype=np.float32),
1478 |         rtol=1e-5,
1479 |         atol=1e-5,
1480 |     )
1481 | 
1482 | 
1483 | def test_nn_dropout_backward_1():
1484 |     np.testing.assert_allclose(
1485 |         dropout_backward((2, 3), prob=0.26),
1486 |         np.array(
1487 |             [[1.3513514, 0.0, 0.0], [1.3513514, 0.0, 1.3513514]], dtype=np.float32
1488 |         ),
1489 |         rtol=1e-5,
1490 |         atol=1e-5,
1491 |     )
1492 | 
1493 | 
1494 | def submit_nn_dropout():
1495 |     mugrade.submit(dropout_forward((3, 3), prob=0.4))
1496 |     mugrade.submit(dropout_backward((3, 3), prob=0.15))
1497 | 
1498 | 
1499 | def test_nn_residual_forward_1():
1500 |     np.testing.assert_allclose(
1501 |         residual_forward(),
1502 |         np.array(
1503 |             [
1504 |                 [0.4660964, 3.8619597, -3.637068, 3.7489638, 2.4931884],
1505 |                 [-3.3769124, 2.5409935, -2.7110925, 4.9782896, -3.005401],
1506 |                 [-3.0222898, 3.796795, -2.101042, 6.785948, 0.9347453],
1507 |                 [-2.2496533, 3.635599, -2.1818666, 5.6361046, 0.9748006],
1508 |                 [-0.03458184, 0.0823682, -0.06686163, 1.9169499, 1.2638961],
1509 |             ],
1510 |             dtype=np.float32,
1511 |         ),
1512 |         rtol=1e-5,
1513 |         atol=1e-5,
1514 |     )
1515 | 
1516 | 
1517 | def test_nn_residual_backward_1():
1518 |     np.testing.assert_allclose(
1519 |         residual_backward(),
1520 |         np.array(
1521 |             [
1522 |                 [0.24244219, -0.19571924, -0.08556509, 0.9191598, 1.6787351],
1523 |                 [0.24244219, -0.19571924, -0.08556509, 0.9191598, 1.6787351],
1524 |                 [0.24244219, -0.19571924, -0.08556509, 0.9191598, 1.6787351],
1525 |                 [0.24244219, -0.19571924, -0.08556509, 0.9191598, 1.6787351],
1526 |                 [0.24244219, -0.19571924, -0.08556509, 0.9191598, 1.6787351],
1527 |             ],
1528 |             dtype=np.float32,
1529 |         ),
1530 |         rtol=1e-5,
1531 |         atol=1e-5,
1532 |     )
1533 | 
1534 | 
1535 | def submit_nn_residual():
1536 |     mugrade.submit(residual_forward(shape=(3, 4)))
1537 |     mugrade.submit(residual_backward(shape=(3, 4)))
1538 | 
1539 | 
1540 | def test_nn_flatten_forward_1():
1541 |     np.testing.assert_allclose(
1542 |         flatten_forward(3, 3),
1543 |         np.array(
1544 |             [[2.1, 0.95, 3.45], [3.1, 2.45, 2.3], [3.3, 0.4, 1.2]], dtype=np.float32
1545 |         ),
1546 |         rtol=1e-5,
1547 |         atol=1e-5,
1548 |     )
1549 | 
1550 | 
1551 | def test_nn_flatten_forward_2():
1552 |     np.testing.assert_allclose(
1553 |         flatten_forward(3, 3, 3),
1554 |         np.array(
1555 |             [
1556 |                 [3.35, 3.25, 2.8, 2.3, 3.75, 3.75, 3.35, 2.45, 2.1],
1557 |                 [1.65, 0.15, 4.15, 2.8, 2.1, 0.5, 2.6, 2.25, 3.25],
1558 |                 [2.4, 4.55, 4.75, 0.75, 3.85, 0.05, 4.7, 1.7, 4.7],
1559 |             ],
1560 |             dtype=np.float32,
1561 |         ),
1562 |         rtol=1e-5,
1563 |         atol=1e-5,
1564 |     )
1565 | 
1566 | 
1567 | def test_nn_flatten_forward_3():
1568 |     np.testing.assert_allclose(
1569 |         flatten_forward(1, 2, 3, 4),
1570 |         np.array(
1571 |             [
1572 |                 [
1573 |                     4.2,
1574 |                     4.5,
1575 |                     1.9,
1576 |                     4.85,
1577 |                     4.85,
1578 |                     3.3,
1579 |                     2.7,
1580 |                     3.05,
1581 |                     0.3,
1582 |                     3.65,
1583 |                     3.1,
1584 |                     0.1,
1585 |                     4.5,
1586 |                     4.05,
1587 |                     3.05,
1588 |                     0.15,
1589 |                     3.0,
1590 |                     1.65,
1591 |                     4.85,
1592 |                     1.3,
1593 |                     3.95,
1594 |                     2.9,
1595 |                     1.2,
1596 |                     1.0,
1597 |                 ]
1598 |             ],
1599 |             dtype=np.float32,
1600 |         ),
1601 |         rtol=1e-5,
1602 |         atol=1e-5,
1603 |     )
1604 | 
1605 | 
1606 | def test_nn_flatten_forward_4():
1607 |     np.testing.assert_allclose(
1608 |         flatten_forward(3, 3, 4, 4),
1609 |         np.array(
1610 |             [
1611 |                 [
1612 |                     0.95,
1613 |                     1.1,
1614 |                     1.0,
1615 |                     1.0,
1616 |                     4.9,
1617 |                     0.25,
1618 |                     1.6,
1619 |                     0.35,
1620 |                     1.5,
1621 |                     3.4,
1622 |                     1.75,
1623 |                     3.4,
1624 |                     4.8,
1625 |                     1.4,
1626 |                     2.35,
1627 |                     3.2,
1628 |                     1.65,
1629 |                     1.9,
1630 |                     3.05,
1631 |                     0.35,
1632 |                     3.15,
1633 |                     4.05,
1634 |                     3.3,
1635 |                     2.2,
1636 |                     2.5,
1637 |                     1.5,
1638 |                     3.25,
1639 |                     0.65,
1640 |                     3.05,
1641 |                     0.75,
1642 |                     3.25,
1643 |                     2.55,
1644 |                     0.55,
1645 |                     0.25,
1646 |                     3.65,
1647 |                     3.4,
1648 |                     0.05,
1649 |                     1.4,
1650 |                     0.75,
1651 |                     1.55,
1652 |                     4.45,
1653 |                     0.2,
1654 |                     3.35,
1655 |                     2.45,
1656 |                     3.45,
1657 |                     4.75,
1658 |                     2.45,
1659 |                     4.3,
1660 |                 ],
1661 |                 [
1662 |                     1.0,
1663 |                     0.2,
1664 |                     0.4,
1665 |                     0.7,
1666 |                     4.9,
1667 |                     4.2,
1668 |                     2.55,
1669 |                     3.15,
1670 |                     1.2,
1671 |                     3.8,
1672 |                     1.35,
1673 |                     1.85,
1674 |                     3.15,
1675 |                     2.7,
1676 |                     1.5,
1677 |                     1.35,
1678 |                     4.85,
1679 |                     4.2,
1680 |                     1.5,
1681 |                     1.75,
1682 |                     0.8,
1683 |                     4.3,
1684 |                     4.2,
1685 |                     4.85,
1686 |                     0.0,
1687 |                     3.75,
1688 |                     0.9,
1689 |                     0.0,
1690 |                     3.35,
1691 |                     1.05,
1692 |                     2.2,
1693 |                     0.75,
1694 |                     3.6,
1695 |                     2.0,
1696 |                     1.2,
1697 |                     1.9,
1698 |                     3.45,
1699 |                     1.6,
1700 |                     3.95,
1701 |                     4.45,
1702 |                     4.55,
1703 |                     4.75,
1704 |                     3.7,
1705 |                     0.3,
1706 |                     2.45,
1707 |                     3.75,
1708 |                     0.9,
1709 |                     2.2,
1710 |                 ],
1711 |                 [
1712 |                     4.95,
1713 |                     1.05,
1714 |                     2.4,
1715 |                     4.05,
1716 |                     3.75,
1717 |                     1.95,
1718 |                     0.65,
1719 |                     4.9,
1720 |                     4.3,
1721 |                     2.5,
1722 |                     1.9,
1723 |                     1.75,
1724 |                     2.05,
1725 |                     3.95,
1726 |                     0.8,
1727 |                     0.0,
1728 |                     0.8,
1729 |                     3.45,
1730 |                     1.55,
1731 |                     0.3,
1732 |                     1.5,
1733 |                     2.9,
1734 |                     2.15,
1735 |                     2.15,
1736 |                     3.3,
1737 |                     3.2,
1738 |                     4.3,
1739 |                     3.7,
1740 |                     0.4,
1741 |                     1.7,
1742 |                     0.35,
1743 |                     1.9,
1744 |                     1.8,
1745 |                     4.3,
1746 |                     4.7,
1747 |                     4.05,
1748 |                     3.65,
1749 |                     1.1,
1750 |                     1.0,
1751 |                     2.7,
1752 |                     3.95,
1753 |                     2.3,
1754 |                     2.6,
1755 |                     3.5,
1756 |                     0.75,
1757 |                     4.3,
1758 |                     3.0,
1759 |                     3.85,
1760 |                 ],
1761 |             ],
1762 |             dtype=np.float32,
1763 |         ),
1764 |         rtol=1e-5,
1765 |         atol=1e-5,
1766 |     )
1767 | 
1768 | 
1769 | def test_nn_flatten_backward_1():
1770 |     np.testing.assert_allclose(
1771 |         flatten_backward(3, 3),
1772 |         np.array([[4.2, 1.9, 6.9], [6.2, 4.9, 4.6], [6.6, 0.8, 2.4]], dtype=np.float32),
1773 |         rtol=1e-5,
1774 |         atol=1e-5,
1775 |     )
1776 | 
1777 | 
1778 | def test_nn_flatten_backward_2():
1779 |     np.testing.assert_allclose(
1780 |         flatten_backward(3, 3, 3),
1781 |         np.array(
1782 |             [
1783 |                 [[6.7, 6.5, 5.6], [4.6, 7.5, 7.5], [6.7, 4.9, 4.2]],
1784 |                 [[3.3, 0.3, 8.3], [5.6, 4.2, 1.0], [5.2, 4.5, 6.5]],
1785 |                 [[4.8, 9.1, 9.5], [1.5, 7.7, 0.1], [9.4, 3.4, 9.4]],
1786 |             ],
1787 |             dtype=np.float32,
1788 |         ),
1789 |         rtol=1e-5,
1790 |         atol=1e-5,
1791 |     )
1792 | 
1793 | 
1794 | def test_nn_flatten_backward_3():
1795 |     np.testing.assert_allclose(
1796 |         flatten_backward(2, 2, 2, 2),
1797 |         np.array(
1798 |             [
1799 |                 [[[6.8, 3.8], [5.4, 5.1]], [[8.5, 4.8], [3.1, 1.0]]],
1800 |                 [[[9.3, 0.8], [3.4, 1.6]], [[9.4, 3.6], [6.6, 7.0]]],
1801 |             ],
1802 |             dtype=np.float32,
1803 |         ),
1804 |         rtol=1e-5,
1805 |         atol=1e-5,
1806 |     )
1807 | 
1808 | 
1809 | def test_nn_flatten_backward_4():
1810 |     np.testing.assert_allclose(
1811 |         flatten_backward(1, 2, 3, 4),
1812 |         np.array(
1813 |             [
1814 |                 [
1815 |                     [[8.4, 9.0, 3.8, 9.7], [9.7, 6.6, 5.4, 6.1], [0.6, 7.3, 6.2, 0.2]],
1816 |                     [[9.0, 8.1, 6.1, 0.3], [6.0, 3.3, 9.7, 2.6], [7.9, 5.8, 2.4, 2.0]],
1817 |                 ]
1818 |             ],
1819 |             dtype=np.float32,
1820 |         ),
1821 |         rtol=1e-5,
1822 |         atol=1e-5,
1823 |     )
1824 | 
1825 | 
1826 | def test_nn_flatten_backward_5():
1827 |     np.testing.assert_allclose(
1828 |         flatten_backward(2, 2, 4, 3),
1829 |         np.array(
1830 |             [
1831 |                 [
1832 |                     [
1833 |                         [9.8, 7.1, 5.4],
1834 |                         [4.0, 6.2, 5.7],
1835 |                         [7.2, 2.0, 2.4],
1836 |                         [8.9, 4.9, 3.3],
1837 |                     ],
1838 |                     [
1839 |                         [9.0, 9.8, 5.9],
1840 |                         [7.1, 2.7, 9.6],
1841 |                         [8.5, 9.3, 5.8],
1842 |                         [3.1, 9.0, 6.7],
1843 |                     ],
1844 |                 ],
1845 |                 [
1846 |                     [
1847 |                         [7.4, 8.6, 6.9],
1848 |                         [8.2, 5.3, 8.7],
1849 |                         [8.8, 8.7, 4.0],
1850 |                         [3.9, 1.8, 2.7],
1851 |                     ],
1852 |                     [
1853 |                         [5.7, 6.2, 0.0],
1854 |                         [6.0, 0.0, 0.3],
1855 |                         [2.0, 0.1, 2.7],
1856 |                         [2.1, 0.1, 6.7],
1857 |                     ],
1858 |                 ],
1859 |             ],
1860 |             dtype=np.float32,
1861 |         ),
1862 |         rtol=1e-5,
1863 |         atol=1e-5,
1864 |     )
1865 | 
1866 | 
1867 | def submit_nn_flatten():
1868 |     mugrade.submit(flatten_forward(1, 2, 2))
1869 |     mugrade.submit(flatten_forward(2, 2, 2))
1870 |     mugrade.submit(flatten_forward(2, 3, 4, 2, 1, 2))
1871 |     mugrade.submit(flatten_forward(2, 3))
1872 |     mugrade.submit(flatten_backward(1, 2, 2))
1873 |     mugrade.submit(flatten_backward(2, 2, 2))
1874 |     mugrade.submit(flatten_backward(2, 3, 4, 2, 1, 2))
1875 |     mugrade.submit(flatten_backward(2, 3, 4, 4))
1876 | 
1877 | 
1878 | def test_optim_sgd_vanilla_1():
1879 |     np.testing.assert_allclose(
1880 |         learn_model_1d(
1881 |             64,
1882 |             16,
1883 |             lambda z: nn.Sequential(nn.Linear(64, 32), nn.ReLU(), nn.Linear(32, 16)),
1884 |             ndl.optim.SGD,
1885 |             lr=0.01,
1886 |             momentum=0.0,
1887 |         ),
1888 |         np.array(3.207009),
1889 |         rtol=1e-5,
1890 |         atol=1e-5,
1891 |     )
1892 | 
1893 | 
1894 | def test_optim_sgd_momentum_1():
1895 |     np.testing.assert_allclose(
1896 |         learn_model_1d(
1897 |             64,
1898 |             16,
1899 |             lambda z: nn.Sequential(nn.Linear(64, 32), nn.ReLU(), nn.Linear(32, 16)),
1900 |             ndl.optim.SGD,
1901 |             lr=0.01,
1902 |             momentum=0.9,
1903 |         ),
1904 |         np.array(3.311805),
1905 |         rtol=1e-5,
1906 |         atol=1e-5,
1907 |     )
1908 | 
1909 | 
1910 | def test_optim_sgd_weight_decay_1():
1911 |     np.testing.assert_allclose(
1912 |         learn_model_1d(
1913 |             64,
1914 |             16,
1915 |             lambda z: nn.Sequential(nn.Linear(64, 32), nn.ReLU(), nn.Linear(32, 16)),
1916 |             ndl.optim.SGD,
1917 |             lr=0.01,
1918 |             momentum=0.0,
1919 |             weight_decay=0.01,
1920 |         ),
1921 |         np.array(3.202637),
1922 |         rtol=1e-5,
1923 |         atol=1e-5,
1924 |     )
1925 | 
1926 | 
1927 | def test_optim_sgd_momentum_weight_decay_1():
1928 |     np.testing.assert_allclose(
1929 |         learn_model_1d(
1930 |             64,
1931 |             16,
1932 |             lambda z: nn.Sequential(nn.Linear(64, 32), nn.ReLU(), nn.Linear(32, 16)),
1933 |             ndl.optim.SGD,
1934 |             lr=0.01,
1935 |             momentum=0.9,
1936 |             weight_decay=0.01,
1937 |         ),
1938 |         np.array(3.306993),
1939 |         rtol=1e-5,
1940 |         atol=1e-5,
1941 |     )
1942 | 
1943 | 
1944 | def test_optim_sgd_layernorm_residual_1():
1945 |     nn.LayerNorm1d(8)
1946 |     np.testing.assert_allclose(
1947 |         learn_model_1d(
1948 |             64,
1949 |             16,
1950 |             lambda z: nn.Sequential(
1951 |                 nn.Linear(64, 8),
1952 |                 nn.ReLU(),
1953 |                 nn.Residual(nn.Linear(8, 8)),
1954 |                 nn.Linear(8, 16),
1955 |             ),
1956 |             ndl.optim.SGD,
1957 |             epochs=3,
1958 |             lr=0.01,
1959 |             weight_decay=0.001,
1960 |         ),
1961 |         np.array(2.852236),
1962 |         rtol=1e-5,
1963 |         atol=1e-5,
1964 |     )
1965 | 
1966 | 
1967 | # We're checking that you have not allocated too many tensors;
1968 | # if this fails, make sure you're using .detach()/.data whenever possible.
1969 | def test_optim_sgd_z_memory_check_1():
1970 |     np.testing.assert_allclose(
1971 |         global_tensor_count(), np.array(387), rtol=1e-5, atol=1000
1972 |     )
1973 | 
1974 | 
1975 | def submit_optim_sgd():
1976 |     mugrade.submit(
1977 |         learn_model_1d(
1978 |             48,
1979 |             17,
1980 |             lambda z: nn.Sequential(nn.Linear(48, 32), nn.ReLU(), nn.Linear(32, 17)),
1981 |             ndl.optim.SGD,
1982 |             lr=0.03,
1983 |             momentum=0.0,
1984 |             epochs=2,
1985 |         )
1986 |     )
1987 |     mugrade.submit(
1988 |         learn_model_1d(
1989 |             48,
1990 |             16,
1991 |             lambda z: nn.Sequential(nn.Linear(48, 32), nn.ReLU(), nn.Linear(32, 16)),
1992 |             ndl.optim.SGD,
1993 |             lr=0.01,
1994 |             momentum=0.9,
1995 |             epochs=2,
1996 |         )
1997 |     )
1998 |     mugrade.submit(
1999 |         learn_model_1d(
2000 |             48,
2001 |             16,
2002 |             lambda z: nn.Sequential(
2003 |                 nn.Linear(48, 32), nn.ReLU(), nn.BatchNorm1d(32), nn.Linear(32, 16)
2004 |             ),
2005 |             ndl.optim.SGD,
2006 |             lr=0.01,
2007 |             momentum=0.0,
2008 |             weight_decay=0.01,
2009 |             epochs=2,
2010 |         )
2011 |     )
2012 |     mugrade.submit(
2013 |         learn_model_1d(
2014 |             54,
2015 |             16,
2016 |             lambda z: nn.Sequential(nn.Linear(54, 32), nn.ReLU(), nn.Linear(32, 16)),
2017 |             ndl.optim.SGD,
2018 |             lr=0.01,
2019 |             momentum=0.9,
2020 |             weight_decay=0.01,
2021 |             epochs=2,
2022 |         )
2023 |     )
2024 |     mugrade.submit(
2025 |         learn_model_1d(
2026 |             64,
2027 |             4,
2028 |             lambda z: nn.Sequential(
2029 |                 nn.Linear(64, 8),
2030 |                 nn.ReLU(),
2031 |                 nn.Residual(nn.Linear(8, 8)),
2032 |                 nn.Linear(8, 4),
2033 |             ),
2034 |             ndl.optim.SGD,
2035 |             epochs=3,
2036 |             lr=0.01,
2037 |             weight_decay=0.001,
2038 |         )
2039 |     )
2040 | 
2041 | 
2042 | def test_optim_adam_1():
2043 |     np.testing.assert_allclose(
2044 |         learn_model_1d(
2045 |             64,
2046 |             16,
2047 |             lambda z: nn.Sequential(nn.Linear(64, 32), nn.ReLU(), nn.Linear(32, 16)),
2048 |             ndl.optim.Adam,
2049 |             lr=0.001,
2050 |         ),
2051 |         np.array(3.703999),
2052 |         rtol=1e-5,
2053 |         atol=1e-5,
2054 |     )
2055 | 
2056 | 
2057 | def test_optim_adam_weight_decay_1():
2058 |     np.testing.assert_allclose(
2059 |         learn_model_1d(
2060 |             64,
2061 |             16,
2062 |             lambda z: nn.Sequential(nn.Linear(64, 32), nn.ReLU(), nn.Linear(32, 16)),
2063 |             ndl.optim.Adam,
2064 |             lr=0.001,
2065 |             weight_decay=0.01,
2066 |         ),
2067 |         np.array(3.705134),
2068 |         rtol=1e-5,
2069 |         atol=1e-5,
2070 |     )
2071 | 
2072 | 
2073 | def test_optim_adam_batchnorm_1():
2074 |     np.testing.assert_allclose(
2075 |         learn_model_1d(
2076 |             64,
2077 |             16,
2078 |             lambda z: nn.Sequential(
2079 |                 nn.Linear(64, 32), nn.ReLU(), nn.BatchNorm1d(32), nn.Linear(32, 16)
2080 |             ),
2081 |             ndl.optim.Adam,
2082 |             lr=0.001,
2083 |             weight_decay=0.001,
2084 |         ),
2085 |         np.array(3.296256, dtype=np.float32),
2086 |         rtol=1e-5,
2087 |         atol=1e-5,
2088 |     )
2089 | 
2090 | 
2091 | def test_optim_adam_batchnorm_eval_mode_1():
2092 |     np.testing.assert_allclose(
2093 |         learn_model_1d_eval(
2094 |             64,
2095 |             16,
2096 |             lambda z: nn.Sequential(
2097 |                 nn.Linear(64, 32), nn.ReLU(), nn.BatchNorm1d(32), nn.Linear(32, 16)
2098 |             ),
2099 |             ndl.optim.Adam,
2100 |             lr=0.001,
2101 |             weight_decay=0.001,
2102 |         ),
2103 |         np.array(3.192054, dtype=np.float32),
2104 |         rtol=1e-5,
2105 |         atol=1e-5,
2106 |     )
2107 | 
2108 | 
2109 | def test_optim_adam_layernorm_1():
2110 |     np.testing.assert_allclose(
2111 |         learn_model_1d(
2112 |             64,
2113 |             16,
2114 |             lambda z: nn.Sequential(
2115 |                 nn.Linear(64, 32), nn.ReLU(), nn.LayerNorm1d(32), nn.Linear(32, 16)
2116 |             ),
2117 |             ndl.optim.Adam,
2118 |             lr=0.01,
2119 |             weight_decay=0.01,
2120 |         ),
2121 |         np.array(2.82192, dtype=np.float32),
2122 |         rtol=1e-5,
2123 |         atol=1e-5,
2124 |     )
2125 | 
2126 | 
2127 | def test_optim_adam_weight_decay_bias_correction_1():
2128 |     np.testing.assert_allclose(
2129 |         learn_model_1d(
2130 |             64,
2131 |             16,
2132 |             lambda z: nn.Sequential(nn.Linear(64, 32), nn.ReLU(), nn.Linear(32, 16)),
2133 |             ndl.optim.Adam,
2134 |             lr=0.001,
2135 |             weight_decay=0.01,
2136 |         ),
2137 |         np.array(3.705134),
2138 |         rtol=1e-5,
2139 |         atol=1e-5,
2140 |     )
2141 | 
2142 | 
2143 | # We're checking that you have not allocated too many tensors;
2144 | # if this fails, make sure you're using .detach()/.data whenever possible.
2145 | def test_optim_adam_z_memory_check_1():
2146 |     np.testing.assert_allclose(
2147 |         global_tensor_count(), np.array(1132), rtol=1e-5, atol=1000
2148 |     )
2149 | 
2150 | 
2151 | def submit_optim_adam():
2152 |     mugrade.submit(
2153 |         learn_model_1d(
2154 |             48,
2155 |             16,
2156 |             lambda z: nn.Sequential(nn.Linear(48, 32), nn.ReLU(), nn.Linear(32, 16)),
2157 |             ndl.optim.Adam,
2158 |             lr=0.001,
2159 |             epochs=2,
2160 |         )
2161 |     )
2162 |     mugrade.submit(
2163 |         learn_model_1d(
2164 |             48,
2165 |             16,
2166 |             lambda z: nn.Sequential(nn.Linear(48, 32), nn.ReLU(), nn.Linear(32, 16)),
2167 |             ndl.optim.Adam,
2168 |             lr=0.001,
2169 |             weight_decay=0.01,
2170 |             epochs=2,
2171 |         )
2172 |     )
2173 |     mugrade.submit(
2174 |         learn_model_1d(
2175 |             48,
2176 |             16,
2177 |             lambda z: nn.Sequential(
2178 |                 nn.Linear(48, 32), nn.ReLU(), nn.BatchNorm1d(32), nn.Linear(32, 16)
2179 |             ),
2180 |             ndl.optim.Adam,
2181 |             lr=0.001,
2182 |             weight_decay=0.001,
2183 |             epochs=3,
2184 |         )
2185 |     )
2186 |     mugrade.submit(
2187 |         learn_model_1d_eval(
2188 |             48,
2189 |             16,
2190 |             lambda z: nn.Sequential(
2191 |                 nn.Linear(48, 32), nn.ReLU(), nn.BatchNorm1d(32), nn.Linear(32, 16)
2192 |             ),
2193 |             ndl.optim.Adam,
2194 |             lr=0.001,
2195 |             weight_decay=0.001,
2196 |             epochs=2,
2197 |         )
2198 |     )
2199 |     mugrade.submit(
2200 |         learn_model_1d(
2201 |             48,
2202 |             16,
2203 |             lambda z: nn.Sequential(
2204 |                 nn.Linear(48, 32), nn.ReLU(), nn.LayerNorm1d(32), nn.Linear(32, 16)
2205 |             ),
2206 |             ndl.optim.Adam,
2207 |             lr=0.01,
2208 |             weight_decay=0.01,
2209 |             epochs=2,
2210 |         )
2211 |     )
2212 |     mugrade.submit(
2213 |         learn_model_1d(
2214 |             48,
2215 |             16,
2216 |             lambda z: nn.Sequential(nn.Linear(48, 32), nn.ReLU(), nn.Linear(32, 16)),
2217 |             ndl.optim.Adam,
2218 |             lr=0.001,
2219 |             weight_decay=0.01,
2220 |             epochs=2,
2221 |         )
2222 |     )
2223 | 
2224 | 
2225 | def test_mlp_residual_block_num_params_1():
2226 |     np.testing.assert_allclose(
2227 |         residual_block_num_params(15, 2, nn.BatchNorm1d),
2228 |         np.array(111),
2229 |         rtol=1e-5,
2230 |         atol=1e-5,
2231 |     )
2232 | 
2233 | 
2234 | def test_mlp_residual_block_num_params_2():
2235 |     np.testing.assert_allclose(
2236 |         residual_block_num_params(784, 100, nn.LayerNorm1d),
2237 |         np.array(159452),
2238 |         rtol=1e-5,
2239 |         atol=1e-5,
2240 |     )
2241 | 
2242 | 
2243 | def test_mlp_residual_block_forward_1():
2244 |     np.testing.assert_allclose(
2245 |         residual_block_forward(15, 10, nn.LayerNorm1d, 0.5),
2246 |         np.array(
2247 |             [
2248 |                 [
2249 |                     0.0,
2250 |                     1.358399,
2251 |                     0.0,
2252 |                     1.384224,
2253 |                     0.0,
2254 |                     0.0,
2255 |                     0.255451,
2256 |                     0.077662,
2257 |                     0.0,
2258 |                     0.939582,
2259 |                     0.525591,
2260 |                     1.99213,
2261 |                     0.0,
2262 |                     0.0,
2263 |                     1.012827,
2264 |                 ]
2265 |             ],
2266 |             dtype=np.float32,
2267 |         ),
2268 |         rtol=1e-5,
2269 |         atol=1e-5,
2270 |     )
2271 | 
2272 | 
2273 | def test_mlp_resnet_num_params_1():
2274 |     np.testing.assert_allclose(
2275 |         mlp_resnet_num_params(150, 100, 5, 10, nn.LayerNorm1d),
2276 |         np.array(68360),
2277 |         rtol=1e-5,
2278 |         atol=1e-5,
2279 |     )
2280 | 
2281 | 
2282 | def test_mlp_resnet_num_params_2():
2283 |     np.testing.assert_allclose(
2284 |         mlp_resnet_num_params(10, 100, 1, 100, nn.BatchNorm1d),
2285 |         np.array(21650),
2286 |         rtol=1e-5,
2287 |         atol=1e-5,
2288 |     )
2289 | 
2290 | 
2291 | def test_mlp_resnet_forward_1():
2292 |     np.testing.assert_allclose(
2293 |         mlp_resnet_forward(10, 5, 2, 5, nn.LayerNorm1d, 0.5),
2294 |         np.array(
2295 |             [
2296 |                 [3.046162, 1.44972, -1.921363, 0.021816, -0.433953],
2297 |                 [3.489114, 1.820994, -2.111306, 0.226388, -1.029428],
2298 |             ],
2299 |             dtype=np.float32,
2300 |         ),
2301 |         rtol=1e-5,
2302 |         atol=1e-5,
2303 |     )
2304 | 
2305 | 
2306 | def test_mlp_resnet_forward_2():
2307 |     np.testing.assert_allclose(
2308 |         mlp_resnet_forward(15, 25, 5, 14, nn.BatchNorm1d, 0.0),
2309 |         np.array(
2310 |             [
2311 |                 [
2312 |                     0.92448235,
2313 |                     -2.745743,
2314 |                     -1.5077105,
2315 |                     1.130784,
2316 |                     -1.2078242,
2317 |                     -0.09833566,
2318 |                     -0.69301605,
2319 |                     2.8945382,
2320 |                     1.259397,
2321 |                     0.13866742,
2322 |                     -2.963875,
2323 |                     -4.8566914,
2324 |                     1.7062538,
2325 |                     -4.846424,
2326 |                 ],
2327 |                 [
2328 |                     0.6653336,
2329 |                     -2.4708004,
2330 |                     2.0572243,
2331 |                     -1.0791507,
2332 |                     4.3489094,
2333 |                     3.1086435,
2334 |                     0.0304327,
2335 |                     -1.9227124,
2336 |                     -1.416201,
2337 |                     -7.2151937,
2338 |                     -1.4858506,
2339 |                     7.1039696,
2340 |                     -2.1589825,
2341 |                     -0.7593413,
2342 |                 ],
2343 |             ],
2344 |             dtype=np.float32,
2345 |         ),
2346 |         rtol=1e-5,
2347 |         atol=1e-5,
2348 |     )
2349 | 
2350 | 
2351 | def test_mlp_train_epoch_1():
2352 |     np.testing.assert_allclose(
2353 |         train_epoch_1(5, 250, ndl.optim.Adam, lr=0.01, weight_decay=0.1),
2354 |         np.array([0.675267, 1.84043]),
2355 |         rtol=0.0001,
2356 |         atol=0.0001,
2357 |     )
2358 | 
2359 | 
2360 | def test_mlp_eval_epoch_1():
2361 |     np.testing.assert_allclose(
2362 |         eval_epoch_1(10, 150), np.array([0.9164, 4.137814]), rtol=1e-5, atol=1e-5
2363 |     )
2364 | 
2365 | 
2366 | def test_mlp_train_mnist_1():
2367 |     np.testing.assert_allclose(
2368 |         train_mnist_1(250, 2, ndl.optim.SGD, 0.001, 0.01, 100),
2369 |         np.array([0.4875, 1.462595, 0.3245, 1.049429]),
2370 |         rtol=0.001,
2371 |         atol=0.001,
2372 |     )
2373 | 
2374 | 
2375 | def submit_mlp_resnet():
2376 |     mugrade.submit(residual_block_num_params(17, 13, nn.BatchNorm1d))
2377 |     mugrade.submit(residual_block_num_params(785, 101, nn.LayerNorm1d))
2378 |     mugrade.submit(residual_block_forward(15, 5, nn.LayerNorm1d, 0.3))
2379 |     mugrade.submit(mlp_resnet_num_params(75, 75, 3, 3, nn.LayerNorm1d))
2380 |     mugrade.submit(mlp_resnet_num_params(15, 10, 10, 5, nn.BatchNorm1d))
2381 |     mugrade.submit(mlp_resnet_forward(12, 7, 1, 6, nn.LayerNorm1d, 0.8))
2382 |     mugrade.submit(mlp_resnet_forward(15, 3, 2, 15, nn.BatchNorm1d, 0.3))
2383 |     mugrade.submit(train_epoch_1(7, 256, ndl.optim.Adam, lr=0.01, weight_decay=0.01))
2384 |     mugrade.submit(eval_epoch_1(12, 154))
2385 |     mugrade.submit(train_mnist_1(550, 1, ndl.optim.SGD, 0.01, 0.01, 7))
2386 | 


--------------------------------------------------------------------------------