├── README.md
├── apps
└── mlp_resnet.py
├── data
├── t10k-images-idx3-ubyte.gz
├── t10k-labels-idx1-ubyte.gz
├── train-images-idx3-ubyte.gz
└── train-labels-idx1-ubyte.gz
├── figures
├── mlp_resnet.png
└── residualblock.png
├── hw2.ipynb
├── python
└── needle
│ ├── __init__.py
│ ├── autograd.py
│ ├── backend_numpy.py
│ ├── data
│ ├── __init__.py
│ ├── data_basic.py
│ ├── data_transforms.py
│ └── datasets
│ │ ├── __init__.py
│ │ ├── mnist_dataset.py
│ │ └── ndarray_dataset.py
│ ├── init
│ ├── __init__.py
│ ├── init_basic.py
│ └── init_initializers.py
│ ├── nn
│ ├── __init__.py
│ └── nn_basic.py
│ ├── ops
│ ├── __init__.py
│ ├── ops_logarithmic.py
│ ├── ops_mathematic.py
│ └── ops_tuple.py
│ └── optim.py
└── tests
└── hw2
├── test_data.py
└── test_nn_and_optim.py
/README.md:
--------------------------------------------------------------------------------
1 | # Homework 2
2 |
3 | Public repository and stub/testing code for Homework 2 of 10-714.
4 |
5 |
--------------------------------------------------------------------------------
/apps/mlp_resnet.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 | sys.path.append("../python")
4 | import needle as ndl
5 | import needle.nn as nn
6 | import numpy as np
7 | import time
8 | import os
9 |
10 | np.random.seed(0)
11 | # MY_DEVICE = ndl.backend_selection.cuda()
12 |
13 |
14 | def ResidualBlock(dim, hidden_dim, norm=nn.BatchNorm1d, drop_prob=0.1):
15 | ### BEGIN YOUR SOLUTION
16 | raise NotImplementedError()
17 | ### END YOUR SOLUTION
18 |
19 |
20 | def MLPResNet(
21 | dim,
22 | hidden_dim=100,
23 | num_blocks=3,
24 | num_classes=10,
25 | norm=nn.BatchNorm1d,
26 | drop_prob=0.1,
27 | ):
28 | ### BEGIN YOUR SOLUTION
29 | raise NotImplementedError()
30 | ### END YOUR SOLUTION
31 |
32 |
33 | def epoch(dataloader, model, opt=None):
34 | np.random.seed(4)
35 | ### BEGIN YOUR SOLUTION
36 | raise NotImplementedError()
37 | ### END YOUR SOLUTION
38 |
39 |
40 | def train_mnist(
41 | batch_size=100,
42 | epochs=10,
43 | optimizer=ndl.optim.Adam,
44 | lr=0.001,
45 | weight_decay=0.001,
46 | hidden_dim=100,
47 | data_dir="data",
48 | ):
49 | np.random.seed(4)
50 | ### BEGIN YOUR SOLUTION
51 | raise NotImplementedError()
52 | ### END YOUR SOLUTION
53 |
54 |
55 | if __name__ == "__main__":
56 | train_mnist(data_dir="../data")
57 |
--------------------------------------------------------------------------------
/data/t10k-images-idx3-ubyte.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dlsyscourse/hw2/633bad8ac8e52cd40531e94b1a6b6bccae92a102/data/t10k-images-idx3-ubyte.gz
--------------------------------------------------------------------------------
/data/t10k-labels-idx1-ubyte.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dlsyscourse/hw2/633bad8ac8e52cd40531e94b1a6b6bccae92a102/data/t10k-labels-idx1-ubyte.gz
--------------------------------------------------------------------------------
/data/train-images-idx3-ubyte.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dlsyscourse/hw2/633bad8ac8e52cd40531e94b1a6b6bccae92a102/data/train-images-idx3-ubyte.gz
--------------------------------------------------------------------------------
/data/train-labels-idx1-ubyte.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dlsyscourse/hw2/633bad8ac8e52cd40531e94b1a6b6bccae92a102/data/train-labels-idx1-ubyte.gz
--------------------------------------------------------------------------------
/figures/mlp_resnet.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dlsyscourse/hw2/633bad8ac8e52cd40531e94b1a6b6bccae92a102/figures/mlp_resnet.png
--------------------------------------------------------------------------------
/figures/residualblock.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dlsyscourse/hw2/633bad8ac8e52cd40531e94b1a6b6bccae92a102/figures/residualblock.png
--------------------------------------------------------------------------------
/hw2.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# 10-714 Homework 2\n",
8 | "\n",
9 | "In this homework, you will be implementing a neural network library in the needle framework. Reminder: __you must save a copy in drive__."
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": null,
15 | "metadata": {},
16 | "outputs": [],
17 | "source": [
18 | "# Code to set up the assignment\n",
19 | "from google.colab import drive\n",
20 | "drive.mount('/content/drive')\n",
21 | "%cd /content/drive/MyDrive/\n",
22 | "!mkdir -p 10714\n",
23 | "%cd /content/drive/MyDrive/10714\n",
24 | "!git clone https://github.com/dlsys10714/hw2.git\n",
25 | "%cd /content/drive/MyDrive/10714/hw2\n",
26 | "\n",
27 | "!pip3 install --upgrade --no-deps git+https://github.com/dlsys10714/mugrade.git"
28 | ]
29 | },
30 | {
31 | "cell_type": "markdown",
32 | "metadata": {},
33 | "source": [
34 | "## Question 0\n",
35 | "\n",
36 | "This homework builds off of Homework 1. First, in your Homework 2 directory, copy the files `python/needle/autograd.py`, `python/needle/ops/ops_mathematic.py` from your Homework 1.\n",
37 | "\n",
38 | "***NOTE***: The default data type for the tensor is `float32`. If you want to change the data type, you can do so by setting the `dtype` parameter in the `Tensor` constructor. For example, `Tensor([1, 2, 3], dtype='float64')` will create a tensor with `float64` data type. \n",
39 | "In this homework, **make sure any tensor you create has `float32` data type to avoid any issues with the autograder**."
40 | ]
41 | },
42 | {
43 | "cell_type": "code",
44 | "execution_count": null,
45 | "metadata": {},
46 | "outputs": [],
47 | "source": [
48 | "import sys\n",
49 | "sys.path.append('./python')\n",
50 | "sys.path.append('./apps')"
51 | ]
52 | },
53 | {
54 | "cell_type": "markdown",
55 | "metadata": {
56 | "tags": []
57 | },
58 | "source": [
59 | "## Question 1\n",
60 | "\n",
61 | "In this first question, you will implement a few different methods for weight initialization. This will be done in the `python/needle/init/init_initializers.py` file, which contains a number of routines for initializing needle Tensors using various random and constant initializations. Following the same methodology of the existing initializers (you will want to call e.g. `init.rand` or `init.randn` implemented in `python/needle/init/init_basic.py` from your functions below, implement the following common initialization methods. In all cases, the functions should return `fan_in` by `fan_out` 2D tensors (extensions to other sizes can be done via e.g., reshaping).\n",
62 | "\n",
63 | "\n",
64 | "### Xavier uniform\n",
65 | "`xavier_uniform(fan_in, fan_out, gain=1.0, **kwargs)`\n",
66 | "\n",
67 | "Fills the input Tensor with values according to the method described in [Understanding the difficulty of training deep feedforward neural networks](https://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf), using a uniform distribution. The resulting Tensor will have values sampled from $\\mathcal{U}(-a, a)$ where \n",
68 | "\\begin{equation}\n",
69 | "a = \\text{gain} \\times \\sqrt{\\frac{6}{\\text{fan_in} + \\text{fan_out}}}\n",
70 | "\\end{equation}\n",
71 | "\n",
72 | "Pass remaining `**kwargs` parameters to the corresponding `init` random call.\n",
73 | "\n",
74 | "##### Parameters\n",
75 | "- `fan_in` - dimensionality of input\n",
76 | "- `fan_out` - dimensionality of output\n",
77 | "- `gain` - optional scaling factor\n",
78 | "___\n",
79 | "\n",
80 | "### Xavier normal\n",
81 | "`xavier_normal(fan_in, fan_out, gain=1.0, **kwargs)`\n",
82 | "\n",
83 | "Fills the input Tensor with values according to the method described in [Understanding the difficulty of training deep feedforward neural networks](https://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf), using a normal distribution. The resulting Tensor will have values sampled from $\\mathcal{N}(0, \\text{std}^2)$ where \n",
84 | "\\begin{equation}\n",
85 | "\\text{std} = \\text{gain} \\times \\sqrt{\\frac{2}{\\text{fan_in} + \\text{fan_out}}}\n",
86 | "\\end{equation}\n",
87 | "\n",
88 | "##### Parameters\n",
89 | "- `fan_in` - dimensionality of input\n",
90 | "- `fan_out` - dimensionality of output\n",
91 | "- `gain` - optional scaling factor\n",
92 | "___\n",
93 | "\n",
94 | "### Kaiming uniform\n",
95 | "`kaiming_uniform(fan_in, fan_out, nonlinearity=\"relu\", **kwargs)`\n",
96 | "\n",
97 | "Fills the input Tensor with values according to the method described in [Delving deep into rectifiers: Surpassing human-level performance on ImageNet classification](https://arxiv.org/pdf/1502.01852.pdf), using a uniform distribution. The resulting Tensor will have values sampled from $\\mathcal{U}(-\\text{bound}, \\text{bound})$ where \n",
98 | "\\begin{equation}\n",
99 | "\\text{bound} = \\text{gain} \\times \\sqrt{\\frac{3}{\\text{fan_in}}}\n",
100 | "\\end{equation}\n",
101 | "\n",
102 | "Use the recommended gain value for ReLU: $\\text{gain}=\\sqrt{2}$.\n",
103 | "\n",
104 | "##### Parameters\n",
105 | "- `fan_in` - dimensionality of input\n",
106 | "- `fan_out` - dimensionality of output\n",
107 | "- `nonlinearity` - the non-linear function\n",
108 | "___\n",
109 | "\n",
110 | "### Kaiming normal\n",
111 | "`kaiming_normal(fan_in, fan_out, nonlinearity=\"relu\", **kwargs)`\n",
112 | "\n",
113 | "Fills the input Tensor with values according to the method described in [Delving deep into rectifiers: Surpassing human-level performance on ImageNet classification](https://arxiv.org/pdf/1502.01852.pdf), using a uniform distribution. The resulting Tensor will have values sampled from $\\mathcal{N}(0, \\text{std}^2)$ where \n",
114 | "\\begin{equation}\n",
115 | "\\text{std} = \\frac{\\text{gain}}{\\sqrt{\\text{fan_in}}}\n",
116 | "\\end{equation}\n",
117 | "\n",
118 | "Use the recommended gain value for ReLU: $\\text{gain}=\\sqrt{2}$.\n",
119 | "\n",
120 | "##### Parameters\n",
121 | "- `fan_in` - dimensionality of input\n",
122 | "- `fan_out` - dimensionality of output\n",
123 | "- `nonlinearity` - the non-linear function"
124 | ]
125 | },
126 | {
127 | "cell_type": "code",
128 | "execution_count": null,
129 | "metadata": {},
130 | "outputs": [],
131 | "source": [
132 | "!python3 -m pytest -v -k \"test_init\""
133 | ]
134 | },
135 | {
136 | "cell_type": "code",
137 | "execution_count": null,
138 | "metadata": {},
139 | "outputs": [],
140 | "source": [
141 | "!python -m mugrade submit 'YOUR_GRADER_KEY_HERE' -k \"init\" -s"
142 | ]
143 | },
144 | {
145 | "cell_type": "markdown",
146 | "metadata": {},
147 | "source": [
148 | "## Question 2\n",
149 | "\n",
150 | "In this question, you will implement additional modules in `python/needle/nn/nn_basic.py`. Specifically, for the following modules described below, initialize any variables of the module in the constructor, and fill out the `forward` method. **Note:** Be sure that you are using the `init` functions that you just implemented to initialize the parameters, and don't forget to pass the `dtype` argument.\n",
151 | "___\n",
152 | "\n",
153 | "### Linear\n",
154 | "`needle.nn.Linear(in_features, out_features, bias=True, device=None, dtype=\"float32\")`\n",
155 | "\n",
156 | "Applies a linear transformation to the incoming data: $y = xA^T + b$. The input shape is $(N, H_{in})$ where $H_{in}=\\text{in_features}$. The output shape is $(N, H_{out})$ where $H_{out}=\\text{out_features}$.\n",
157 | "\n",
158 | "**Be careful to explicitly broadcast the bias term to the correct shape -- Needle does not support implicit broadcasting.**\n",
159 | "\n",
160 | "**Note: for all layers including this one, you should initialize the weight Tensor before the bias Tensor, and should initialize all Parameters using only functions from `init`**. This does not affect the algorithm's correctness. It is only necessary to ensure the value matches the expected results in the mugrade tests for this assignment's implementation scope. \n",
161 | "\n",
162 | "##### Parameters\n",
163 | "- `in_features` - size of each input sample\n",
164 | "- `out_features` - size of each output sample\n",
165 | "- `bias` - If set to `False`, the layer will not learn an additive bias.\n",
166 | "\n",
167 | "##### Variables\n",
168 | "- `weight` - the learnable weights of shape (`in_features`, `out_features`). The values should be initialized with the Kaiming Uniform initialization with `fan_in = in_features`\n",
169 | "- `bias` - the learnable bias of shape (`out_features`). The values should be initialized with the Kaiming Uniform initialize with `fan_in = out_features`. **Note the difference in fan_in choice, due to their relative sizes**. \n",
170 | "\n",
171 | "Make sure to enclose all necessary variables e.g. (`weight`, `bias`) in the `Parameter` class so that they are visible to the optimizers which would be implemented next."
172 | ]
173 | },
174 | {
175 | "cell_type": "code",
176 | "execution_count": null,
177 | "metadata": {},
178 | "outputs": [],
179 | "source": [
180 | "!python3 -m pytest -v -k \"test_nn_linear\""
181 | ]
182 | },
183 | {
184 | "cell_type": "code",
185 | "execution_count": null,
186 | "metadata": {},
187 | "outputs": [],
188 | "source": [
189 | "!python -m mugrade submit 'YOUR_GRADER_KEY_HERE' -k \"nn_linear\""
190 | ]
191 | },
192 | {
193 | "cell_type": "markdown",
194 | "metadata": {},
195 | "source": [
196 | "### ReLU\n",
197 | "`needle.nn.ReLU()`\n",
198 | "\n",
199 | "Applies the rectified linear unit function element-wise:\n",
200 | "$ReLU(x) = max(0, x)$.\n",
201 | "\n",
202 | "If you have previously implemented ReLU's backwards pass in terms of itself, note that this is numerically unstable and will likely cause problems\n",
203 | "down the line.\n",
204 | "Instead, consider that we could write the derivative of ReLU as $I\\{x>0\\}$, where we arbitrarily decide that the derivative at $x=0$ is 0.\n",
205 | "(This is a _subdifferentiable_ function.)\n",
206 | "\n",
207 | "___"
208 | ]
209 | },
210 | {
211 | "cell_type": "code",
212 | "execution_count": null,
213 | "metadata": {},
214 | "outputs": [],
215 | "source": [
216 | "!python3 -m pytest -v -k \"test_nn_relu\""
217 | ]
218 | },
219 | {
220 | "cell_type": "code",
221 | "execution_count": null,
222 | "metadata": {},
223 | "outputs": [],
224 | "source": [
225 | "!python -m mugrade submit 'YOUR_GRADER_KEY_HERE' -k \"nn_relu\""
226 | ]
227 | },
228 | {
229 | "cell_type": "markdown",
230 | "metadata": {
231 | "tags": []
232 | },
233 | "source": [
234 | "### Sequential\n",
235 | "`needle.nn.Sequential(*modules)`\n",
236 | "\n",
237 | "Applies a sequence of modules to the input (in the order that they were passed to the constructor) and returns the output of the last module.\n",
238 | "These should be kept in a `.module` property: you should _not_ redefine any magic methods like `__getitem__`, as this may not be compatible with our tests.\n",
239 | "\n",
240 | "##### Parameters\n",
241 | "- `*modules` - any number of modules of type `needle.nn.Module`\n",
242 | "\n",
243 | "___"
244 | ]
245 | },
246 | {
247 | "cell_type": "code",
248 | "execution_count": null,
249 | "metadata": {},
250 | "outputs": [],
251 | "source": [
252 | "!python3 -m pytest -v -k \"test_nn_sequential\""
253 | ]
254 | },
255 | {
256 | "cell_type": "code",
257 | "execution_count": null,
258 | "metadata": {},
259 | "outputs": [],
260 | "source": [
261 | "!python -m mugrade submit 'YOUR_GRADER_KEY_HERE' -k \"nn_sequential\""
262 | ]
263 | },
264 | {
265 | "cell_type": "markdown",
266 | "metadata": {
267 | "tags": []
268 | },
269 | "source": [
270 | "### LogSumExp\n",
271 | "\n",
272 | "`needle.ops.LogSumExp(axes)`\n",
273 | "\n",
274 | "Applies a numerically stable log-sum-exp function to the input by subtracting off the maximum elements. You will need to implement this and the next operation in file `python/needle/ops/ops_logarithmic.py`.\n",
275 | "\n",
276 | "\\begin{equation}\n",
277 | "\\text{LogSumExp}(z) = \\log (\\sum_{i} \\exp (z_i - \\max{z})) + \\max{z}\n",
278 | "\\end{equation}\n",
279 | "\n",
280 | "#### Parameters\n",
281 | "- `axes` - Tuple of axes to sum and take the maximum element over. This uses the same conventions as `needle.ops.Summation()`\n",
282 | "\n",
283 | "___"
284 | ]
285 | },
286 | {
287 | "cell_type": "code",
288 | "execution_count": null,
289 | "metadata": {},
290 | "outputs": [],
291 | "source": [
292 | "!python3 -m pytest -v -k \"test_op_logsumexp\""
293 | ]
294 | },
295 | {
296 | "cell_type": "code",
297 | "execution_count": null,
298 | "metadata": {},
299 | "outputs": [],
300 | "source": [
301 | "!python -m mugrade submit 'YOUR_GRADER_KEY_HERE' -k \"op_logsumexp\""
302 | ]
303 | },
304 | {
305 | "cell_type": "markdown",
306 | "metadata": {},
307 | "source": [
308 | "### LogSoftmax\n",
309 | "\n",
310 | "`needle.ops.LogSoftmax(axes)`\n",
311 | "\n",
312 | "Applies a numerically stable logsoftmax function to the input by subtracting off the maximum elements. Assume the input NDArray is 2 dimensional and we are doing softmax over `axis=1`.\n",
313 | "\n",
314 | "\\begin{equation}\n",
315 | "\\text{LogSoftmax}(z) = \\log \\left(\\frac{\\exp(z_i - \\max z)}{\\sum_{i}\\exp(z_i - \\max z)}\\right) = z - \\text{LogSumExp}(z)\n",
316 | "\\end{equation}\n",
317 | "___"
318 | ]
319 | },
320 | {
321 | "cell_type": "code",
322 | "execution_count": null,
323 | "metadata": {},
324 | "outputs": [],
325 | "source": [
326 | "!python3 -m pytest -v -k \"test_op_logsoftmax\""
327 | ]
328 | },
329 | {
330 | "cell_type": "code",
331 | "execution_count": null,
332 | "metadata": {},
333 | "outputs": [],
334 | "source": [
335 | "!python -m mugrade submit 'YOUR_GRADER_KEY_HERE' -k \"op_logsoftmax\""
336 | ]
337 | },
338 | {
339 | "cell_type": "markdown",
340 | "metadata": {
341 | "tags": []
342 | },
343 | "source": [
344 | "### SoftmaxLoss\n",
345 | "\n",
346 | "`needle.nn.SoftmaxLoss()`\n",
347 | "\n",
348 | "Applies the softmax loss as defined below (and as implemented in Homework 1), taking in as input a Tensor of logits and a Tensor of the true labels (expressed as a list of numbers, *not* one-hot encoded).\n",
349 | "\n",
350 | "Note that you can use the `init.one_hot` function now instead of writing this yourself. Note: You will need to use the numerically stable logsumexp operator you just implemented for this purpose.\n",
351 | "\n",
352 | "\\begin{equation}\n",
353 | "\\ell_\\text{softmax}(z,y) = \\log \\sum_{i=1}^k \\exp z_i - z_y\n",
354 | "\\end{equation}\n",
355 | "\n",
356 | "___"
357 | ]
358 | },
359 | {
360 | "cell_type": "code",
361 | "execution_count": null,
362 | "metadata": {},
363 | "outputs": [],
364 | "source": [
365 | "!python3 -m pytest -v -k \"test_nn_softmax_loss\""
366 | ]
367 | },
368 | {
369 | "cell_type": "code",
370 | "execution_count": null,
371 | "metadata": {},
372 | "outputs": [],
373 | "source": [
374 | "!python -m mugrade submit 'YOUR_GRADER_KEY_HERE' -k \"nn_softmax_loss\""
375 | ]
376 | },
377 | {
378 | "cell_type": "markdown",
379 | "metadata": {
380 | "tags": []
381 | },
382 | "source": [
383 | "### LayerNorm1d\n",
384 | "`needle.nn.LayerNorm1d(dim, eps=1e-5, device=None, dtype=\"float32\")`\n",
385 | "\n",
386 | "Applies layer normalization over a mini-batch of inputs as described in the paper [Layer Normalization](https://arxiv.org/abs/1607.06450).\n",
387 | "\n",
388 | "\\begin{equation}\n",
389 | "y = w \\circ \\frac{x_i - \\textbf{E}[x]}{((\\textbf{Var}[x]+\\epsilon)^{1/2})} + b\n",
390 | "\\end{equation}\n",
391 | "\n",
392 | "where $\\textbf{E}[x]$ denotes the empirical mean of the inputs, $\\textbf{Var}[x]$ denotes their empirical variance (note that here we are using the \"biased\" estimate of the variance, i.e., dividing by $N$ rather than by $N-1$), and $w$ and $b$ denote learnable scalar weights and biases respectively. Note you can assume the input to this layer is a 2D tensor, with batches in the first dimension and features in the second. You might need to broadcast the weight and bias before applying them.\n",
393 | "\n",
394 | "##### Parameters\n",
395 | "- `dim` - number of channels\n",
396 | "- `eps` - a value added to the denominator for numerical stability.\n",
397 | "\n",
398 | "##### Variables\n",
399 | "- `weight` - the learnable weights of size `dim`, elements initialized to 1.\n",
400 | "- `bias` - the learnable bias of shape `dim`, elements initialized to 0.\n",
401 | "___"
402 | ]
403 | },
404 | {
405 | "cell_type": "code",
406 | "execution_count": null,
407 | "metadata": {},
408 | "outputs": [],
409 | "source": [
410 | "!python3 -m pytest -v -k \"test_nn_layernorm\""
411 | ]
412 | },
413 | {
414 | "cell_type": "code",
415 | "execution_count": null,
416 | "metadata": {},
417 | "outputs": [],
418 | "source": [
419 | "!python -m mugrade submit 'YOUR_GRADER_KEY_HERE' -k \"nn_layernorm\""
420 | ]
421 | },
422 | {
423 | "cell_type": "markdown",
424 | "metadata": {},
425 | "source": [
426 | "\n",
427 | "### Flatten\n",
428 | "`needle.nn.Flatten()`\n",
429 | "\n",
430 | "Takes in a tensor of shape `(B,X_0,X_1,...)`, and flattens all non-batch dimensions so that the output is of shape `(B, X_0 * X_1 * ...)`"
431 | ]
432 | },
433 | {
434 | "cell_type": "code",
435 | "execution_count": null,
436 | "metadata": {},
437 | "outputs": [],
438 | "source": [
439 | "!python3 -m pytest -v -k \"test_nn_flatten\""
440 | ]
441 | },
442 | {
443 | "cell_type": "code",
444 | "execution_count": null,
445 | "metadata": {},
446 | "outputs": [],
447 | "source": [
448 | "!python -m mugrade submit 'YOUR_GRADER_KEY_HERE' -k \"nn_flatten\""
449 | ]
450 | },
451 | {
452 | "cell_type": "markdown",
453 | "metadata": {},
454 | "source": [
455 | "### BatchNorm1d\n",
456 | "`needle.nn.BatchNorm1d(dim, eps=1e-5, momentum=0.1, device=None, dtype=\"float32\")`\n",
457 | "\n",
458 | "Applies batch normalization over a mini-batch of inputs as described in the paper [Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift](https://arxiv.org/abs/1502.03167).\n",
459 | "\n",
460 | "\\begin{equation}\n",
461 | "y = w \\circ \\frac{z_i - \\textbf{E}[x]}{((\\textbf{Var}[x]+\\epsilon)^{1/2})} + b\n",
462 | "\\end{equation}\n",
463 | "\n",
464 | "but where here the mean and variance refer to to the mean and variance over the _batch_dimensions. The function also computes a running average of mean/variance for all features at each layer $\\hat{\\mu}, \\hat{\\sigma}^2$, and at test time normalizes by these quantities:\n",
465 | "\n",
466 | "\\begin{equation}\n",
467 | "y = \\frac{(x - \\hat{mu})}{((\\hat{\\sigma}^2_{i+1})_j+\\epsilon)^{1/2}}\n",
468 | "\\end{equation}\n",
469 | "\n",
470 | "\n",
471 | "BatchNorm uses the running estimates of mean and variance instead of batch statistics at test time, i.e.,\n",
472 | "after `model.eval()` has been called on the BatchNorm layer's `training` flag is false.\n",
473 | "\n",
474 | "To compute the running estimates, you can use the equation $$\\hat{x_{new}} = (1 - m) \\hat{x_{old}} + mx_{observed},$$\n",
475 | "where $m$ is momentum.\n",
476 | "\n",
477 | "##### Parameters\n",
478 | "- `dim` - input dimension\n",
479 | "- `eps` - a value added to the denominator for numerical stability.\n",
480 | "- `momentum` - the value used for the running mean and running variance computation.\n",
481 | "\n",
482 | "##### Variables\n",
483 | "- `weight` - the learnable weights of size `dim`, elements initialized to 1.\n",
484 | "- `bias` - the learnable bias of size `dim`, elements initialized to 0.\n",
485 | "- `running_mean` - the running mean used at evaluation time, elements initialized to 0.\n",
486 | "- `running_var` - the running (unbiased) variance used at evaluation time, elements initialized to 1. \n",
487 | "\n",
488 | "___"
489 | ]
490 | },
491 | {
492 | "cell_type": "code",
493 | "execution_count": null,
494 | "metadata": {},
495 | "outputs": [],
496 | "source": [
497 | "!python3 -m pytest -v -k \"test_nn_batchnorm\""
498 | ]
499 | },
500 | {
501 | "cell_type": "code",
502 | "execution_count": null,
503 | "metadata": {},
504 | "outputs": [],
505 | "source": [
506 | "!python -m mugrade submit 'YOUR_GRADER_KEY_HERE' -k \"nn_batchnorm\""
507 | ]
508 | },
509 | {
510 | "cell_type": "markdown",
511 | "metadata": {},
512 | "source": [
513 | "### Dropout\n",
514 | "`needle.nn.Dropout(p = 0.5)`\n",
515 | "\n",
516 | "During training, randomly zeroes some of the elements of the input tensor with probability `p` using samples from a Bernoulli distribution. This has proven to be an effective technique for regularization and preventing the co-adaptation of neurons as described in the paper [Improving neural networks by preventing co-adaption of feature detectors](https://arxiv.org/abs/1207.0580). During evaluation the module simply computes an identity function. \n",
517 | "\n",
518 | "\\begin{equation}\n",
519 | "\\hat{z}_{i+1} = \\sigma_i (W_i^T z_i + b_i) \\\\\n",
520 | "(z_{i+1})_j = \n",
521 | " \\begin{cases}\n",
522 | " (\\hat{z}_{i+1})_j /(1-p) & \\text{with probability } 1-p \\\\\n",
523 | " 0 & \\text{with probability } p \\\\\n",
524 | " \\end{cases}\n",
525 | "\\end{equation}\n",
526 | "\n",
527 | "**Important**: If the Dropout module the flag `training=False`, you shouldn't \"dropout\" any weights. That is, dropout applies during training only, not during evaluation. Note that `training` is a flag in `nn.Module`.\n",
528 | "\n",
529 | "##### Parameters\n",
530 | "- `p` - the probability of an element to be zeroed.\n",
531 | "\n",
532 | "___"
533 | ]
534 | },
535 | {
536 | "cell_type": "code",
537 | "execution_count": null,
538 | "metadata": {},
539 | "outputs": [],
540 | "source": [
541 | "!python3 -m pytest -v -k \"test_nn_dropout\""
542 | ]
543 | },
544 | {
545 | "cell_type": "code",
546 | "execution_count": null,
547 | "metadata": {},
548 | "outputs": [],
549 | "source": [
550 | "!python -m mugrade submit 'YOUR_GRADER_KEY_HERE' -k \"nn_dropout\""
551 | ]
552 | },
553 | {
554 | "cell_type": "markdown",
555 | "metadata": {
556 | "tags": []
557 | },
558 | "source": [
559 | "### Residual\n",
560 | "`needle.nn.Residual(fn: Module)`\n",
561 | "\n",
562 | "Applies a residual or skip connection given module $\\mathcal{F}$ and input Tensor $x$, returning $\\mathcal{F}(x) + x$.\n",
563 | "##### Parameters\n",
564 | "- `fn` - module of type `needle.nn.Module`"
565 | ]
566 | },
567 | {
568 | "cell_type": "code",
569 | "execution_count": null,
570 | "metadata": {},
571 | "outputs": [],
572 | "source": [
573 | "!python3 -m pytest -v -k \"test_nn_residual\""
574 | ]
575 | },
576 | {
577 | "cell_type": "code",
578 | "execution_count": null,
579 | "metadata": {},
580 | "outputs": [],
581 | "source": [
582 | "!python -m mugrade submit 'YOUR_GRADER_KEY_HERE' -k \"nn_residual\""
583 | ]
584 | },
585 | {
586 | "cell_type": "markdown",
587 | "metadata": {
588 | "tags": []
589 | },
590 | "source": [
591 | "## Question 3\n",
592 | "\n",
593 | "Implement the `step` function of the following optimizers in `python/needle/optim.py`.\n",
594 | "Make sure that your optimizers _don't_ modify the gradients of tensors in-place.\n",
595 | "\n",
596 | "We have included some tests to ensure that you are not consuming excessive memory, which can happen if you are\n",
597 | "not using `.data` or `.detach()` in the right places, thus building an increasingly large computational graph\n",
598 | "(not just in the optimizers, but in the previous modules as well).\n",
599 | "You can ignore these tests, which include the string `memory_check` at your own discretion.\n",
600 | "\n",
601 | "___\n",
602 | "\n",
603 | "### SGD\n",
604 | "`needle.optim.SGD(params, lr=0.01, momentum=0.0, weight_decay=0.0)`\n",
605 | "\n",
606 | "Implements stochastic gradient descent (optionally with momentum, shown as $\\beta$ below). \n",
607 | "\n",
608 | "\\begin{equation}\n",
609 | "\\begin{split}\n",
610 | " u_{t+1} &= \\beta u_t + (1-\\beta) \\nabla_\\theta f(\\theta_t) \\\\\n",
611 | " \\theta_{t+1} &= \\theta_t - \\alpha u_{t+1}\n",
612 | "\\end{split}\n",
613 | "\\end{equation}\n",
614 | "\n",
615 | "##### Parameters\n",
616 | "- `params` - iterable of parameters of type `needle.nn.Parameter` to optimize\n",
617 | "- `lr` (*float*) - learning rate\n",
618 | "- `momentum` (*float*) - momentum factor\n",
619 | "- `weight_decay` (*float*) - weight decay (L2 penalty)\n",
620 | "___"
621 | ]
622 | },
623 | {
624 | "cell_type": "code",
625 | "execution_count": null,
626 | "metadata": {},
627 | "outputs": [],
628 | "source": [
629 | "!python3 -m pytest -v -k \"test_optim_sgd\""
630 | ]
631 | },
632 | {
633 | "cell_type": "code",
634 | "execution_count": null,
635 | "metadata": {},
636 | "outputs": [],
637 | "source": [
638 | "!python -m mugrade submit 'YOUR_GRADER_KEY_HERE' -k \"optim_sgd\""
639 | ]
640 | },
641 | {
642 | "cell_type": "markdown",
643 | "metadata": {
644 | "tags": []
645 | },
646 | "source": [
647 | "### Adam\n",
648 | "`needle.optim.Adam(params, lr=0.01, beta1=0.9, beta2=0.999, eps=1e-8, weight_decay=0.0)`\n",
649 | "\n",
650 | "Implements Adam algorithm, proposed in [Adam: A Method for Stochastic Optimization](https://arxiv.org/abs/1412.6980). \n",
651 | "\n",
652 | "\\begin{equation}\n",
653 | "\\begin{split}\n",
654 | "u_{t+1} &= \\beta_1 u_t + (1-\\beta_1) \\nabla_\\theta f(\\theta_t) \\\\\n",
655 | "v_{t+1} &= \\beta_2 v_t + (1-\\beta_2) (\\nabla_\\theta f(\\theta_t))^2 \\\\\n",
656 | "\\hat{u}_{t+1} &= u_{t+1} / (1 - \\beta_1^t) \\quad \\text{(bias correction)} \\\\\n",
657 | "\\hat{v}_{t+1} &= v_{t+1} / (1 - \\beta_2^t) \\quad \\text{(bias correction)}\\\\\n",
658 | "\\theta_{t+1} &= \\theta_t - \\alpha \\hat{u_{t+1}}/(\\hat{v}_{t+1}^{1/2}+\\epsilon)\n",
659 | "\\end{split}\n",
660 | " \\end{equation}\n",
661 | "\n",
662 | "**Important:** Pay attention to whether or not you are applying bias correction.\n",
663 | "\n",
664 | "##### Parameters\n",
665 | "- `params` - iterable of parameters of type `needle.nn.Parameter` to optimize\n",
666 | "- `lr` (*float*) - learning rate\n",
667 | "- `beta1` (*float*) - coefficient used for computing running average of gradient\n",
668 | "- `beta2` (*float*) - coefficient used for computing running average of square of gradient\n",
669 | "- `eps` (*float*) - term added to the denominator to improve numerical stability\n",
670 | "- `weight_decay` (*float*) - weight decay (L2 penalty)\n",
671 | "\n",
672 | "**Hint**: To help deal with memory issues, try to understand how to use `.data` or `.detach()`"
673 | ]
674 | },
675 | {
676 | "cell_type": "code",
677 | "execution_count": null,
678 | "metadata": {},
679 | "outputs": [],
680 | "source": [
681 | "!python3 -m pytest -v -k \"test_optim_adam\""
682 | ]
683 | },
684 | {
685 | "cell_type": "code",
686 | "execution_count": null,
687 | "metadata": {},
688 | "outputs": [],
689 | "source": [
690 | "!python -m mugrade submit 'YOUR_GRADER_KEY_HERE' -k \"optim_adam\""
691 | ]
692 | },
693 | {
694 | "cell_type": "markdown",
695 | "metadata": {},
696 | "source": [
697 | "## Question 4\n",
698 | "\n",
699 | "In this question, you will implement two data primitives: `needle.data.DataLoader` and `needle.data.Dataset`. `Dataset` stores the samples and their corresponding labels, and `DataLoader` wraps an iterable around the `Dataset` to enable easy access to the samples. \n",
700 | "\n",
701 | "For this question, you will be working in the `python/needle/data` directory. \n",
702 | "\n",
703 | "### Transformations\n",
704 | "\n",
705 | "First we will implement a few transformations that are helpful when working with images. We will stick with a horizontal flip and a random crop for now. Fill out the following functions in `needle/data/data_transforms.py`.\n",
706 | "___ \n",
707 | "\n",
708 | "#### RandomFlipHorizontal\n",
709 | "`needle.data.RandomFlipHorizontal(p = 0.5)`\n",
710 | "\n",
711 | "Flips the image horizontally, with probability `p`.\n",
712 | "\n",
713 | "##### Parameters\n",
714 | "- `p` (*float*) - The probability of flipping the input image.\n",
715 | "___\n",
716 | "\n",
717 | "#### RandomCrop\n",
718 | "`needle.data.RandomCrop(padding=3)`\n",
719 | "\n",
720 | "Padding is added to all sides of the image, and then the image is cropped back to it's original size at a random location. Returns an image the same size as the original image.\n",
721 | "\n",
722 | "##### Parameters\n",
723 | "- `padding` (*int*) - The padding on each border of the image."
724 | ]
725 | },
726 | {
727 | "cell_type": "code",
728 | "execution_count": null,
729 | "metadata": {},
730 | "outputs": [],
731 | "source": [
732 | "!python3 -m pytest -v -k \"flip_horizontal\"\n",
733 | "!python3 -m pytest -v -k \"random_crop\""
734 | ]
735 | },
736 | {
737 | "cell_type": "code",
738 | "execution_count": null,
739 | "metadata": {},
740 | "outputs": [],
741 | "source": [
742 | "!python -m mugrade submit 'YOUR_GRADER_KEY_HERE' -k \"flip_horizontal\"\n",
743 | "!python -m mugrade submit 'YOUR_GRADER_KEY_HERE' -k \"random_crop\""
744 | ]
745 | },
746 | {
747 | "cell_type": "markdown",
748 | "metadata": {},
749 | "source": [
750 | "### Dataset\n",
751 | "\n",
752 | "Each `Dataset` subclass must implement three functions: `__init__`, `__len__`, and `__getitem__`. The `__init__` function initializes the images, labels, and transforms. The `__len__` function returns the number of samples in the dataset. The `__getitem__` function retrieves a sample from the dataset at a given index `idx`, calls the transform functions on the image (if applicable), converts the image and label to a numpy array (the data will be converted to Tensors elsewhere). The output of `__getitem__` and `__next__` should be NDArrays, and you should follow the shapes such that you're accessing an array of size (Datapoint Number, Feature Dim 1, Feature Dim 2, ...). \n",
753 | "\n",
754 | "Fill out these functions in the `MNISTDataset` class in `needle/data/datasets/mnist_dataset.py`. You can use your solution to `parse_mnist` from the previous homework for the `__init__` function.\n",
755 | "\n",
756 | "### MNISTDataset\n",
757 | "`needle.data.MNISTDataset(image_filesname, label_filesname, transforms)`\n",
758 | "\n",
759 | "##### Parameters\n",
760 | "- `image_filesname` - path of file containing images\n",
761 | "- `label_filesname` - path of file containing labels\n",
762 | "- `transforms` - an optional list of transforms to apply to data\n"
763 | ]
764 | },
765 | {
766 | "cell_type": "code",
767 | "execution_count": null,
768 | "metadata": {},
769 | "outputs": [],
770 | "source": [
771 | "!python3 -m pytest -v -k \"test_mnist_dataset\""
772 | ]
773 | },
774 | {
775 | "cell_type": "code",
776 | "execution_count": null,
777 | "metadata": {},
778 | "outputs": [],
779 | "source": [
780 | "!python -m mugrade submit 'YOUR_GRADER_KEY_HERE' -k \"mnist_dataset\""
781 | ]
782 | },
783 | {
784 | "cell_type": "markdown",
785 | "metadata": {},
786 | "source": [
787 | "### Dataloader\n",
788 | "\n",
789 | "In `needle/data/data_basic.py`, the Dataloader class provides an interface for assembling mini-batches of examples suitable for training using SGD-based approaches, backed by a Dataset object. In order to build the typical Dataloader interface (allowing users to iterate over all the mini-batches in the dataset), you will need to implement the `__iter__()` and `__next__()` calls in the class: `__iter__()` is called at the start of iteration, while `__next__()` is called to grab the next mini-batch. Please note that subsequent calls to next will require you to return the following batches, so next is not a pure function.\n",
790 | "\n",
791 | "### Dataloader\n",
792 | "`needle.data.Dataloader(dataset: Dataset, batch_size: Optional[int] = 1, shuffle: bool = False)`\n",
793 | "\n",
794 | "Combines a dataset and a sampler, and provides an iterable over the given dataset. \n",
795 | "\n",
796 | "##### Parameters\n",
797 | "- `dataset` - `needle.data.Dataset` - a dataset \n",
798 | "- `batch_size` - `int` - what batch size to serve the data in \n",
799 | "- `shuffle` - `bool` - set to ``True`` to have the data reshuffle at every epoch, default ``False``.\n",
800 | "___ \n",
801 | "\n",
802 | "\n",
803 | "\n"
804 | ]
805 | },
806 | {
807 | "cell_type": "code",
808 | "execution_count": null,
809 | "metadata": {},
810 | "outputs": [],
811 | "source": [
812 | "!python3 -m pytest -v -k \"test_dataloader\""
813 | ]
814 | },
815 | {
816 | "cell_type": "code",
817 | "execution_count": null,
818 | "metadata": {},
819 | "outputs": [],
820 | "source": [
821 | "!python -m mugrade submit 'YOUR_GRADER_KEY_HERE' -k \"dataloader\""
822 | ]
823 | },
824 | {
825 | "cell_type": "markdown",
826 | "metadata": {},
827 | "source": [
828 | "## Question 5\n",
829 | "\n",
830 | "Given you have now implemented all the necessary components for our neural network library, let's build and train an MLP ResNet. For this question, you will be working in `apps/mlp_resnet.py`. First, fill out the functions `ResidualBlock` and `MLPResNet` as described below:\n",
831 | "\n",
832 | "### ResidualBlock\n",
833 | "`ResidualBlock(dim, hidden_dim, norm=nn.BatchNorm1d, drop_prob=0.1)`\n",
834 | "\n",
835 | "Implements a residual block as follows:\n",
836 | "\n",
837 | "
\n",
838 | "
\n",
839 | "
\n",
840 | "\n",
841 | "**NOTE**: if the figure does not render, please see the figure in the `figures` directory.\n",
842 | "\n",
843 | "where the first linear layer has `in_features=dim` and `out_features=hidden_dim`, and the last linear layer has `out_features=dim`. Returns the block as type `nn.Module`. \n",
844 | "\n",
845 | "##### Parameters\n",
846 | "- `dim` (*int*) - input dim\n",
847 | "- `hidden_dim` (*int*) - hidden dim\n",
848 | "- `norm` (*nn.Module*) - normalization method\n",
849 | "- `drop_prob` (*float*) - dropout probability\n",
850 | "\n",
851 | "___\n",
852 | "\n",
853 | "### MLPResNet\n",
854 | "`MLPResNet(dim, hidden_dim=100, num_blocks=3, num_classes=10, norm=nn.BatchNorm1d, drop_prob=0.1)`\n",
855 | "\n",
856 | "Implements an MLP ResNet as follows:\n",
857 | "\n",
858 | "\n",
859 | "
\n",
860 | "
\n",
861 | "\n",
862 | "where the first linear layer has `in_features=dim` and `out_features=hidden_dim`, and each ResidualBlock has `dim=hidden_dim` and `hidden_dim=hidden_dim//2`. Returns a network of type `nn.Module`.\n",
863 | "\n",
864 | "##### Parameters\n",
865 | "- `dim` (*int*) - input dim\n",
866 | "- `hidden_dim` (*int*) - hidden dim\n",
867 | "- `num_blocks` (*int*) - number of ResidualBlocks\n",
868 | "- `num_classes` (*int*) - number of classes\n",
869 | "- `norm` (*nn.Module*) - normalization method\n",
870 | "- `drop_prob` (*float*) - dropout probability (0.1)\n",
871 | "\n",
872 | "**Note**: Modules should be initialized to match the order of execution in the Resnet.\n",
873 | "___ \n",
874 | "\n",
875 | "Once you have the deep learning model architecture correct, let's train the network using our new neural network library components. Specifically, implement the functions `epoch` and `train_mnist`.\n",
876 | "\n",
877 | "### Epoch\n",
878 | "\n",
879 | "`epoch(dataloader, model, opt=None)`\n",
880 | "\n",
881 | "Executes one epoch of training or evaluation, iterating over the entire training dataset once (just like `nn_epoch` from previous homeworks). Returns the average error rate (as a *float*) and the average loss over all samples (as a *float*). Set the model to `training` mode at the beginning of the function if `opt` is given; set the model to `eval` if `opt` is not given (i.e. `None`). When setting the modes, use `.train()` and `.eval()` instead of modifying the training attribute.\n",
882 | "\n",
883 | "##### Parameters\n",
884 | "- `dataloader` (*`needle.data.DataLoader`*) - dataloader returning samples from the training dataset\n",
885 | "- `model` (*`needle.nn.Module`*) - neural network\n",
886 | "- `opt` (*`needle.optim.Optimizer`*) - optimizer instance, or `None`\n",
887 | "\n",
888 | "___\n",
889 | "\n",
890 | "### Train Mnist\n",
891 | "\n",
892 | "`train_mnist(batch_size=100, epochs=10, optimizer=ndl.optim.Adam, lr=0.001, weight_decay=0.001, hidden_dim=100, data_dir=\"data\")`\n",
893 | " \n",
894 | "Initializes a training dataloader (with `shuffle` set to `True`) and a test dataloader for MNIST data, and trains an `MLPResNet` using the given optimizer (if `opt` is not None) and the softmax loss for a given number of epochs. Returns a tuple of the training error, training loss, test error, test loss computed in the last epoch of training. If any parameters are not specified, use the default parameters.\n",
895 | "\n",
896 | "##### Parameters\n",
897 | "- `batch_size` (*int*) - batch size to use for train and test dataloader\n",
898 | "- `epochs` (*int*) - number of epochs to train for\n",
899 | "- `optimizer` (*`needle.optim.Optimizer` type*) - optimizer type to use\n",
900 | "- `lr` (*float*) - learning rate \n",
901 | "- `weight_decay` (*float*) - weight decay\n",
902 | "- `hidden_dim` (*int*) - hidden dim for `MLPResNet`\n",
903 | "- `data_dir` (*int*) - directory containing MNIST image/label files\n"
904 | ]
905 | },
906 | {
907 | "cell_type": "code",
908 | "execution_count": null,
909 | "metadata": {},
910 | "outputs": [],
911 | "source": [
912 | "!python3 -m pytest -v -k \"test_mlp\""
913 | ]
914 | },
915 | {
916 | "cell_type": "code",
917 | "execution_count": null,
918 | "metadata": {},
919 | "outputs": [],
920 | "source": [
921 | "!python -m mugrade submit 'YOUR_GRADER_KEY_HERE' -k \"mlp_resnet\""
922 | ]
923 | },
924 | {
925 | "cell_type": "markdown",
926 | "metadata": {},
927 | "source": [
928 | "We encourage to experiment with the `mlp_resnet.py` training script.\n",
929 | "You can investigate the effect of using different initializers on the Linear layers,\n",
930 | "increasing the dropout probability,\n",
931 | "or adding transforms (via a list to the `transforms=` keyword argument of Dataset)\n",
932 | "such as random cropping."
933 | ]
934 | }
935 | ],
936 | "metadata": {
937 | "kernelspec": {
938 | "display_name": "Python 3.8.10 64-bit",
939 | "language": "python",
940 | "name": "python3"
941 | },
942 | "language_info": {
943 | "codemirror_mode": {
944 | "name": "ipython",
945 | "version": 3
946 | },
947 | "file_extension": ".py",
948 | "mimetype": "text/x-python",
949 | "name": "python",
950 | "nbconvert_exporter": "python",
951 | "pygments_lexer": "ipython3",
952 | "version": "3.8.10"
953 | },
954 | "vscode": {
955 | "interpreter": {
956 | "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6"
957 | }
958 | }
959 | },
960 | "nbformat": 4,
961 | "nbformat_minor": 4
962 | }
963 |
--------------------------------------------------------------------------------
/python/needle/__init__.py:
--------------------------------------------------------------------------------
1 | from . import ops
2 | from .ops import *
3 | from .autograd import Tensor, cpu, all_devices
4 |
5 | from . import init
6 | from .init import ones, zeros, zeros_like, ones_like
7 |
8 | from . import data
9 | from . import nn
10 | from . import optim
11 |
--------------------------------------------------------------------------------
/python/needle/autograd.py:
--------------------------------------------------------------------------------
1 | """Core data structures."""
2 | import needle
3 | from .backend_numpy import Device, cpu, all_devices
4 | from typing import List, Optional, NamedTuple, Tuple, Union
5 | from collections import namedtuple
6 | import numpy
7 |
8 | from needle import init
9 |
10 | # needle version
11 | LAZY_MODE = False
12 | TENSOR_COUNTER = 0
13 |
14 | # NOTE: we will import numpy as the array_api
15 | # as the backend for our computations, this line will change in later homeworks
16 |
17 | import numpy as array_api
18 | NDArray = numpy.ndarray
19 |
20 |
21 | class Op:
22 | """Operator definition."""
23 |
24 | def __call__(self, *args):
25 | raise NotImplementedError()
26 |
27 | def compute(self, *args: Tuple[NDArray]):
28 | """Calculate forward pass of operator.
29 |
30 | Parameters
31 | ----------
32 | input: np.ndarray
33 | A list of input arrays to the function
34 |
35 | Returns
36 | -------
37 | output: nd.array
38 | Array output of the operation
39 |
40 | """
41 | raise NotImplementedError()
42 |
43 | def gradient(
44 | self, out_grad: "Value", node: "Value"
45 | ) -> Union["Value", Tuple["Value"]]:
46 | """Compute partial adjoint for each input value for a given output adjoint.
47 |
48 | Parameters
49 | ----------
50 | out_grad: Value
51 | The adjoint wrt to the output value.
52 |
53 | node: Value
54 | The value node of forward evaluation.
55 |
56 | Returns
57 | -------
58 | input_grads: Value or Tuple[Value]
59 | A list containing partial gradient adjoints to be propagated to
60 | each of the input node.
61 | """
62 | raise NotImplementedError()
63 |
64 | def gradient_as_tuple(self, out_grad: "Value", node: "Value") -> Tuple["Value"]:
65 | """Convenience method to always return a tuple from gradient call"""
66 | output = self.gradient(out_grad, node)
67 | if isinstance(output, tuple):
68 | return output
69 | elif isinstance(output, list):
70 | return tuple(output)
71 | else:
72 | return (output,)
73 |
74 |
75 | class TensorOp(Op):
76 | """Op class specialized to output tensors, will be alternate subclasses for other structures"""
77 |
78 | def __call__(self, *args):
79 | return Tensor.make_from_op(self, args)
80 |
81 |
82 | class TensorTupleOp(Op):
83 | """Op class specialized to output TensorTuple"""
84 |
85 | def __call__(self, *args):
86 | return TensorTuple.make_from_op(self, args)
87 |
88 |
89 | class Value:
90 | """A value in the computational graph."""
91 |
92 | # trace of computational graph
93 | op: Optional[Op]
94 | inputs: List["Value"]
95 | # The following fields are cached fields for
96 | # dynamic computation
97 | cached_data: NDArray
98 | requires_grad: bool
99 |
100 | def realize_cached_data(self):
101 | """Run compute to realize the cached data"""
102 | # avoid recomputation
103 | if self.cached_data is not None:
104 | return self.cached_data
105 | # note: data implicitly calls realized cached data
106 | self.cached_data = self.op.compute(
107 | *[x.realize_cached_data() for x in self.inputs]
108 | )
109 | return self.cached_data
110 |
111 | def is_leaf(self):
112 | return self.op is None
113 |
114 | def __del__(self):
115 | global TENSOR_COUNTER
116 | TENSOR_COUNTER -= 1
117 |
118 | def _init(
119 | self,
120 | op: Optional[Op],
121 | inputs: List["Tensor"],
122 | *,
123 | num_outputs: int = 1,
124 | cached_data: List[object] = None,
125 | requires_grad: Optional[bool] = None
126 | ):
127 | global TENSOR_COUNTER
128 | TENSOR_COUNTER += 1
129 | if requires_grad is None:
130 | requires_grad = any(x.requires_grad for x in inputs)
131 | self.op = op
132 | self.inputs = inputs
133 | self.num_outputs = num_outputs
134 | self.cached_data = cached_data
135 | self.requires_grad = requires_grad
136 |
137 | @classmethod
138 | def make_const(cls, data, *, requires_grad=False):
139 | value = cls.__new__(cls)
140 | value._init(
141 | None,
142 | [],
143 | cached_data=data,
144 | requires_grad=requires_grad,
145 | )
146 | return value
147 |
148 | @classmethod
149 | def make_from_op(cls, op: Op, inputs: List["Value"]):
150 | value = cls.__new__(cls)
151 | value._init(op, inputs)
152 |
153 | if not LAZY_MODE:
154 | if not value.requires_grad:
155 | return value.detach()
156 | value.realize_cached_data()
157 | return value
158 |
159 |
160 | ### Not needed in HW1
161 | class TensorTuple(Value):
162 | """Represent a tuple of tensors.
163 |
164 | To keep things simple, we do not support nested tuples.
165 | """
166 |
167 | def __len__(self):
168 | cdata = self.realize_cached_data()
169 | return len(cdata)
170 |
171 | def __getitem__(self, index: int):
172 | return needle.ops.tuple_get_item(self, index)
173 |
174 | def tuple(self):
175 | return tuple([x for x in self])
176 |
177 | def __repr__(self):
178 | return "needle.TensorTuple" + str(self.tuple())
179 |
180 | def __str__(self):
181 | return self.__repr__()
182 |
183 | def __add__(self, other):
184 | assert isinstance(other, TensorTuple)
185 | assert len(self) == len(other)
186 | return needle.ops.make_tuple(*[self[i] + other[i] for i in range(len(self))])
187 |
188 | def detach(self):
189 | """Create a new tensor that shares the data but detaches from the graph."""
190 | return TensorTuple.make_const(self.realize_cached_data())
191 |
192 |
193 | class Tensor(Value):
194 | grad: "Tensor"
195 |
196 | def __init__(
197 | self,
198 | array,
199 | *,
200 | device: Optional[Device] = None,
201 | dtype=None,
202 | requires_grad=True,
203 | **kwargs
204 | ):
205 | if isinstance(array, Tensor):
206 | if device is None:
207 | device = array.device
208 | if dtype is None:
209 | dtype = array.dtype
210 | if device == array.device and dtype == array.dtype:
211 | cached_data = array.realize_cached_data()
212 | else:
213 | # fall back, copy through numpy conversion
214 | cached_data = Tensor._array_from_numpy(
215 | array.numpy(), device=device, dtype=dtype
216 | )
217 | else:
218 | device = device if device else cpu()
219 | cached_data = Tensor._array_from_numpy(array, device=device, dtype=dtype)
220 |
221 | self._init(
222 | None,
223 | [],
224 | cached_data=cached_data,
225 | requires_grad=requires_grad,
226 | )
227 |
228 | @staticmethod
229 | def _array_from_numpy(numpy_array, device, dtype):
230 | if array_api is numpy:
231 | return numpy.array(numpy_array, dtype=dtype)
232 | return array_api.array(numpy_array, device=device, dtype=dtype)
233 |
234 | @staticmethod
235 | def make_from_op(op: Op, inputs: List["Value"]):
236 | tensor = Tensor.__new__(Tensor)
237 | tensor._init(op, inputs)
238 | if not LAZY_MODE:
239 | if not tensor.requires_grad:
240 | return tensor.detach()
241 | tensor.realize_cached_data()
242 | return tensor
243 |
244 | @staticmethod
245 | def make_const(data, requires_grad=False):
246 | tensor = Tensor.__new__(Tensor)
247 | tensor._init(
248 | None,
249 | [],
250 | cached_data=data
251 | if not isinstance(data, Tensor)
252 | else data.realize_cached_data(),
253 | requires_grad=requires_grad,
254 | )
255 | return tensor
256 |
257 | @property
258 | def data(self):
259 | return self.detach()
260 |
261 | @data.setter
262 | def data(self, value):
263 | assert isinstance(value, Tensor)
264 | assert value.dtype == self.dtype, "%s %s" % (
265 | value.dtype,
266 | self.dtype,
267 | )
268 | self.cached_data = value.realize_cached_data()
269 |
270 | def detach(self):
271 | """Create a new tensor that shares the data but detaches from the graph."""
272 | return Tensor.make_const(self.realize_cached_data())
273 |
274 | @property
275 | def shape(self):
276 | return self.realize_cached_data().shape
277 |
278 | @property
279 | def dtype(self):
280 | return self.realize_cached_data().dtype
281 |
282 | @property
283 | def device(self):
284 | data = self.realize_cached_data()
285 | # numpy array always sits on cpu
286 | if array_api is numpy:
287 | return cpu()
288 | return data.device
289 |
290 | def backward(self, out_grad=None):
291 | out_grad = (
292 | out_grad
293 | if out_grad
294 | else init.ones(*self.shape, dtype=self.dtype, device=self.device)
295 | )
296 | compute_gradient_of_variables(self, out_grad)
297 |
298 | def __repr__(self):
299 | return "needle.Tensor(" + str(self.realize_cached_data()) + ")"
300 |
301 | def __str__(self):
302 | return self.realize_cached_data().__str__()
303 |
304 | def numpy(self):
305 | data = self.realize_cached_data()
306 | if array_api is numpy:
307 | return data
308 | return data.numpy()
309 |
310 | def __add__(self, other):
311 | if isinstance(other, Tensor):
312 | return needle.ops.EWiseAdd()(self, other)
313 | else:
314 | return needle.ops.AddScalar(other)(self)
315 |
316 | def __mul__(self, other):
317 | if isinstance(other, Tensor):
318 | return needle.ops.EWiseMul()(self, other)
319 | else:
320 | return needle.ops.MulScalar(other)(self)
321 |
322 | def __pow__(self, other):
323 | if isinstance(other, Tensor):
324 | return needle.ops.EWisePow()(self, other)
325 | else:
326 | return needle.ops.PowerScalar(other)(self)
327 |
328 | def __sub__(self, other):
329 | if isinstance(other, Tensor):
330 | return needle.ops.EWiseAdd()(self, needle.ops.Negate()(other))
331 | else:
332 | return needle.ops.AddScalar(-other)(self)
333 |
334 | def __truediv__(self, other):
335 | if isinstance(other, Tensor):
336 | return needle.ops.EWiseDiv()(self, other)
337 | else:
338 | return needle.ops.DivScalar(other)(self)
339 |
340 | def __matmul__(self, other):
341 | return needle.ops.MatMul()(self, other)
342 |
343 | def matmul(self, other):
344 | return needle.ops.MatMul()(self, other)
345 |
346 | def sum(self, axes=None):
347 | return needle.ops.Summation(axes)(self)
348 |
349 | def broadcast_to(self, shape):
350 | return needle.ops.BroadcastTo(shape)(self)
351 |
352 | def reshape(self, shape):
353 | return needle.ops.Reshape(shape)(self)
354 |
355 | def __neg__(self):
356 | return needle.ops.Negate()(self)
357 |
358 | def transpose(self, axes=None):
359 | return needle.ops.Transpose(axes)(self)
360 |
361 |
362 | __radd__ = __add__
363 | __rmul__ = __mul__
364 |
365 |
366 |
367 | def compute_gradient_of_variables(output_tensor, out_grad):
368 | """Take gradient of output node with respect to each node in node_list.
369 |
370 | Store the computed result in the grad field of each Variable.
371 | """
372 | # a map from node to a list of gradient contributions from each output node
373 | node_to_output_grads_list: Dict[Tensor, List[Tensor]] = {}
374 | # Special note on initializing gradient of
375 | # We are really taking a derivative of the scalar reduce_sum(output_node)
376 | # instead of the vector output_node. But this is the common case for loss function.
377 | node_to_output_grads_list[output_tensor] = [out_grad]
378 |
379 | # Traverse graph in reverse topological order given the output_node that we are taking gradient wrt.
380 | reverse_topo_order = list(reversed(find_topo_sort([output_tensor])))
381 |
382 | ### BEGIN YOUR SOLUTION
383 | raise NotImplementedError()
384 | ### END YOUR SOLUTION
385 |
386 |
387 | def find_topo_sort(node_list: List[Value]) -> List[Value]:
388 | """Given a list of nodes, return a topological sort list of nodes ending in them.
389 |
390 | A simple algorithm is to do a post-order DFS traversal on the given nodes,
391 | going backwards based on input edges. Since a node is added to the ordering
392 | after all its predecessors are traversed due to post-order DFS, we get a topological
393 | sort.
394 | """
395 | ### BEGIN YOUR SOLUTION
396 | raise NotImplementedError()
397 | ### END YOUR SOLUTION
398 |
399 |
400 | def topo_sort_dfs(node, visited, topo_order):
401 | """Post-order DFS"""
402 | ### BEGIN YOUR SOLUTION
403 | raise NotImplementedError()
404 | ### END YOUR SOLUTION
405 |
406 |
407 | ##############################
408 | ####### Helper Methods #######
409 | ##############################
410 |
411 |
412 | def sum_node_list(node_list):
413 | """Custom sum function in order to avoid create redundant nodes in Python sum implementation."""
414 | from operator import add
415 | from functools import reduce
416 |
417 | return reduce(add, node_list)
418 |
--------------------------------------------------------------------------------
/python/needle/backend_numpy.py:
--------------------------------------------------------------------------------
1 | """This file defies specific implementations of devices when using numpy as NDArray backend.
2 | """
3 | import numpy
4 |
5 |
6 | class Device:
7 | """Baseclass of all device"""
8 |
9 |
10 | class CPUDevice(Device):
11 | """Represents data that sits in CPU"""
12 |
13 | def __repr__(self):
14 | return "needle.cpu()"
15 |
16 | def __hash__(self):
17 | return self.__repr__().__hash__()
18 |
19 | def __eq__(self, other):
20 | return isinstance(other, CPUDevice)
21 |
22 | def enabled(self):
23 | return True
24 |
25 | def zeros(self, *shape, dtype="float32"):
26 | return numpy.zeros(shape, dtype=dtype)
27 |
28 | def ones(self, *shape, dtype="float32"):
29 | return numpy.ones(shape, dtype=dtype)
30 |
31 | def randn(self, *shape):
32 | # note: numpy doesn't support types within standard random routines, and
33 | # .astype("float32") does work if we're generating a singleton
34 | return numpy.random.randn(*shape)
35 |
36 | def rand(self, *shape):
37 | # note: numpy doesn't support types within standard random routines, and
38 | # .astype("float32") does work if we're generating a singleton
39 | return numpy.random.rand(*shape)
40 |
41 | def one_hot(self, n, i, dtype="float32"):
42 | return numpy.eye(n, dtype=dtype)[i]
43 |
44 | def empty(self, shape, dtype="float32"):
45 | return numpy.empty(shape, dtype=dtype)
46 |
47 | def full(self, shape, fill_value, dtype="float32"):
48 | return numpy.full(shape, fill_value, dtype=dtype)
49 |
50 |
51 | def cpu():
52 | """Return cpu device"""
53 | return CPUDevice()
54 |
55 |
56 | def default_device():
57 | return cpu()
58 |
59 |
60 | def all_devices():
61 | """return a list of all available devices"""
62 | return [cpu()]
63 |
--------------------------------------------------------------------------------
/python/needle/data/__init__.py:
--------------------------------------------------------------------------------
1 | from .data_basic import *
2 | from .data_transforms import *
3 | from .datasets import *
4 |
--------------------------------------------------------------------------------
/python/needle/data/data_basic.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from ..autograd import Tensor
3 |
4 | from typing import Iterator, Optional, List, Sized, Union, Iterable, Any
5 |
6 |
7 |
8 | class Dataset:
9 | r"""An abstract class representing a `Dataset`.
10 |
11 | All subclasses should overwrite :meth:`__getitem__`, supporting fetching a
12 | data sample for a given key. Subclasses must also overwrite
13 | :meth:`__len__`, which is expected to return the size of the dataset.
14 | """
15 |
16 | def __init__(self, transforms: Optional[List] = None):
17 | self.transforms = transforms
18 |
19 | def __getitem__(self, index) -> object:
20 | raise NotImplementedError
21 |
22 | def __len__(self) -> int:
23 | raise NotImplementedError
24 |
25 | def apply_transforms(self, x):
26 | if self.transforms is not None:
27 | # apply the transforms
28 | for tform in self.transforms:
29 | x = tform(x)
30 | return x
31 |
32 |
33 | class DataLoader:
34 | r"""
35 | Data loader. Combines a dataset and a sampler, and provides an iterable over
36 | the given dataset.
37 | Args:
38 | dataset (Dataset): dataset from which to load the data.
39 | batch_size (int, optional): how many samples per batch to load
40 | (default: ``1``).
41 | shuffle (bool, optional): set to ``True`` to have the data reshuffled
42 | at every epoch (default: ``False``).
43 | """
44 | dataset: Dataset
45 | batch_size: Optional[int]
46 |
47 | def __init__(
48 | self,
49 | dataset: Dataset,
50 | batch_size: Optional[int] = 1,
51 | shuffle: bool = False,
52 | ):
53 |
54 | self.dataset = dataset
55 | self.shuffle = shuffle
56 | self.batch_size = batch_size
57 | if not self.shuffle:
58 | self.ordering = np.array_split(np.arange(len(dataset)),
59 | range(batch_size, len(dataset), batch_size))
60 |
61 | def __iter__(self):
62 | ### BEGIN YOUR SOLUTION
63 | raise NotImplementedError()
64 | ### END YOUR SOLUTION
65 | return self
66 |
67 | def __next__(self):
68 | ### BEGIN YOUR SOLUTION
69 | raise NotImplementedError()
70 | ### END YOUR SOLUTION
71 |
72 |
--------------------------------------------------------------------------------
/python/needle/data/data_transforms.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | class Transform:
4 | def __call__(self, x):
5 | raise NotImplementedError
6 |
7 |
8 | class RandomFlipHorizontal(Transform):
9 | def __init__(self, p = 0.5):
10 | self.p = p
11 |
12 | def __call__(self, img):
13 | """
14 | Horizonally flip an image, specified as an H x W x C NDArray.
15 | Args:
16 | img: H x W x C NDArray of an image
17 | Returns:
18 | H x W x C ndarray corresponding to image flipped with probability self.p
19 | Note: use the provided code to provide randomness, for easier testing
20 | """
21 | flip_img = np.random.rand() < self.p
22 | ### BEGIN YOUR SOLUTION
23 | raise NotImplementedError()
24 | ### END YOUR SOLUTION
25 |
26 |
27 | class RandomCrop(Transform):
28 | def __init__(self, padding=3):
29 | self.padding = padding
30 |
31 | def __call__(self, img):
32 | """ Zero pad and then randomly crop an image.
33 | Args:
34 | img: H x W x C NDArray of an image
35 | Return
36 | H x W x C NAArray of cliped image
37 | Note: generate the image shifted by shift_x, shift_y specified below
38 | """
39 | shift_x, shift_y = np.random.randint(low=-self.padding, high=self.padding+1, size=2)
40 | ### BEGIN YOUR SOLUTION
41 | raise NotImplementedError()
42 | ### END YOUR SOLUTION
43 |
--------------------------------------------------------------------------------
/python/needle/data/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | from .mnist_dataset import *
2 | from .ndarray_dataset import *
3 |
--------------------------------------------------------------------------------
/python/needle/data/datasets/mnist_dataset.py:
--------------------------------------------------------------------------------
1 | from typing import List, Optional
2 | from ..data_basic import Dataset
3 | import numpy as np
4 |
5 | class MNISTDataset(Dataset):
6 | def __init__(
7 | self,
8 | image_filename: str,
9 | label_filename: str,
10 | transforms: Optional[List] = None,
11 | ):
12 | ### BEGIN YOUR SOLUTION
13 | raise NotImplementedError()
14 | ### END YOUR SOLUTION
15 |
16 | def __getitem__(self, index) -> object:
17 | ### BEGIN YOUR SOLUTION
18 | raise NotImplementedError()
19 | ### END YOUR SOLUTION
20 |
21 | def __len__(self) -> int:
22 | ### BEGIN YOUR SOLUTION
23 | raise NotImplementedError()
24 | ### END YOUR SOLUTION
--------------------------------------------------------------------------------
/python/needle/data/datasets/ndarray_dataset.py:
--------------------------------------------------------------------------------
1 | from ..data_basic import Dataset
2 |
3 | class NDArrayDataset(Dataset):
4 | def __init__(self, *arrays):
5 | self.arrays = arrays
6 |
7 | def __len__(self) -> int:
8 | return self.arrays[0].shape[0]
9 |
10 | def __getitem__(self, i) -> object:
11 | return tuple([a[i] for a in self.arrays])
--------------------------------------------------------------------------------
/python/needle/init/__init__.py:
--------------------------------------------------------------------------------
1 | from .init_basic import *
2 |
3 | from .init_initializers import *
4 |
--------------------------------------------------------------------------------
/python/needle/init/init_basic.py:
--------------------------------------------------------------------------------
1 | import math
2 | import needle as ndl
3 |
4 |
5 | def rand(*shape, low=0.0, high=1.0, device=None, dtype="float32", requires_grad=False):
6 | """Generate random numbers uniform between low and high"""
7 | device = ndl.cpu() if device is None else device
8 | array = device.rand(*shape) * (high - low) + low
9 | return ndl.Tensor(array, device=device, dtype=dtype, requires_grad=requires_grad)
10 |
11 |
12 | def randn(*shape, mean=0.0, std=1.0, device=None, dtype="float32", requires_grad=False):
13 | """Generate random normal with specified mean and std deviation"""
14 | device = ndl.cpu() if device is None else device
15 | array = device.randn(*shape) * std + mean
16 | return ndl.Tensor(array, device=device, dtype=dtype, requires_grad=requires_grad)
17 |
18 |
19 |
20 |
21 | def constant(*shape, c=1.0, device=None, dtype="float32", requires_grad=False):
22 | """Generate constant Tensor"""
23 | device = ndl.cpu() if device is None else device
24 | array = device.ones(*shape, dtype=dtype) * c # note: can change dtype
25 | return ndl.Tensor(array, device=device, dtype=dtype, requires_grad=requires_grad)
26 |
27 |
28 |
29 | def ones(*shape, device=None, dtype="float32", requires_grad=False):
30 | """Generate all-ones Tensor"""
31 | return constant(
32 | *shape, c=1.0, device=device, dtype=dtype, requires_grad=requires_grad
33 | )
34 |
35 |
36 | def zeros(*shape, device=None, dtype="float32", requires_grad=False):
37 | """Generate all-zeros Tensor"""
38 | return constant(
39 | *shape, c=0.0, device=device, dtype=dtype, requires_grad=requires_grad
40 | )
41 |
42 |
43 | def randb(*shape, p=0.5, device=None, dtype="bool", requires_grad=False):
44 | """Generate binary random Tensor"""
45 | device = ndl.cpu() if device is None else device
46 | array = device.rand(*shape) <= p
47 | return ndl.Tensor(array, device=device, dtype=dtype, requires_grad=requires_grad)
48 |
49 |
50 | def one_hot(n, i, device=None, dtype="float32", requires_grad=False):
51 | """Generate one-hot encoding Tensor"""
52 | device = ndl.cpu() if device is None else device
53 | return ndl.Tensor(
54 | device.one_hot(n, i.numpy().astype("int32"), dtype=dtype),
55 | device=device,
56 | requires_grad=requires_grad,
57 | )
58 |
59 |
60 | def zeros_like(array, *, device=None, requires_grad=False):
61 | device = device if device else array.device
62 | return zeros(
63 | *array.shape, dtype=array.dtype, device=device, requires_grad=requires_grad
64 | )
65 |
66 |
67 | def ones_like(array, *, device=None, requires_grad=False):
68 | device = device if device else array.device
69 | return ones(
70 | *array.shape, dtype=array.dtype, device=device, requires_grad=requires_grad
71 | )
72 |
--------------------------------------------------------------------------------
/python/needle/init/init_initializers.py:
--------------------------------------------------------------------------------
1 | import math
2 | from .init_basic import *
3 |
4 |
5 | def xavier_uniform(fan_in, fan_out, gain=1.0, **kwargs):
6 | ### BEGIN YOUR SOLUTION
7 | raise NotImplementedError()
8 | ### END YOUR SOLUTION
9 |
10 |
11 | def xavier_normal(fan_in, fan_out, gain=1.0, **kwargs):
12 | ### BEGIN YOUR SOLUTION
13 | raise NotImplementedError()
14 | ### END YOUR SOLUTION
15 |
16 | def kaiming_uniform(fan_in, fan_out, nonlinearity="relu", **kwargs):
17 | assert nonlinearity == "relu", "Only relu supported currently"
18 | ### BEGIN YOUR SOLUTION
19 | raise NotImplementedError()
20 | ### END YOUR SOLUTION
21 |
22 |
23 |
24 | def kaiming_normal(fan_in, fan_out, nonlinearity="relu", **kwargs):
25 | assert nonlinearity == "relu", "Only relu supported currently"
26 | ### BEGIN YOUR SOLUTION
27 | raise NotImplementedError()
28 | ### END YOUR SOLUTION
--------------------------------------------------------------------------------
/python/needle/nn/__init__.py:
--------------------------------------------------------------------------------
1 | from .nn_basic import *
2 |
--------------------------------------------------------------------------------
/python/needle/nn/nn_basic.py:
--------------------------------------------------------------------------------
1 | """The module.
2 | """
3 | from typing import List, Callable, Any
4 | from needle.autograd import Tensor
5 | from needle import ops
6 | import needle.init as init
7 | import numpy as np
8 |
9 |
10 | class Parameter(Tensor):
11 | """A special kind of tensor that represents parameters."""
12 |
13 |
14 | def _unpack_params(value: object) -> List[Tensor]:
15 | if isinstance(value, Parameter):
16 | return [value]
17 | elif isinstance(value, Module):
18 | return value.parameters()
19 | elif isinstance(value, dict):
20 | params = []
21 | for k, v in value.items():
22 | params += _unpack_params(v)
23 | return params
24 | elif isinstance(value, (list, tuple)):
25 | params = []
26 | for v in value:
27 | params += _unpack_params(v)
28 | return params
29 | else:
30 | return []
31 |
32 |
33 | def _child_modules(value: object) -> List["Module"]:
34 | if isinstance(value, Module):
35 | modules = [value]
36 | modules.extend(_child_modules(value.__dict__))
37 | return modules
38 | if isinstance(value, dict):
39 | modules = []
40 | for k, v in value.items():
41 | modules += _child_modules(v)
42 | return modules
43 | elif isinstance(value, (list, tuple)):
44 | modules = []
45 | for v in value:
46 | modules += _child_modules(v)
47 | return modules
48 | else:
49 | return []
50 |
51 |
52 | class Module:
53 | def __init__(self):
54 | self.training = True
55 |
56 | def parameters(self) -> List[Tensor]:
57 | """Return the list of parameters in the module."""
58 | return _unpack_params(self.__dict__)
59 |
60 | def _children(self) -> List["Module"]:
61 | return _child_modules(self.__dict__)
62 |
63 | def eval(self):
64 | self.training = False
65 | for m in self._children():
66 | m.training = False
67 |
68 | def train(self):
69 | self.training = True
70 | for m in self._children():
71 | m.training = True
72 |
73 | def __call__(self, *args, **kwargs):
74 | return self.forward(*args, **kwargs)
75 |
76 |
77 | class Identity(Module):
78 | def forward(self, x):
79 | return x
80 |
81 |
82 | class Linear(Module):
83 | def __init__(
84 | self, in_features, out_features, bias=True, device=None, dtype="float32"
85 | ):
86 | super().__init__()
87 | self.in_features = in_features
88 | self.out_features = out_features
89 |
90 | ### BEGIN YOUR SOLUTION
91 | raise NotImplementedError()
92 | ### END YOUR SOLUTION
93 |
94 | def forward(self, X: Tensor) -> Tensor:
95 | ### BEGIN YOUR SOLUTION
96 | raise NotImplementedError()
97 | ### END YOUR SOLUTION
98 |
99 |
100 | class Flatten(Module):
101 | def forward(self, X):
102 | ### BEGIN YOUR SOLUTION
103 | raise NotImplementedError()
104 | ### END YOUR SOLUTION
105 |
106 |
107 | class ReLU(Module):
108 | def forward(self, x: Tensor) -> Tensor:
109 | ### BEGIN YOUR SOLUTION
110 | raise NotImplementedError()
111 | ### END YOUR SOLUTION
112 |
113 | class Sequential(Module):
114 | def __init__(self, *modules):
115 | super().__init__()
116 | self.modules = modules
117 |
118 | def forward(self, x: Tensor) -> Tensor:
119 | ### BEGIN YOUR SOLUTION
120 | raise NotImplementedError()
121 | ### END YOUR SOLUTION
122 |
123 |
124 | class SoftmaxLoss(Module):
125 | def forward(self, logits: Tensor, y: Tensor):
126 | ### BEGIN YOUR SOLUTION
127 | raise NotImplementedError()
128 | ### END YOUR SOLUTION
129 |
130 |
131 | class BatchNorm1d(Module):
132 | def __init__(self, dim, eps=1e-5, momentum=0.1, device=None, dtype="float32"):
133 | super().__init__()
134 | self.dim = dim
135 | self.eps = eps
136 | self.momentum = momentum
137 | ### BEGIN YOUR SOLUTION
138 | raise NotImplementedError()
139 | ### END YOUR SOLUTION
140 |
141 | def forward(self, x: Tensor) -> Tensor:
142 | ### BEGIN YOUR SOLUTION
143 | raise NotImplementedError()
144 | ### END YOUR SOLUTION
145 |
146 |
147 |
148 | class LayerNorm1d(Module):
149 | def __init__(self, dim, eps=1e-5, device=None, dtype="float32"):
150 | super().__init__()
151 | self.dim = dim
152 | self.eps = eps
153 | ### BEGIN YOUR SOLUTION
154 | raise NotImplementedError()
155 | ### END YOUR SOLUTION
156 |
157 | def forward(self, x: Tensor) -> Tensor:
158 | ### BEGIN YOUR SOLUTION
159 | raise NotImplementedError()
160 | ### END YOUR SOLUTION
161 |
162 |
163 | class Dropout(Module):
164 | def __init__(self, p=0.5):
165 | super().__init__()
166 | self.p = p
167 |
168 | def forward(self, x: Tensor) -> Tensor:
169 | ### BEGIN YOUR SOLUTION
170 | raise NotImplementedError()
171 | ### END YOUR SOLUTION
172 |
173 |
174 | class Residual(Module):
175 | def __init__(self, fn: Module):
176 | super().__init__()
177 | self.fn = fn
178 |
179 | def forward(self, x: Tensor) -> Tensor:
180 | ### BEGIN YOUR SOLUTION
181 | raise NotImplementedError()
182 | ### END YOUR SOLUTION
183 |
--------------------------------------------------------------------------------
/python/needle/ops/__init__.py:
--------------------------------------------------------------------------------
1 | from .ops_mathematic import *
2 |
3 | from .ops_logarithmic import *
4 | from .ops_tuple import *
5 |
--------------------------------------------------------------------------------
/python/needle/ops/ops_logarithmic.py:
--------------------------------------------------------------------------------
1 | from typing import Optional
2 | from ..autograd import NDArray
3 | from ..autograd import Op, Tensor, Value, TensorOp
4 | from ..autograd import TensorTuple, TensorTupleOp
5 |
6 | from .ops_mathematic import *
7 |
8 | import numpy as array_api
9 |
10 | class LogSoftmax(TensorOp):
11 | def compute(self, Z):
12 | ### BEGIN YOUR SOLUTION
13 | raise NotImplementedError()
14 | ### END YOUR SOLUTION
15 |
16 | def gradient(self, out_grad, node):
17 | ### BEGIN YOUR SOLUTION
18 | raise NotImplementedError()
19 | ### END YOUR SOLUTION
20 |
21 |
22 | def logsoftmax(a):
23 | return LogSoftmax()(a)
24 |
25 |
26 | class LogSumExp(TensorOp):
27 | def __init__(self, axes: Optional[tuple] = None):
28 | self.axes = axes
29 |
30 | def compute(self, Z):
31 | ### BEGIN YOUR SOLUTION
32 | raise NotImplementedError()
33 | ### END YOUR SOLUTION
34 |
35 | def gradient(self, out_grad, node):
36 | ### BEGIN YOUR SOLUTION
37 | raise NotImplementedError()
38 | ### END YOUR SOLUTION
39 |
40 |
41 | def logsumexp(a, axes=None):
42 | return LogSumExp(axes=axes)(a)
43 |
44 |
--------------------------------------------------------------------------------
/python/needle/ops/ops_mathematic.py:
--------------------------------------------------------------------------------
1 | """Operator implementations."""
2 |
3 | from numbers import Number
4 | from typing import Optional, List, Tuple, Union
5 |
6 | from ..autograd import NDArray
7 | from ..autograd import Op, Tensor, Value, TensorOp
8 | from ..autograd import TensorTuple, TensorTupleOp
9 | import numpy
10 |
11 | # NOTE: we will import numpy as the array_api
12 | # as the backend for our computations, this line will change in later homeworks
13 |
14 | BACKEND = "np"
15 | import numpy as array_api
16 |
17 |
18 | class EWiseAdd(TensorOp):
19 | def compute(self, a: NDArray, b: NDArray):
20 | return a + b
21 |
22 | def gradient(self, out_grad: Tensor, node: Tensor):
23 | return out_grad, out_grad
24 |
25 |
26 | def add(a, b):
27 | return EWiseAdd()(a, b)
28 |
29 |
30 | class AddScalar(TensorOp):
31 | def __init__(self, scalar):
32 | self.scalar = scalar
33 |
34 | def compute(self, a: NDArray):
35 | return a + self.scalar
36 |
37 | def gradient(self, out_grad: Tensor, node: Tensor):
38 | return out_grad
39 |
40 |
41 | def add_scalar(a, scalar):
42 | return AddScalar(scalar)(a)
43 |
44 |
45 | class EWiseMul(TensorOp):
46 | def compute(self, a: NDArray, b: NDArray):
47 | return a * b
48 |
49 | def gradient(self, out_grad: Tensor, node: Tensor):
50 | lhs, rhs = node.inputs
51 | return out_grad * rhs, out_grad * lhs
52 |
53 |
54 | def multiply(a, b):
55 | return EWiseMul()(a, b)
56 |
57 |
58 | class MulScalar(TensorOp):
59 | def __init__(self, scalar):
60 | self.scalar = scalar
61 |
62 | def compute(self, a: NDArray):
63 | return a * self.scalar
64 |
65 | def gradient(self, out_grad: Tensor, node: Tensor):
66 | return (out_grad * self.scalar,)
67 |
68 |
69 | def mul_scalar(a, scalar):
70 | return MulScalar(scalar)(a)
71 |
72 |
73 | class EWisePow(TensorOp):
74 | """Op to element-wise raise a tensor to a power."""
75 |
76 | def compute(self, a: NDArray, b: NDArray) -> NDArray:
77 | ### BEGIN YOUR SOLUTION
78 | raise NotImplementedError()
79 | ### END YOUR SOLUTION
80 |
81 | def gradient(self, out_grad, node):
82 | ### BEGIN YOUR SOLUTION
83 | raise NotImplementedError()
84 | ### END YOUR SOLUTION
85 |
86 |
87 | def power(a, b):
88 | return EWisePow()(a, b)
89 |
90 |
91 | class PowerScalar(TensorOp):
92 | """Op raise a tensor to an (integer) power."""
93 |
94 | def __init__(self, scalar: int):
95 | self.scalar = scalar
96 |
97 | def compute(self, a: NDArray) -> NDArray:
98 | ### BEGIN YOUR SOLUTION
99 | raise NotImplementedError()
100 | ### END YOUR SOLUTION
101 |
102 | def gradient(self, out_grad, node):
103 | ### BEGIN YOUR SOLUTION
104 | raise NotImplementedError()
105 | ### END YOUR SOLUTION
106 |
107 |
108 | def power_scalar(a, scalar):
109 | return PowerScalar(scalar)(a)
110 |
111 |
112 | class EWiseDiv(TensorOp):
113 | """Op to element-wise divide two nodes."""
114 |
115 | def compute(self, a, b):
116 | ### BEGIN YOUR SOLUTION
117 | raise NotImplementedError()
118 | ### END YOUR SOLUTION
119 |
120 | def gradient(self, out_grad, node):
121 | ### BEGIN YOUR SOLUTION
122 | raise NotImplementedError()
123 | ### END YOUR SOLUTION
124 |
125 |
126 | def divide(a, b):
127 | return EWiseDiv()(a, b)
128 |
129 |
130 | class DivScalar(TensorOp):
131 | def __init__(self, scalar):
132 | self.scalar = scalar
133 |
134 | def compute(self, a):
135 | ### BEGIN YOUR SOLUTION
136 | raise NotImplementedError()
137 | ### END YOUR SOLUTION
138 |
139 | def gradient(self, out_grad, node):
140 | ### BEGIN YOUR SOLUTION
141 | raise NotImplementedError()
142 | ### END YOUR SOLUTION
143 |
144 |
145 | def divide_scalar(a, scalar):
146 | return DivScalar(scalar)(a)
147 |
148 |
149 | class Transpose(TensorOp):
150 | def __init__(self, axes: Optional[tuple] = None):
151 | self.axes = axes
152 |
153 | def compute(self, a):
154 | ### BEGIN YOUR SOLUTION
155 | raise NotImplementedError()
156 | ### END YOUR SOLUTION
157 |
158 | def gradient(self, out_grad, node):
159 | ### BEGIN YOUR SOLUTION
160 | raise NotImplementedError()
161 | ### END YOUR SOLUTION
162 |
163 |
164 | def transpose(a, axes=None):
165 | return Transpose(axes)(a)
166 |
167 |
168 | class Reshape(TensorOp):
169 | def __init__(self, shape):
170 | self.shape = shape
171 |
172 | def compute(self, a):
173 | ### BEGIN YOUR SOLUTION
174 | raise NotImplementedError()
175 | ### END YOUR SOLUTION
176 |
177 | def gradient(self, out_grad, node):
178 | ### BEGIN YOUR SOLUTION
179 | raise NotImplementedError()
180 | ### END YOUR SOLUTION
181 |
182 |
183 | def reshape(a, shape):
184 | return Reshape(shape)(a)
185 |
186 |
187 | class BroadcastTo(TensorOp):
188 | def __init__(self, shape):
189 | self.shape = shape
190 |
191 | def compute(self, a):
192 | ### BEGIN YOUR SOLUTION
193 | raise NotImplementedError()
194 | ### END YOUR SOLUTION
195 |
196 | def gradient(self, out_grad, node):
197 | ### BEGIN YOUR SOLUTION
198 | raise NotImplementedError()
199 | ### END YOUR SOLUTION
200 |
201 |
202 | def broadcast_to(a, shape):
203 | return BroadcastTo(shape)(a)
204 |
205 |
206 | class Summation(TensorOp):
207 | def __init__(self, axes: Optional[tuple] = None):
208 | self.axes = axes
209 |
210 | def compute(self, a):
211 | ### BEGIN YOUR SOLUTION
212 | raise NotImplementedError()
213 | ### END YOUR SOLUTION
214 |
215 | def gradient(self, out_grad, node):
216 | ### BEGIN YOUR SOLUTION
217 | raise NotImplementedError()
218 | ### END YOUR SOLUTION
219 |
220 |
221 | def summation(a, axes=None):
222 | return Summation(axes)(a)
223 |
224 |
225 | class MatMul(TensorOp):
226 | def compute(self, a, b):
227 | ### BEGIN YOUR SOLUTION
228 | raise NotImplementedError()
229 | ### END YOUR SOLUTION
230 |
231 | def gradient(self, out_grad, node):
232 | ### BEGIN YOUR SOLUTION
233 | raise NotImplementedError()
234 | ### END YOUR SOLUTION
235 |
236 |
237 | def matmul(a, b):
238 | return MatMul()(a, b)
239 |
240 |
241 | class Negate(TensorOp):
242 | def compute(self, a):
243 | ### BEGIN YOUR SOLUTION
244 | raise NotImplementedError()
245 | ### END YOUR SOLUTION
246 |
247 | def gradient(self, out_grad, node):
248 | ### BEGIN YOUR SOLUTION
249 | raise NotImplementedError()
250 | ### END YOUR SOLUTION
251 |
252 |
253 | def negate(a):
254 | return Negate()(a)
255 |
256 |
257 | class Log(TensorOp):
258 | def compute(self, a):
259 | ### BEGIN YOUR SOLUTION
260 | raise NotImplementedError()
261 | ### END YOUR SOLUTION
262 |
263 | def gradient(self, out_grad, node):
264 | ### BEGIN YOUR SOLUTION
265 | raise NotImplementedError()
266 | ### END YOUR SOLUTION
267 |
268 |
269 | def log(a):
270 | return Log()(a)
271 |
272 |
273 | class Exp(TensorOp):
274 | def compute(self, a):
275 | ### BEGIN YOUR SOLUTION
276 | raise NotImplementedError()
277 | ### END YOUR SOLUTION
278 |
279 | def gradient(self, out_grad, node):
280 | ### BEGIN YOUR SOLUTION
281 | raise NotImplementedError()
282 | ### END YOUR SOLUTION
283 |
284 |
285 | def exp(a):
286 | return Exp()(a)
287 |
288 |
289 | class ReLU(TensorOp):
290 | def compute(self, a):
291 | ### BEGIN YOUR SOLUTION
292 | raise NotImplementedError()
293 | ### END YOUR SOLUTION
294 |
295 | def gradient(self, out_grad, node):
296 | ### BEGIN YOUR SOLUTION
297 | raise NotImplementedError()
298 | ### END YOUR SOLUTION
299 |
300 |
301 | def relu(a):
302 | return ReLU()(a)
303 |
304 |
305 |
--------------------------------------------------------------------------------
/python/needle/ops/ops_tuple.py:
--------------------------------------------------------------------------------
1 | from ..autograd import Op, Tensor, TensorTuple, Value, TensorOp, TensorTupleOp
2 | import needle.init as init
3 |
4 | class MakeTensorTuple(TensorTupleOp):
5 | def compute(self, *args) -> tuple:
6 | return tuple(args)
7 |
8 | def gradient(self, out_grad, node):
9 | assert isinstance(out_grad, TensorTuple)
10 | return tuple([out_grad[i] for i in range(len(out_grad))])
11 |
12 |
13 | def make_tuple(*args):
14 | return MakeTensorTuple()(*args)
15 |
16 |
17 | class TupleGetItem(TensorOp):
18 | def __init__(self, index):
19 | self.index = index
20 |
21 | def __call__(self, a: TensorTuple, fold_const=True) -> Value:
22 | assert isinstance(a, TensorTuple)
23 | # constant folding
24 | if fold_const and isinstance(a.op, MakeTensorTuple):
25 | return a.inputs[self.index]
26 | return Tensor.make_from_op(self, [a])
27 |
28 | def compute(self, a):
29 | return a[self.index]
30 |
31 | def gradient(self, out_grad, node):
32 | index = self.index
33 | in_grad = []
34 | for i, value in enumerate(node.inputs[0]):
35 | if i != index:
36 | in_grad.append(init.zeros_like(value))
37 | else:
38 | in_grad.append(out_grad)
39 | return MakeTensorTuple()(*in_grad)
40 |
41 |
42 | def tuple_get_item(value, index):
43 | return TupleGetItem(index)(value)
44 |
45 |
46 | class FusedAddScalars(TensorTupleOp):
47 | def __init__(self, c0: float, c1: float):
48 | self.c0 = c0
49 | self.c1 = c1
50 |
51 | def compute(self, a):
52 | return a + self.c0, a + self.c1
53 |
54 | def gradient(self, out_grad, node):
55 | return out_grad[0] + out_grad[1]
56 |
57 |
58 | def fused_add_scalars(x, c0, c1):
59 | return FusedAddScalars(c0, c1)(x)
60 |
--------------------------------------------------------------------------------
/python/needle/optim.py:
--------------------------------------------------------------------------------
1 | """Optimization module"""
2 | import needle as ndl
3 | import numpy as np
4 |
5 |
6 | class Optimizer:
7 | def __init__(self, params):
8 | self.params = params
9 |
10 | def step(self):
11 | raise NotImplementedError()
12 |
13 | def reset_grad(self):
14 | for p in self.params:
15 | p.grad = None
16 |
17 |
18 | class SGD(Optimizer):
19 | def __init__(self, params, lr=0.01, momentum=0.0, weight_decay=0.0):
20 | super().__init__(params)
21 | self.lr = lr
22 | self.momentum = momentum
23 | self.u = {}
24 | self.weight_decay = weight_decay
25 |
26 | def step(self):
27 | ### BEGIN YOUR SOLUTION
28 | raise NotImplementedError()
29 | ### END YOUR SOLUTION
30 |
31 | def clip_grad_norm(self, max_norm=0.25):
32 | """
33 | Clips gradient norm of parameters.
34 | """
35 | ### BEGIN YOUR SOLUTION
36 | raise NotImplementedError()
37 | ### END YOUR SOLUTION
38 |
39 |
40 | class Adam(Optimizer):
41 | def __init__(
42 | self,
43 | params,
44 | lr=0.01,
45 | beta1=0.9,
46 | beta2=0.999,
47 | eps=1e-8,
48 | weight_decay=0.0,
49 | ):
50 | super().__init__(params)
51 | self.lr = lr
52 | self.beta1 = beta1
53 | self.beta2 = beta2
54 | self.eps = eps
55 | self.weight_decay = weight_decay
56 | self.t = 0
57 |
58 | self.m = {}
59 | self.v = {}
60 |
61 | def step(self):
62 | ### BEGIN YOUR SOLUTION
63 | raise NotImplementedError()
64 | ### END YOUR SOLUTION
65 |
--------------------------------------------------------------------------------
/tests/hw2/test_nn_and_optim.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 | sys.path.append("./python")
4 | import numpy as np
5 | import needle as ndl
6 | import needle.nn as nn
7 |
8 | sys.path.append("./apps")
9 | from mlp_resnet import *
10 |
11 | import mugrade
12 |
13 | """Deterministically generate a matrix"""
14 |
15 |
16 | def get_tensor(*shape, entropy=1):
17 | np.random.seed(np.prod(shape) * len(shape) * entropy)
18 | return ndl.Tensor(np.random.randint(0, 100, size=shape) / 20, dtype="float32")
19 |
20 |
21 | def get_int_tensor(*shape, low=0, high=10, entropy=1):
22 | np.random.seed(np.prod(shape) * len(shape) * entropy)
23 | return ndl.Tensor(np.random.randint(low, high, size=shape))
24 |
25 |
26 | def check_prng(*shape):
27 | """We want to ensure that numpy generates random matrices on your machine/colab
28 | Such that our tests will make sense
29 | So this matrix should match our to full precision
30 | """
31 | return get_tensor(*shape).cached_data
32 |
33 |
34 | def batchnorm_forward(*shape, affine=False):
35 | x = get_tensor(*shape)
36 | bn = ndl.nn.BatchNorm1d(shape[1])
37 | if affine:
38 | bn.weight.data = get_tensor(shape[1], entropy=42)
39 | bn.bias.data = get_tensor(shape[1], entropy=1337)
40 | return bn(x).cached_data
41 |
42 |
43 | def batchnorm_backward(*shape, affine=False):
44 | x = get_tensor(*shape)
45 | bn = ndl.nn.BatchNorm1d(shape[1])
46 | if affine:
47 | bn.weight.data = get_tensor(shape[1], entropy=42)
48 | bn.bias.data = get_tensor(shape[1], entropy=1337)
49 | y = (bn(x) ** 2).sum().backward()
50 | return x.grad.cached_data
51 |
52 |
53 | def flatten_forward(*shape):
54 | x = get_tensor(*shape)
55 | tform = ndl.nn.Flatten()
56 | return tform(x).cached_data
57 |
58 |
59 | def flatten_backward(*shape):
60 | x = get_tensor(*shape)
61 | tform = ndl.nn.Flatten()
62 | (tform(x) ** 2).sum().backward()
63 | return x.grad.cached_data
64 |
65 |
66 | def batchnorm_running_mean(*shape, iters=10):
67 | bn = ndl.nn.BatchNorm1d(shape[1])
68 | for i in range(iters):
69 | x = get_tensor(*shape, entropy=i)
70 | y = bn(x)
71 | return bn.running_mean.cached_data
72 |
73 |
74 | def batchnorm_running_var(*shape, iters=10):
75 | bn = ndl.nn.BatchNorm1d(shape[1])
76 | for i in range(iters):
77 | x = get_tensor(*shape, entropy=i)
78 | y = bn(x)
79 | return bn.running_var.cached_data
80 |
81 |
82 | def batchnorm_running_grad(*shape, iters=10):
83 | bn = ndl.nn.BatchNorm1d(shape[1])
84 | for i in range(iters):
85 | x = get_tensor(*shape, entropy=i)
86 | y = bn(x)
87 | bn.eval()
88 | (y**2).sum().backward()
89 | return x.grad.cached_data
90 |
91 |
92 | def relu_forward(*shape):
93 | f = ndl.nn.ReLU()
94 | x = get_tensor(*shape)
95 | return f(x).cached_data
96 |
97 |
98 | def relu_backward(*shape):
99 | f = ndl.nn.ReLU()
100 | x = get_tensor(*shape)
101 | (f(x) ** 2).sum().backward()
102 | return x.grad.cached_data
103 |
104 |
105 | def layernorm_forward(shape, dim):
106 | f = ndl.nn.LayerNorm1d(dim)
107 | x = get_tensor(*shape)
108 | return f(x).cached_data
109 |
110 |
111 | def layernorm_backward(shape, dims):
112 | f = ndl.nn.LayerNorm1d(dims)
113 | x = get_tensor(*shape)
114 | (f(x) ** 4).sum().backward()
115 | return x.grad.cached_data
116 |
117 | def logsoftmax_forward(shape, mult=1.0):
118 | x = get_tensor(*shape) * mult
119 | return ndl.ops.logsoftmax(x).cached_data
120 |
121 | def logsoftmax_backward(shape, mult=1.0):
122 | x = get_tensor(*shape)
123 | y = ndl.ops.logsoftmax(x * mult)
124 | z = (y**2).sum()
125 | z.backward()
126 | return x.grad.cached_data
127 |
128 | def softmax_loss_forward(rows, classes):
129 | x = get_tensor(rows, classes)
130 | y = get_int_tensor(rows, low=0, high=classes)
131 | f = ndl.nn.SoftmaxLoss()
132 | return np.array(f(x, y).cached_data)
133 |
134 |
135 | def softmax_loss_backward(rows, classes):
136 | x = get_tensor(rows, classes)
137 | y = get_int_tensor(rows, low=0, high=classes)
138 | f = ndl.nn.SoftmaxLoss()
139 | loss = f(x, y)
140 | loss.backward()
141 | return x.grad.cached_data
142 |
143 |
144 | def linear_forward(lhs_shape, rhs_shape):
145 | np.random.seed(199)
146 | f = ndl.nn.Linear(*lhs_shape)
147 | f.bias.data = get_tensor(lhs_shape[-1])
148 | x = get_tensor(*rhs_shape)
149 | return f(x).cached_data
150 |
151 |
152 | def linear_backward(lhs_shape, rhs_shape):
153 | np.random.seed(199)
154 | f = ndl.nn.Linear(*lhs_shape)
155 | f.bias.data = get_tensor(lhs_shape[-1])
156 | x = get_tensor(*rhs_shape)
157 | (f(x) ** 2).sum().backward()
158 | return x.grad.cached_data
159 |
160 |
161 | def sequential_forward(batches=3):
162 | np.random.seed(42)
163 | f = nn.Sequential(nn.Linear(5, 8), nn.ReLU(), nn.Linear(8, 5))
164 | x = get_tensor(batches, 5)
165 | return f(x).cached_data
166 |
167 |
168 | def sequential_backward(batches=3):
169 | np.random.seed(42)
170 | f = nn.Sequential(nn.Linear(5, 8), nn.ReLU(), nn.Linear(8, 5))
171 | x = get_tensor(batches, 5)
172 | f(x).sum().backward()
173 | return x.grad.cached_data
174 |
175 |
176 | def residual_forward(shape=(5, 5)):
177 | np.random.seed(42)
178 | f = nn.Residual(
179 | nn.Sequential(nn.Linear(*shape), nn.ReLU(), nn.Linear(*shape[::-1]))
180 | )
181 | x = get_tensor(*shape[::-1])
182 | return f(x).cached_data
183 |
184 |
185 | def residual_backward(shape=(5, 5)):
186 | np.random.seed(42)
187 | f = nn.Residual(
188 | nn.Sequential(nn.Linear(*shape), nn.ReLU(), nn.Linear(*shape[::-1]))
189 | )
190 | x = get_tensor(*shape[::-1])
191 | f(x).sum().backward()
192 | return x.grad.cached_data
193 |
194 |
195 | def learn_model_1d(feature_size, nclasses, _model, optimizer, epochs=1, **kwargs):
196 | np.random.seed(42)
197 | model = _model([])
198 | X = get_tensor(1024, feature_size).cached_data
199 | y = get_int_tensor(1024, low=0, high=nclasses).cached_data.astype(np.uint8)
200 | m = X.shape[0]
201 | batch = 32
202 |
203 | loss_func = nn.SoftmaxLoss()
204 | opt = optimizer(model.parameters(), **kwargs)
205 |
206 | for _ in range(epochs):
207 | for i, (X0, y0) in enumerate(
208 | zip(np.array_split(X, m // batch), np.array_split(y, m // batch))
209 | ):
210 | opt.reset_grad()
211 | X0, y0 = ndl.Tensor(X0, dtype="float32"), ndl.Tensor(y0)
212 | out = model(X0)
213 | loss = loss_func(out, y0)
214 | loss.backward()
215 | # Opt should not change gradients.
216 | grad_before = model.parameters()[0].grad.detach().cached_data
217 | opt.step()
218 | grad_after = model.parameters()[0].grad.detach().cached_data
219 | np.testing.assert_allclose(
220 | grad_before,
221 | grad_after,
222 | rtol=1e-5,
223 | atol=1e-5,
224 | err_msg="Optim should not modify gradients in place",
225 | )
226 |
227 | return np.array(loss.cached_data)
228 |
229 |
230 | def learn_model_1d_eval(feature_size, nclasses, _model, optimizer, epochs=1, **kwargs):
231 | np.random.seed(42)
232 | model = _model([])
233 | X = get_tensor(1024, feature_size).cached_data
234 | y = get_int_tensor(1024, low=0, high=nclasses).cached_data.astype(np.uint8)
235 | m = X.shape[0]
236 | batch = 32
237 |
238 | loss_func = nn.SoftmaxLoss()
239 | opt = optimizer(model.parameters(), **kwargs)
240 |
241 | for i, (X0, y0) in enumerate(
242 | zip(np.array_split(X, m // batch), np.array_split(y, m // batch))
243 | ):
244 | opt.reset_grad()
245 | X0, y0 = ndl.Tensor(X0, dtype="float32"), ndl.Tensor(y0)
246 | out = model(X0)
247 | loss = loss_func(out, y0)
248 | loss.backward()
249 | opt.step()
250 |
251 | X_test = ndl.Tensor(get_tensor(batch, feature_size).cached_data)
252 | y_test = ndl.Tensor(
253 | get_int_tensor(batch, low=0, high=nclasses).cached_data.astype(np.uint8)
254 | )
255 |
256 | model.eval()
257 |
258 | return np.array(loss_func(model(X_test), y_test).cached_data)
259 |
260 |
261 | def init_a_tensor_of_shape(shape, init_fn):
262 | x = get_tensor(*shape)
263 | np.random.seed(42)
264 | init_fn(x)
265 | return x.cached_data
266 |
267 |
268 | def global_tensor_count():
269 | return np.array(ndl.autograd.TENSOR_COUNTER)
270 |
271 |
272 | def nn_linear_weight_init():
273 | np.random.seed(1337)
274 | f = ndl.nn.Linear(7, 4)
275 | f.weight.cached_data
276 | return f.weight.cached_data
277 |
278 |
279 | def nn_linear_bias_init():
280 | np.random.seed(1337)
281 | f = ndl.nn.Linear(7, 4)
282 | return f.bias.cached_data
283 |
284 |
285 | class UselessModule(ndl.nn.Module):
286 | def __init__(self):
287 | super().__init__()
288 | self.stuff = {
289 | "layer1": nn.Linear(4, 4),
290 | "layer2": [nn.Dropout(0.1), nn.Sequential(nn.Linear(4, 4))],
291 | }
292 |
293 | def forward(self, x):
294 | raise NotImplementedError()
295 |
296 |
297 | def check_training_mode():
298 | model = nn.Sequential(
299 | nn.BatchNorm1d(4),
300 | nn.Sequential(
301 | nn.LayerNorm1d(4),
302 | nn.Linear(4, 4),
303 | nn.Dropout(0.1),
304 | ),
305 | nn.Linear(4, 4),
306 | UselessModule(),
307 | )
308 |
309 | model_refs = [
310 | model.modules[0],
311 | model.modules[1].modules[0],
312 | model.modules[1].modules[1],
313 | model.modules[1].modules[2],
314 | model.modules[2],
315 | model.modules[3],
316 | model.modules[3].stuff["layer1"],
317 | model.modules[3].stuff["layer2"][0],
318 | model.modules[3].stuff["layer2"][1].modules[0],
319 | ]
320 |
321 | eval_mode = [1 if not x.training else 0 for x in model_refs]
322 | model.eval()
323 | eval_mode.extend([1 if not x.training else 0 for x in model_refs])
324 | model.train()
325 | eval_mode.extend([1 if not x.training else 0 for x in model_refs])
326 |
327 | return np.array(eval_mode)
328 |
329 |
330 | def power_scalar_forward(shape, power=2):
331 | x = get_tensor(*shape)
332 | return (x**power).cached_data
333 |
334 |
335 | def power_scalar_backward(shape, power=2):
336 | x = get_tensor(*shape)
337 | y = (x**power).sum()
338 | y.backward()
339 | return x.grad.cached_data
340 |
341 |
342 | def logsumexp_forward(shape, axes):
343 | x = get_tensor(*shape)
344 | return (ndl.ops.logsumexp(x, axes=axes)).cached_data
345 |
346 |
347 | def logsumexp_backward(shape, axes):
348 | x = get_tensor(*shape)
349 | y = (ndl.ops.logsumexp(x, axes=axes) ** 2).sum()
350 | y.backward()
351 | return x.grad.cached_data
352 |
353 |
354 | def dropout_forward(shape, prob=0.5):
355 | np.random.seed(3)
356 | x = get_tensor(*shape)
357 | f = nn.Dropout(prob)
358 | return f(x).cached_data
359 |
360 |
361 | def dropout_backward(shape, prob=0.5):
362 | np.random.seed(3)
363 | x = get_tensor(*shape)
364 | f = nn.Dropout(prob)
365 | y = f(x).sum()
366 | y.backward()
367 | return x.grad.cached_data
368 |
369 |
370 | def num_params(model):
371 | return np.sum([np.prod(x.shape) for x in model.parameters()])
372 |
373 |
374 | def residual_block_num_params(dim, hidden_dim, norm):
375 | model = ResidualBlock(dim, hidden_dim, norm)
376 | return np.array(num_params(model))
377 |
378 |
379 | def residual_block_forward(dim, hidden_dim, norm, drop_prob):
380 | np.random.seed(2)
381 | input_tensor = ndl.Tensor(np.random.randn(1, dim))
382 | output_tensor = ResidualBlock(dim, hidden_dim, norm, drop_prob)(input_tensor)
383 | return output_tensor.numpy()
384 |
385 |
386 | def mlp_resnet_num_params(dim, hidden_dim, num_blocks, num_classes, norm):
387 | model = MLPResNet(dim, hidden_dim, num_blocks, num_classes, norm)
388 | return np.array(num_params(model))
389 |
390 |
391 | def mlp_resnet_forward(dim, hidden_dim, num_blocks, num_classes, norm, drop_prob):
392 | np.random.seed(4)
393 | input_tensor = ndl.Tensor(np.random.randn(2, dim), dtype=np.float32)
394 | output_tensor = MLPResNet(
395 | dim, hidden_dim, num_blocks, num_classes, norm, drop_prob
396 | )(input_tensor)
397 | return output_tensor.numpy()
398 |
399 |
400 | def train_epoch_1(hidden_dim, batch_size, optimizer, **kwargs):
401 | np.random.seed(1)
402 | train_dataset = ndl.data.MNISTDataset(
403 | "./data/train-images-idx3-ubyte.gz", "./data/train-labels-idx1-ubyte.gz"
404 | )
405 | train_dataloader = ndl.data.DataLoader(dataset=train_dataset, batch_size=batch_size)
406 |
407 | model = MLPResNet(784, hidden_dim)
408 | opt = optimizer(model.parameters(), **kwargs)
409 | model.eval()
410 | return np.array(epoch(train_dataloader, model, opt))
411 |
412 |
413 | def eval_epoch_1(hidden_dim, batch_size):
414 | np.random.seed(1)
415 | test_dataset = ndl.data.MNISTDataset(
416 | "./data/t10k-images-idx3-ubyte.gz", "./data/t10k-labels-idx1-ubyte.gz"
417 | )
418 | test_dataloader = ndl.data.DataLoader(
419 | dataset=test_dataset, batch_size=batch_size, shuffle=False
420 | )
421 |
422 | model = MLPResNet(784, hidden_dim)
423 | model.train()
424 | return np.array(epoch(test_dataloader, model))
425 |
426 |
427 | def train_mnist_1(batch_size, epochs, optimizer, lr, weight_decay, hidden_dim):
428 | np.random.seed(1)
429 | out = train_mnist(
430 | batch_size, epochs, optimizer, lr, weight_decay, hidden_dim, data_dir="./data"
431 | )
432 | return np.array(out)
433 |
434 |
435 | def test_check_prng_contact_us_if_this_fails_1():
436 | np.testing.assert_allclose(
437 | check_prng(3, 3),
438 | np.array(
439 | [[2.1, 0.95, 3.45], [3.1, 2.45, 2.3], [3.3, 0.4, 1.2]], dtype=np.float32
440 | ),
441 | rtol=1e-08,
442 | atol=1e-08,
443 | )
444 |
445 |
446 | def test_op_power_scalar_forward_1():
447 | np.testing.assert_allclose(
448 | power_scalar_forward((2, 2), power=2),
449 | np.array([[11.222499, 17.639997], [0.0625, 20.25]], dtype=np.float32),
450 | rtol=1e-5,
451 | atol=1e-5,
452 | )
453 |
454 |
455 | def test_op_power_scalar_forward_2():
456 | np.testing.assert_allclose(
457 | power_scalar_forward((2, 2), power=-1.5),
458 | np.array([[0.16309206, 0.11617859], [8.0, 0.10475656]], dtype=np.float32),
459 | rtol=1e-5,
460 | atol=1e-5,
461 | )
462 |
463 |
464 | def test_op_power_scalar_backward_1():
465 | np.testing.assert_allclose(
466 | power_scalar_backward((2, 2), power=2),
467 | np.array([[6.7, 8.4], [0.5, 9.0]], dtype=np.float32),
468 | rtol=1e-5,
469 | atol=1e-5,
470 | )
471 |
472 |
473 | def test_op_logsoftmax_forward_1():
474 | np.testing.assert_allclose(logsoftmax_forward((3, 3)),
475 | np.array([[-1.6436583 , -2.7936583 , -0.29365814],
476 | [-0.6787312 , -1.3287311 , -1.4787312 ],
477 | [-0.16337626, -3.0633762 , -2.2633762 ]], dtype=np.float32), rtol=1e-5, atol=1e-5)
478 |
479 | def test_op_logsoftmax_stable_forward_1():
480 | np.testing.assert_allclose(logsoftmax_forward((3, 3), mult=1e5),
481 | np.array([[-135000.02, -250000. , 0. ],
482 | [ 0. , -65000. , -80000. ],
483 | [ 0. , -290000. , -210000. ]], dtype=np.float32), rtol=1e-5, atol=1e-5)
484 |
485 | def test_op_logsoftmax_backward_1():
486 | np.testing.assert_allclose(logsoftmax_backward((3, 3)),
487 | np.array([[-1.4585897 , -5.008274 , 6.4668627 ],
488 | [ 2.1793516 , -0.81108296, -1.3682691 ],
489 | [ 8.998467 , -5.613649 , -3.3848193 ]], dtype=np.float32), rtol=1e-5, atol=1e-5)
490 |
491 | def submit_op_logsoftmax():
492 | mugrade.submit(logsoftmax_forward((3, 4)))
493 | mugrade.submit(logsoftmax_forward((3, 5), mult=1e5))
494 | mugrade.submit(logsoftmax_forward((3, 6), mult=1e5))
495 | mugrade.submit(logsoftmax_backward((1, 3)))
496 | mugrade.submit(logsoftmax_backward((3, 6), mult=1e5))
497 |
498 |
499 | def test_op_logsumexp_forward_1():
500 | np.testing.assert_allclose(
501 | logsumexp_forward((3, 3, 3), (1, 2)),
502 | np.array([5.366029, 4.9753823, 6.208126], dtype=np.float32),
503 | rtol=1e-5,
504 | atol=1e-5,
505 | )
506 |
507 |
508 | def test_op_logsumexp_forward_2():
509 | np.testing.assert_allclose(
510 | logsumexp_forward((3, 3, 3), None),
511 | np.array([6.7517853], dtype=np.float32),
512 | rtol=1e-5,
513 | atol=1e-5,
514 | )
515 |
516 |
517 | def test_op_logsumexp_forward_3():
518 | np.testing.assert_allclose(
519 | logsumexp_forward((1, 2, 3, 4), (0, 2)),
520 | np.array(
521 | [
522 | [5.276974, 5.047317, 3.778802, 5.0103745],
523 | [5.087831, 4.391712, 5.025037, 2.0214698],
524 | ],
525 | dtype=np.float32,
526 | ),
527 | rtol=1e-5,
528 | atol=1e-5,
529 | )
530 |
531 |
532 | def test_op_logsumexp_forward_4():
533 | np.testing.assert_allclose(
534 | logsumexp_forward((3, 10), (1,)),
535 | np.array([5.705309, 5.976375, 5.696459], dtype=np.float32),
536 | rtol=1e-5,
537 | atol=1e-5,
538 | )
539 |
540 |
541 | def test_op_logsumexp_forward_5():
542 | test_data = ndl.ops.logsumexp(
543 | ndl.Tensor(np.array([[1e10, 1e9, 1e8, -10], [1e-10, 1e9, 1e8, -10]])), (0,)
544 | ).numpy()
545 | np.testing.assert_allclose(
546 | test_data,
547 | np.array([1.00000000e10, 1.00000000e09, 1.00000001e08, -9.30685282e00]),
548 | rtol=1e-5,
549 | atol=1e-5,
550 | )
551 |
552 |
553 | def test_op_logsumexp_backward_1():
554 | np.testing.assert_allclose(
555 | logsumexp_backward((3, 1), (1,)),
556 | np.array([[1.0], [7.3], [9.9]], dtype=np.float32),
557 | rtol=1e-5,
558 | atol=1e-5,
559 | )
560 |
561 |
562 | def test_op_logsumexp_backward_2():
563 | np.testing.assert_allclose(
564 | logsumexp_backward((3, 3, 3), (1, 2)),
565 | np.array(
566 | [
567 | [
568 | [1.4293308, 1.2933122, 0.82465225],
569 | [0.50017685, 2.1323113, 2.1323113],
570 | [1.4293308, 0.58112264, 0.40951014],
571 | ],
572 | [
573 | [0.3578173, 0.07983983, 4.359107],
574 | [1.1300558, 0.561169, 0.1132981],
575 | [0.9252113, 0.65198547, 1.7722803],
576 | ],
577 | [
578 | [0.2755132, 2.365242, 2.888913],
579 | [0.05291228, 1.1745441, 0.02627547],
580 | [2.748018, 0.13681579, 2.748018],
581 | ],
582 | ],
583 | dtype=np.float32,
584 | ),
585 | rtol=1e-5,
586 | atol=1e-5,
587 | )
588 |
589 |
590 | def test_op_logsumexp_backward_3():
591 | np.testing.assert_allclose(
592 | logsumexp_backward((3, 3, 3), (0, 2)),
593 | np.array(
594 | [
595 | [
596 | [0.92824626, 0.839912, 0.5355515],
597 | [0.59857905, 2.551811, 2.551811],
598 | [1.0213376, 0.41524494, 0.29261813],
599 | ],
600 | [
601 | [0.16957533, 0.03783737, 2.0658503],
602 | [0.98689, 0.49007502, 0.09894446],
603 | [0.48244575, 0.3399738, 0.9241446],
604 | ],
605 | [
606 | [0.358991, 3.081887, 3.764224],
607 | [0.12704718, 2.820187, 0.06308978],
608 | [3.9397335, 0.19614778, 3.9397335],
609 | ],
610 | ],
611 | dtype=np.float32,
612 | ),
613 | rtol=1e-5,
614 | atol=1e-5,
615 | )
616 |
617 |
618 | def test_op_logsumexp_backward_5():
619 | grad_compare = ndl.Tensor(np.array([[1e10, 1e9, 1e8, -10], [1e-10, 1e9, 1e8, -10]]))
620 | test_data = (ndl.ops.logsumexp(grad_compare, (0,)) ** 2).sum().backward()
621 | np.testing.assert_allclose(
622 | grad_compare.grad.cached_data,
623 | np.array(
624 | [
625 | [2.00000000e10, 9.99999999e08, 1.00000001e08, -9.30685282e00],
626 | [0.00000000e00, 9.99999999e08, 1.00000001e08, -9.30685282e00],
627 | ]
628 | ),
629 | rtol=1e-5,
630 | atol=1e-5,
631 | )
632 |
633 |
634 | def submit_op_logsumexp():
635 | mugrade.submit(logsumexp_forward((2, 2, 2), None))
636 | mugrade.submit(logsumexp_forward((1, 2, 3), (0,)))
637 | mugrade.submit(logsumexp_forward((2, 3, 3), (1, 2)))
638 | mugrade.submit(logsumexp_forward((1, 2, 2, 2, 2), (1, 2, 3, 4)))
639 | mugrade.submit(logsumexp_forward((1, 2, 2, 2, 2), (0, 1, 3)))
640 | mugrade.submit(logsumexp_backward((2, 2, 2), None))
641 | mugrade.submit(logsumexp_backward((1, 2, 3), (0,)))
642 | mugrade.submit(logsumexp_backward((2, 3, 3), (1, 2)))
643 | mugrade.submit(logsumexp_backward((1, 2, 2, 2, 2), (1, 2, 3, 4)))
644 | mugrade.submit(logsumexp_backward((1, 2, 2, 2, 2), (0, 1, 3)))
645 |
646 |
647 | def test_op_logsumexp_backward_4():
648 | np.testing.assert_allclose(
649 | logsumexp_backward((1, 2, 3, 4), None),
650 | np.array(
651 | [
652 | [
653 | [
654 | [0.96463485, 1.30212122, 0.09671321, 1.84779774],
655 | [1.84779774, 0.39219132, 0.21523925, 0.30543892],
656 | [0.01952606, 0.55654611, 0.32109909, 0.01598658],
657 | ],
658 | [
659 | [1.30212122, 0.83026929, 0.30543892, 0.01680623],
660 | [0.29054249, 0.07532032, 1.84779774, 0.05307731],
661 | [0.75125862, 0.26289377, 0.04802637, 0.03932065],
662 | ],
663 | ]
664 | ],
665 | dtype=np.float32,
666 | ),
667 | rtol=1e-5,
668 | atol=1e-5,
669 | )
670 |
671 |
672 | def test_init_kaiming_uniform():
673 | np.random.seed(42)
674 | np.testing.assert_allclose(
675 | ndl.init.kaiming_uniform(3, 5).numpy(),
676 | np.array(
677 | [
678 | [-0.35485414, 1.2748126, 0.65617794, 0.27904832, -0.9729262],
679 | [-0.97299445, -1.2499284, 1.0357026, 0.28599644, 0.58851814],
680 | [-1.3559918, 1.3291057, 0.9402898, -0.81362784, -0.8999349],
681 | ],
682 | dtype=np.float32,
683 | ),
684 | rtol=1e-4,
685 | atol=1e-4,
686 | )
687 |
688 |
689 | def test_init_kaiming_normal():
690 | np.random.seed(42)
691 | np.testing.assert_allclose(
692 | ndl.init.kaiming_normal(3, 5).numpy(),
693 | np.array(
694 | [
695 | [0.4055654, -0.11289233, 0.5288355, 1.2435486, -0.19118543],
696 | [-0.19117202, 1.2894219, 0.62660784, -0.38332424, 0.4429984],
697 | [-0.37837896, -0.38026676, 0.19756137, -1.5621868, -1.4083896],
698 | ],
699 | dtype=np.float32,
700 | ),
701 | rtol=1e-4,
702 | atol=1e-4,
703 | )
704 |
705 |
706 | def test_init_xavier_uniform():
707 | np.random.seed(42)
708 | np.testing.assert_allclose(
709 | ndl.init.xavier_uniform(3, 5, gain=1.5).numpy(),
710 | np.array(
711 | [
712 | [-0.32595432, 1.1709901, 0.60273796, 0.25632226, -0.8936898],
713 | [-0.89375246, -1.1481324, 0.95135355, 0.26270452, 0.54058844],
714 | [-1.245558, 1.2208616, 0.8637113, -0.74736494, -0.826643],
715 | ],
716 | dtype=np.float32,
717 | ),
718 | rtol=1e-4,
719 | atol=1e-4,
720 | )
721 |
722 |
723 | def test_init_xavier_normal():
724 | np.random.seed(42)
725 | np.testing.assert_allclose(
726 | ndl.init.xavier_normal(3, 5, gain=0.33).numpy(),
727 | np.array(
728 | [
729 | [0.08195783, -0.022813609, 0.10686861, 0.25129992, -0.038635306],
730 | [-0.038632598, 0.2605701, 0.12662673, -0.07746328, 0.08952241],
731 | [-0.07646392, -0.07684541, 0.039923776, -0.31569123, -0.28461143],
732 | ],
733 | dtype=np.float32,
734 | ),
735 | rtol=1e-4,
736 | atol=1e-4,
737 | )
738 |
739 |
740 | def submit_init():
741 | np.random.seed(0)
742 | mugrade.submit(ndl.init.kaiming_normal(2, 5).numpy())
743 | mugrade.submit(ndl.init.kaiming_uniform(2, 5).numpy())
744 | mugrade.submit(ndl.init.xavier_uniform(2, 5, gain=0.33).numpy())
745 | mugrade.submit(ndl.init.xavier_normal(2, 5, gain=1.3).numpy())
746 |
747 |
748 | def test_nn_linear_weight_init_1():
749 | np.testing.assert_allclose(
750 | nn_linear_weight_init(),
751 | np.array(
752 | [
753 | [-4.4064468e-01, -6.3199449e-01, -4.1082984e-01, -7.5330488e-02],
754 | [-3.3144259e-01, 3.4056887e-02, -4.4079605e-01, 8.8153863e-01],
755 | [4.3108878e-01, -7.1237373e-01, -2.1057765e-01, 2.3793796e-01],
756 | [-6.9425780e-01, 8.9535803e-01, -1.0512712e-01, 5.3615785e-01],
757 | [5.4460180e-01, -2.5689366e-01, -1.5534532e-01, 1.5601574e-01],
758 | [4.8174453e-01, -5.7806653e-01, -3.9223823e-01, 3.1518409e-01],
759 | [-6.5129338e-04, -5.9517515e-01, -1.6083106e-01, -5.5698222e-01],
760 | ],
761 | dtype=np.float32,
762 | ),
763 | rtol=1e-5,
764 | atol=1e-5,
765 | )
766 |
767 |
768 | def test_nn_linear_bias_init_1():
769 | np.testing.assert_allclose(
770 | nn_linear_bias_init(),
771 | np.array([[0.077647, 0.814139, -0.770975, 1.120297]], dtype=np.float32),
772 | rtol=1e-5,
773 | atol=1e-5,
774 | )
775 |
776 |
777 | def test_nn_linear_forward_1():
778 | np.testing.assert_allclose(
779 | linear_forward((10, 5), (1, 10)),
780 | np.array([[3.849948, 9.50499, 2.38029, 5.572587, 5.668391]], dtype=np.float32),
781 | rtol=1e-5,
782 | atol=1e-5,
783 | )
784 |
785 |
786 | def test_nn_linear_forward_2():
787 | np.testing.assert_allclose(
788 | linear_forward((10, 5), (3, 10)),
789 | np.array(
790 | [
791 | [7.763089, 10.086785, 0.380316, 6.242502, 6.944664],
792 | [2.548275, 7.747925, 5.343155, 2.065694, 9.871243],
793 | [2.871696, 7.466332, 4.236925, 2.461897, 8.209476],
794 | ],
795 | dtype=np.float32,
796 | ),
797 | rtol=1e-5,
798 | atol=1e-5,
799 | )
800 |
801 |
802 | def test_nn_linear_forward_3():
803 | np.testing.assert_allclose(
804 | linear_forward((10, 5), (1, 3, 10)),
805 | np.array(
806 | [
807 | [
808 | [4.351459, 8.782808, 3.935711, 3.03171, 8.014219],
809 | [5.214458, 8.728788, 2.376814, 5.672185, 4.974319],
810 | [1.343204, 8.639378, 2.604359, -0.282955, 9.864498],
811 | ]
812 | ],
813 | dtype=np.float32,
814 | ),
815 | rtol=1e-5,
816 | atol=1e-5,
817 | )
818 |
819 |
820 | def test_nn_linear_backward_1():
821 | np.testing.assert_allclose(
822 | linear_backward((10, 5), (1, 10)),
823 | np.array(
824 | [
825 | [
826 | 20.61148,
827 | 6.920893,
828 | -1.625556,
829 | -13.497676,
830 | -6.672813,
831 | 18.762121,
832 | 7.286628,
833 | 8.18535,
834 | 2.741301,
835 | 5.723689,
836 | ]
837 | ],
838 | dtype=np.float32,
839 | ),
840 | rtol=1e-5,
841 | atol=1e-5,
842 | )
843 |
844 |
845 | def test_nn_linear_backward_2():
846 | print(linear_backward((10, 5), (3, 10)))
847 | np.testing.assert_allclose(
848 | linear_backward((10, 5), (3, 10)),
849 | np.array(
850 | [
851 | [
852 | 24.548800,
853 | 8.775347,
854 | 4.387898,
855 | -21.248514,
856 | -3.9669373,
857 | 24.256767,
858 | 6.3171115,
859 | 6.029777,
860 | 0.8809935,
861 | 3.5995162,
862 | ],
863 | [
864 | 12.233745,
865 | -3.792646,
866 | -4.1903896,
867 | -5.106719,
868 | -12.004269,
869 | 11.967942,
870 | 11.939469,
871 | 19.314493,
872 | 10.631226,
873 | 14.510731,
874 | ],
875 | [
876 | 12.920014,
877 | -1.4545978,
878 | -3.0892954,
879 | -6.762379,
880 | -9.713004,
881 | 12.523148,
882 | 9.904757,
883 | 15.442993,
884 | 8.044141,
885 | 11.4106865,
886 | ],
887 | ],
888 | dtype=np.float32,
889 | ),
890 | rtol=1e-5,
891 | atol=1e-5,
892 | )
893 |
894 |
895 | def test_nn_linear_backward_3():
896 | print(linear_backward((10, 5), (1, 3, 10)))
897 | np.testing.assert_allclose(
898 | linear_backward((10, 5), (1, 3, 10)),
899 | np.array(
900 | [
901 | [
902 | [
903 | 16.318823,
904 | 0.3890714,
905 | -2.3196607,
906 | -10.607947,
907 | -8.891977,
908 | 16.04581,
909 | 9.475689,
910 | 14.571134,
911 | 6.581477,
912 | 10.204643,
913 | ],
914 | [
915 | 20.291656,
916 | 7.48733,
917 | 1.2581345,
918 | -14.285493,
919 | -6.0252004,
920 | 19.621624,
921 | 4.343303,
922 | 6.973201,
923 | -0.8103489,
924 | 4.037069,
925 | ],
926 | [
927 | 11.332953,
928 | -5.698288,
929 | -8.815561,
930 | -7.673438,
931 | -7.6161675,
932 | 9.361553,
933 | 17.341637,
934 | 17.269142,
935 | 18.1076,
936 | 14.261493,
937 | ],
938 | ]
939 | ],
940 | dtype=np.float32,
941 | ),
942 | rtol=1e-5,
943 | atol=1e-5,
944 | )
945 |
946 |
947 | def submit_nn_linear():
948 | mugrade.submit(linear_forward((3, 5), (1, 3)))
949 | mugrade.submit(linear_forward((3, 5), (3, 3)))
950 | mugrade.submit(linear_forward((3, 5), (1, 3, 3)))
951 | mugrade.submit(linear_backward((4, 5), (1, 4)))
952 | mugrade.submit(linear_backward((4, 5), (3, 4)))
953 | mugrade.submit(linear_backward((4, 5), (1, 3, 4)))
954 |
955 |
956 | def test_nn_relu_forward_1():
957 | np.testing.assert_allclose(
958 | relu_forward(2, 2),
959 | np.array([[3.35, 4.2], [0.25, 4.5]], dtype=np.float32),
960 | rtol=1e-5,
961 | atol=1e-5,
962 | )
963 |
964 |
965 | def test_nn_relu_backward_1():
966 | np.testing.assert_allclose(
967 | relu_backward(3, 2),
968 | np.array([[7.5, 2.7], [0.6, 0.2], [0.3, 6.7]], dtype=np.float32),
969 | rtol=1e-5,
970 | atol=1e-5,
971 | )
972 |
973 |
974 | def submit_nn_relu():
975 | mugrade.submit(relu_forward(2, 3))
976 | mugrade.submit(relu_backward(3, 4))
977 |
978 |
979 | def test_nn_sequential_forward_1():
980 | print(sequential_forward(batches=3))
981 | np.testing.assert_allclose(
982 | sequential_forward(batches=3),
983 | np.array(
984 | [
985 | [3.296263, 0.057031, 2.97568, -4.618432, -0.902491],
986 | [2.465332, -0.228394, 2.069803, -3.772378, -0.238334],
987 | [3.04427, -0.25623, 3.848721, -6.586399, -0.576819],
988 | ],
989 | dtype=np.float32,
990 | ),
991 | rtol=1e-5,
992 | atol=1e-5,
993 | )
994 |
995 |
996 | def test_nn_sequential_backward_1():
997 | np.testing.assert_allclose(
998 | sequential_backward(batches=3),
999 | np.array(
1000 | [
1001 | [0.802697, -1.0971, 0.120842, 0.033051, 0.241105],
1002 | [-0.364489, 0.651385, 0.482428, 0.925252, -1.233545],
1003 | [0.802697, -1.0971, 0.120842, 0.033051, 0.241105],
1004 | ],
1005 | dtype=np.float32,
1006 | ),
1007 | rtol=1e-5,
1008 | atol=1e-5,
1009 | )
1010 |
1011 |
1012 | def submit_nn_sequential():
1013 | mugrade.submit(sequential_forward(batches=2))
1014 | mugrade.submit(sequential_backward(batches=2))
1015 |
1016 |
1017 | def test_nn_softmax_loss_forward_1():
1018 | np.testing.assert_allclose(
1019 | softmax_loss_forward(5, 10),
1020 | np.array(4.041218, dtype=np.float32),
1021 | rtol=1e-5,
1022 | atol=1e-5,
1023 | )
1024 |
1025 |
1026 | def test_nn_softmax_loss_forward_2():
1027 | np.testing.assert_allclose(
1028 | softmax_loss_forward(3, 11),
1029 | np.array(3.3196716, dtype=np.float32),
1030 | rtol=1e-5,
1031 | atol=1e-5,
1032 | )
1033 |
1034 |
1035 | def test_nn_softmax_loss_backward_1():
1036 | np.testing.assert_allclose(
1037 | softmax_loss_backward(5, 10),
1038 | np.array(
1039 | [
1040 | [
1041 | 0.00068890385,
1042 | 0.0015331834,
1043 | 0.013162163,
1044 | -0.16422154,
1045 | 0.023983022,
1046 | 0.0050903494,
1047 | 0.00076135644,
1048 | 0.050772052,
1049 | 0.0062173656,
1050 | 0.062013146,
1051 | ],
1052 | [
1053 | 0.012363418,
1054 | 0.02368262,
1055 | 0.11730081,
1056 | 0.001758993,
1057 | 0.004781439,
1058 | 0.0029000894,
1059 | -0.19815083,
1060 | 0.017544521,
1061 | 0.015874943,
1062 | 0.0019439887,
1063 | ],
1064 | [
1065 | 0.001219767,
1066 | 0.08134181,
1067 | 0.057320606,
1068 | 0.0008595553,
1069 | 0.0030001428,
1070 | 0.0009499555,
1071 | -0.19633561,
1072 | 0.0008176346,
1073 | 0.0014898272,
1074 | 0.0493363,
1075 | ],
1076 | [
1077 | -0.19886842,
1078 | 0.08767337,
1079 | 0.017700946,
1080 | 0.026406704,
1081 | 0.0013147127,
1082 | 0.0107361665,
1083 | 0.009714483,
1084 | 0.023893777,
1085 | 0.019562569,
1086 | 0.0018656658,
1087 | ],
1088 | [
1089 | 0.007933789,
1090 | 0.017656967,
1091 | 0.027691642,
1092 | 0.0005605318,
1093 | 0.05576411,
1094 | 0.0013114461,
1095 | 0.06811045,
1096 | 0.011835824,
1097 | 0.0071787895,
1098 | -0.19804356,
1099 | ],
1100 | ],
1101 | dtype=np.float32,
1102 | ),
1103 | rtol=1e-5,
1104 | atol=1e-5,
1105 | )
1106 |
1107 |
1108 | def test_nn_softmax_loss_backward_2():
1109 | np.testing.assert_allclose(
1110 | softmax_loss_backward(3, 11),
1111 | np.array(
1112 | [
1113 | [
1114 | 0.0027466794,
1115 | 0.020295369,
1116 | 0.012940894,
1117 | 0.04748398,
1118 | 0.052477922,
1119 | 0.090957515,
1120 | 0.0028875037,
1121 | 0.012940894,
1122 | 0.040869843,
1123 | 0.04748398,
1124 | -0.33108455,
1125 | ],
1126 | [
1127 | 0.0063174255,
1128 | 0.001721699,
1129 | 0.09400159,
1130 | 0.0034670753,
1131 | 0.038218185,
1132 | 0.009424488,
1133 | 0.0042346967,
1134 | 0.08090791,
1135 | -0.29697907,
1136 | 0.0044518122,
1137 | 0.054234188,
1138 | ],
1139 | [
1140 | 0.14326698,
1141 | 0.002624026,
1142 | 0.0032049934,
1143 | 0.01176007,
1144 | 0.045363605,
1145 | 0.0043262867,
1146 | 0.039044812,
1147 | 0.017543964,
1148 | 0.0037236712,
1149 | -0.3119051,
1150 | 0.04104668,
1151 | ],
1152 | ],
1153 | dtype=np.float32,
1154 | ),
1155 | rtol=1e-5,
1156 | atol=1e-5,
1157 | )
1158 |
1159 |
1160 | def submit_nn_softmax_loss():
1161 | mugrade.submit(softmax_loss_forward(4, 9))
1162 | mugrade.submit(softmax_loss_forward(2, 7))
1163 | mugrade.submit(softmax_loss_backward(4, 9))
1164 | mugrade.submit(softmax_loss_backward(2, 7))
1165 |
1166 |
1167 | def test_nn_layernorm_forward_1():
1168 | np.testing.assert_allclose(
1169 | layernorm_forward((3, 3), 3),
1170 | np.array(
1171 | [
1172 | [-0.06525002, -1.1908097, 1.2560595],
1173 | [1.3919864, -0.47999576, -0.911992],
1174 | [1.3628436, -1.0085043, -0.3543393],
1175 | ],
1176 | dtype=np.float32,
1177 | ),
1178 | rtol=1e-5,
1179 | atol=1e-5,
1180 | )
1181 |
1182 |
1183 | def test_nn_layernorm_forward_2():
1184 | np.testing.assert_allclose(
1185 | layernorm_forward((2, 10), 10),
1186 | np.array(
1187 | [
1188 | [
1189 | 0.8297899,
1190 | 1.6147263,
1191 | -1.525019,
1192 | -0.4036814,
1193 | 0.306499,
1194 | 0.08223152,
1195 | 0.6429003,
1196 | -1.3381294,
1197 | 0.8671678,
1198 | -1.0764838,
1199 | ],
1200 | [
1201 | -1.8211555,
1202 | 0.39098236,
1203 | -0.5864739,
1204 | 0.853988,
1205 | -0.3806936,
1206 | 1.2655486,
1207 | 0.33953735,
1208 | 1.522774,
1209 | -0.8951442,
1210 | -0.68936396,
1211 | ],
1212 | ],
1213 | dtype=np.float32,
1214 | ),
1215 | rtol=1e-5,
1216 | atol=1e-5,
1217 | )
1218 |
1219 |
1220 | def test_nn_layernorm_forward_3():
1221 | np.testing.assert_allclose(
1222 | layernorm_forward((1, 5), 5),
1223 | np.array(
1224 | [[-1.0435007, -0.8478443, 0.7500162, -0.42392215, 1.565251]],
1225 | dtype=np.float32,
1226 | ),
1227 | rtol=1e-5,
1228 | atol=1e-5,
1229 | )
1230 |
1231 |
1232 | def test_nn_layernorm_backward_1():
1233 | np.testing.assert_allclose(
1234 | layernorm_backward((3, 3), 3),
1235 | np.array(
1236 | [
1237 | [-2.8312206e-06, -6.6757202e-05, 6.9618225e-05],
1238 | [1.9950867e-03, -6.8092346e-04, -1.3141632e-03],
1239 | [4.4703484e-05, -3.2544136e-05, -1.1801720e-05],
1240 | ],
1241 | dtype=np.float32,
1242 | ),
1243 | rtol=1e-5,
1244 | atol=1e-5,
1245 | )
1246 |
1247 |
1248 | def test_nn_layernorm_backward_2():
1249 | np.testing.assert_allclose(
1250 | layernorm_backward((2, 10), 10),
1251 | np.array(
1252 | [
1253 | [
1254 | -2.301574,
1255 | 4.353944,
1256 | -1.9396116,
1257 | 2.4330146,
1258 | -1.1070801,
1259 | 0.01571643,
1260 | -2.209449,
1261 | 0.49513134,
1262 | -2.261348,
1263 | 2.5212562,
1264 | ],
1265 | [
1266 | -9.042961,
1267 | -2.6184766,
1268 | 4.5592957,
1269 | -4.2109876,
1270 | 3.4247458,
1271 | -1.9075732,
1272 | -2.2689414,
1273 | 2.110825,
1274 | 5.044025,
1275 | 4.910048,
1276 | ],
1277 | ],
1278 | dtype=np.float32,
1279 | ),
1280 | rtol=1e-5,
1281 | atol=1e-5,
1282 | )
1283 |
1284 |
1285 | def test_nn_layernorm_backward_3():
1286 | np.testing.assert_allclose(
1287 | layernorm_backward((1, 5), 5),
1288 | np.array(
1289 | [[0.150192, 0.702322, -3.321343, 0.31219, 2.156639]], dtype=np.float32
1290 | ),
1291 | rtol=1e-5,
1292 | atol=1e-5,
1293 | )
1294 |
1295 |
1296 | def test_nn_layernorm_backward_4():
1297 | np.testing.assert_allclose(
1298 | layernorm_backward((5, 1), 1),
1299 | np.array([[0], [0], [0], [0], [0]], dtype=np.float32),
1300 | rtol=1e-5,
1301 | atol=1e-5,
1302 | )
1303 |
1304 |
1305 | def submit_nn_layernorm():
1306 | mugrade.submit(layernorm_forward((1, 1), 1))
1307 | mugrade.submit(layernorm_forward((10, 10), 10))
1308 | mugrade.submit(layernorm_forward((10, 30), 30))
1309 | mugrade.submit(layernorm_forward((1, 3), 3))
1310 | mugrade.submit(layernorm_backward((1, 1), 1))
1311 | mugrade.submit(layernorm_backward((10, 10), 10))
1312 | mugrade.submit(layernorm_backward((10, 30), 30))
1313 | mugrade.submit(layernorm_backward((1, 3), 3))
1314 |
1315 |
1316 | def test_nn_batchnorm_check_model_eval_switches_training_flag_1():
1317 | np.testing.assert_allclose(
1318 | check_training_mode(),
1319 | np.array(
1320 | [
1321 | 0,
1322 | 0,
1323 | 0,
1324 | 0,
1325 | 0,
1326 | 0,
1327 | 0,
1328 | 0,
1329 | 0,
1330 | 1,
1331 | 1,
1332 | 1,
1333 | 1,
1334 | 1,
1335 | 1,
1336 | 1,
1337 | 1,
1338 | 1,
1339 | 0,
1340 | 0,
1341 | 0,
1342 | 0,
1343 | 0,
1344 | 0,
1345 | 0,
1346 | 0,
1347 | 0,
1348 | ]
1349 | ),
1350 | rtol=1e-5,
1351 | atol=1e-5,
1352 | )
1353 |
1354 |
1355 | def test_nn_batchnorm_forward_1():
1356 | np.testing.assert_allclose(
1357 | batchnorm_forward(4, 4),
1358 | np.array(
1359 | [
1360 | [7.8712696e-01, -3.1676728e-01, -6.4885163e-01, 2.0828949e-01],
1361 | [-7.9508079e-03, 1.0092355e00, 1.6221288e00, 8.5209310e-01],
1362 | [8.5073310e-01, -1.4954363e00, -9.6686421e-08, -1.6852506e00],
1363 | [-1.6299094e00, 8.0296844e-01, -9.7327745e-01, 6.2486827e-01],
1364 | ],
1365 | dtype=np.float32,
1366 | ),
1367 | rtol=1e-5,
1368 | atol=1e-5,
1369 | )
1370 |
1371 |
1372 | def test_nn_batchnorm_forward_affine_1():
1373 | np.testing.assert_allclose(
1374 | batchnorm_forward(4, 4, affine=True),
1375 | np.array(
1376 | [
1377 | [7.49529, 0.047213316, 2.690084, 5.5227957],
1378 | [4.116209, 3.8263211, 7.79979, 7.293256],
1379 | [7.765616, -3.3119934, 4.15, 0.31556034],
1380 | [-2.7771149, 3.23846, 1.9601259, 6.6683874],
1381 | ],
1382 | dtype=np.float32,
1383 | ),
1384 | rtol=1e-5,
1385 | atol=1e-5,
1386 | )
1387 |
1388 |
1389 | def test_nn_batchnorm_backward_1():
1390 | np.testing.assert_allclose(
1391 | batchnorm_backward(5, 4),
1392 | np.array(
1393 | [
1394 | [2.1338463e-04, 5.2094460e-06, -2.8359889e-05, -4.4368207e-06],
1395 | [-3.8480759e-04, -4.0292739e-06, 1.8370152e-05, -1.1172146e-05],
1396 | [2.5629997e-04, -1.1003018e-05, -9.0479853e-06, 5.5171549e-06],
1397 | [-4.2676926e-04, 3.4213067e-06, 1.3601780e-05, 1.0166317e-05],
1398 | [3.4189224e-04, 6.4015389e-06, 5.4359434e-06, -7.4505806e-08],
1399 | ],
1400 | dtype=np.float32,
1401 | ),
1402 | rtol=1e-5,
1403 | atol=1e-5,
1404 | )
1405 |
1406 |
1407 | def test_nn_batchnorm_backward_affine_1():
1408 | np.testing.assert_allclose(
1409 | batchnorm_backward(5, 4, affine=True),
1410 | np.array(
1411 | [
1412 | [3.8604736e-03, 4.2676926e-05, -1.4114380e-04, -3.2424927e-05],
1413 | [-6.9427490e-03, -3.3140182e-05, 9.1552734e-05, -8.5830688e-05],
1414 | [4.6386719e-03, -8.9883804e-05, -4.5776367e-05, 4.3869019e-05],
1415 | [-7.7133179e-03, 2.7418137e-05, 6.6757202e-05, 7.4386597e-05],
1416 | [6.1874390e-03, 5.2213669e-05, 2.8610229e-05, -1.9073486e-06],
1417 | ],
1418 | dtype=np.float32,
1419 | ),
1420 | rtol=1e-5,
1421 | atol=1e-4,
1422 | )
1423 |
1424 |
1425 | def test_nn_batchnorm_running_mean_1():
1426 | np.testing.assert_allclose(
1427 | batchnorm_running_mean(4, 3),
1428 | np.array([2.020656, 1.69489, 1.498846], dtype=np.float32),
1429 | rtol=1e-5,
1430 | atol=1e-5,
1431 | )
1432 |
1433 |
1434 | def test_nn_batchnorm_running_var_1():
1435 | np.testing.assert_allclose(
1436 | batchnorm_running_var(4, 3),
1437 | np.array([1.412775, 1.386191, 1.096604], dtype=np.float32),
1438 | rtol=1e-5,
1439 | atol=1e-5,
1440 | )
1441 |
1442 |
1443 | def test_nn_batchnorm_running_grad_1():
1444 | np.testing.assert_allclose(
1445 | batchnorm_running_grad(4, 3),
1446 | np.array(
1447 | [
1448 | [8.7022781e-06, -4.9751252e-06, 9.5367432e-05],
1449 | [6.5565109e-06, -7.2401017e-06, -2.3484230e-05],
1450 | [-3.5762787e-06, -4.5262277e-07, 1.6093254e-05],
1451 | [-1.1682510e-05, 1.2667850e-05, -8.7976456e-05],
1452 | ],
1453 | dtype=np.float32,
1454 | ),
1455 | rtol=1e-5,
1456 | atol=1e-5,
1457 | )
1458 |
1459 |
1460 | def submit_nn_batchnorm():
1461 | mugrade.submit(batchnorm_forward(2, 3))
1462 | mugrade.submit(batchnorm_forward(3, 4, affine=True))
1463 | mugrade.submit(batchnorm_backward(5, 3))
1464 |
1465 | # todo(Zico) : these need to be added to mugrade
1466 | mugrade.submit(batchnorm_backward(4, 2, affine=True))
1467 | mugrade.submit(batchnorm_running_mean(3, 3))
1468 | mugrade.submit(batchnorm_running_mean(3, 3))
1469 | mugrade.submit(batchnorm_running_var(4, 3))
1470 | mugrade.submit(batchnorm_running_var(4, 4))
1471 | mugrade.submit(batchnorm_running_grad(4, 3))
1472 |
1473 |
1474 | def test_nn_dropout_forward_1():
1475 | np.testing.assert_allclose(
1476 | dropout_forward((2, 3), prob=0.45),
1477 | np.array([[6.818182, 0.0, 0.0], [0.18181819, 0.0, 6.090909]], dtype=np.float32),
1478 | rtol=1e-5,
1479 | atol=1e-5,
1480 | )
1481 |
1482 |
1483 | def test_nn_dropout_backward_1():
1484 | np.testing.assert_allclose(
1485 | dropout_backward((2, 3), prob=0.26),
1486 | np.array(
1487 | [[1.3513514, 0.0, 0.0], [1.3513514, 0.0, 1.3513514]], dtype=np.float32
1488 | ),
1489 | rtol=1e-5,
1490 | atol=1e-5,
1491 | )
1492 |
1493 |
1494 | def submit_nn_dropout():
1495 | mugrade.submit(dropout_forward((3, 3), prob=0.4))
1496 | mugrade.submit(dropout_backward((3, 3), prob=0.15))
1497 |
1498 |
1499 | def test_nn_residual_forward_1():
1500 | np.testing.assert_allclose(
1501 | residual_forward(),
1502 | np.array(
1503 | [
1504 | [0.4660964, 3.8619597, -3.637068, 3.7489638, 2.4931884],
1505 | [-3.3769124, 2.5409935, -2.7110925, 4.9782896, -3.005401],
1506 | [-3.0222898, 3.796795, -2.101042, 6.785948, 0.9347453],
1507 | [-2.2496533, 3.635599, -2.1818666, 5.6361046, 0.9748006],
1508 | [-0.03458184, 0.0823682, -0.06686163, 1.9169499, 1.2638961],
1509 | ],
1510 | dtype=np.float32,
1511 | ),
1512 | rtol=1e-5,
1513 | atol=1e-5,
1514 | )
1515 |
1516 |
1517 | def test_nn_residual_backward_1():
1518 | np.testing.assert_allclose(
1519 | residual_backward(),
1520 | np.array(
1521 | [
1522 | [0.24244219, -0.19571924, -0.08556509, 0.9191598, 1.6787351],
1523 | [0.24244219, -0.19571924, -0.08556509, 0.9191598, 1.6787351],
1524 | [0.24244219, -0.19571924, -0.08556509, 0.9191598, 1.6787351],
1525 | [0.24244219, -0.19571924, -0.08556509, 0.9191598, 1.6787351],
1526 | [0.24244219, -0.19571924, -0.08556509, 0.9191598, 1.6787351],
1527 | ],
1528 | dtype=np.float32,
1529 | ),
1530 | rtol=1e-5,
1531 | atol=1e-5,
1532 | )
1533 |
1534 |
1535 | def submit_nn_residual():
1536 | mugrade.submit(residual_forward(shape=(3, 4)))
1537 | mugrade.submit(residual_backward(shape=(3, 4)))
1538 |
1539 |
1540 | def test_nn_flatten_forward_1():
1541 | np.testing.assert_allclose(
1542 | flatten_forward(3, 3),
1543 | np.array(
1544 | [[2.1, 0.95, 3.45], [3.1, 2.45, 2.3], [3.3, 0.4, 1.2]], dtype=np.float32
1545 | ),
1546 | rtol=1e-5,
1547 | atol=1e-5,
1548 | )
1549 |
1550 |
1551 | def test_nn_flatten_forward_2():
1552 | np.testing.assert_allclose(
1553 | flatten_forward(3, 3, 3),
1554 | np.array(
1555 | [
1556 | [3.35, 3.25, 2.8, 2.3, 3.75, 3.75, 3.35, 2.45, 2.1],
1557 | [1.65, 0.15, 4.15, 2.8, 2.1, 0.5, 2.6, 2.25, 3.25],
1558 | [2.4, 4.55, 4.75, 0.75, 3.85, 0.05, 4.7, 1.7, 4.7],
1559 | ],
1560 | dtype=np.float32,
1561 | ),
1562 | rtol=1e-5,
1563 | atol=1e-5,
1564 | )
1565 |
1566 |
1567 | def test_nn_flatten_forward_3():
1568 | np.testing.assert_allclose(
1569 | flatten_forward(1, 2, 3, 4),
1570 | np.array(
1571 | [
1572 | [
1573 | 4.2,
1574 | 4.5,
1575 | 1.9,
1576 | 4.85,
1577 | 4.85,
1578 | 3.3,
1579 | 2.7,
1580 | 3.05,
1581 | 0.3,
1582 | 3.65,
1583 | 3.1,
1584 | 0.1,
1585 | 4.5,
1586 | 4.05,
1587 | 3.05,
1588 | 0.15,
1589 | 3.0,
1590 | 1.65,
1591 | 4.85,
1592 | 1.3,
1593 | 3.95,
1594 | 2.9,
1595 | 1.2,
1596 | 1.0,
1597 | ]
1598 | ],
1599 | dtype=np.float32,
1600 | ),
1601 | rtol=1e-5,
1602 | atol=1e-5,
1603 | )
1604 |
1605 |
1606 | def test_nn_flatten_forward_4():
1607 | np.testing.assert_allclose(
1608 | flatten_forward(3, 3, 4, 4),
1609 | np.array(
1610 | [
1611 | [
1612 | 0.95,
1613 | 1.1,
1614 | 1.0,
1615 | 1.0,
1616 | 4.9,
1617 | 0.25,
1618 | 1.6,
1619 | 0.35,
1620 | 1.5,
1621 | 3.4,
1622 | 1.75,
1623 | 3.4,
1624 | 4.8,
1625 | 1.4,
1626 | 2.35,
1627 | 3.2,
1628 | 1.65,
1629 | 1.9,
1630 | 3.05,
1631 | 0.35,
1632 | 3.15,
1633 | 4.05,
1634 | 3.3,
1635 | 2.2,
1636 | 2.5,
1637 | 1.5,
1638 | 3.25,
1639 | 0.65,
1640 | 3.05,
1641 | 0.75,
1642 | 3.25,
1643 | 2.55,
1644 | 0.55,
1645 | 0.25,
1646 | 3.65,
1647 | 3.4,
1648 | 0.05,
1649 | 1.4,
1650 | 0.75,
1651 | 1.55,
1652 | 4.45,
1653 | 0.2,
1654 | 3.35,
1655 | 2.45,
1656 | 3.45,
1657 | 4.75,
1658 | 2.45,
1659 | 4.3,
1660 | ],
1661 | [
1662 | 1.0,
1663 | 0.2,
1664 | 0.4,
1665 | 0.7,
1666 | 4.9,
1667 | 4.2,
1668 | 2.55,
1669 | 3.15,
1670 | 1.2,
1671 | 3.8,
1672 | 1.35,
1673 | 1.85,
1674 | 3.15,
1675 | 2.7,
1676 | 1.5,
1677 | 1.35,
1678 | 4.85,
1679 | 4.2,
1680 | 1.5,
1681 | 1.75,
1682 | 0.8,
1683 | 4.3,
1684 | 4.2,
1685 | 4.85,
1686 | 0.0,
1687 | 3.75,
1688 | 0.9,
1689 | 0.0,
1690 | 3.35,
1691 | 1.05,
1692 | 2.2,
1693 | 0.75,
1694 | 3.6,
1695 | 2.0,
1696 | 1.2,
1697 | 1.9,
1698 | 3.45,
1699 | 1.6,
1700 | 3.95,
1701 | 4.45,
1702 | 4.55,
1703 | 4.75,
1704 | 3.7,
1705 | 0.3,
1706 | 2.45,
1707 | 3.75,
1708 | 0.9,
1709 | 2.2,
1710 | ],
1711 | [
1712 | 4.95,
1713 | 1.05,
1714 | 2.4,
1715 | 4.05,
1716 | 3.75,
1717 | 1.95,
1718 | 0.65,
1719 | 4.9,
1720 | 4.3,
1721 | 2.5,
1722 | 1.9,
1723 | 1.75,
1724 | 2.05,
1725 | 3.95,
1726 | 0.8,
1727 | 0.0,
1728 | 0.8,
1729 | 3.45,
1730 | 1.55,
1731 | 0.3,
1732 | 1.5,
1733 | 2.9,
1734 | 2.15,
1735 | 2.15,
1736 | 3.3,
1737 | 3.2,
1738 | 4.3,
1739 | 3.7,
1740 | 0.4,
1741 | 1.7,
1742 | 0.35,
1743 | 1.9,
1744 | 1.8,
1745 | 4.3,
1746 | 4.7,
1747 | 4.05,
1748 | 3.65,
1749 | 1.1,
1750 | 1.0,
1751 | 2.7,
1752 | 3.95,
1753 | 2.3,
1754 | 2.6,
1755 | 3.5,
1756 | 0.75,
1757 | 4.3,
1758 | 3.0,
1759 | 3.85,
1760 | ],
1761 | ],
1762 | dtype=np.float32,
1763 | ),
1764 | rtol=1e-5,
1765 | atol=1e-5,
1766 | )
1767 |
1768 |
1769 | def test_nn_flatten_backward_1():
1770 | np.testing.assert_allclose(
1771 | flatten_backward(3, 3),
1772 | np.array([[4.2, 1.9, 6.9], [6.2, 4.9, 4.6], [6.6, 0.8, 2.4]], dtype=np.float32),
1773 | rtol=1e-5,
1774 | atol=1e-5,
1775 | )
1776 |
1777 |
1778 | def test_nn_flatten_backward_2():
1779 | np.testing.assert_allclose(
1780 | flatten_backward(3, 3, 3),
1781 | np.array(
1782 | [
1783 | [[6.7, 6.5, 5.6], [4.6, 7.5, 7.5], [6.7, 4.9, 4.2]],
1784 | [[3.3, 0.3, 8.3], [5.6, 4.2, 1.0], [5.2, 4.5, 6.5]],
1785 | [[4.8, 9.1, 9.5], [1.5, 7.7, 0.1], [9.4, 3.4, 9.4]],
1786 | ],
1787 | dtype=np.float32,
1788 | ),
1789 | rtol=1e-5,
1790 | atol=1e-5,
1791 | )
1792 |
1793 |
1794 | def test_nn_flatten_backward_3():
1795 | np.testing.assert_allclose(
1796 | flatten_backward(2, 2, 2, 2),
1797 | np.array(
1798 | [
1799 | [[[6.8, 3.8], [5.4, 5.1]], [[8.5, 4.8], [3.1, 1.0]]],
1800 | [[[9.3, 0.8], [3.4, 1.6]], [[9.4, 3.6], [6.6, 7.0]]],
1801 | ],
1802 | dtype=np.float32,
1803 | ),
1804 | rtol=1e-5,
1805 | atol=1e-5,
1806 | )
1807 |
1808 |
1809 | def test_nn_flatten_backward_4():
1810 | np.testing.assert_allclose(
1811 | flatten_backward(1, 2, 3, 4),
1812 | np.array(
1813 | [
1814 | [
1815 | [[8.4, 9.0, 3.8, 9.7], [9.7, 6.6, 5.4, 6.1], [0.6, 7.3, 6.2, 0.2]],
1816 | [[9.0, 8.1, 6.1, 0.3], [6.0, 3.3, 9.7, 2.6], [7.9, 5.8, 2.4, 2.0]],
1817 | ]
1818 | ],
1819 | dtype=np.float32,
1820 | ),
1821 | rtol=1e-5,
1822 | atol=1e-5,
1823 | )
1824 |
1825 |
1826 | def test_nn_flatten_backward_5():
1827 | np.testing.assert_allclose(
1828 | flatten_backward(2, 2, 4, 3),
1829 | np.array(
1830 | [
1831 | [
1832 | [
1833 | [9.8, 7.1, 5.4],
1834 | [4.0, 6.2, 5.7],
1835 | [7.2, 2.0, 2.4],
1836 | [8.9, 4.9, 3.3],
1837 | ],
1838 | [
1839 | [9.0, 9.8, 5.9],
1840 | [7.1, 2.7, 9.6],
1841 | [8.5, 9.3, 5.8],
1842 | [3.1, 9.0, 6.7],
1843 | ],
1844 | ],
1845 | [
1846 | [
1847 | [7.4, 8.6, 6.9],
1848 | [8.2, 5.3, 8.7],
1849 | [8.8, 8.7, 4.0],
1850 | [3.9, 1.8, 2.7],
1851 | ],
1852 | [
1853 | [5.7, 6.2, 0.0],
1854 | [6.0, 0.0, 0.3],
1855 | [2.0, 0.1, 2.7],
1856 | [2.1, 0.1, 6.7],
1857 | ],
1858 | ],
1859 | ],
1860 | dtype=np.float32,
1861 | ),
1862 | rtol=1e-5,
1863 | atol=1e-5,
1864 | )
1865 |
1866 |
1867 | def submit_nn_flatten():
1868 | mugrade.submit(flatten_forward(1, 2, 2))
1869 | mugrade.submit(flatten_forward(2, 2, 2))
1870 | mugrade.submit(flatten_forward(2, 3, 4, 2, 1, 2))
1871 | mugrade.submit(flatten_forward(2, 3))
1872 | mugrade.submit(flatten_backward(1, 2, 2))
1873 | mugrade.submit(flatten_backward(2, 2, 2))
1874 | mugrade.submit(flatten_backward(2, 3, 4, 2, 1, 2))
1875 | mugrade.submit(flatten_backward(2, 3, 4, 4))
1876 |
1877 |
1878 | def test_optim_sgd_vanilla_1():
1879 | np.testing.assert_allclose(
1880 | learn_model_1d(
1881 | 64,
1882 | 16,
1883 | lambda z: nn.Sequential(nn.Linear(64, 32), nn.ReLU(), nn.Linear(32, 16)),
1884 | ndl.optim.SGD,
1885 | lr=0.01,
1886 | momentum=0.0,
1887 | ),
1888 | np.array(3.207009),
1889 | rtol=1e-5,
1890 | atol=1e-5,
1891 | )
1892 |
1893 |
1894 | def test_optim_sgd_momentum_1():
1895 | np.testing.assert_allclose(
1896 | learn_model_1d(
1897 | 64,
1898 | 16,
1899 | lambda z: nn.Sequential(nn.Linear(64, 32), nn.ReLU(), nn.Linear(32, 16)),
1900 | ndl.optim.SGD,
1901 | lr=0.01,
1902 | momentum=0.9,
1903 | ),
1904 | np.array(3.311805),
1905 | rtol=1e-5,
1906 | atol=1e-5,
1907 | )
1908 |
1909 |
1910 | def test_optim_sgd_weight_decay_1():
1911 | np.testing.assert_allclose(
1912 | learn_model_1d(
1913 | 64,
1914 | 16,
1915 | lambda z: nn.Sequential(nn.Linear(64, 32), nn.ReLU(), nn.Linear(32, 16)),
1916 | ndl.optim.SGD,
1917 | lr=0.01,
1918 | momentum=0.0,
1919 | weight_decay=0.01,
1920 | ),
1921 | np.array(3.202637),
1922 | rtol=1e-5,
1923 | atol=1e-5,
1924 | )
1925 |
1926 |
1927 | def test_optim_sgd_momentum_weight_decay_1():
1928 | np.testing.assert_allclose(
1929 | learn_model_1d(
1930 | 64,
1931 | 16,
1932 | lambda z: nn.Sequential(nn.Linear(64, 32), nn.ReLU(), nn.Linear(32, 16)),
1933 | ndl.optim.SGD,
1934 | lr=0.01,
1935 | momentum=0.9,
1936 | weight_decay=0.01,
1937 | ),
1938 | np.array(3.306993),
1939 | rtol=1e-5,
1940 | atol=1e-5,
1941 | )
1942 |
1943 |
1944 | def test_optim_sgd_layernorm_residual_1():
1945 | nn.LayerNorm1d(8)
1946 | np.testing.assert_allclose(
1947 | learn_model_1d(
1948 | 64,
1949 | 16,
1950 | lambda z: nn.Sequential(
1951 | nn.Linear(64, 8),
1952 | nn.ReLU(),
1953 | nn.Residual(nn.Linear(8, 8)),
1954 | nn.Linear(8, 16),
1955 | ),
1956 | ndl.optim.SGD,
1957 | epochs=3,
1958 | lr=0.01,
1959 | weight_decay=0.001,
1960 | ),
1961 | np.array(2.852236),
1962 | rtol=1e-5,
1963 | atol=1e-5,
1964 | )
1965 |
1966 |
1967 | # We're checking that you have not allocated too many tensors;
1968 | # if this fails, make sure you're using .detach()/.data whenever possible.
1969 | def test_optim_sgd_z_memory_check_1():
1970 | np.testing.assert_allclose(
1971 | global_tensor_count(), np.array(387), rtol=1e-5, atol=1000
1972 | )
1973 |
1974 |
1975 | def submit_optim_sgd():
1976 | mugrade.submit(
1977 | learn_model_1d(
1978 | 48,
1979 | 17,
1980 | lambda z: nn.Sequential(nn.Linear(48, 32), nn.ReLU(), nn.Linear(32, 17)),
1981 | ndl.optim.SGD,
1982 | lr=0.03,
1983 | momentum=0.0,
1984 | epochs=2,
1985 | )
1986 | )
1987 | mugrade.submit(
1988 | learn_model_1d(
1989 | 48,
1990 | 16,
1991 | lambda z: nn.Sequential(nn.Linear(48, 32), nn.ReLU(), nn.Linear(32, 16)),
1992 | ndl.optim.SGD,
1993 | lr=0.01,
1994 | momentum=0.9,
1995 | epochs=2,
1996 | )
1997 | )
1998 | mugrade.submit(
1999 | learn_model_1d(
2000 | 48,
2001 | 16,
2002 | lambda z: nn.Sequential(
2003 | nn.Linear(48, 32), nn.ReLU(), nn.BatchNorm1d(32), nn.Linear(32, 16)
2004 | ),
2005 | ndl.optim.SGD,
2006 | lr=0.01,
2007 | momentum=0.0,
2008 | weight_decay=0.01,
2009 | epochs=2,
2010 | )
2011 | )
2012 | mugrade.submit(
2013 | learn_model_1d(
2014 | 54,
2015 | 16,
2016 | lambda z: nn.Sequential(nn.Linear(54, 32), nn.ReLU(), nn.Linear(32, 16)),
2017 | ndl.optim.SGD,
2018 | lr=0.01,
2019 | momentum=0.9,
2020 | weight_decay=0.01,
2021 | epochs=2,
2022 | )
2023 | )
2024 | mugrade.submit(
2025 | learn_model_1d(
2026 | 64,
2027 | 4,
2028 | lambda z: nn.Sequential(
2029 | nn.Linear(64, 8),
2030 | nn.ReLU(),
2031 | nn.Residual(nn.Linear(8, 8)),
2032 | nn.Linear(8, 4),
2033 | ),
2034 | ndl.optim.SGD,
2035 | epochs=3,
2036 | lr=0.01,
2037 | weight_decay=0.001,
2038 | )
2039 | )
2040 |
2041 |
2042 | def test_optim_adam_1():
2043 | np.testing.assert_allclose(
2044 | learn_model_1d(
2045 | 64,
2046 | 16,
2047 | lambda z: nn.Sequential(nn.Linear(64, 32), nn.ReLU(), nn.Linear(32, 16)),
2048 | ndl.optim.Adam,
2049 | lr=0.001,
2050 | ),
2051 | np.array(3.703999),
2052 | rtol=1e-5,
2053 | atol=1e-5,
2054 | )
2055 |
2056 |
2057 | def test_optim_adam_weight_decay_1():
2058 | np.testing.assert_allclose(
2059 | learn_model_1d(
2060 | 64,
2061 | 16,
2062 | lambda z: nn.Sequential(nn.Linear(64, 32), nn.ReLU(), nn.Linear(32, 16)),
2063 | ndl.optim.Adam,
2064 | lr=0.001,
2065 | weight_decay=0.01,
2066 | ),
2067 | np.array(3.705134),
2068 | rtol=1e-5,
2069 | atol=1e-5,
2070 | )
2071 |
2072 |
2073 | def test_optim_adam_batchnorm_1():
2074 | np.testing.assert_allclose(
2075 | learn_model_1d(
2076 | 64,
2077 | 16,
2078 | lambda z: nn.Sequential(
2079 | nn.Linear(64, 32), nn.ReLU(), nn.BatchNorm1d(32), nn.Linear(32, 16)
2080 | ),
2081 | ndl.optim.Adam,
2082 | lr=0.001,
2083 | weight_decay=0.001,
2084 | ),
2085 | np.array(3.296256, dtype=np.float32),
2086 | rtol=1e-5,
2087 | atol=1e-5,
2088 | )
2089 |
2090 |
2091 | def test_optim_adam_batchnorm_eval_mode_1():
2092 | np.testing.assert_allclose(
2093 | learn_model_1d_eval(
2094 | 64,
2095 | 16,
2096 | lambda z: nn.Sequential(
2097 | nn.Linear(64, 32), nn.ReLU(), nn.BatchNorm1d(32), nn.Linear(32, 16)
2098 | ),
2099 | ndl.optim.Adam,
2100 | lr=0.001,
2101 | weight_decay=0.001,
2102 | ),
2103 | np.array(3.192054, dtype=np.float32),
2104 | rtol=1e-5,
2105 | atol=1e-5,
2106 | )
2107 |
2108 |
2109 | def test_optim_adam_layernorm_1():
2110 | np.testing.assert_allclose(
2111 | learn_model_1d(
2112 | 64,
2113 | 16,
2114 | lambda z: nn.Sequential(
2115 | nn.Linear(64, 32), nn.ReLU(), nn.LayerNorm1d(32), nn.Linear(32, 16)
2116 | ),
2117 | ndl.optim.Adam,
2118 | lr=0.01,
2119 | weight_decay=0.01,
2120 | ),
2121 | np.array(2.82192, dtype=np.float32),
2122 | rtol=1e-5,
2123 | atol=1e-5,
2124 | )
2125 |
2126 |
2127 | def test_optim_adam_weight_decay_bias_correction_1():
2128 | np.testing.assert_allclose(
2129 | learn_model_1d(
2130 | 64,
2131 | 16,
2132 | lambda z: nn.Sequential(nn.Linear(64, 32), nn.ReLU(), nn.Linear(32, 16)),
2133 | ndl.optim.Adam,
2134 | lr=0.001,
2135 | weight_decay=0.01,
2136 | ),
2137 | np.array(3.705134),
2138 | rtol=1e-5,
2139 | atol=1e-5,
2140 | )
2141 |
2142 |
2143 | # We're checking that you have not allocated too many tensors;
2144 | # if this fails, make sure you're using .detach()/.data whenever possible.
2145 | def test_optim_adam_z_memory_check_1():
2146 | np.testing.assert_allclose(
2147 | global_tensor_count(), np.array(1132), rtol=1e-5, atol=1000
2148 | )
2149 |
2150 |
2151 | def submit_optim_adam():
2152 | mugrade.submit(
2153 | learn_model_1d(
2154 | 48,
2155 | 16,
2156 | lambda z: nn.Sequential(nn.Linear(48, 32), nn.ReLU(), nn.Linear(32, 16)),
2157 | ndl.optim.Adam,
2158 | lr=0.001,
2159 | epochs=2,
2160 | )
2161 | )
2162 | mugrade.submit(
2163 | learn_model_1d(
2164 | 48,
2165 | 16,
2166 | lambda z: nn.Sequential(nn.Linear(48, 32), nn.ReLU(), nn.Linear(32, 16)),
2167 | ndl.optim.Adam,
2168 | lr=0.001,
2169 | weight_decay=0.01,
2170 | epochs=2,
2171 | )
2172 | )
2173 | mugrade.submit(
2174 | learn_model_1d(
2175 | 48,
2176 | 16,
2177 | lambda z: nn.Sequential(
2178 | nn.Linear(48, 32), nn.ReLU(), nn.BatchNorm1d(32), nn.Linear(32, 16)
2179 | ),
2180 | ndl.optim.Adam,
2181 | lr=0.001,
2182 | weight_decay=0.001,
2183 | epochs=3,
2184 | )
2185 | )
2186 | mugrade.submit(
2187 | learn_model_1d_eval(
2188 | 48,
2189 | 16,
2190 | lambda z: nn.Sequential(
2191 | nn.Linear(48, 32), nn.ReLU(), nn.BatchNorm1d(32), nn.Linear(32, 16)
2192 | ),
2193 | ndl.optim.Adam,
2194 | lr=0.001,
2195 | weight_decay=0.001,
2196 | epochs=2,
2197 | )
2198 | )
2199 | mugrade.submit(
2200 | learn_model_1d(
2201 | 48,
2202 | 16,
2203 | lambda z: nn.Sequential(
2204 | nn.Linear(48, 32), nn.ReLU(), nn.LayerNorm1d(32), nn.Linear(32, 16)
2205 | ),
2206 | ndl.optim.Adam,
2207 | lr=0.01,
2208 | weight_decay=0.01,
2209 | epochs=2,
2210 | )
2211 | )
2212 | mugrade.submit(
2213 | learn_model_1d(
2214 | 48,
2215 | 16,
2216 | lambda z: nn.Sequential(nn.Linear(48, 32), nn.ReLU(), nn.Linear(32, 16)),
2217 | ndl.optim.Adam,
2218 | lr=0.001,
2219 | weight_decay=0.01,
2220 | epochs=2,
2221 | )
2222 | )
2223 |
2224 |
2225 | def test_mlp_residual_block_num_params_1():
2226 | np.testing.assert_allclose(
2227 | residual_block_num_params(15, 2, nn.BatchNorm1d),
2228 | np.array(111),
2229 | rtol=1e-5,
2230 | atol=1e-5,
2231 | )
2232 |
2233 |
2234 | def test_mlp_residual_block_num_params_2():
2235 | np.testing.assert_allclose(
2236 | residual_block_num_params(784, 100, nn.LayerNorm1d),
2237 | np.array(159452),
2238 | rtol=1e-5,
2239 | atol=1e-5,
2240 | )
2241 |
2242 |
2243 | def test_mlp_residual_block_forward_1():
2244 | np.testing.assert_allclose(
2245 | residual_block_forward(15, 10, nn.LayerNorm1d, 0.5),
2246 | np.array(
2247 | [
2248 | [
2249 | 0.0,
2250 | 1.358399,
2251 | 0.0,
2252 | 1.384224,
2253 | 0.0,
2254 | 0.0,
2255 | 0.255451,
2256 | 0.077662,
2257 | 0.0,
2258 | 0.939582,
2259 | 0.525591,
2260 | 1.99213,
2261 | 0.0,
2262 | 0.0,
2263 | 1.012827,
2264 | ]
2265 | ],
2266 | dtype=np.float32,
2267 | ),
2268 | rtol=1e-5,
2269 | atol=1e-5,
2270 | )
2271 |
2272 |
2273 | def test_mlp_resnet_num_params_1():
2274 | np.testing.assert_allclose(
2275 | mlp_resnet_num_params(150, 100, 5, 10, nn.LayerNorm1d),
2276 | np.array(68360),
2277 | rtol=1e-5,
2278 | atol=1e-5,
2279 | )
2280 |
2281 |
2282 | def test_mlp_resnet_num_params_2():
2283 | np.testing.assert_allclose(
2284 | mlp_resnet_num_params(10, 100, 1, 100, nn.BatchNorm1d),
2285 | np.array(21650),
2286 | rtol=1e-5,
2287 | atol=1e-5,
2288 | )
2289 |
2290 |
2291 | def test_mlp_resnet_forward_1():
2292 | np.testing.assert_allclose(
2293 | mlp_resnet_forward(10, 5, 2, 5, nn.LayerNorm1d, 0.5),
2294 | np.array(
2295 | [
2296 | [3.046162, 1.44972, -1.921363, 0.021816, -0.433953],
2297 | [3.489114, 1.820994, -2.111306, 0.226388, -1.029428],
2298 | ],
2299 | dtype=np.float32,
2300 | ),
2301 | rtol=1e-5,
2302 | atol=1e-5,
2303 | )
2304 |
2305 |
2306 | def test_mlp_resnet_forward_2():
2307 | np.testing.assert_allclose(
2308 | mlp_resnet_forward(15, 25, 5, 14, nn.BatchNorm1d, 0.0),
2309 | np.array(
2310 | [
2311 | [
2312 | 0.92448235,
2313 | -2.745743,
2314 | -1.5077105,
2315 | 1.130784,
2316 | -1.2078242,
2317 | -0.09833566,
2318 | -0.69301605,
2319 | 2.8945382,
2320 | 1.259397,
2321 | 0.13866742,
2322 | -2.963875,
2323 | -4.8566914,
2324 | 1.7062538,
2325 | -4.846424,
2326 | ],
2327 | [
2328 | 0.6653336,
2329 | -2.4708004,
2330 | 2.0572243,
2331 | -1.0791507,
2332 | 4.3489094,
2333 | 3.1086435,
2334 | 0.0304327,
2335 | -1.9227124,
2336 | -1.416201,
2337 | -7.2151937,
2338 | -1.4858506,
2339 | 7.1039696,
2340 | -2.1589825,
2341 | -0.7593413,
2342 | ],
2343 | ],
2344 | dtype=np.float32,
2345 | ),
2346 | rtol=1e-5,
2347 | atol=1e-5,
2348 | )
2349 |
2350 |
2351 | def test_mlp_train_epoch_1():
2352 | np.testing.assert_allclose(
2353 | train_epoch_1(5, 250, ndl.optim.Adam, lr=0.01, weight_decay=0.1),
2354 | np.array([0.675267, 1.84043]),
2355 | rtol=0.0001,
2356 | atol=0.0001,
2357 | )
2358 |
2359 |
2360 | def test_mlp_eval_epoch_1():
2361 | np.testing.assert_allclose(
2362 | eval_epoch_1(10, 150), np.array([0.9164, 4.137814]), rtol=1e-5, atol=1e-5
2363 | )
2364 |
2365 |
2366 | def test_mlp_train_mnist_1():
2367 | np.testing.assert_allclose(
2368 | train_mnist_1(250, 2, ndl.optim.SGD, 0.001, 0.01, 100),
2369 | np.array([0.4875, 1.462595, 0.3245, 1.049429]),
2370 | rtol=0.001,
2371 | atol=0.001,
2372 | )
2373 |
2374 |
2375 | def submit_mlp_resnet():
2376 | mugrade.submit(residual_block_num_params(17, 13, nn.BatchNorm1d))
2377 | mugrade.submit(residual_block_num_params(785, 101, nn.LayerNorm1d))
2378 | mugrade.submit(residual_block_forward(15, 5, nn.LayerNorm1d, 0.3))
2379 | mugrade.submit(mlp_resnet_num_params(75, 75, 3, 3, nn.LayerNorm1d))
2380 | mugrade.submit(mlp_resnet_num_params(15, 10, 10, 5, nn.BatchNorm1d))
2381 | mugrade.submit(mlp_resnet_forward(12, 7, 1, 6, nn.LayerNorm1d, 0.8))
2382 | mugrade.submit(mlp_resnet_forward(15, 3, 2, 15, nn.BatchNorm1d, 0.3))
2383 | mugrade.submit(train_epoch_1(7, 256, ndl.optim.Adam, lr=0.01, weight_decay=0.01))
2384 | mugrade.submit(eval_epoch_1(12, 154))
2385 | mugrade.submit(train_mnist_1(550, 1, ndl.optim.SGD, 0.01, 0.01, 7))
2386 |
--------------------------------------------------------------------------------