├── README.md ├── apps └── mlp_resnet.py ├── data ├── t10k-images-idx3-ubyte.gz ├── t10k-labels-idx1-ubyte.gz ├── train-images-idx3-ubyte.gz └── train-labels-idx1-ubyte.gz ├── figures ├── mlp_resnet.png └── residualblock.png ├── hw2.ipynb ├── python └── needle │ ├── __init__.py │ ├── autograd.py │ ├── backend_numpy.py │ ├── data │ ├── __init__.py │ ├── data_basic.py │ ├── data_transforms.py │ └── datasets │ │ ├── __init__.py │ │ ├── mnist_dataset.py │ │ └── ndarray_dataset.py │ ├── init │ ├── __init__.py │ ├── init_basic.py │ └── init_initializers.py │ ├── nn │ ├── __init__.py │ └── nn_basic.py │ ├── ops │ ├── __init__.py │ ├── ops_logarithmic.py │ ├── ops_mathematic.py │ └── ops_tuple.py │ └── optim.py └── tests └── hw2 ├── test_data.py └── test_nn_and_optim.py /README.md: -------------------------------------------------------------------------------- 1 | # Homework 2 2 | 3 | Public repository and stub/testing code for Homework 2 of 10-714. 4 | 5 | -------------------------------------------------------------------------------- /apps/mlp_resnet.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | sys.path.append("../python") 4 | import needle as ndl 5 | import needle.nn as nn 6 | import numpy as np 7 | import time 8 | import os 9 | 10 | np.random.seed(0) 11 | # MY_DEVICE = ndl.backend_selection.cuda() 12 | 13 | 14 | def ResidualBlock(dim, hidden_dim, norm=nn.BatchNorm1d, drop_prob=0.1): 15 | ### BEGIN YOUR SOLUTION 16 | raise NotImplementedError() 17 | ### END YOUR SOLUTION 18 | 19 | 20 | def MLPResNet( 21 | dim, 22 | hidden_dim=100, 23 | num_blocks=3, 24 | num_classes=10, 25 | norm=nn.BatchNorm1d, 26 | drop_prob=0.1, 27 | ): 28 | ### BEGIN YOUR SOLUTION 29 | raise NotImplementedError() 30 | ### END YOUR SOLUTION 31 | 32 | 33 | def epoch(dataloader, model, opt=None): 34 | np.random.seed(4) 35 | ### BEGIN YOUR SOLUTION 36 | raise NotImplementedError() 37 | ### END YOUR SOLUTION 38 | 39 | 40 | def train_mnist( 41 | batch_size=100, 42 | epochs=10, 43 | optimizer=ndl.optim.Adam, 44 | lr=0.001, 45 | weight_decay=0.001, 46 | hidden_dim=100, 47 | data_dir="data", 48 | ): 49 | np.random.seed(4) 50 | ### BEGIN YOUR SOLUTION 51 | raise NotImplementedError() 52 | ### END YOUR SOLUTION 53 | 54 | 55 | if __name__ == "__main__": 56 | train_mnist(data_dir="../data") 57 | -------------------------------------------------------------------------------- /data/t10k-images-idx3-ubyte.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dlsyscourse/hw2/633bad8ac8e52cd40531e94b1a6b6bccae92a102/data/t10k-images-idx3-ubyte.gz -------------------------------------------------------------------------------- /data/t10k-labels-idx1-ubyte.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dlsyscourse/hw2/633bad8ac8e52cd40531e94b1a6b6bccae92a102/data/t10k-labels-idx1-ubyte.gz -------------------------------------------------------------------------------- /data/train-images-idx3-ubyte.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dlsyscourse/hw2/633bad8ac8e52cd40531e94b1a6b6bccae92a102/data/train-images-idx3-ubyte.gz -------------------------------------------------------------------------------- /data/train-labels-idx1-ubyte.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dlsyscourse/hw2/633bad8ac8e52cd40531e94b1a6b6bccae92a102/data/train-labels-idx1-ubyte.gz -------------------------------------------------------------------------------- /figures/mlp_resnet.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dlsyscourse/hw2/633bad8ac8e52cd40531e94b1a6b6bccae92a102/figures/mlp_resnet.png -------------------------------------------------------------------------------- /figures/residualblock.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dlsyscourse/hw2/633bad8ac8e52cd40531e94b1a6b6bccae92a102/figures/residualblock.png -------------------------------------------------------------------------------- /hw2.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# 10-714 Homework 2\n", 8 | "\n", 9 | "In this homework, you will be implementing a neural network library in the needle framework. Reminder: __you must save a copy in drive__." 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": null, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "# Code to set up the assignment\n", 19 | "from google.colab import drive\n", 20 | "drive.mount('/content/drive')\n", 21 | "%cd /content/drive/MyDrive/\n", 22 | "!mkdir -p 10714\n", 23 | "%cd /content/drive/MyDrive/10714\n", 24 | "!git clone https://github.com/dlsys10714/hw2.git\n", 25 | "%cd /content/drive/MyDrive/10714/hw2\n", 26 | "\n", 27 | "!pip3 install --upgrade --no-deps git+https://github.com/dlsys10714/mugrade.git" 28 | ] 29 | }, 30 | { 31 | "cell_type": "markdown", 32 | "metadata": {}, 33 | "source": [ 34 | "## Question 0\n", 35 | "\n", 36 | "This homework builds off of Homework 1. First, in your Homework 2 directory, copy the files `python/needle/autograd.py`, `python/needle/ops/ops_mathematic.py` from your Homework 1.\n", 37 | "\n", 38 | "***NOTE***: The default data type for the tensor is `float32`. If you want to change the data type, you can do so by setting the `dtype` parameter in the `Tensor` constructor. For example, `Tensor([1, 2, 3], dtype='float64')` will create a tensor with `float64` data type. \n", 39 | "In this homework, **make sure any tensor you create has `float32` data type to avoid any issues with the autograder**." 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": null, 45 | "metadata": {}, 46 | "outputs": [], 47 | "source": [ 48 | "import sys\n", 49 | "sys.path.append('./python')\n", 50 | "sys.path.append('./apps')" 51 | ] 52 | }, 53 | { 54 | "cell_type": "markdown", 55 | "metadata": { 56 | "tags": [] 57 | }, 58 | "source": [ 59 | "## Question 1\n", 60 | "\n", 61 | "In this first question, you will implement a few different methods for weight initialization. This will be done in the `python/needle/init/init_initializers.py` file, which contains a number of routines for initializing needle Tensors using various random and constant initializations. Following the same methodology of the existing initializers (you will want to call e.g. `init.rand` or `init.randn` implemented in `python/needle/init/init_basic.py` from your functions below, implement the following common initialization methods. In all cases, the functions should return `fan_in` by `fan_out` 2D tensors (extensions to other sizes can be done via e.g., reshaping).\n", 62 | "\n", 63 | "\n", 64 | "### Xavier uniform\n", 65 | "`xavier_uniform(fan_in, fan_out, gain=1.0, **kwargs)`\n", 66 | "\n", 67 | "Fills the input Tensor with values according to the method described in [Understanding the difficulty of training deep feedforward neural networks](https://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf), using a uniform distribution. The resulting Tensor will have values sampled from $\\mathcal{U}(-a, a)$ where \n", 68 | "\\begin{equation}\n", 69 | "a = \\text{gain} \\times \\sqrt{\\frac{6}{\\text{fan_in} + \\text{fan_out}}}\n", 70 | "\\end{equation}\n", 71 | "\n", 72 | "Pass remaining `**kwargs` parameters to the corresponding `init` random call.\n", 73 | "\n", 74 | "##### Parameters\n", 75 | "- `fan_in` - dimensionality of input\n", 76 | "- `fan_out` - dimensionality of output\n", 77 | "- `gain` - optional scaling factor\n", 78 | "___\n", 79 | "\n", 80 | "### Xavier normal\n", 81 | "`xavier_normal(fan_in, fan_out, gain=1.0, **kwargs)`\n", 82 | "\n", 83 | "Fills the input Tensor with values according to the method described in [Understanding the difficulty of training deep feedforward neural networks](https://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf), using a normal distribution. The resulting Tensor will have values sampled from $\\mathcal{N}(0, \\text{std}^2)$ where \n", 84 | "\\begin{equation}\n", 85 | "\\text{std} = \\text{gain} \\times \\sqrt{\\frac{2}{\\text{fan_in} + \\text{fan_out}}}\n", 86 | "\\end{equation}\n", 87 | "\n", 88 | "##### Parameters\n", 89 | "- `fan_in` - dimensionality of input\n", 90 | "- `fan_out` - dimensionality of output\n", 91 | "- `gain` - optional scaling factor\n", 92 | "___\n", 93 | "\n", 94 | "### Kaiming uniform\n", 95 | "`kaiming_uniform(fan_in, fan_out, nonlinearity=\"relu\", **kwargs)`\n", 96 | "\n", 97 | "Fills the input Tensor with values according to the method described in [Delving deep into rectifiers: Surpassing human-level performance on ImageNet classification](https://arxiv.org/pdf/1502.01852.pdf), using a uniform distribution. The resulting Tensor will have values sampled from $\\mathcal{U}(-\\text{bound}, \\text{bound})$ where \n", 98 | "\\begin{equation}\n", 99 | "\\text{bound} = \\text{gain} \\times \\sqrt{\\frac{3}{\\text{fan_in}}}\n", 100 | "\\end{equation}\n", 101 | "\n", 102 | "Use the recommended gain value for ReLU: $\\text{gain}=\\sqrt{2}$.\n", 103 | "\n", 104 | "##### Parameters\n", 105 | "- `fan_in` - dimensionality of input\n", 106 | "- `fan_out` - dimensionality of output\n", 107 | "- `nonlinearity` - the non-linear function\n", 108 | "___\n", 109 | "\n", 110 | "### Kaiming normal\n", 111 | "`kaiming_normal(fan_in, fan_out, nonlinearity=\"relu\", **kwargs)`\n", 112 | "\n", 113 | "Fills the input Tensor with values according to the method described in [Delving deep into rectifiers: Surpassing human-level performance on ImageNet classification](https://arxiv.org/pdf/1502.01852.pdf), using a uniform distribution. The resulting Tensor will have values sampled from $\\mathcal{N}(0, \\text{std}^2)$ where \n", 114 | "\\begin{equation}\n", 115 | "\\text{std} = \\frac{\\text{gain}}{\\sqrt{\\text{fan_in}}}\n", 116 | "\\end{equation}\n", 117 | "\n", 118 | "Use the recommended gain value for ReLU: $\\text{gain}=\\sqrt{2}$.\n", 119 | "\n", 120 | "##### Parameters\n", 121 | "- `fan_in` - dimensionality of input\n", 122 | "- `fan_out` - dimensionality of output\n", 123 | "- `nonlinearity` - the non-linear function" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": null, 129 | "metadata": {}, 130 | "outputs": [], 131 | "source": [ 132 | "!python3 -m pytest -v -k \"test_init\"" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": null, 138 | "metadata": {}, 139 | "outputs": [], 140 | "source": [ 141 | "!python -m mugrade submit 'YOUR_GRADER_KEY_HERE' -k \"init\" -s" 142 | ] 143 | }, 144 | { 145 | "cell_type": "markdown", 146 | "metadata": {}, 147 | "source": [ 148 | "## Question 2\n", 149 | "\n", 150 | "In this question, you will implement additional modules in `python/needle/nn/nn_basic.py`. Specifically, for the following modules described below, initialize any variables of the module in the constructor, and fill out the `forward` method. **Note:** Be sure that you are using the `init` functions that you just implemented to initialize the parameters, and don't forget to pass the `dtype` argument.\n", 151 | "___\n", 152 | "\n", 153 | "### Linear\n", 154 | "`needle.nn.Linear(in_features, out_features, bias=True, device=None, dtype=\"float32\")`\n", 155 | "\n", 156 | "Applies a linear transformation to the incoming data: $y = xA^T + b$. The input shape is $(N, H_{in})$ where $H_{in}=\\text{in_features}$. The output shape is $(N, H_{out})$ where $H_{out}=\\text{out_features}$.\n", 157 | "\n", 158 | "**Be careful to explicitly broadcast the bias term to the correct shape -- Needle does not support implicit broadcasting.**\n", 159 | "\n", 160 | "**Note: for all layers including this one, you should initialize the weight Tensor before the bias Tensor, and should initialize all Parameters using only functions from `init`**. This does not affect the algorithm's correctness. It is only necessary to ensure the value matches the expected results in the mugrade tests for this assignment's implementation scope. \n", 161 | "\n", 162 | "##### Parameters\n", 163 | "- `in_features` - size of each input sample\n", 164 | "- `out_features` - size of each output sample\n", 165 | "- `bias` - If set to `False`, the layer will not learn an additive bias.\n", 166 | "\n", 167 | "##### Variables\n", 168 | "- `weight` - the learnable weights of shape (`in_features`, `out_features`). The values should be initialized with the Kaiming Uniform initialization with `fan_in = in_features`\n", 169 | "- `bias` - the learnable bias of shape (`out_features`). The values should be initialized with the Kaiming Uniform initialize with `fan_in = out_features`. **Note the difference in fan_in choice, due to their relative sizes**. \n", 170 | "\n", 171 | "Make sure to enclose all necessary variables e.g. (`weight`, `bias`) in the `Parameter` class so that they are visible to the optimizers which would be implemented next." 172 | ] 173 | }, 174 | { 175 | "cell_type": "code", 176 | "execution_count": null, 177 | "metadata": {}, 178 | "outputs": [], 179 | "source": [ 180 | "!python3 -m pytest -v -k \"test_nn_linear\"" 181 | ] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "execution_count": null, 186 | "metadata": {}, 187 | "outputs": [], 188 | "source": [ 189 | "!python -m mugrade submit 'YOUR_GRADER_KEY_HERE' -k \"nn_linear\"" 190 | ] 191 | }, 192 | { 193 | "cell_type": "markdown", 194 | "metadata": {}, 195 | "source": [ 196 | "### ReLU\n", 197 | "`needle.nn.ReLU()`\n", 198 | "\n", 199 | "Applies the rectified linear unit function element-wise:\n", 200 | "$ReLU(x) = max(0, x)$.\n", 201 | "\n", 202 | "If you have previously implemented ReLU's backwards pass in terms of itself, note that this is numerically unstable and will likely cause problems\n", 203 | "down the line.\n", 204 | "Instead, consider that we could write the derivative of ReLU as $I\\{x>0\\}$, where we arbitrarily decide that the derivative at $x=0$ is 0.\n", 205 | "(This is a _subdifferentiable_ function.)\n", 206 | "\n", 207 | "___" 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": null, 213 | "metadata": {}, 214 | "outputs": [], 215 | "source": [ 216 | "!python3 -m pytest -v -k \"test_nn_relu\"" 217 | ] 218 | }, 219 | { 220 | "cell_type": "code", 221 | "execution_count": null, 222 | "metadata": {}, 223 | "outputs": [], 224 | "source": [ 225 | "!python -m mugrade submit 'YOUR_GRADER_KEY_HERE' -k \"nn_relu\"" 226 | ] 227 | }, 228 | { 229 | "cell_type": "markdown", 230 | "metadata": { 231 | "tags": [] 232 | }, 233 | "source": [ 234 | "### Sequential\n", 235 | "`needle.nn.Sequential(*modules)`\n", 236 | "\n", 237 | "Applies a sequence of modules to the input (in the order that they were passed to the constructor) and returns the output of the last module.\n", 238 | "These should be kept in a `.module` property: you should _not_ redefine any magic methods like `__getitem__`, as this may not be compatible with our tests.\n", 239 | "\n", 240 | "##### Parameters\n", 241 | "- `*modules` - any number of modules of type `needle.nn.Module`\n", 242 | "\n", 243 | "___" 244 | ] 245 | }, 246 | { 247 | "cell_type": "code", 248 | "execution_count": null, 249 | "metadata": {}, 250 | "outputs": [], 251 | "source": [ 252 | "!python3 -m pytest -v -k \"test_nn_sequential\"" 253 | ] 254 | }, 255 | { 256 | "cell_type": "code", 257 | "execution_count": null, 258 | "metadata": {}, 259 | "outputs": [], 260 | "source": [ 261 | "!python -m mugrade submit 'YOUR_GRADER_KEY_HERE' -k \"nn_sequential\"" 262 | ] 263 | }, 264 | { 265 | "cell_type": "markdown", 266 | "metadata": { 267 | "tags": [] 268 | }, 269 | "source": [ 270 | "### LogSumExp\n", 271 | "\n", 272 | "`needle.ops.LogSumExp(axes)`\n", 273 | "\n", 274 | "Applies a numerically stable log-sum-exp function to the input by subtracting off the maximum elements. You will need to implement this and the next operation in file `python/needle/ops/ops_logarithmic.py`.\n", 275 | "\n", 276 | "\\begin{equation}\n", 277 | "\\text{LogSumExp}(z) = \\log (\\sum_{i} \\exp (z_i - \\max{z})) + \\max{z}\n", 278 | "\\end{equation}\n", 279 | "\n", 280 | "#### Parameters\n", 281 | "- `axes` - Tuple of axes to sum and take the maximum element over. This uses the same conventions as `needle.ops.Summation()`\n", 282 | "\n", 283 | "___" 284 | ] 285 | }, 286 | { 287 | "cell_type": "code", 288 | "execution_count": null, 289 | "metadata": {}, 290 | "outputs": [], 291 | "source": [ 292 | "!python3 -m pytest -v -k \"test_op_logsumexp\"" 293 | ] 294 | }, 295 | { 296 | "cell_type": "code", 297 | "execution_count": null, 298 | "metadata": {}, 299 | "outputs": [], 300 | "source": [ 301 | "!python -m mugrade submit 'YOUR_GRADER_KEY_HERE' -k \"op_logsumexp\"" 302 | ] 303 | }, 304 | { 305 | "cell_type": "markdown", 306 | "metadata": {}, 307 | "source": [ 308 | "### LogSoftmax\n", 309 | "\n", 310 | "`needle.ops.LogSoftmax(axes)`\n", 311 | "\n", 312 | "Applies a numerically stable logsoftmax function to the input by subtracting off the maximum elements. Assume the input NDArray is 2 dimensional and we are doing softmax over `axis=1`.\n", 313 | "\n", 314 | "\\begin{equation}\n", 315 | "\\text{LogSoftmax}(z) = \\log \\left(\\frac{\\exp(z_i - \\max z)}{\\sum_{i}\\exp(z_i - \\max z)}\\right) = z - \\text{LogSumExp}(z)\n", 316 | "\\end{equation}\n", 317 | "___" 318 | ] 319 | }, 320 | { 321 | "cell_type": "code", 322 | "execution_count": null, 323 | "metadata": {}, 324 | "outputs": [], 325 | "source": [ 326 | "!python3 -m pytest -v -k \"test_op_logsoftmax\"" 327 | ] 328 | }, 329 | { 330 | "cell_type": "code", 331 | "execution_count": null, 332 | "metadata": {}, 333 | "outputs": [], 334 | "source": [ 335 | "!python -m mugrade submit 'YOUR_GRADER_KEY_HERE' -k \"op_logsoftmax\"" 336 | ] 337 | }, 338 | { 339 | "cell_type": "markdown", 340 | "metadata": { 341 | "tags": [] 342 | }, 343 | "source": [ 344 | "### SoftmaxLoss\n", 345 | "\n", 346 | "`needle.nn.SoftmaxLoss()`\n", 347 | "\n", 348 | "Applies the softmax loss as defined below (and as implemented in Homework 1), taking in as input a Tensor of logits and a Tensor of the true labels (expressed as a list of numbers, *not* one-hot encoded).\n", 349 | "\n", 350 | "Note that you can use the `init.one_hot` function now instead of writing this yourself. Note: You will need to use the numerically stable logsumexp operator you just implemented for this purpose.\n", 351 | "\n", 352 | "\\begin{equation}\n", 353 | "\\ell_\\text{softmax}(z,y) = \\log \\sum_{i=1}^k \\exp z_i - z_y\n", 354 | "\\end{equation}\n", 355 | "\n", 356 | "___" 357 | ] 358 | }, 359 | { 360 | "cell_type": "code", 361 | "execution_count": null, 362 | "metadata": {}, 363 | "outputs": [], 364 | "source": [ 365 | "!python3 -m pytest -v -k \"test_nn_softmax_loss\"" 366 | ] 367 | }, 368 | { 369 | "cell_type": "code", 370 | "execution_count": null, 371 | "metadata": {}, 372 | "outputs": [], 373 | "source": [ 374 | "!python -m mugrade submit 'YOUR_GRADER_KEY_HERE' -k \"nn_softmax_loss\"" 375 | ] 376 | }, 377 | { 378 | "cell_type": "markdown", 379 | "metadata": { 380 | "tags": [] 381 | }, 382 | "source": [ 383 | "### LayerNorm1d\n", 384 | "`needle.nn.LayerNorm1d(dim, eps=1e-5, device=None, dtype=\"float32\")`\n", 385 | "\n", 386 | "Applies layer normalization over a mini-batch of inputs as described in the paper [Layer Normalization](https://arxiv.org/abs/1607.06450).\n", 387 | "\n", 388 | "\\begin{equation}\n", 389 | "y = w \\circ \\frac{x_i - \\textbf{E}[x]}{((\\textbf{Var}[x]+\\epsilon)^{1/2})} + b\n", 390 | "\\end{equation}\n", 391 | "\n", 392 | "where $\\textbf{E}[x]$ denotes the empirical mean of the inputs, $\\textbf{Var}[x]$ denotes their empirical variance (note that here we are using the \"biased\" estimate of the variance, i.e., dividing by $N$ rather than by $N-1$), and $w$ and $b$ denote learnable scalar weights and biases respectively. Note you can assume the input to this layer is a 2D tensor, with batches in the first dimension and features in the second. You might need to broadcast the weight and bias before applying them.\n", 393 | "\n", 394 | "##### Parameters\n", 395 | "- `dim` - number of channels\n", 396 | "- `eps` - a value added to the denominator for numerical stability.\n", 397 | "\n", 398 | "##### Variables\n", 399 | "- `weight` - the learnable weights of size `dim`, elements initialized to 1.\n", 400 | "- `bias` - the learnable bias of shape `dim`, elements initialized to 0.\n", 401 | "___" 402 | ] 403 | }, 404 | { 405 | "cell_type": "code", 406 | "execution_count": null, 407 | "metadata": {}, 408 | "outputs": [], 409 | "source": [ 410 | "!python3 -m pytest -v -k \"test_nn_layernorm\"" 411 | ] 412 | }, 413 | { 414 | "cell_type": "code", 415 | "execution_count": null, 416 | "metadata": {}, 417 | "outputs": [], 418 | "source": [ 419 | "!python -m mugrade submit 'YOUR_GRADER_KEY_HERE' -k \"nn_layernorm\"" 420 | ] 421 | }, 422 | { 423 | "cell_type": "markdown", 424 | "metadata": {}, 425 | "source": [ 426 | "\n", 427 | "### Flatten\n", 428 | "`needle.nn.Flatten()`\n", 429 | "\n", 430 | "Takes in a tensor of shape `(B,X_0,X_1,...)`, and flattens all non-batch dimensions so that the output is of shape `(B, X_0 * X_1 * ...)`" 431 | ] 432 | }, 433 | { 434 | "cell_type": "code", 435 | "execution_count": null, 436 | "metadata": {}, 437 | "outputs": [], 438 | "source": [ 439 | "!python3 -m pytest -v -k \"test_nn_flatten\"" 440 | ] 441 | }, 442 | { 443 | "cell_type": "code", 444 | "execution_count": null, 445 | "metadata": {}, 446 | "outputs": [], 447 | "source": [ 448 | "!python -m mugrade submit 'YOUR_GRADER_KEY_HERE' -k \"nn_flatten\"" 449 | ] 450 | }, 451 | { 452 | "cell_type": "markdown", 453 | "metadata": {}, 454 | "source": [ 455 | "### BatchNorm1d\n", 456 | "`needle.nn.BatchNorm1d(dim, eps=1e-5, momentum=0.1, device=None, dtype=\"float32\")`\n", 457 | "\n", 458 | "Applies batch normalization over a mini-batch of inputs as described in the paper [Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift](https://arxiv.org/abs/1502.03167).\n", 459 | "\n", 460 | "\\begin{equation}\n", 461 | "y = w \\circ \\frac{z_i - \\textbf{E}[x]}{((\\textbf{Var}[x]+\\epsilon)^{1/2})} + b\n", 462 | "\\end{equation}\n", 463 | "\n", 464 | "but where here the mean and variance refer to to the mean and variance over the _batch_dimensions. The function also computes a running average of mean/variance for all features at each layer $\\hat{\\mu}, \\hat{\\sigma}^2$, and at test time normalizes by these quantities:\n", 465 | "\n", 466 | "\\begin{equation}\n", 467 | "y = \\frac{(x - \\hat{mu})}{((\\hat{\\sigma}^2_{i+1})_j+\\epsilon)^{1/2}}\n", 468 | "\\end{equation}\n", 469 | "\n", 470 | "\n", 471 | "BatchNorm uses the running estimates of mean and variance instead of batch statistics at test time, i.e.,\n", 472 | "after `model.eval()` has been called on the BatchNorm layer's `training` flag is false.\n", 473 | "\n", 474 | "To compute the running estimates, you can use the equation $$\\hat{x_{new}} = (1 - m) \\hat{x_{old}} + mx_{observed},$$\n", 475 | "where $m$ is momentum.\n", 476 | "\n", 477 | "##### Parameters\n", 478 | "- `dim` - input dimension\n", 479 | "- `eps` - a value added to the denominator for numerical stability.\n", 480 | "- `momentum` - the value used for the running mean and running variance computation.\n", 481 | "\n", 482 | "##### Variables\n", 483 | "- `weight` - the learnable weights of size `dim`, elements initialized to 1.\n", 484 | "- `bias` - the learnable bias of size `dim`, elements initialized to 0.\n", 485 | "- `running_mean` - the running mean used at evaluation time, elements initialized to 0.\n", 486 | "- `running_var` - the running (unbiased) variance used at evaluation time, elements initialized to 1. \n", 487 | "\n", 488 | "___" 489 | ] 490 | }, 491 | { 492 | "cell_type": "code", 493 | "execution_count": null, 494 | "metadata": {}, 495 | "outputs": [], 496 | "source": [ 497 | "!python3 -m pytest -v -k \"test_nn_batchnorm\"" 498 | ] 499 | }, 500 | { 501 | "cell_type": "code", 502 | "execution_count": null, 503 | "metadata": {}, 504 | "outputs": [], 505 | "source": [ 506 | "!python -m mugrade submit 'YOUR_GRADER_KEY_HERE' -k \"nn_batchnorm\"" 507 | ] 508 | }, 509 | { 510 | "cell_type": "markdown", 511 | "metadata": {}, 512 | "source": [ 513 | "### Dropout\n", 514 | "`needle.nn.Dropout(p = 0.5)`\n", 515 | "\n", 516 | "During training, randomly zeroes some of the elements of the input tensor with probability `p` using samples from a Bernoulli distribution. This has proven to be an effective technique for regularization and preventing the co-adaptation of neurons as described in the paper [Improving neural networks by preventing co-adaption of feature detectors](https://arxiv.org/abs/1207.0580). During evaluation the module simply computes an identity function. \n", 517 | "\n", 518 | "\\begin{equation}\n", 519 | "\\hat{z}_{i+1} = \\sigma_i (W_i^T z_i + b_i) \\\\\n", 520 | "(z_{i+1})_j = \n", 521 | " \\begin{cases}\n", 522 | " (\\hat{z}_{i+1})_j /(1-p) & \\text{with probability } 1-p \\\\\n", 523 | " 0 & \\text{with probability } p \\\\\n", 524 | " \\end{cases}\n", 525 | "\\end{equation}\n", 526 | "\n", 527 | "**Important**: If the Dropout module the flag `training=False`, you shouldn't \"dropout\" any weights. That is, dropout applies during training only, not during evaluation. Note that `training` is a flag in `nn.Module`.\n", 528 | "\n", 529 | "##### Parameters\n", 530 | "- `p` - the probability of an element to be zeroed.\n", 531 | "\n", 532 | "___" 533 | ] 534 | }, 535 | { 536 | "cell_type": "code", 537 | "execution_count": null, 538 | "metadata": {}, 539 | "outputs": [], 540 | "source": [ 541 | "!python3 -m pytest -v -k \"test_nn_dropout\"" 542 | ] 543 | }, 544 | { 545 | "cell_type": "code", 546 | "execution_count": null, 547 | "metadata": {}, 548 | "outputs": [], 549 | "source": [ 550 | "!python -m mugrade submit 'YOUR_GRADER_KEY_HERE' -k \"nn_dropout\"" 551 | ] 552 | }, 553 | { 554 | "cell_type": "markdown", 555 | "metadata": { 556 | "tags": [] 557 | }, 558 | "source": [ 559 | "### Residual\n", 560 | "`needle.nn.Residual(fn: Module)`\n", 561 | "\n", 562 | "Applies a residual or skip connection given module $\\mathcal{F}$ and input Tensor $x$, returning $\\mathcal{F}(x) + x$.\n", 563 | "##### Parameters\n", 564 | "- `fn` - module of type `needle.nn.Module`" 565 | ] 566 | }, 567 | { 568 | "cell_type": "code", 569 | "execution_count": null, 570 | "metadata": {}, 571 | "outputs": [], 572 | "source": [ 573 | "!python3 -m pytest -v -k \"test_nn_residual\"" 574 | ] 575 | }, 576 | { 577 | "cell_type": "code", 578 | "execution_count": null, 579 | "metadata": {}, 580 | "outputs": [], 581 | "source": [ 582 | "!python -m mugrade submit 'YOUR_GRADER_KEY_HERE' -k \"nn_residual\"" 583 | ] 584 | }, 585 | { 586 | "cell_type": "markdown", 587 | "metadata": { 588 | "tags": [] 589 | }, 590 | "source": [ 591 | "## Question 3\n", 592 | "\n", 593 | "Implement the `step` function of the following optimizers in `python/needle/optim.py`.\n", 594 | "Make sure that your optimizers _don't_ modify the gradients of tensors in-place.\n", 595 | "\n", 596 | "We have included some tests to ensure that you are not consuming excessive memory, which can happen if you are\n", 597 | "not using `.data` or `.detach()` in the right places, thus building an increasingly large computational graph\n", 598 | "(not just in the optimizers, but in the previous modules as well).\n", 599 | "You can ignore these tests, which include the string `memory_check` at your own discretion.\n", 600 | "\n", 601 | "___\n", 602 | "\n", 603 | "### SGD\n", 604 | "`needle.optim.SGD(params, lr=0.01, momentum=0.0, weight_decay=0.0)`\n", 605 | "\n", 606 | "Implements stochastic gradient descent (optionally with momentum, shown as $\\beta$ below). \n", 607 | "\n", 608 | "\\begin{equation}\n", 609 | "\\begin{split}\n", 610 | " u_{t+1} &= \\beta u_t + (1-\\beta) \\nabla_\\theta f(\\theta_t) \\\\\n", 611 | " \\theta_{t+1} &= \\theta_t - \\alpha u_{t+1}\n", 612 | "\\end{split}\n", 613 | "\\end{equation}\n", 614 | "\n", 615 | "##### Parameters\n", 616 | "- `params` - iterable of parameters of type `needle.nn.Parameter` to optimize\n", 617 | "- `lr` (*float*) - learning rate\n", 618 | "- `momentum` (*float*) - momentum factor\n", 619 | "- `weight_decay` (*float*) - weight decay (L2 penalty)\n", 620 | "___" 621 | ] 622 | }, 623 | { 624 | "cell_type": "code", 625 | "execution_count": null, 626 | "metadata": {}, 627 | "outputs": [], 628 | "source": [ 629 | "!python3 -m pytest -v -k \"test_optim_sgd\"" 630 | ] 631 | }, 632 | { 633 | "cell_type": "code", 634 | "execution_count": null, 635 | "metadata": {}, 636 | "outputs": [], 637 | "source": [ 638 | "!python -m mugrade submit 'YOUR_GRADER_KEY_HERE' -k \"optim_sgd\"" 639 | ] 640 | }, 641 | { 642 | "cell_type": "markdown", 643 | "metadata": { 644 | "tags": [] 645 | }, 646 | "source": [ 647 | "### Adam\n", 648 | "`needle.optim.Adam(params, lr=0.01, beta1=0.9, beta2=0.999, eps=1e-8, weight_decay=0.0)`\n", 649 | "\n", 650 | "Implements Adam algorithm, proposed in [Adam: A Method for Stochastic Optimization](https://arxiv.org/abs/1412.6980). \n", 651 | "\n", 652 | "\\begin{equation}\n", 653 | "\\begin{split}\n", 654 | "u_{t+1} &= \\beta_1 u_t + (1-\\beta_1) \\nabla_\\theta f(\\theta_t) \\\\\n", 655 | "v_{t+1} &= \\beta_2 v_t + (1-\\beta_2) (\\nabla_\\theta f(\\theta_t))^2 \\\\\n", 656 | "\\hat{u}_{t+1} &= u_{t+1} / (1 - \\beta_1^t) \\quad \\text{(bias correction)} \\\\\n", 657 | "\\hat{v}_{t+1} &= v_{t+1} / (1 - \\beta_2^t) \\quad \\text{(bias correction)}\\\\\n", 658 | "\\theta_{t+1} &= \\theta_t - \\alpha \\hat{u_{t+1}}/(\\hat{v}_{t+1}^{1/2}+\\epsilon)\n", 659 | "\\end{split}\n", 660 | " \\end{equation}\n", 661 | "\n", 662 | "**Important:** Pay attention to whether or not you are applying bias correction.\n", 663 | "\n", 664 | "##### Parameters\n", 665 | "- `params` - iterable of parameters of type `needle.nn.Parameter` to optimize\n", 666 | "- `lr` (*float*) - learning rate\n", 667 | "- `beta1` (*float*) - coefficient used for computing running average of gradient\n", 668 | "- `beta2` (*float*) - coefficient used for computing running average of square of gradient\n", 669 | "- `eps` (*float*) - term added to the denominator to improve numerical stability\n", 670 | "- `weight_decay` (*float*) - weight decay (L2 penalty)\n", 671 | "\n", 672 | "**Hint**: To help deal with memory issues, try to understand how to use `.data` or `.detach()`" 673 | ] 674 | }, 675 | { 676 | "cell_type": "code", 677 | "execution_count": null, 678 | "metadata": {}, 679 | "outputs": [], 680 | "source": [ 681 | "!python3 -m pytest -v -k \"test_optim_adam\"" 682 | ] 683 | }, 684 | { 685 | "cell_type": "code", 686 | "execution_count": null, 687 | "metadata": {}, 688 | "outputs": [], 689 | "source": [ 690 | "!python -m mugrade submit 'YOUR_GRADER_KEY_HERE' -k \"optim_adam\"" 691 | ] 692 | }, 693 | { 694 | "cell_type": "markdown", 695 | "metadata": {}, 696 | "source": [ 697 | "## Question 4\n", 698 | "\n", 699 | "In this question, you will implement two data primitives: `needle.data.DataLoader` and `needle.data.Dataset`. `Dataset` stores the samples and their corresponding labels, and `DataLoader` wraps an iterable around the `Dataset` to enable easy access to the samples. \n", 700 | "\n", 701 | "For this question, you will be working in the `python/needle/data` directory. \n", 702 | "\n", 703 | "### Transformations\n", 704 | "\n", 705 | "First we will implement a few transformations that are helpful when working with images. We will stick with a horizontal flip and a random crop for now. Fill out the following functions in `needle/data/data_transforms.py`.\n", 706 | "___ \n", 707 | "\n", 708 | "#### RandomFlipHorizontal\n", 709 | "`needle.data.RandomFlipHorizontal(p = 0.5)`\n", 710 | "\n", 711 | "Flips the image horizontally, with probability `p`.\n", 712 | "\n", 713 | "##### Parameters\n", 714 | "- `p` (*float*) - The probability of flipping the input image.\n", 715 | "___\n", 716 | "\n", 717 | "#### RandomCrop\n", 718 | "`needle.data.RandomCrop(padding=3)`\n", 719 | "\n", 720 | "Padding is added to all sides of the image, and then the image is cropped back to it's original size at a random location. Returns an image the same size as the original image.\n", 721 | "\n", 722 | "##### Parameters\n", 723 | "- `padding` (*int*) - The padding on each border of the image." 724 | ] 725 | }, 726 | { 727 | "cell_type": "code", 728 | "execution_count": null, 729 | "metadata": {}, 730 | "outputs": [], 731 | "source": [ 732 | "!python3 -m pytest -v -k \"flip_horizontal\"\n", 733 | "!python3 -m pytest -v -k \"random_crop\"" 734 | ] 735 | }, 736 | { 737 | "cell_type": "code", 738 | "execution_count": null, 739 | "metadata": {}, 740 | "outputs": [], 741 | "source": [ 742 | "!python -m mugrade submit 'YOUR_GRADER_KEY_HERE' -k \"flip_horizontal\"\n", 743 | "!python -m mugrade submit 'YOUR_GRADER_KEY_HERE' -k \"random_crop\"" 744 | ] 745 | }, 746 | { 747 | "cell_type": "markdown", 748 | "metadata": {}, 749 | "source": [ 750 | "### Dataset\n", 751 | "\n", 752 | "Each `Dataset` subclass must implement three functions: `__init__`, `__len__`, and `__getitem__`. The `__init__` function initializes the images, labels, and transforms. The `__len__` function returns the number of samples in the dataset. The `__getitem__` function retrieves a sample from the dataset at a given index `idx`, calls the transform functions on the image (if applicable), converts the image and label to a numpy array (the data will be converted to Tensors elsewhere). The output of `__getitem__` and `__next__` should be NDArrays, and you should follow the shapes such that you're accessing an array of size (Datapoint Number, Feature Dim 1, Feature Dim 2, ...). \n", 753 | "\n", 754 | "Fill out these functions in the `MNISTDataset` class in `needle/data/datasets/mnist_dataset.py`. You can use your solution to `parse_mnist` from the previous homework for the `__init__` function.\n", 755 | "\n", 756 | "### MNISTDataset\n", 757 | "`needle.data.MNISTDataset(image_filesname, label_filesname, transforms)`\n", 758 | "\n", 759 | "##### Parameters\n", 760 | "- `image_filesname` - path of file containing images\n", 761 | "- `label_filesname` - path of file containing labels\n", 762 | "- `transforms` - an optional list of transforms to apply to data\n" 763 | ] 764 | }, 765 | { 766 | "cell_type": "code", 767 | "execution_count": null, 768 | "metadata": {}, 769 | "outputs": [], 770 | "source": [ 771 | "!python3 -m pytest -v -k \"test_mnist_dataset\"" 772 | ] 773 | }, 774 | { 775 | "cell_type": "code", 776 | "execution_count": null, 777 | "metadata": {}, 778 | "outputs": [], 779 | "source": [ 780 | "!python -m mugrade submit 'YOUR_GRADER_KEY_HERE' -k \"mnist_dataset\"" 781 | ] 782 | }, 783 | { 784 | "cell_type": "markdown", 785 | "metadata": {}, 786 | "source": [ 787 | "### Dataloader\n", 788 | "\n", 789 | "In `needle/data/data_basic.py`, the Dataloader class provides an interface for assembling mini-batches of examples suitable for training using SGD-based approaches, backed by a Dataset object. In order to build the typical Dataloader interface (allowing users to iterate over all the mini-batches in the dataset), you will need to implement the `__iter__()` and `__next__()` calls in the class: `__iter__()` is called at the start of iteration, while `__next__()` is called to grab the next mini-batch. Please note that subsequent calls to next will require you to return the following batches, so next is not a pure function.\n", 790 | "\n", 791 | "### Dataloader\n", 792 | "`needle.data.Dataloader(dataset: Dataset, batch_size: Optional[int] = 1, shuffle: bool = False)`\n", 793 | "\n", 794 | "Combines a dataset and a sampler, and provides an iterable over the given dataset. \n", 795 | "\n", 796 | "##### Parameters\n", 797 | "- `dataset` - `needle.data.Dataset` - a dataset \n", 798 | "- `batch_size` - `int` - what batch size to serve the data in \n", 799 | "- `shuffle` - `bool` - set to ``True`` to have the data reshuffle at every epoch, default ``False``.\n", 800 | "___ \n", 801 | "\n", 802 | "\n", 803 | "\n" 804 | ] 805 | }, 806 | { 807 | "cell_type": "code", 808 | "execution_count": null, 809 | "metadata": {}, 810 | "outputs": [], 811 | "source": [ 812 | "!python3 -m pytest -v -k \"test_dataloader\"" 813 | ] 814 | }, 815 | { 816 | "cell_type": "code", 817 | "execution_count": null, 818 | "metadata": {}, 819 | "outputs": [], 820 | "source": [ 821 | "!python -m mugrade submit 'YOUR_GRADER_KEY_HERE' -k \"dataloader\"" 822 | ] 823 | }, 824 | { 825 | "cell_type": "markdown", 826 | "metadata": {}, 827 | "source": [ 828 | "## Question 5\n", 829 | "\n", 830 | "Given you have now implemented all the necessary components for our neural network library, let's build and train an MLP ResNet. For this question, you will be working in `apps/mlp_resnet.py`. First, fill out the functions `ResidualBlock` and `MLPResNet` as described below:\n", 831 | "\n", 832 | "### ResidualBlock\n", 833 | "`ResidualBlock(dim, hidden_dim, norm=nn.BatchNorm1d, drop_prob=0.1)`\n", 834 | "\n", 835 | "Implements a residual block as follows:\n", 836 | "\n", 837 | "

\n", 838 | " \"Residual\n", 839 | "

\n", 840 | "\n", 841 | "**NOTE**: if the figure does not render, please see the figure in the `figures` directory.\n", 842 | "\n", 843 | "where the first linear layer has `in_features=dim` and `out_features=hidden_dim`, and the last linear layer has `out_features=dim`. Returns the block as type `nn.Module`. \n", 844 | "\n", 845 | "##### Parameters\n", 846 | "- `dim` (*int*) - input dim\n", 847 | "- `hidden_dim` (*int*) - hidden dim\n", 848 | "- `norm` (*nn.Module*) - normalization method\n", 849 | "- `drop_prob` (*float*) - dropout probability\n", 850 | "\n", 851 | "___\n", 852 | "\n", 853 | "### MLPResNet\n", 854 | "`MLPResNet(dim, hidden_dim=100, num_blocks=3, num_classes=10, norm=nn.BatchNorm1d, drop_prob=0.1)`\n", 855 | "\n", 856 | "Implements an MLP ResNet as follows:\n", 857 | "\n", 858 | "

\n", 859 | " \"MLP\n", 860 | "

\n", 861 | "\n", 862 | "where the first linear layer has `in_features=dim` and `out_features=hidden_dim`, and each ResidualBlock has `dim=hidden_dim` and `hidden_dim=hidden_dim//2`. Returns a network of type `nn.Module`.\n", 863 | "\n", 864 | "##### Parameters\n", 865 | "- `dim` (*int*) - input dim\n", 866 | "- `hidden_dim` (*int*) - hidden dim\n", 867 | "- `num_blocks` (*int*) - number of ResidualBlocks\n", 868 | "- `num_classes` (*int*) - number of classes\n", 869 | "- `norm` (*nn.Module*) - normalization method\n", 870 | "- `drop_prob` (*float*) - dropout probability (0.1)\n", 871 | "\n", 872 | "**Note**: Modules should be initialized to match the order of execution in the Resnet.\n", 873 | "___ \n", 874 | "\n", 875 | "Once you have the deep learning model architecture correct, let's train the network using our new neural network library components. Specifically, implement the functions `epoch` and `train_mnist`.\n", 876 | "\n", 877 | "### Epoch\n", 878 | "\n", 879 | "`epoch(dataloader, model, opt=None)`\n", 880 | "\n", 881 | "Executes one epoch of training or evaluation, iterating over the entire training dataset once (just like `nn_epoch` from previous homeworks). Returns the average error rate (as a *float*) and the average loss over all samples (as a *float*). Set the model to `training` mode at the beginning of the function if `opt` is given; set the model to `eval` if `opt` is not given (i.e. `None`). When setting the modes, use `.train()` and `.eval()` instead of modifying the training attribute.\n", 882 | "\n", 883 | "##### Parameters\n", 884 | "- `dataloader` (*`needle.data.DataLoader`*) - dataloader returning samples from the training dataset\n", 885 | "- `model` (*`needle.nn.Module`*) - neural network\n", 886 | "- `opt` (*`needle.optim.Optimizer`*) - optimizer instance, or `None`\n", 887 | "\n", 888 | "___\n", 889 | "\n", 890 | "### Train Mnist\n", 891 | "\n", 892 | "`train_mnist(batch_size=100, epochs=10, optimizer=ndl.optim.Adam, lr=0.001, weight_decay=0.001, hidden_dim=100, data_dir=\"data\")`\n", 893 | " \n", 894 | "Initializes a training dataloader (with `shuffle` set to `True`) and a test dataloader for MNIST data, and trains an `MLPResNet` using the given optimizer (if `opt` is not None) and the softmax loss for a given number of epochs. Returns a tuple of the training error, training loss, test error, test loss computed in the last epoch of training. If any parameters are not specified, use the default parameters.\n", 895 | "\n", 896 | "##### Parameters\n", 897 | "- `batch_size` (*int*) - batch size to use for train and test dataloader\n", 898 | "- `epochs` (*int*) - number of epochs to train for\n", 899 | "- `optimizer` (*`needle.optim.Optimizer` type*) - optimizer type to use\n", 900 | "- `lr` (*float*) - learning rate \n", 901 | "- `weight_decay` (*float*) - weight decay\n", 902 | "- `hidden_dim` (*int*) - hidden dim for `MLPResNet`\n", 903 | "- `data_dir` (*int*) - directory containing MNIST image/label files\n" 904 | ] 905 | }, 906 | { 907 | "cell_type": "code", 908 | "execution_count": null, 909 | "metadata": {}, 910 | "outputs": [], 911 | "source": [ 912 | "!python3 -m pytest -v -k \"test_mlp\"" 913 | ] 914 | }, 915 | { 916 | "cell_type": "code", 917 | "execution_count": null, 918 | "metadata": {}, 919 | "outputs": [], 920 | "source": [ 921 | "!python -m mugrade submit 'YOUR_GRADER_KEY_HERE' -k \"mlp_resnet\"" 922 | ] 923 | }, 924 | { 925 | "cell_type": "markdown", 926 | "metadata": {}, 927 | "source": [ 928 | "We encourage to experiment with the `mlp_resnet.py` training script.\n", 929 | "You can investigate the effect of using different initializers on the Linear layers,\n", 930 | "increasing the dropout probability,\n", 931 | "or adding transforms (via a list to the `transforms=` keyword argument of Dataset)\n", 932 | "such as random cropping." 933 | ] 934 | } 935 | ], 936 | "metadata": { 937 | "kernelspec": { 938 | "display_name": "Python 3.8.10 64-bit", 939 | "language": "python", 940 | "name": "python3" 941 | }, 942 | "language_info": { 943 | "codemirror_mode": { 944 | "name": "ipython", 945 | "version": 3 946 | }, 947 | "file_extension": ".py", 948 | "mimetype": "text/x-python", 949 | "name": "python", 950 | "nbconvert_exporter": "python", 951 | "pygments_lexer": "ipython3", 952 | "version": "3.8.10" 953 | }, 954 | "vscode": { 955 | "interpreter": { 956 | "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6" 957 | } 958 | } 959 | }, 960 | "nbformat": 4, 961 | "nbformat_minor": 4 962 | } 963 | -------------------------------------------------------------------------------- /python/needle/__init__.py: -------------------------------------------------------------------------------- 1 | from . import ops 2 | from .ops import * 3 | from .autograd import Tensor, cpu, all_devices 4 | 5 | from . import init 6 | from .init import ones, zeros, zeros_like, ones_like 7 | 8 | from . import data 9 | from . import nn 10 | from . import optim 11 | -------------------------------------------------------------------------------- /python/needle/autograd.py: -------------------------------------------------------------------------------- 1 | """Core data structures.""" 2 | import needle 3 | from .backend_numpy import Device, cpu, all_devices 4 | from typing import List, Optional, NamedTuple, Tuple, Union 5 | from collections import namedtuple 6 | import numpy 7 | 8 | from needle import init 9 | 10 | # needle version 11 | LAZY_MODE = False 12 | TENSOR_COUNTER = 0 13 | 14 | # NOTE: we will import numpy as the array_api 15 | # as the backend for our computations, this line will change in later homeworks 16 | 17 | import numpy as array_api 18 | NDArray = numpy.ndarray 19 | 20 | 21 | class Op: 22 | """Operator definition.""" 23 | 24 | def __call__(self, *args): 25 | raise NotImplementedError() 26 | 27 | def compute(self, *args: Tuple[NDArray]): 28 | """Calculate forward pass of operator. 29 | 30 | Parameters 31 | ---------- 32 | input: np.ndarray 33 | A list of input arrays to the function 34 | 35 | Returns 36 | ------- 37 | output: nd.array 38 | Array output of the operation 39 | 40 | """ 41 | raise NotImplementedError() 42 | 43 | def gradient( 44 | self, out_grad: "Value", node: "Value" 45 | ) -> Union["Value", Tuple["Value"]]: 46 | """Compute partial adjoint for each input value for a given output adjoint. 47 | 48 | Parameters 49 | ---------- 50 | out_grad: Value 51 | The adjoint wrt to the output value. 52 | 53 | node: Value 54 | The value node of forward evaluation. 55 | 56 | Returns 57 | ------- 58 | input_grads: Value or Tuple[Value] 59 | A list containing partial gradient adjoints to be propagated to 60 | each of the input node. 61 | """ 62 | raise NotImplementedError() 63 | 64 | def gradient_as_tuple(self, out_grad: "Value", node: "Value") -> Tuple["Value"]: 65 | """Convenience method to always return a tuple from gradient call""" 66 | output = self.gradient(out_grad, node) 67 | if isinstance(output, tuple): 68 | return output 69 | elif isinstance(output, list): 70 | return tuple(output) 71 | else: 72 | return (output,) 73 | 74 | 75 | class TensorOp(Op): 76 | """Op class specialized to output tensors, will be alternate subclasses for other structures""" 77 | 78 | def __call__(self, *args): 79 | return Tensor.make_from_op(self, args) 80 | 81 | 82 | class TensorTupleOp(Op): 83 | """Op class specialized to output TensorTuple""" 84 | 85 | def __call__(self, *args): 86 | return TensorTuple.make_from_op(self, args) 87 | 88 | 89 | class Value: 90 | """A value in the computational graph.""" 91 | 92 | # trace of computational graph 93 | op: Optional[Op] 94 | inputs: List["Value"] 95 | # The following fields are cached fields for 96 | # dynamic computation 97 | cached_data: NDArray 98 | requires_grad: bool 99 | 100 | def realize_cached_data(self): 101 | """Run compute to realize the cached data""" 102 | # avoid recomputation 103 | if self.cached_data is not None: 104 | return self.cached_data 105 | # note: data implicitly calls realized cached data 106 | self.cached_data = self.op.compute( 107 | *[x.realize_cached_data() for x in self.inputs] 108 | ) 109 | return self.cached_data 110 | 111 | def is_leaf(self): 112 | return self.op is None 113 | 114 | def __del__(self): 115 | global TENSOR_COUNTER 116 | TENSOR_COUNTER -= 1 117 | 118 | def _init( 119 | self, 120 | op: Optional[Op], 121 | inputs: List["Tensor"], 122 | *, 123 | num_outputs: int = 1, 124 | cached_data: List[object] = None, 125 | requires_grad: Optional[bool] = None 126 | ): 127 | global TENSOR_COUNTER 128 | TENSOR_COUNTER += 1 129 | if requires_grad is None: 130 | requires_grad = any(x.requires_grad for x in inputs) 131 | self.op = op 132 | self.inputs = inputs 133 | self.num_outputs = num_outputs 134 | self.cached_data = cached_data 135 | self.requires_grad = requires_grad 136 | 137 | @classmethod 138 | def make_const(cls, data, *, requires_grad=False): 139 | value = cls.__new__(cls) 140 | value._init( 141 | None, 142 | [], 143 | cached_data=data, 144 | requires_grad=requires_grad, 145 | ) 146 | return value 147 | 148 | @classmethod 149 | def make_from_op(cls, op: Op, inputs: List["Value"]): 150 | value = cls.__new__(cls) 151 | value._init(op, inputs) 152 | 153 | if not LAZY_MODE: 154 | if not value.requires_grad: 155 | return value.detach() 156 | value.realize_cached_data() 157 | return value 158 | 159 | 160 | ### Not needed in HW1 161 | class TensorTuple(Value): 162 | """Represent a tuple of tensors. 163 | 164 | To keep things simple, we do not support nested tuples. 165 | """ 166 | 167 | def __len__(self): 168 | cdata = self.realize_cached_data() 169 | return len(cdata) 170 | 171 | def __getitem__(self, index: int): 172 | return needle.ops.tuple_get_item(self, index) 173 | 174 | def tuple(self): 175 | return tuple([x for x in self]) 176 | 177 | def __repr__(self): 178 | return "needle.TensorTuple" + str(self.tuple()) 179 | 180 | def __str__(self): 181 | return self.__repr__() 182 | 183 | def __add__(self, other): 184 | assert isinstance(other, TensorTuple) 185 | assert len(self) == len(other) 186 | return needle.ops.make_tuple(*[self[i] + other[i] for i in range(len(self))]) 187 | 188 | def detach(self): 189 | """Create a new tensor that shares the data but detaches from the graph.""" 190 | return TensorTuple.make_const(self.realize_cached_data()) 191 | 192 | 193 | class Tensor(Value): 194 | grad: "Tensor" 195 | 196 | def __init__( 197 | self, 198 | array, 199 | *, 200 | device: Optional[Device] = None, 201 | dtype=None, 202 | requires_grad=True, 203 | **kwargs 204 | ): 205 | if isinstance(array, Tensor): 206 | if device is None: 207 | device = array.device 208 | if dtype is None: 209 | dtype = array.dtype 210 | if device == array.device and dtype == array.dtype: 211 | cached_data = array.realize_cached_data() 212 | else: 213 | # fall back, copy through numpy conversion 214 | cached_data = Tensor._array_from_numpy( 215 | array.numpy(), device=device, dtype=dtype 216 | ) 217 | else: 218 | device = device if device else cpu() 219 | cached_data = Tensor._array_from_numpy(array, device=device, dtype=dtype) 220 | 221 | self._init( 222 | None, 223 | [], 224 | cached_data=cached_data, 225 | requires_grad=requires_grad, 226 | ) 227 | 228 | @staticmethod 229 | def _array_from_numpy(numpy_array, device, dtype): 230 | if array_api is numpy: 231 | return numpy.array(numpy_array, dtype=dtype) 232 | return array_api.array(numpy_array, device=device, dtype=dtype) 233 | 234 | @staticmethod 235 | def make_from_op(op: Op, inputs: List["Value"]): 236 | tensor = Tensor.__new__(Tensor) 237 | tensor._init(op, inputs) 238 | if not LAZY_MODE: 239 | if not tensor.requires_grad: 240 | return tensor.detach() 241 | tensor.realize_cached_data() 242 | return tensor 243 | 244 | @staticmethod 245 | def make_const(data, requires_grad=False): 246 | tensor = Tensor.__new__(Tensor) 247 | tensor._init( 248 | None, 249 | [], 250 | cached_data=data 251 | if not isinstance(data, Tensor) 252 | else data.realize_cached_data(), 253 | requires_grad=requires_grad, 254 | ) 255 | return tensor 256 | 257 | @property 258 | def data(self): 259 | return self.detach() 260 | 261 | @data.setter 262 | def data(self, value): 263 | assert isinstance(value, Tensor) 264 | assert value.dtype == self.dtype, "%s %s" % ( 265 | value.dtype, 266 | self.dtype, 267 | ) 268 | self.cached_data = value.realize_cached_data() 269 | 270 | def detach(self): 271 | """Create a new tensor that shares the data but detaches from the graph.""" 272 | return Tensor.make_const(self.realize_cached_data()) 273 | 274 | @property 275 | def shape(self): 276 | return self.realize_cached_data().shape 277 | 278 | @property 279 | def dtype(self): 280 | return self.realize_cached_data().dtype 281 | 282 | @property 283 | def device(self): 284 | data = self.realize_cached_data() 285 | # numpy array always sits on cpu 286 | if array_api is numpy: 287 | return cpu() 288 | return data.device 289 | 290 | def backward(self, out_grad=None): 291 | out_grad = ( 292 | out_grad 293 | if out_grad 294 | else init.ones(*self.shape, dtype=self.dtype, device=self.device) 295 | ) 296 | compute_gradient_of_variables(self, out_grad) 297 | 298 | def __repr__(self): 299 | return "needle.Tensor(" + str(self.realize_cached_data()) + ")" 300 | 301 | def __str__(self): 302 | return self.realize_cached_data().__str__() 303 | 304 | def numpy(self): 305 | data = self.realize_cached_data() 306 | if array_api is numpy: 307 | return data 308 | return data.numpy() 309 | 310 | def __add__(self, other): 311 | if isinstance(other, Tensor): 312 | return needle.ops.EWiseAdd()(self, other) 313 | else: 314 | return needle.ops.AddScalar(other)(self) 315 | 316 | def __mul__(self, other): 317 | if isinstance(other, Tensor): 318 | return needle.ops.EWiseMul()(self, other) 319 | else: 320 | return needle.ops.MulScalar(other)(self) 321 | 322 | def __pow__(self, other): 323 | if isinstance(other, Tensor): 324 | return needle.ops.EWisePow()(self, other) 325 | else: 326 | return needle.ops.PowerScalar(other)(self) 327 | 328 | def __sub__(self, other): 329 | if isinstance(other, Tensor): 330 | return needle.ops.EWiseAdd()(self, needle.ops.Negate()(other)) 331 | else: 332 | return needle.ops.AddScalar(-other)(self) 333 | 334 | def __truediv__(self, other): 335 | if isinstance(other, Tensor): 336 | return needle.ops.EWiseDiv()(self, other) 337 | else: 338 | return needle.ops.DivScalar(other)(self) 339 | 340 | def __matmul__(self, other): 341 | return needle.ops.MatMul()(self, other) 342 | 343 | def matmul(self, other): 344 | return needle.ops.MatMul()(self, other) 345 | 346 | def sum(self, axes=None): 347 | return needle.ops.Summation(axes)(self) 348 | 349 | def broadcast_to(self, shape): 350 | return needle.ops.BroadcastTo(shape)(self) 351 | 352 | def reshape(self, shape): 353 | return needle.ops.Reshape(shape)(self) 354 | 355 | def __neg__(self): 356 | return needle.ops.Negate()(self) 357 | 358 | def transpose(self, axes=None): 359 | return needle.ops.Transpose(axes)(self) 360 | 361 | 362 | __radd__ = __add__ 363 | __rmul__ = __mul__ 364 | 365 | 366 | 367 | def compute_gradient_of_variables(output_tensor, out_grad): 368 | """Take gradient of output node with respect to each node in node_list. 369 | 370 | Store the computed result in the grad field of each Variable. 371 | """ 372 | # a map from node to a list of gradient contributions from each output node 373 | node_to_output_grads_list: Dict[Tensor, List[Tensor]] = {} 374 | # Special note on initializing gradient of 375 | # We are really taking a derivative of the scalar reduce_sum(output_node) 376 | # instead of the vector output_node. But this is the common case for loss function. 377 | node_to_output_grads_list[output_tensor] = [out_grad] 378 | 379 | # Traverse graph in reverse topological order given the output_node that we are taking gradient wrt. 380 | reverse_topo_order = list(reversed(find_topo_sort([output_tensor]))) 381 | 382 | ### BEGIN YOUR SOLUTION 383 | raise NotImplementedError() 384 | ### END YOUR SOLUTION 385 | 386 | 387 | def find_topo_sort(node_list: List[Value]) -> List[Value]: 388 | """Given a list of nodes, return a topological sort list of nodes ending in them. 389 | 390 | A simple algorithm is to do a post-order DFS traversal on the given nodes, 391 | going backwards based on input edges. Since a node is added to the ordering 392 | after all its predecessors are traversed due to post-order DFS, we get a topological 393 | sort. 394 | """ 395 | ### BEGIN YOUR SOLUTION 396 | raise NotImplementedError() 397 | ### END YOUR SOLUTION 398 | 399 | 400 | def topo_sort_dfs(node, visited, topo_order): 401 | """Post-order DFS""" 402 | ### BEGIN YOUR SOLUTION 403 | raise NotImplementedError() 404 | ### END YOUR SOLUTION 405 | 406 | 407 | ############################## 408 | ####### Helper Methods ####### 409 | ############################## 410 | 411 | 412 | def sum_node_list(node_list): 413 | """Custom sum function in order to avoid create redundant nodes in Python sum implementation.""" 414 | from operator import add 415 | from functools import reduce 416 | 417 | return reduce(add, node_list) 418 | -------------------------------------------------------------------------------- /python/needle/backend_numpy.py: -------------------------------------------------------------------------------- 1 | """This file defies specific implementations of devices when using numpy as NDArray backend. 2 | """ 3 | import numpy 4 | 5 | 6 | class Device: 7 | """Baseclass of all device""" 8 | 9 | 10 | class CPUDevice(Device): 11 | """Represents data that sits in CPU""" 12 | 13 | def __repr__(self): 14 | return "needle.cpu()" 15 | 16 | def __hash__(self): 17 | return self.__repr__().__hash__() 18 | 19 | def __eq__(self, other): 20 | return isinstance(other, CPUDevice) 21 | 22 | def enabled(self): 23 | return True 24 | 25 | def zeros(self, *shape, dtype="float32"): 26 | return numpy.zeros(shape, dtype=dtype) 27 | 28 | def ones(self, *shape, dtype="float32"): 29 | return numpy.ones(shape, dtype=dtype) 30 | 31 | def randn(self, *shape): 32 | # note: numpy doesn't support types within standard random routines, and 33 | # .astype("float32") does work if we're generating a singleton 34 | return numpy.random.randn(*shape) 35 | 36 | def rand(self, *shape): 37 | # note: numpy doesn't support types within standard random routines, and 38 | # .astype("float32") does work if we're generating a singleton 39 | return numpy.random.rand(*shape) 40 | 41 | def one_hot(self, n, i, dtype="float32"): 42 | return numpy.eye(n, dtype=dtype)[i] 43 | 44 | def empty(self, shape, dtype="float32"): 45 | return numpy.empty(shape, dtype=dtype) 46 | 47 | def full(self, shape, fill_value, dtype="float32"): 48 | return numpy.full(shape, fill_value, dtype=dtype) 49 | 50 | 51 | def cpu(): 52 | """Return cpu device""" 53 | return CPUDevice() 54 | 55 | 56 | def default_device(): 57 | return cpu() 58 | 59 | 60 | def all_devices(): 61 | """return a list of all available devices""" 62 | return [cpu()] 63 | -------------------------------------------------------------------------------- /python/needle/data/__init__.py: -------------------------------------------------------------------------------- 1 | from .data_basic import * 2 | from .data_transforms import * 3 | from .datasets import * 4 | -------------------------------------------------------------------------------- /python/needle/data/data_basic.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from ..autograd import Tensor 3 | 4 | from typing import Iterator, Optional, List, Sized, Union, Iterable, Any 5 | 6 | 7 | 8 | class Dataset: 9 | r"""An abstract class representing a `Dataset`. 10 | 11 | All subclasses should overwrite :meth:`__getitem__`, supporting fetching a 12 | data sample for a given key. Subclasses must also overwrite 13 | :meth:`__len__`, which is expected to return the size of the dataset. 14 | """ 15 | 16 | def __init__(self, transforms: Optional[List] = None): 17 | self.transforms = transforms 18 | 19 | def __getitem__(self, index) -> object: 20 | raise NotImplementedError 21 | 22 | def __len__(self) -> int: 23 | raise NotImplementedError 24 | 25 | def apply_transforms(self, x): 26 | if self.transforms is not None: 27 | # apply the transforms 28 | for tform in self.transforms: 29 | x = tform(x) 30 | return x 31 | 32 | 33 | class DataLoader: 34 | r""" 35 | Data loader. Combines a dataset and a sampler, and provides an iterable over 36 | the given dataset. 37 | Args: 38 | dataset (Dataset): dataset from which to load the data. 39 | batch_size (int, optional): how many samples per batch to load 40 | (default: ``1``). 41 | shuffle (bool, optional): set to ``True`` to have the data reshuffled 42 | at every epoch (default: ``False``). 43 | """ 44 | dataset: Dataset 45 | batch_size: Optional[int] 46 | 47 | def __init__( 48 | self, 49 | dataset: Dataset, 50 | batch_size: Optional[int] = 1, 51 | shuffle: bool = False, 52 | ): 53 | 54 | self.dataset = dataset 55 | self.shuffle = shuffle 56 | self.batch_size = batch_size 57 | if not self.shuffle: 58 | self.ordering = np.array_split(np.arange(len(dataset)), 59 | range(batch_size, len(dataset), batch_size)) 60 | 61 | def __iter__(self): 62 | ### BEGIN YOUR SOLUTION 63 | raise NotImplementedError() 64 | ### END YOUR SOLUTION 65 | return self 66 | 67 | def __next__(self): 68 | ### BEGIN YOUR SOLUTION 69 | raise NotImplementedError() 70 | ### END YOUR SOLUTION 71 | 72 | -------------------------------------------------------------------------------- /python/needle/data/data_transforms.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | class Transform: 4 | def __call__(self, x): 5 | raise NotImplementedError 6 | 7 | 8 | class RandomFlipHorizontal(Transform): 9 | def __init__(self, p = 0.5): 10 | self.p = p 11 | 12 | def __call__(self, img): 13 | """ 14 | Horizonally flip an image, specified as an H x W x C NDArray. 15 | Args: 16 | img: H x W x C NDArray of an image 17 | Returns: 18 | H x W x C ndarray corresponding to image flipped with probability self.p 19 | Note: use the provided code to provide randomness, for easier testing 20 | """ 21 | flip_img = np.random.rand() < self.p 22 | ### BEGIN YOUR SOLUTION 23 | raise NotImplementedError() 24 | ### END YOUR SOLUTION 25 | 26 | 27 | class RandomCrop(Transform): 28 | def __init__(self, padding=3): 29 | self.padding = padding 30 | 31 | def __call__(self, img): 32 | """ Zero pad and then randomly crop an image. 33 | Args: 34 | img: H x W x C NDArray of an image 35 | Return 36 | H x W x C NAArray of cliped image 37 | Note: generate the image shifted by shift_x, shift_y specified below 38 | """ 39 | shift_x, shift_y = np.random.randint(low=-self.padding, high=self.padding+1, size=2) 40 | ### BEGIN YOUR SOLUTION 41 | raise NotImplementedError() 42 | ### END YOUR SOLUTION 43 | -------------------------------------------------------------------------------- /python/needle/data/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | from .mnist_dataset import * 2 | from .ndarray_dataset import * 3 | -------------------------------------------------------------------------------- /python/needle/data/datasets/mnist_dataset.py: -------------------------------------------------------------------------------- 1 | from typing import List, Optional 2 | from ..data_basic import Dataset 3 | import numpy as np 4 | 5 | class MNISTDataset(Dataset): 6 | def __init__( 7 | self, 8 | image_filename: str, 9 | label_filename: str, 10 | transforms: Optional[List] = None, 11 | ): 12 | ### BEGIN YOUR SOLUTION 13 | raise NotImplementedError() 14 | ### END YOUR SOLUTION 15 | 16 | def __getitem__(self, index) -> object: 17 | ### BEGIN YOUR SOLUTION 18 | raise NotImplementedError() 19 | ### END YOUR SOLUTION 20 | 21 | def __len__(self) -> int: 22 | ### BEGIN YOUR SOLUTION 23 | raise NotImplementedError() 24 | ### END YOUR SOLUTION -------------------------------------------------------------------------------- /python/needle/data/datasets/ndarray_dataset.py: -------------------------------------------------------------------------------- 1 | from ..data_basic import Dataset 2 | 3 | class NDArrayDataset(Dataset): 4 | def __init__(self, *arrays): 5 | self.arrays = arrays 6 | 7 | def __len__(self) -> int: 8 | return self.arrays[0].shape[0] 9 | 10 | def __getitem__(self, i) -> object: 11 | return tuple([a[i] for a in self.arrays]) -------------------------------------------------------------------------------- /python/needle/init/__init__.py: -------------------------------------------------------------------------------- 1 | from .init_basic import * 2 | 3 | from .init_initializers import * 4 | -------------------------------------------------------------------------------- /python/needle/init/init_basic.py: -------------------------------------------------------------------------------- 1 | import math 2 | import needle as ndl 3 | 4 | 5 | def rand(*shape, low=0.0, high=1.0, device=None, dtype="float32", requires_grad=False): 6 | """Generate random numbers uniform between low and high""" 7 | device = ndl.cpu() if device is None else device 8 | array = device.rand(*shape) * (high - low) + low 9 | return ndl.Tensor(array, device=device, dtype=dtype, requires_grad=requires_grad) 10 | 11 | 12 | def randn(*shape, mean=0.0, std=1.0, device=None, dtype="float32", requires_grad=False): 13 | """Generate random normal with specified mean and std deviation""" 14 | device = ndl.cpu() if device is None else device 15 | array = device.randn(*shape) * std + mean 16 | return ndl.Tensor(array, device=device, dtype=dtype, requires_grad=requires_grad) 17 | 18 | 19 | 20 | 21 | def constant(*shape, c=1.0, device=None, dtype="float32", requires_grad=False): 22 | """Generate constant Tensor""" 23 | device = ndl.cpu() if device is None else device 24 | array = device.ones(*shape, dtype=dtype) * c # note: can change dtype 25 | return ndl.Tensor(array, device=device, dtype=dtype, requires_grad=requires_grad) 26 | 27 | 28 | 29 | def ones(*shape, device=None, dtype="float32", requires_grad=False): 30 | """Generate all-ones Tensor""" 31 | return constant( 32 | *shape, c=1.0, device=device, dtype=dtype, requires_grad=requires_grad 33 | ) 34 | 35 | 36 | def zeros(*shape, device=None, dtype="float32", requires_grad=False): 37 | """Generate all-zeros Tensor""" 38 | return constant( 39 | *shape, c=0.0, device=device, dtype=dtype, requires_grad=requires_grad 40 | ) 41 | 42 | 43 | def randb(*shape, p=0.5, device=None, dtype="bool", requires_grad=False): 44 | """Generate binary random Tensor""" 45 | device = ndl.cpu() if device is None else device 46 | array = device.rand(*shape) <= p 47 | return ndl.Tensor(array, device=device, dtype=dtype, requires_grad=requires_grad) 48 | 49 | 50 | def one_hot(n, i, device=None, dtype="float32", requires_grad=False): 51 | """Generate one-hot encoding Tensor""" 52 | device = ndl.cpu() if device is None else device 53 | return ndl.Tensor( 54 | device.one_hot(n, i.numpy().astype("int32"), dtype=dtype), 55 | device=device, 56 | requires_grad=requires_grad, 57 | ) 58 | 59 | 60 | def zeros_like(array, *, device=None, requires_grad=False): 61 | device = device if device else array.device 62 | return zeros( 63 | *array.shape, dtype=array.dtype, device=device, requires_grad=requires_grad 64 | ) 65 | 66 | 67 | def ones_like(array, *, device=None, requires_grad=False): 68 | device = device if device else array.device 69 | return ones( 70 | *array.shape, dtype=array.dtype, device=device, requires_grad=requires_grad 71 | ) 72 | -------------------------------------------------------------------------------- /python/needle/init/init_initializers.py: -------------------------------------------------------------------------------- 1 | import math 2 | from .init_basic import * 3 | 4 | 5 | def xavier_uniform(fan_in, fan_out, gain=1.0, **kwargs): 6 | ### BEGIN YOUR SOLUTION 7 | raise NotImplementedError() 8 | ### END YOUR SOLUTION 9 | 10 | 11 | def xavier_normal(fan_in, fan_out, gain=1.0, **kwargs): 12 | ### BEGIN YOUR SOLUTION 13 | raise NotImplementedError() 14 | ### END YOUR SOLUTION 15 | 16 | def kaiming_uniform(fan_in, fan_out, nonlinearity="relu", **kwargs): 17 | assert nonlinearity == "relu", "Only relu supported currently" 18 | ### BEGIN YOUR SOLUTION 19 | raise NotImplementedError() 20 | ### END YOUR SOLUTION 21 | 22 | 23 | 24 | def kaiming_normal(fan_in, fan_out, nonlinearity="relu", **kwargs): 25 | assert nonlinearity == "relu", "Only relu supported currently" 26 | ### BEGIN YOUR SOLUTION 27 | raise NotImplementedError() 28 | ### END YOUR SOLUTION -------------------------------------------------------------------------------- /python/needle/nn/__init__.py: -------------------------------------------------------------------------------- 1 | from .nn_basic import * 2 | -------------------------------------------------------------------------------- /python/needle/nn/nn_basic.py: -------------------------------------------------------------------------------- 1 | """The module. 2 | """ 3 | from typing import List, Callable, Any 4 | from needle.autograd import Tensor 5 | from needle import ops 6 | import needle.init as init 7 | import numpy as np 8 | 9 | 10 | class Parameter(Tensor): 11 | """A special kind of tensor that represents parameters.""" 12 | 13 | 14 | def _unpack_params(value: object) -> List[Tensor]: 15 | if isinstance(value, Parameter): 16 | return [value] 17 | elif isinstance(value, Module): 18 | return value.parameters() 19 | elif isinstance(value, dict): 20 | params = [] 21 | for k, v in value.items(): 22 | params += _unpack_params(v) 23 | return params 24 | elif isinstance(value, (list, tuple)): 25 | params = [] 26 | for v in value: 27 | params += _unpack_params(v) 28 | return params 29 | else: 30 | return [] 31 | 32 | 33 | def _child_modules(value: object) -> List["Module"]: 34 | if isinstance(value, Module): 35 | modules = [value] 36 | modules.extend(_child_modules(value.__dict__)) 37 | return modules 38 | if isinstance(value, dict): 39 | modules = [] 40 | for k, v in value.items(): 41 | modules += _child_modules(v) 42 | return modules 43 | elif isinstance(value, (list, tuple)): 44 | modules = [] 45 | for v in value: 46 | modules += _child_modules(v) 47 | return modules 48 | else: 49 | return [] 50 | 51 | 52 | class Module: 53 | def __init__(self): 54 | self.training = True 55 | 56 | def parameters(self) -> List[Tensor]: 57 | """Return the list of parameters in the module.""" 58 | return _unpack_params(self.__dict__) 59 | 60 | def _children(self) -> List["Module"]: 61 | return _child_modules(self.__dict__) 62 | 63 | def eval(self): 64 | self.training = False 65 | for m in self._children(): 66 | m.training = False 67 | 68 | def train(self): 69 | self.training = True 70 | for m in self._children(): 71 | m.training = True 72 | 73 | def __call__(self, *args, **kwargs): 74 | return self.forward(*args, **kwargs) 75 | 76 | 77 | class Identity(Module): 78 | def forward(self, x): 79 | return x 80 | 81 | 82 | class Linear(Module): 83 | def __init__( 84 | self, in_features, out_features, bias=True, device=None, dtype="float32" 85 | ): 86 | super().__init__() 87 | self.in_features = in_features 88 | self.out_features = out_features 89 | 90 | ### BEGIN YOUR SOLUTION 91 | raise NotImplementedError() 92 | ### END YOUR SOLUTION 93 | 94 | def forward(self, X: Tensor) -> Tensor: 95 | ### BEGIN YOUR SOLUTION 96 | raise NotImplementedError() 97 | ### END YOUR SOLUTION 98 | 99 | 100 | class Flatten(Module): 101 | def forward(self, X): 102 | ### BEGIN YOUR SOLUTION 103 | raise NotImplementedError() 104 | ### END YOUR SOLUTION 105 | 106 | 107 | class ReLU(Module): 108 | def forward(self, x: Tensor) -> Tensor: 109 | ### BEGIN YOUR SOLUTION 110 | raise NotImplementedError() 111 | ### END YOUR SOLUTION 112 | 113 | class Sequential(Module): 114 | def __init__(self, *modules): 115 | super().__init__() 116 | self.modules = modules 117 | 118 | def forward(self, x: Tensor) -> Tensor: 119 | ### BEGIN YOUR SOLUTION 120 | raise NotImplementedError() 121 | ### END YOUR SOLUTION 122 | 123 | 124 | class SoftmaxLoss(Module): 125 | def forward(self, logits: Tensor, y: Tensor): 126 | ### BEGIN YOUR SOLUTION 127 | raise NotImplementedError() 128 | ### END YOUR SOLUTION 129 | 130 | 131 | class BatchNorm1d(Module): 132 | def __init__(self, dim, eps=1e-5, momentum=0.1, device=None, dtype="float32"): 133 | super().__init__() 134 | self.dim = dim 135 | self.eps = eps 136 | self.momentum = momentum 137 | ### BEGIN YOUR SOLUTION 138 | raise NotImplementedError() 139 | ### END YOUR SOLUTION 140 | 141 | def forward(self, x: Tensor) -> Tensor: 142 | ### BEGIN YOUR SOLUTION 143 | raise NotImplementedError() 144 | ### END YOUR SOLUTION 145 | 146 | 147 | 148 | class LayerNorm1d(Module): 149 | def __init__(self, dim, eps=1e-5, device=None, dtype="float32"): 150 | super().__init__() 151 | self.dim = dim 152 | self.eps = eps 153 | ### BEGIN YOUR SOLUTION 154 | raise NotImplementedError() 155 | ### END YOUR SOLUTION 156 | 157 | def forward(self, x: Tensor) -> Tensor: 158 | ### BEGIN YOUR SOLUTION 159 | raise NotImplementedError() 160 | ### END YOUR SOLUTION 161 | 162 | 163 | class Dropout(Module): 164 | def __init__(self, p=0.5): 165 | super().__init__() 166 | self.p = p 167 | 168 | def forward(self, x: Tensor) -> Tensor: 169 | ### BEGIN YOUR SOLUTION 170 | raise NotImplementedError() 171 | ### END YOUR SOLUTION 172 | 173 | 174 | class Residual(Module): 175 | def __init__(self, fn: Module): 176 | super().__init__() 177 | self.fn = fn 178 | 179 | def forward(self, x: Tensor) -> Tensor: 180 | ### BEGIN YOUR SOLUTION 181 | raise NotImplementedError() 182 | ### END YOUR SOLUTION 183 | -------------------------------------------------------------------------------- /python/needle/ops/__init__.py: -------------------------------------------------------------------------------- 1 | from .ops_mathematic import * 2 | 3 | from .ops_logarithmic import * 4 | from .ops_tuple import * 5 | -------------------------------------------------------------------------------- /python/needle/ops/ops_logarithmic.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | from ..autograd import NDArray 3 | from ..autograd import Op, Tensor, Value, TensorOp 4 | from ..autograd import TensorTuple, TensorTupleOp 5 | 6 | from .ops_mathematic import * 7 | 8 | import numpy as array_api 9 | 10 | class LogSoftmax(TensorOp): 11 | def compute(self, Z): 12 | ### BEGIN YOUR SOLUTION 13 | raise NotImplementedError() 14 | ### END YOUR SOLUTION 15 | 16 | def gradient(self, out_grad, node): 17 | ### BEGIN YOUR SOLUTION 18 | raise NotImplementedError() 19 | ### END YOUR SOLUTION 20 | 21 | 22 | def logsoftmax(a): 23 | return LogSoftmax()(a) 24 | 25 | 26 | class LogSumExp(TensorOp): 27 | def __init__(self, axes: Optional[tuple] = None): 28 | self.axes = axes 29 | 30 | def compute(self, Z): 31 | ### BEGIN YOUR SOLUTION 32 | raise NotImplementedError() 33 | ### END YOUR SOLUTION 34 | 35 | def gradient(self, out_grad, node): 36 | ### BEGIN YOUR SOLUTION 37 | raise NotImplementedError() 38 | ### END YOUR SOLUTION 39 | 40 | 41 | def logsumexp(a, axes=None): 42 | return LogSumExp(axes=axes)(a) 43 | 44 | -------------------------------------------------------------------------------- /python/needle/ops/ops_mathematic.py: -------------------------------------------------------------------------------- 1 | """Operator implementations.""" 2 | 3 | from numbers import Number 4 | from typing import Optional, List, Tuple, Union 5 | 6 | from ..autograd import NDArray 7 | from ..autograd import Op, Tensor, Value, TensorOp 8 | from ..autograd import TensorTuple, TensorTupleOp 9 | import numpy 10 | 11 | # NOTE: we will import numpy as the array_api 12 | # as the backend for our computations, this line will change in later homeworks 13 | 14 | BACKEND = "np" 15 | import numpy as array_api 16 | 17 | 18 | class EWiseAdd(TensorOp): 19 | def compute(self, a: NDArray, b: NDArray): 20 | return a + b 21 | 22 | def gradient(self, out_grad: Tensor, node: Tensor): 23 | return out_grad, out_grad 24 | 25 | 26 | def add(a, b): 27 | return EWiseAdd()(a, b) 28 | 29 | 30 | class AddScalar(TensorOp): 31 | def __init__(self, scalar): 32 | self.scalar = scalar 33 | 34 | def compute(self, a: NDArray): 35 | return a + self.scalar 36 | 37 | def gradient(self, out_grad: Tensor, node: Tensor): 38 | return out_grad 39 | 40 | 41 | def add_scalar(a, scalar): 42 | return AddScalar(scalar)(a) 43 | 44 | 45 | class EWiseMul(TensorOp): 46 | def compute(self, a: NDArray, b: NDArray): 47 | return a * b 48 | 49 | def gradient(self, out_grad: Tensor, node: Tensor): 50 | lhs, rhs = node.inputs 51 | return out_grad * rhs, out_grad * lhs 52 | 53 | 54 | def multiply(a, b): 55 | return EWiseMul()(a, b) 56 | 57 | 58 | class MulScalar(TensorOp): 59 | def __init__(self, scalar): 60 | self.scalar = scalar 61 | 62 | def compute(self, a: NDArray): 63 | return a * self.scalar 64 | 65 | def gradient(self, out_grad: Tensor, node: Tensor): 66 | return (out_grad * self.scalar,) 67 | 68 | 69 | def mul_scalar(a, scalar): 70 | return MulScalar(scalar)(a) 71 | 72 | 73 | class EWisePow(TensorOp): 74 | """Op to element-wise raise a tensor to a power.""" 75 | 76 | def compute(self, a: NDArray, b: NDArray) -> NDArray: 77 | ### BEGIN YOUR SOLUTION 78 | raise NotImplementedError() 79 | ### END YOUR SOLUTION 80 | 81 | def gradient(self, out_grad, node): 82 | ### BEGIN YOUR SOLUTION 83 | raise NotImplementedError() 84 | ### END YOUR SOLUTION 85 | 86 | 87 | def power(a, b): 88 | return EWisePow()(a, b) 89 | 90 | 91 | class PowerScalar(TensorOp): 92 | """Op raise a tensor to an (integer) power.""" 93 | 94 | def __init__(self, scalar: int): 95 | self.scalar = scalar 96 | 97 | def compute(self, a: NDArray) -> NDArray: 98 | ### BEGIN YOUR SOLUTION 99 | raise NotImplementedError() 100 | ### END YOUR SOLUTION 101 | 102 | def gradient(self, out_grad, node): 103 | ### BEGIN YOUR SOLUTION 104 | raise NotImplementedError() 105 | ### END YOUR SOLUTION 106 | 107 | 108 | def power_scalar(a, scalar): 109 | return PowerScalar(scalar)(a) 110 | 111 | 112 | class EWiseDiv(TensorOp): 113 | """Op to element-wise divide two nodes.""" 114 | 115 | def compute(self, a, b): 116 | ### BEGIN YOUR SOLUTION 117 | raise NotImplementedError() 118 | ### END YOUR SOLUTION 119 | 120 | def gradient(self, out_grad, node): 121 | ### BEGIN YOUR SOLUTION 122 | raise NotImplementedError() 123 | ### END YOUR SOLUTION 124 | 125 | 126 | def divide(a, b): 127 | return EWiseDiv()(a, b) 128 | 129 | 130 | class DivScalar(TensorOp): 131 | def __init__(self, scalar): 132 | self.scalar = scalar 133 | 134 | def compute(self, a): 135 | ### BEGIN YOUR SOLUTION 136 | raise NotImplementedError() 137 | ### END YOUR SOLUTION 138 | 139 | def gradient(self, out_grad, node): 140 | ### BEGIN YOUR SOLUTION 141 | raise NotImplementedError() 142 | ### END YOUR SOLUTION 143 | 144 | 145 | def divide_scalar(a, scalar): 146 | return DivScalar(scalar)(a) 147 | 148 | 149 | class Transpose(TensorOp): 150 | def __init__(self, axes: Optional[tuple] = None): 151 | self.axes = axes 152 | 153 | def compute(self, a): 154 | ### BEGIN YOUR SOLUTION 155 | raise NotImplementedError() 156 | ### END YOUR SOLUTION 157 | 158 | def gradient(self, out_grad, node): 159 | ### BEGIN YOUR SOLUTION 160 | raise NotImplementedError() 161 | ### END YOUR SOLUTION 162 | 163 | 164 | def transpose(a, axes=None): 165 | return Transpose(axes)(a) 166 | 167 | 168 | class Reshape(TensorOp): 169 | def __init__(self, shape): 170 | self.shape = shape 171 | 172 | def compute(self, a): 173 | ### BEGIN YOUR SOLUTION 174 | raise NotImplementedError() 175 | ### END YOUR SOLUTION 176 | 177 | def gradient(self, out_grad, node): 178 | ### BEGIN YOUR SOLUTION 179 | raise NotImplementedError() 180 | ### END YOUR SOLUTION 181 | 182 | 183 | def reshape(a, shape): 184 | return Reshape(shape)(a) 185 | 186 | 187 | class BroadcastTo(TensorOp): 188 | def __init__(self, shape): 189 | self.shape = shape 190 | 191 | def compute(self, a): 192 | ### BEGIN YOUR SOLUTION 193 | raise NotImplementedError() 194 | ### END YOUR SOLUTION 195 | 196 | def gradient(self, out_grad, node): 197 | ### BEGIN YOUR SOLUTION 198 | raise NotImplementedError() 199 | ### END YOUR SOLUTION 200 | 201 | 202 | def broadcast_to(a, shape): 203 | return BroadcastTo(shape)(a) 204 | 205 | 206 | class Summation(TensorOp): 207 | def __init__(self, axes: Optional[tuple] = None): 208 | self.axes = axes 209 | 210 | def compute(self, a): 211 | ### BEGIN YOUR SOLUTION 212 | raise NotImplementedError() 213 | ### END YOUR SOLUTION 214 | 215 | def gradient(self, out_grad, node): 216 | ### BEGIN YOUR SOLUTION 217 | raise NotImplementedError() 218 | ### END YOUR SOLUTION 219 | 220 | 221 | def summation(a, axes=None): 222 | return Summation(axes)(a) 223 | 224 | 225 | class MatMul(TensorOp): 226 | def compute(self, a, b): 227 | ### BEGIN YOUR SOLUTION 228 | raise NotImplementedError() 229 | ### END YOUR SOLUTION 230 | 231 | def gradient(self, out_grad, node): 232 | ### BEGIN YOUR SOLUTION 233 | raise NotImplementedError() 234 | ### END YOUR SOLUTION 235 | 236 | 237 | def matmul(a, b): 238 | return MatMul()(a, b) 239 | 240 | 241 | class Negate(TensorOp): 242 | def compute(self, a): 243 | ### BEGIN YOUR SOLUTION 244 | raise NotImplementedError() 245 | ### END YOUR SOLUTION 246 | 247 | def gradient(self, out_grad, node): 248 | ### BEGIN YOUR SOLUTION 249 | raise NotImplementedError() 250 | ### END YOUR SOLUTION 251 | 252 | 253 | def negate(a): 254 | return Negate()(a) 255 | 256 | 257 | class Log(TensorOp): 258 | def compute(self, a): 259 | ### BEGIN YOUR SOLUTION 260 | raise NotImplementedError() 261 | ### END YOUR SOLUTION 262 | 263 | def gradient(self, out_grad, node): 264 | ### BEGIN YOUR SOLUTION 265 | raise NotImplementedError() 266 | ### END YOUR SOLUTION 267 | 268 | 269 | def log(a): 270 | return Log()(a) 271 | 272 | 273 | class Exp(TensorOp): 274 | def compute(self, a): 275 | ### BEGIN YOUR SOLUTION 276 | raise NotImplementedError() 277 | ### END YOUR SOLUTION 278 | 279 | def gradient(self, out_grad, node): 280 | ### BEGIN YOUR SOLUTION 281 | raise NotImplementedError() 282 | ### END YOUR SOLUTION 283 | 284 | 285 | def exp(a): 286 | return Exp()(a) 287 | 288 | 289 | class ReLU(TensorOp): 290 | def compute(self, a): 291 | ### BEGIN YOUR SOLUTION 292 | raise NotImplementedError() 293 | ### END YOUR SOLUTION 294 | 295 | def gradient(self, out_grad, node): 296 | ### BEGIN YOUR SOLUTION 297 | raise NotImplementedError() 298 | ### END YOUR SOLUTION 299 | 300 | 301 | def relu(a): 302 | return ReLU()(a) 303 | 304 | 305 | -------------------------------------------------------------------------------- /python/needle/ops/ops_tuple.py: -------------------------------------------------------------------------------- 1 | from ..autograd import Op, Tensor, TensorTuple, Value, TensorOp, TensorTupleOp 2 | import needle.init as init 3 | 4 | class MakeTensorTuple(TensorTupleOp): 5 | def compute(self, *args) -> tuple: 6 | return tuple(args) 7 | 8 | def gradient(self, out_grad, node): 9 | assert isinstance(out_grad, TensorTuple) 10 | return tuple([out_grad[i] for i in range(len(out_grad))]) 11 | 12 | 13 | def make_tuple(*args): 14 | return MakeTensorTuple()(*args) 15 | 16 | 17 | class TupleGetItem(TensorOp): 18 | def __init__(self, index): 19 | self.index = index 20 | 21 | def __call__(self, a: TensorTuple, fold_const=True) -> Value: 22 | assert isinstance(a, TensorTuple) 23 | # constant folding 24 | if fold_const and isinstance(a.op, MakeTensorTuple): 25 | return a.inputs[self.index] 26 | return Tensor.make_from_op(self, [a]) 27 | 28 | def compute(self, a): 29 | return a[self.index] 30 | 31 | def gradient(self, out_grad, node): 32 | index = self.index 33 | in_grad = [] 34 | for i, value in enumerate(node.inputs[0]): 35 | if i != index: 36 | in_grad.append(init.zeros_like(value)) 37 | else: 38 | in_grad.append(out_grad) 39 | return MakeTensorTuple()(*in_grad) 40 | 41 | 42 | def tuple_get_item(value, index): 43 | return TupleGetItem(index)(value) 44 | 45 | 46 | class FusedAddScalars(TensorTupleOp): 47 | def __init__(self, c0: float, c1: float): 48 | self.c0 = c0 49 | self.c1 = c1 50 | 51 | def compute(self, a): 52 | return a + self.c0, a + self.c1 53 | 54 | def gradient(self, out_grad, node): 55 | return out_grad[0] + out_grad[1] 56 | 57 | 58 | def fused_add_scalars(x, c0, c1): 59 | return FusedAddScalars(c0, c1)(x) 60 | -------------------------------------------------------------------------------- /python/needle/optim.py: -------------------------------------------------------------------------------- 1 | """Optimization module""" 2 | import needle as ndl 3 | import numpy as np 4 | 5 | 6 | class Optimizer: 7 | def __init__(self, params): 8 | self.params = params 9 | 10 | def step(self): 11 | raise NotImplementedError() 12 | 13 | def reset_grad(self): 14 | for p in self.params: 15 | p.grad = None 16 | 17 | 18 | class SGD(Optimizer): 19 | def __init__(self, params, lr=0.01, momentum=0.0, weight_decay=0.0): 20 | super().__init__(params) 21 | self.lr = lr 22 | self.momentum = momentum 23 | self.u = {} 24 | self.weight_decay = weight_decay 25 | 26 | def step(self): 27 | ### BEGIN YOUR SOLUTION 28 | raise NotImplementedError() 29 | ### END YOUR SOLUTION 30 | 31 | def clip_grad_norm(self, max_norm=0.25): 32 | """ 33 | Clips gradient norm of parameters. 34 | """ 35 | ### BEGIN YOUR SOLUTION 36 | raise NotImplementedError() 37 | ### END YOUR SOLUTION 38 | 39 | 40 | class Adam(Optimizer): 41 | def __init__( 42 | self, 43 | params, 44 | lr=0.01, 45 | beta1=0.9, 46 | beta2=0.999, 47 | eps=1e-8, 48 | weight_decay=0.0, 49 | ): 50 | super().__init__(params) 51 | self.lr = lr 52 | self.beta1 = beta1 53 | self.beta2 = beta2 54 | self.eps = eps 55 | self.weight_decay = weight_decay 56 | self.t = 0 57 | 58 | self.m = {} 59 | self.v = {} 60 | 61 | def step(self): 62 | ### BEGIN YOUR SOLUTION 63 | raise NotImplementedError() 64 | ### END YOUR SOLUTION 65 | -------------------------------------------------------------------------------- /tests/hw2/test_nn_and_optim.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | sys.path.append("./python") 4 | import numpy as np 5 | import needle as ndl 6 | import needle.nn as nn 7 | 8 | sys.path.append("./apps") 9 | from mlp_resnet import * 10 | 11 | import mugrade 12 | 13 | """Deterministically generate a matrix""" 14 | 15 | 16 | def get_tensor(*shape, entropy=1): 17 | np.random.seed(np.prod(shape) * len(shape) * entropy) 18 | return ndl.Tensor(np.random.randint(0, 100, size=shape) / 20, dtype="float32") 19 | 20 | 21 | def get_int_tensor(*shape, low=0, high=10, entropy=1): 22 | np.random.seed(np.prod(shape) * len(shape) * entropy) 23 | return ndl.Tensor(np.random.randint(low, high, size=shape)) 24 | 25 | 26 | def check_prng(*shape): 27 | """We want to ensure that numpy generates random matrices on your machine/colab 28 | Such that our tests will make sense 29 | So this matrix should match our to full precision 30 | """ 31 | return get_tensor(*shape).cached_data 32 | 33 | 34 | def batchnorm_forward(*shape, affine=False): 35 | x = get_tensor(*shape) 36 | bn = ndl.nn.BatchNorm1d(shape[1]) 37 | if affine: 38 | bn.weight.data = get_tensor(shape[1], entropy=42) 39 | bn.bias.data = get_tensor(shape[1], entropy=1337) 40 | return bn(x).cached_data 41 | 42 | 43 | def batchnorm_backward(*shape, affine=False): 44 | x = get_tensor(*shape) 45 | bn = ndl.nn.BatchNorm1d(shape[1]) 46 | if affine: 47 | bn.weight.data = get_tensor(shape[1], entropy=42) 48 | bn.bias.data = get_tensor(shape[1], entropy=1337) 49 | y = (bn(x) ** 2).sum().backward() 50 | return x.grad.cached_data 51 | 52 | 53 | def flatten_forward(*shape): 54 | x = get_tensor(*shape) 55 | tform = ndl.nn.Flatten() 56 | return tform(x).cached_data 57 | 58 | 59 | def flatten_backward(*shape): 60 | x = get_tensor(*shape) 61 | tform = ndl.nn.Flatten() 62 | (tform(x) ** 2).sum().backward() 63 | return x.grad.cached_data 64 | 65 | 66 | def batchnorm_running_mean(*shape, iters=10): 67 | bn = ndl.nn.BatchNorm1d(shape[1]) 68 | for i in range(iters): 69 | x = get_tensor(*shape, entropy=i) 70 | y = bn(x) 71 | return bn.running_mean.cached_data 72 | 73 | 74 | def batchnorm_running_var(*shape, iters=10): 75 | bn = ndl.nn.BatchNorm1d(shape[1]) 76 | for i in range(iters): 77 | x = get_tensor(*shape, entropy=i) 78 | y = bn(x) 79 | return bn.running_var.cached_data 80 | 81 | 82 | def batchnorm_running_grad(*shape, iters=10): 83 | bn = ndl.nn.BatchNorm1d(shape[1]) 84 | for i in range(iters): 85 | x = get_tensor(*shape, entropy=i) 86 | y = bn(x) 87 | bn.eval() 88 | (y**2).sum().backward() 89 | return x.grad.cached_data 90 | 91 | 92 | def relu_forward(*shape): 93 | f = ndl.nn.ReLU() 94 | x = get_tensor(*shape) 95 | return f(x).cached_data 96 | 97 | 98 | def relu_backward(*shape): 99 | f = ndl.nn.ReLU() 100 | x = get_tensor(*shape) 101 | (f(x) ** 2).sum().backward() 102 | return x.grad.cached_data 103 | 104 | 105 | def layernorm_forward(shape, dim): 106 | f = ndl.nn.LayerNorm1d(dim) 107 | x = get_tensor(*shape) 108 | return f(x).cached_data 109 | 110 | 111 | def layernorm_backward(shape, dims): 112 | f = ndl.nn.LayerNorm1d(dims) 113 | x = get_tensor(*shape) 114 | (f(x) ** 4).sum().backward() 115 | return x.grad.cached_data 116 | 117 | def logsoftmax_forward(shape, mult=1.0): 118 | x = get_tensor(*shape) * mult 119 | return ndl.ops.logsoftmax(x).cached_data 120 | 121 | def logsoftmax_backward(shape, mult=1.0): 122 | x = get_tensor(*shape) 123 | y = ndl.ops.logsoftmax(x * mult) 124 | z = (y**2).sum() 125 | z.backward() 126 | return x.grad.cached_data 127 | 128 | def softmax_loss_forward(rows, classes): 129 | x = get_tensor(rows, classes) 130 | y = get_int_tensor(rows, low=0, high=classes) 131 | f = ndl.nn.SoftmaxLoss() 132 | return np.array(f(x, y).cached_data) 133 | 134 | 135 | def softmax_loss_backward(rows, classes): 136 | x = get_tensor(rows, classes) 137 | y = get_int_tensor(rows, low=0, high=classes) 138 | f = ndl.nn.SoftmaxLoss() 139 | loss = f(x, y) 140 | loss.backward() 141 | return x.grad.cached_data 142 | 143 | 144 | def linear_forward(lhs_shape, rhs_shape): 145 | np.random.seed(199) 146 | f = ndl.nn.Linear(*lhs_shape) 147 | f.bias.data = get_tensor(lhs_shape[-1]) 148 | x = get_tensor(*rhs_shape) 149 | return f(x).cached_data 150 | 151 | 152 | def linear_backward(lhs_shape, rhs_shape): 153 | np.random.seed(199) 154 | f = ndl.nn.Linear(*lhs_shape) 155 | f.bias.data = get_tensor(lhs_shape[-1]) 156 | x = get_tensor(*rhs_shape) 157 | (f(x) ** 2).sum().backward() 158 | return x.grad.cached_data 159 | 160 | 161 | def sequential_forward(batches=3): 162 | np.random.seed(42) 163 | f = nn.Sequential(nn.Linear(5, 8), nn.ReLU(), nn.Linear(8, 5)) 164 | x = get_tensor(batches, 5) 165 | return f(x).cached_data 166 | 167 | 168 | def sequential_backward(batches=3): 169 | np.random.seed(42) 170 | f = nn.Sequential(nn.Linear(5, 8), nn.ReLU(), nn.Linear(8, 5)) 171 | x = get_tensor(batches, 5) 172 | f(x).sum().backward() 173 | return x.grad.cached_data 174 | 175 | 176 | def residual_forward(shape=(5, 5)): 177 | np.random.seed(42) 178 | f = nn.Residual( 179 | nn.Sequential(nn.Linear(*shape), nn.ReLU(), nn.Linear(*shape[::-1])) 180 | ) 181 | x = get_tensor(*shape[::-1]) 182 | return f(x).cached_data 183 | 184 | 185 | def residual_backward(shape=(5, 5)): 186 | np.random.seed(42) 187 | f = nn.Residual( 188 | nn.Sequential(nn.Linear(*shape), nn.ReLU(), nn.Linear(*shape[::-1])) 189 | ) 190 | x = get_tensor(*shape[::-1]) 191 | f(x).sum().backward() 192 | return x.grad.cached_data 193 | 194 | 195 | def learn_model_1d(feature_size, nclasses, _model, optimizer, epochs=1, **kwargs): 196 | np.random.seed(42) 197 | model = _model([]) 198 | X = get_tensor(1024, feature_size).cached_data 199 | y = get_int_tensor(1024, low=0, high=nclasses).cached_data.astype(np.uint8) 200 | m = X.shape[0] 201 | batch = 32 202 | 203 | loss_func = nn.SoftmaxLoss() 204 | opt = optimizer(model.parameters(), **kwargs) 205 | 206 | for _ in range(epochs): 207 | for i, (X0, y0) in enumerate( 208 | zip(np.array_split(X, m // batch), np.array_split(y, m // batch)) 209 | ): 210 | opt.reset_grad() 211 | X0, y0 = ndl.Tensor(X0, dtype="float32"), ndl.Tensor(y0) 212 | out = model(X0) 213 | loss = loss_func(out, y0) 214 | loss.backward() 215 | # Opt should not change gradients. 216 | grad_before = model.parameters()[0].grad.detach().cached_data 217 | opt.step() 218 | grad_after = model.parameters()[0].grad.detach().cached_data 219 | np.testing.assert_allclose( 220 | grad_before, 221 | grad_after, 222 | rtol=1e-5, 223 | atol=1e-5, 224 | err_msg="Optim should not modify gradients in place", 225 | ) 226 | 227 | return np.array(loss.cached_data) 228 | 229 | 230 | def learn_model_1d_eval(feature_size, nclasses, _model, optimizer, epochs=1, **kwargs): 231 | np.random.seed(42) 232 | model = _model([]) 233 | X = get_tensor(1024, feature_size).cached_data 234 | y = get_int_tensor(1024, low=0, high=nclasses).cached_data.astype(np.uint8) 235 | m = X.shape[0] 236 | batch = 32 237 | 238 | loss_func = nn.SoftmaxLoss() 239 | opt = optimizer(model.parameters(), **kwargs) 240 | 241 | for i, (X0, y0) in enumerate( 242 | zip(np.array_split(X, m // batch), np.array_split(y, m // batch)) 243 | ): 244 | opt.reset_grad() 245 | X0, y0 = ndl.Tensor(X0, dtype="float32"), ndl.Tensor(y0) 246 | out = model(X0) 247 | loss = loss_func(out, y0) 248 | loss.backward() 249 | opt.step() 250 | 251 | X_test = ndl.Tensor(get_tensor(batch, feature_size).cached_data) 252 | y_test = ndl.Tensor( 253 | get_int_tensor(batch, low=0, high=nclasses).cached_data.astype(np.uint8) 254 | ) 255 | 256 | model.eval() 257 | 258 | return np.array(loss_func(model(X_test), y_test).cached_data) 259 | 260 | 261 | def init_a_tensor_of_shape(shape, init_fn): 262 | x = get_tensor(*shape) 263 | np.random.seed(42) 264 | init_fn(x) 265 | return x.cached_data 266 | 267 | 268 | def global_tensor_count(): 269 | return np.array(ndl.autograd.TENSOR_COUNTER) 270 | 271 | 272 | def nn_linear_weight_init(): 273 | np.random.seed(1337) 274 | f = ndl.nn.Linear(7, 4) 275 | f.weight.cached_data 276 | return f.weight.cached_data 277 | 278 | 279 | def nn_linear_bias_init(): 280 | np.random.seed(1337) 281 | f = ndl.nn.Linear(7, 4) 282 | return f.bias.cached_data 283 | 284 | 285 | class UselessModule(ndl.nn.Module): 286 | def __init__(self): 287 | super().__init__() 288 | self.stuff = { 289 | "layer1": nn.Linear(4, 4), 290 | "layer2": [nn.Dropout(0.1), nn.Sequential(nn.Linear(4, 4))], 291 | } 292 | 293 | def forward(self, x): 294 | raise NotImplementedError() 295 | 296 | 297 | def check_training_mode(): 298 | model = nn.Sequential( 299 | nn.BatchNorm1d(4), 300 | nn.Sequential( 301 | nn.LayerNorm1d(4), 302 | nn.Linear(4, 4), 303 | nn.Dropout(0.1), 304 | ), 305 | nn.Linear(4, 4), 306 | UselessModule(), 307 | ) 308 | 309 | model_refs = [ 310 | model.modules[0], 311 | model.modules[1].modules[0], 312 | model.modules[1].modules[1], 313 | model.modules[1].modules[2], 314 | model.modules[2], 315 | model.modules[3], 316 | model.modules[3].stuff["layer1"], 317 | model.modules[3].stuff["layer2"][0], 318 | model.modules[3].stuff["layer2"][1].modules[0], 319 | ] 320 | 321 | eval_mode = [1 if not x.training else 0 for x in model_refs] 322 | model.eval() 323 | eval_mode.extend([1 if not x.training else 0 for x in model_refs]) 324 | model.train() 325 | eval_mode.extend([1 if not x.training else 0 for x in model_refs]) 326 | 327 | return np.array(eval_mode) 328 | 329 | 330 | def power_scalar_forward(shape, power=2): 331 | x = get_tensor(*shape) 332 | return (x**power).cached_data 333 | 334 | 335 | def power_scalar_backward(shape, power=2): 336 | x = get_tensor(*shape) 337 | y = (x**power).sum() 338 | y.backward() 339 | return x.grad.cached_data 340 | 341 | 342 | def logsumexp_forward(shape, axes): 343 | x = get_tensor(*shape) 344 | return (ndl.ops.logsumexp(x, axes=axes)).cached_data 345 | 346 | 347 | def logsumexp_backward(shape, axes): 348 | x = get_tensor(*shape) 349 | y = (ndl.ops.logsumexp(x, axes=axes) ** 2).sum() 350 | y.backward() 351 | return x.grad.cached_data 352 | 353 | 354 | def dropout_forward(shape, prob=0.5): 355 | np.random.seed(3) 356 | x = get_tensor(*shape) 357 | f = nn.Dropout(prob) 358 | return f(x).cached_data 359 | 360 | 361 | def dropout_backward(shape, prob=0.5): 362 | np.random.seed(3) 363 | x = get_tensor(*shape) 364 | f = nn.Dropout(prob) 365 | y = f(x).sum() 366 | y.backward() 367 | return x.grad.cached_data 368 | 369 | 370 | def num_params(model): 371 | return np.sum([np.prod(x.shape) for x in model.parameters()]) 372 | 373 | 374 | def residual_block_num_params(dim, hidden_dim, norm): 375 | model = ResidualBlock(dim, hidden_dim, norm) 376 | return np.array(num_params(model)) 377 | 378 | 379 | def residual_block_forward(dim, hidden_dim, norm, drop_prob): 380 | np.random.seed(2) 381 | input_tensor = ndl.Tensor(np.random.randn(1, dim)) 382 | output_tensor = ResidualBlock(dim, hidden_dim, norm, drop_prob)(input_tensor) 383 | return output_tensor.numpy() 384 | 385 | 386 | def mlp_resnet_num_params(dim, hidden_dim, num_blocks, num_classes, norm): 387 | model = MLPResNet(dim, hidden_dim, num_blocks, num_classes, norm) 388 | return np.array(num_params(model)) 389 | 390 | 391 | def mlp_resnet_forward(dim, hidden_dim, num_blocks, num_classes, norm, drop_prob): 392 | np.random.seed(4) 393 | input_tensor = ndl.Tensor(np.random.randn(2, dim), dtype=np.float32) 394 | output_tensor = MLPResNet( 395 | dim, hidden_dim, num_blocks, num_classes, norm, drop_prob 396 | )(input_tensor) 397 | return output_tensor.numpy() 398 | 399 | 400 | def train_epoch_1(hidden_dim, batch_size, optimizer, **kwargs): 401 | np.random.seed(1) 402 | train_dataset = ndl.data.MNISTDataset( 403 | "./data/train-images-idx3-ubyte.gz", "./data/train-labels-idx1-ubyte.gz" 404 | ) 405 | train_dataloader = ndl.data.DataLoader(dataset=train_dataset, batch_size=batch_size) 406 | 407 | model = MLPResNet(784, hidden_dim) 408 | opt = optimizer(model.parameters(), **kwargs) 409 | model.eval() 410 | return np.array(epoch(train_dataloader, model, opt)) 411 | 412 | 413 | def eval_epoch_1(hidden_dim, batch_size): 414 | np.random.seed(1) 415 | test_dataset = ndl.data.MNISTDataset( 416 | "./data/t10k-images-idx3-ubyte.gz", "./data/t10k-labels-idx1-ubyte.gz" 417 | ) 418 | test_dataloader = ndl.data.DataLoader( 419 | dataset=test_dataset, batch_size=batch_size, shuffle=False 420 | ) 421 | 422 | model = MLPResNet(784, hidden_dim) 423 | model.train() 424 | return np.array(epoch(test_dataloader, model)) 425 | 426 | 427 | def train_mnist_1(batch_size, epochs, optimizer, lr, weight_decay, hidden_dim): 428 | np.random.seed(1) 429 | out = train_mnist( 430 | batch_size, epochs, optimizer, lr, weight_decay, hidden_dim, data_dir="./data" 431 | ) 432 | return np.array(out) 433 | 434 | 435 | def test_check_prng_contact_us_if_this_fails_1(): 436 | np.testing.assert_allclose( 437 | check_prng(3, 3), 438 | np.array( 439 | [[2.1, 0.95, 3.45], [3.1, 2.45, 2.3], [3.3, 0.4, 1.2]], dtype=np.float32 440 | ), 441 | rtol=1e-08, 442 | atol=1e-08, 443 | ) 444 | 445 | 446 | def test_op_power_scalar_forward_1(): 447 | np.testing.assert_allclose( 448 | power_scalar_forward((2, 2), power=2), 449 | np.array([[11.222499, 17.639997], [0.0625, 20.25]], dtype=np.float32), 450 | rtol=1e-5, 451 | atol=1e-5, 452 | ) 453 | 454 | 455 | def test_op_power_scalar_forward_2(): 456 | np.testing.assert_allclose( 457 | power_scalar_forward((2, 2), power=-1.5), 458 | np.array([[0.16309206, 0.11617859], [8.0, 0.10475656]], dtype=np.float32), 459 | rtol=1e-5, 460 | atol=1e-5, 461 | ) 462 | 463 | 464 | def test_op_power_scalar_backward_1(): 465 | np.testing.assert_allclose( 466 | power_scalar_backward((2, 2), power=2), 467 | np.array([[6.7, 8.4], [0.5, 9.0]], dtype=np.float32), 468 | rtol=1e-5, 469 | atol=1e-5, 470 | ) 471 | 472 | 473 | def test_op_logsoftmax_forward_1(): 474 | np.testing.assert_allclose(logsoftmax_forward((3, 3)), 475 | np.array([[-1.6436583 , -2.7936583 , -0.29365814], 476 | [-0.6787312 , -1.3287311 , -1.4787312 ], 477 | [-0.16337626, -3.0633762 , -2.2633762 ]], dtype=np.float32), rtol=1e-5, atol=1e-5) 478 | 479 | def test_op_logsoftmax_stable_forward_1(): 480 | np.testing.assert_allclose(logsoftmax_forward((3, 3), mult=1e5), 481 | np.array([[-135000.02, -250000. , 0. ], 482 | [ 0. , -65000. , -80000. ], 483 | [ 0. , -290000. , -210000. ]], dtype=np.float32), rtol=1e-5, atol=1e-5) 484 | 485 | def test_op_logsoftmax_backward_1(): 486 | np.testing.assert_allclose(logsoftmax_backward((3, 3)), 487 | np.array([[-1.4585897 , -5.008274 , 6.4668627 ], 488 | [ 2.1793516 , -0.81108296, -1.3682691 ], 489 | [ 8.998467 , -5.613649 , -3.3848193 ]], dtype=np.float32), rtol=1e-5, atol=1e-5) 490 | 491 | def submit_op_logsoftmax(): 492 | mugrade.submit(logsoftmax_forward((3, 4))) 493 | mugrade.submit(logsoftmax_forward((3, 5), mult=1e5)) 494 | mugrade.submit(logsoftmax_forward((3, 6), mult=1e5)) 495 | mugrade.submit(logsoftmax_backward((1, 3))) 496 | mugrade.submit(logsoftmax_backward((3, 6), mult=1e5)) 497 | 498 | 499 | def test_op_logsumexp_forward_1(): 500 | np.testing.assert_allclose( 501 | logsumexp_forward((3, 3, 3), (1, 2)), 502 | np.array([5.366029, 4.9753823, 6.208126], dtype=np.float32), 503 | rtol=1e-5, 504 | atol=1e-5, 505 | ) 506 | 507 | 508 | def test_op_logsumexp_forward_2(): 509 | np.testing.assert_allclose( 510 | logsumexp_forward((3, 3, 3), None), 511 | np.array([6.7517853], dtype=np.float32), 512 | rtol=1e-5, 513 | atol=1e-5, 514 | ) 515 | 516 | 517 | def test_op_logsumexp_forward_3(): 518 | np.testing.assert_allclose( 519 | logsumexp_forward((1, 2, 3, 4), (0, 2)), 520 | np.array( 521 | [ 522 | [5.276974, 5.047317, 3.778802, 5.0103745], 523 | [5.087831, 4.391712, 5.025037, 2.0214698], 524 | ], 525 | dtype=np.float32, 526 | ), 527 | rtol=1e-5, 528 | atol=1e-5, 529 | ) 530 | 531 | 532 | def test_op_logsumexp_forward_4(): 533 | np.testing.assert_allclose( 534 | logsumexp_forward((3, 10), (1,)), 535 | np.array([5.705309, 5.976375, 5.696459], dtype=np.float32), 536 | rtol=1e-5, 537 | atol=1e-5, 538 | ) 539 | 540 | 541 | def test_op_logsumexp_forward_5(): 542 | test_data = ndl.ops.logsumexp( 543 | ndl.Tensor(np.array([[1e10, 1e9, 1e8, -10], [1e-10, 1e9, 1e8, -10]])), (0,) 544 | ).numpy() 545 | np.testing.assert_allclose( 546 | test_data, 547 | np.array([1.00000000e10, 1.00000000e09, 1.00000001e08, -9.30685282e00]), 548 | rtol=1e-5, 549 | atol=1e-5, 550 | ) 551 | 552 | 553 | def test_op_logsumexp_backward_1(): 554 | np.testing.assert_allclose( 555 | logsumexp_backward((3, 1), (1,)), 556 | np.array([[1.0], [7.3], [9.9]], dtype=np.float32), 557 | rtol=1e-5, 558 | atol=1e-5, 559 | ) 560 | 561 | 562 | def test_op_logsumexp_backward_2(): 563 | np.testing.assert_allclose( 564 | logsumexp_backward((3, 3, 3), (1, 2)), 565 | np.array( 566 | [ 567 | [ 568 | [1.4293308, 1.2933122, 0.82465225], 569 | [0.50017685, 2.1323113, 2.1323113], 570 | [1.4293308, 0.58112264, 0.40951014], 571 | ], 572 | [ 573 | [0.3578173, 0.07983983, 4.359107], 574 | [1.1300558, 0.561169, 0.1132981], 575 | [0.9252113, 0.65198547, 1.7722803], 576 | ], 577 | [ 578 | [0.2755132, 2.365242, 2.888913], 579 | [0.05291228, 1.1745441, 0.02627547], 580 | [2.748018, 0.13681579, 2.748018], 581 | ], 582 | ], 583 | dtype=np.float32, 584 | ), 585 | rtol=1e-5, 586 | atol=1e-5, 587 | ) 588 | 589 | 590 | def test_op_logsumexp_backward_3(): 591 | np.testing.assert_allclose( 592 | logsumexp_backward((3, 3, 3), (0, 2)), 593 | np.array( 594 | [ 595 | [ 596 | [0.92824626, 0.839912, 0.5355515], 597 | [0.59857905, 2.551811, 2.551811], 598 | [1.0213376, 0.41524494, 0.29261813], 599 | ], 600 | [ 601 | [0.16957533, 0.03783737, 2.0658503], 602 | [0.98689, 0.49007502, 0.09894446], 603 | [0.48244575, 0.3399738, 0.9241446], 604 | ], 605 | [ 606 | [0.358991, 3.081887, 3.764224], 607 | [0.12704718, 2.820187, 0.06308978], 608 | [3.9397335, 0.19614778, 3.9397335], 609 | ], 610 | ], 611 | dtype=np.float32, 612 | ), 613 | rtol=1e-5, 614 | atol=1e-5, 615 | ) 616 | 617 | 618 | def test_op_logsumexp_backward_5(): 619 | grad_compare = ndl.Tensor(np.array([[1e10, 1e9, 1e8, -10], [1e-10, 1e9, 1e8, -10]])) 620 | test_data = (ndl.ops.logsumexp(grad_compare, (0,)) ** 2).sum().backward() 621 | np.testing.assert_allclose( 622 | grad_compare.grad.cached_data, 623 | np.array( 624 | [ 625 | [2.00000000e10, 9.99999999e08, 1.00000001e08, -9.30685282e00], 626 | [0.00000000e00, 9.99999999e08, 1.00000001e08, -9.30685282e00], 627 | ] 628 | ), 629 | rtol=1e-5, 630 | atol=1e-5, 631 | ) 632 | 633 | 634 | def submit_op_logsumexp(): 635 | mugrade.submit(logsumexp_forward((2, 2, 2), None)) 636 | mugrade.submit(logsumexp_forward((1, 2, 3), (0,))) 637 | mugrade.submit(logsumexp_forward((2, 3, 3), (1, 2))) 638 | mugrade.submit(logsumexp_forward((1, 2, 2, 2, 2), (1, 2, 3, 4))) 639 | mugrade.submit(logsumexp_forward((1, 2, 2, 2, 2), (0, 1, 3))) 640 | mugrade.submit(logsumexp_backward((2, 2, 2), None)) 641 | mugrade.submit(logsumexp_backward((1, 2, 3), (0,))) 642 | mugrade.submit(logsumexp_backward((2, 3, 3), (1, 2))) 643 | mugrade.submit(logsumexp_backward((1, 2, 2, 2, 2), (1, 2, 3, 4))) 644 | mugrade.submit(logsumexp_backward((1, 2, 2, 2, 2), (0, 1, 3))) 645 | 646 | 647 | def test_op_logsumexp_backward_4(): 648 | np.testing.assert_allclose( 649 | logsumexp_backward((1, 2, 3, 4), None), 650 | np.array( 651 | [ 652 | [ 653 | [ 654 | [0.96463485, 1.30212122, 0.09671321, 1.84779774], 655 | [1.84779774, 0.39219132, 0.21523925, 0.30543892], 656 | [0.01952606, 0.55654611, 0.32109909, 0.01598658], 657 | ], 658 | [ 659 | [1.30212122, 0.83026929, 0.30543892, 0.01680623], 660 | [0.29054249, 0.07532032, 1.84779774, 0.05307731], 661 | [0.75125862, 0.26289377, 0.04802637, 0.03932065], 662 | ], 663 | ] 664 | ], 665 | dtype=np.float32, 666 | ), 667 | rtol=1e-5, 668 | atol=1e-5, 669 | ) 670 | 671 | 672 | def test_init_kaiming_uniform(): 673 | np.random.seed(42) 674 | np.testing.assert_allclose( 675 | ndl.init.kaiming_uniform(3, 5).numpy(), 676 | np.array( 677 | [ 678 | [-0.35485414, 1.2748126, 0.65617794, 0.27904832, -0.9729262], 679 | [-0.97299445, -1.2499284, 1.0357026, 0.28599644, 0.58851814], 680 | [-1.3559918, 1.3291057, 0.9402898, -0.81362784, -0.8999349], 681 | ], 682 | dtype=np.float32, 683 | ), 684 | rtol=1e-4, 685 | atol=1e-4, 686 | ) 687 | 688 | 689 | def test_init_kaiming_normal(): 690 | np.random.seed(42) 691 | np.testing.assert_allclose( 692 | ndl.init.kaiming_normal(3, 5).numpy(), 693 | np.array( 694 | [ 695 | [0.4055654, -0.11289233, 0.5288355, 1.2435486, -0.19118543], 696 | [-0.19117202, 1.2894219, 0.62660784, -0.38332424, 0.4429984], 697 | [-0.37837896, -0.38026676, 0.19756137, -1.5621868, -1.4083896], 698 | ], 699 | dtype=np.float32, 700 | ), 701 | rtol=1e-4, 702 | atol=1e-4, 703 | ) 704 | 705 | 706 | def test_init_xavier_uniform(): 707 | np.random.seed(42) 708 | np.testing.assert_allclose( 709 | ndl.init.xavier_uniform(3, 5, gain=1.5).numpy(), 710 | np.array( 711 | [ 712 | [-0.32595432, 1.1709901, 0.60273796, 0.25632226, -0.8936898], 713 | [-0.89375246, -1.1481324, 0.95135355, 0.26270452, 0.54058844], 714 | [-1.245558, 1.2208616, 0.8637113, -0.74736494, -0.826643], 715 | ], 716 | dtype=np.float32, 717 | ), 718 | rtol=1e-4, 719 | atol=1e-4, 720 | ) 721 | 722 | 723 | def test_init_xavier_normal(): 724 | np.random.seed(42) 725 | np.testing.assert_allclose( 726 | ndl.init.xavier_normal(3, 5, gain=0.33).numpy(), 727 | np.array( 728 | [ 729 | [0.08195783, -0.022813609, 0.10686861, 0.25129992, -0.038635306], 730 | [-0.038632598, 0.2605701, 0.12662673, -0.07746328, 0.08952241], 731 | [-0.07646392, -0.07684541, 0.039923776, -0.31569123, -0.28461143], 732 | ], 733 | dtype=np.float32, 734 | ), 735 | rtol=1e-4, 736 | atol=1e-4, 737 | ) 738 | 739 | 740 | def submit_init(): 741 | np.random.seed(0) 742 | mugrade.submit(ndl.init.kaiming_normal(2, 5).numpy()) 743 | mugrade.submit(ndl.init.kaiming_uniform(2, 5).numpy()) 744 | mugrade.submit(ndl.init.xavier_uniform(2, 5, gain=0.33).numpy()) 745 | mugrade.submit(ndl.init.xavier_normal(2, 5, gain=1.3).numpy()) 746 | 747 | 748 | def test_nn_linear_weight_init_1(): 749 | np.testing.assert_allclose( 750 | nn_linear_weight_init(), 751 | np.array( 752 | [ 753 | [-4.4064468e-01, -6.3199449e-01, -4.1082984e-01, -7.5330488e-02], 754 | [-3.3144259e-01, 3.4056887e-02, -4.4079605e-01, 8.8153863e-01], 755 | [4.3108878e-01, -7.1237373e-01, -2.1057765e-01, 2.3793796e-01], 756 | [-6.9425780e-01, 8.9535803e-01, -1.0512712e-01, 5.3615785e-01], 757 | [5.4460180e-01, -2.5689366e-01, -1.5534532e-01, 1.5601574e-01], 758 | [4.8174453e-01, -5.7806653e-01, -3.9223823e-01, 3.1518409e-01], 759 | [-6.5129338e-04, -5.9517515e-01, -1.6083106e-01, -5.5698222e-01], 760 | ], 761 | dtype=np.float32, 762 | ), 763 | rtol=1e-5, 764 | atol=1e-5, 765 | ) 766 | 767 | 768 | def test_nn_linear_bias_init_1(): 769 | np.testing.assert_allclose( 770 | nn_linear_bias_init(), 771 | np.array([[0.077647, 0.814139, -0.770975, 1.120297]], dtype=np.float32), 772 | rtol=1e-5, 773 | atol=1e-5, 774 | ) 775 | 776 | 777 | def test_nn_linear_forward_1(): 778 | np.testing.assert_allclose( 779 | linear_forward((10, 5), (1, 10)), 780 | np.array([[3.849948, 9.50499, 2.38029, 5.572587, 5.668391]], dtype=np.float32), 781 | rtol=1e-5, 782 | atol=1e-5, 783 | ) 784 | 785 | 786 | def test_nn_linear_forward_2(): 787 | np.testing.assert_allclose( 788 | linear_forward((10, 5), (3, 10)), 789 | np.array( 790 | [ 791 | [7.763089, 10.086785, 0.380316, 6.242502, 6.944664], 792 | [2.548275, 7.747925, 5.343155, 2.065694, 9.871243], 793 | [2.871696, 7.466332, 4.236925, 2.461897, 8.209476], 794 | ], 795 | dtype=np.float32, 796 | ), 797 | rtol=1e-5, 798 | atol=1e-5, 799 | ) 800 | 801 | 802 | def test_nn_linear_forward_3(): 803 | np.testing.assert_allclose( 804 | linear_forward((10, 5), (1, 3, 10)), 805 | np.array( 806 | [ 807 | [ 808 | [4.351459, 8.782808, 3.935711, 3.03171, 8.014219], 809 | [5.214458, 8.728788, 2.376814, 5.672185, 4.974319], 810 | [1.343204, 8.639378, 2.604359, -0.282955, 9.864498], 811 | ] 812 | ], 813 | dtype=np.float32, 814 | ), 815 | rtol=1e-5, 816 | atol=1e-5, 817 | ) 818 | 819 | 820 | def test_nn_linear_backward_1(): 821 | np.testing.assert_allclose( 822 | linear_backward((10, 5), (1, 10)), 823 | np.array( 824 | [ 825 | [ 826 | 20.61148, 827 | 6.920893, 828 | -1.625556, 829 | -13.497676, 830 | -6.672813, 831 | 18.762121, 832 | 7.286628, 833 | 8.18535, 834 | 2.741301, 835 | 5.723689, 836 | ] 837 | ], 838 | dtype=np.float32, 839 | ), 840 | rtol=1e-5, 841 | atol=1e-5, 842 | ) 843 | 844 | 845 | def test_nn_linear_backward_2(): 846 | print(linear_backward((10, 5), (3, 10))) 847 | np.testing.assert_allclose( 848 | linear_backward((10, 5), (3, 10)), 849 | np.array( 850 | [ 851 | [ 852 | 24.548800, 853 | 8.775347, 854 | 4.387898, 855 | -21.248514, 856 | -3.9669373, 857 | 24.256767, 858 | 6.3171115, 859 | 6.029777, 860 | 0.8809935, 861 | 3.5995162, 862 | ], 863 | [ 864 | 12.233745, 865 | -3.792646, 866 | -4.1903896, 867 | -5.106719, 868 | -12.004269, 869 | 11.967942, 870 | 11.939469, 871 | 19.314493, 872 | 10.631226, 873 | 14.510731, 874 | ], 875 | [ 876 | 12.920014, 877 | -1.4545978, 878 | -3.0892954, 879 | -6.762379, 880 | -9.713004, 881 | 12.523148, 882 | 9.904757, 883 | 15.442993, 884 | 8.044141, 885 | 11.4106865, 886 | ], 887 | ], 888 | dtype=np.float32, 889 | ), 890 | rtol=1e-5, 891 | atol=1e-5, 892 | ) 893 | 894 | 895 | def test_nn_linear_backward_3(): 896 | print(linear_backward((10, 5), (1, 3, 10))) 897 | np.testing.assert_allclose( 898 | linear_backward((10, 5), (1, 3, 10)), 899 | np.array( 900 | [ 901 | [ 902 | [ 903 | 16.318823, 904 | 0.3890714, 905 | -2.3196607, 906 | -10.607947, 907 | -8.891977, 908 | 16.04581, 909 | 9.475689, 910 | 14.571134, 911 | 6.581477, 912 | 10.204643, 913 | ], 914 | [ 915 | 20.291656, 916 | 7.48733, 917 | 1.2581345, 918 | -14.285493, 919 | -6.0252004, 920 | 19.621624, 921 | 4.343303, 922 | 6.973201, 923 | -0.8103489, 924 | 4.037069, 925 | ], 926 | [ 927 | 11.332953, 928 | -5.698288, 929 | -8.815561, 930 | -7.673438, 931 | -7.6161675, 932 | 9.361553, 933 | 17.341637, 934 | 17.269142, 935 | 18.1076, 936 | 14.261493, 937 | ], 938 | ] 939 | ], 940 | dtype=np.float32, 941 | ), 942 | rtol=1e-5, 943 | atol=1e-5, 944 | ) 945 | 946 | 947 | def submit_nn_linear(): 948 | mugrade.submit(linear_forward((3, 5), (1, 3))) 949 | mugrade.submit(linear_forward((3, 5), (3, 3))) 950 | mugrade.submit(linear_forward((3, 5), (1, 3, 3))) 951 | mugrade.submit(linear_backward((4, 5), (1, 4))) 952 | mugrade.submit(linear_backward((4, 5), (3, 4))) 953 | mugrade.submit(linear_backward((4, 5), (1, 3, 4))) 954 | 955 | 956 | def test_nn_relu_forward_1(): 957 | np.testing.assert_allclose( 958 | relu_forward(2, 2), 959 | np.array([[3.35, 4.2], [0.25, 4.5]], dtype=np.float32), 960 | rtol=1e-5, 961 | atol=1e-5, 962 | ) 963 | 964 | 965 | def test_nn_relu_backward_1(): 966 | np.testing.assert_allclose( 967 | relu_backward(3, 2), 968 | np.array([[7.5, 2.7], [0.6, 0.2], [0.3, 6.7]], dtype=np.float32), 969 | rtol=1e-5, 970 | atol=1e-5, 971 | ) 972 | 973 | 974 | def submit_nn_relu(): 975 | mugrade.submit(relu_forward(2, 3)) 976 | mugrade.submit(relu_backward(3, 4)) 977 | 978 | 979 | def test_nn_sequential_forward_1(): 980 | print(sequential_forward(batches=3)) 981 | np.testing.assert_allclose( 982 | sequential_forward(batches=3), 983 | np.array( 984 | [ 985 | [3.296263, 0.057031, 2.97568, -4.618432, -0.902491], 986 | [2.465332, -0.228394, 2.069803, -3.772378, -0.238334], 987 | [3.04427, -0.25623, 3.848721, -6.586399, -0.576819], 988 | ], 989 | dtype=np.float32, 990 | ), 991 | rtol=1e-5, 992 | atol=1e-5, 993 | ) 994 | 995 | 996 | def test_nn_sequential_backward_1(): 997 | np.testing.assert_allclose( 998 | sequential_backward(batches=3), 999 | np.array( 1000 | [ 1001 | [0.802697, -1.0971, 0.120842, 0.033051, 0.241105], 1002 | [-0.364489, 0.651385, 0.482428, 0.925252, -1.233545], 1003 | [0.802697, -1.0971, 0.120842, 0.033051, 0.241105], 1004 | ], 1005 | dtype=np.float32, 1006 | ), 1007 | rtol=1e-5, 1008 | atol=1e-5, 1009 | ) 1010 | 1011 | 1012 | def submit_nn_sequential(): 1013 | mugrade.submit(sequential_forward(batches=2)) 1014 | mugrade.submit(sequential_backward(batches=2)) 1015 | 1016 | 1017 | def test_nn_softmax_loss_forward_1(): 1018 | np.testing.assert_allclose( 1019 | softmax_loss_forward(5, 10), 1020 | np.array(4.041218, dtype=np.float32), 1021 | rtol=1e-5, 1022 | atol=1e-5, 1023 | ) 1024 | 1025 | 1026 | def test_nn_softmax_loss_forward_2(): 1027 | np.testing.assert_allclose( 1028 | softmax_loss_forward(3, 11), 1029 | np.array(3.3196716, dtype=np.float32), 1030 | rtol=1e-5, 1031 | atol=1e-5, 1032 | ) 1033 | 1034 | 1035 | def test_nn_softmax_loss_backward_1(): 1036 | np.testing.assert_allclose( 1037 | softmax_loss_backward(5, 10), 1038 | np.array( 1039 | [ 1040 | [ 1041 | 0.00068890385, 1042 | 0.0015331834, 1043 | 0.013162163, 1044 | -0.16422154, 1045 | 0.023983022, 1046 | 0.0050903494, 1047 | 0.00076135644, 1048 | 0.050772052, 1049 | 0.0062173656, 1050 | 0.062013146, 1051 | ], 1052 | [ 1053 | 0.012363418, 1054 | 0.02368262, 1055 | 0.11730081, 1056 | 0.001758993, 1057 | 0.004781439, 1058 | 0.0029000894, 1059 | -0.19815083, 1060 | 0.017544521, 1061 | 0.015874943, 1062 | 0.0019439887, 1063 | ], 1064 | [ 1065 | 0.001219767, 1066 | 0.08134181, 1067 | 0.057320606, 1068 | 0.0008595553, 1069 | 0.0030001428, 1070 | 0.0009499555, 1071 | -0.19633561, 1072 | 0.0008176346, 1073 | 0.0014898272, 1074 | 0.0493363, 1075 | ], 1076 | [ 1077 | -0.19886842, 1078 | 0.08767337, 1079 | 0.017700946, 1080 | 0.026406704, 1081 | 0.0013147127, 1082 | 0.0107361665, 1083 | 0.009714483, 1084 | 0.023893777, 1085 | 0.019562569, 1086 | 0.0018656658, 1087 | ], 1088 | [ 1089 | 0.007933789, 1090 | 0.017656967, 1091 | 0.027691642, 1092 | 0.0005605318, 1093 | 0.05576411, 1094 | 0.0013114461, 1095 | 0.06811045, 1096 | 0.011835824, 1097 | 0.0071787895, 1098 | -0.19804356, 1099 | ], 1100 | ], 1101 | dtype=np.float32, 1102 | ), 1103 | rtol=1e-5, 1104 | atol=1e-5, 1105 | ) 1106 | 1107 | 1108 | def test_nn_softmax_loss_backward_2(): 1109 | np.testing.assert_allclose( 1110 | softmax_loss_backward(3, 11), 1111 | np.array( 1112 | [ 1113 | [ 1114 | 0.0027466794, 1115 | 0.020295369, 1116 | 0.012940894, 1117 | 0.04748398, 1118 | 0.052477922, 1119 | 0.090957515, 1120 | 0.0028875037, 1121 | 0.012940894, 1122 | 0.040869843, 1123 | 0.04748398, 1124 | -0.33108455, 1125 | ], 1126 | [ 1127 | 0.0063174255, 1128 | 0.001721699, 1129 | 0.09400159, 1130 | 0.0034670753, 1131 | 0.038218185, 1132 | 0.009424488, 1133 | 0.0042346967, 1134 | 0.08090791, 1135 | -0.29697907, 1136 | 0.0044518122, 1137 | 0.054234188, 1138 | ], 1139 | [ 1140 | 0.14326698, 1141 | 0.002624026, 1142 | 0.0032049934, 1143 | 0.01176007, 1144 | 0.045363605, 1145 | 0.0043262867, 1146 | 0.039044812, 1147 | 0.017543964, 1148 | 0.0037236712, 1149 | -0.3119051, 1150 | 0.04104668, 1151 | ], 1152 | ], 1153 | dtype=np.float32, 1154 | ), 1155 | rtol=1e-5, 1156 | atol=1e-5, 1157 | ) 1158 | 1159 | 1160 | def submit_nn_softmax_loss(): 1161 | mugrade.submit(softmax_loss_forward(4, 9)) 1162 | mugrade.submit(softmax_loss_forward(2, 7)) 1163 | mugrade.submit(softmax_loss_backward(4, 9)) 1164 | mugrade.submit(softmax_loss_backward(2, 7)) 1165 | 1166 | 1167 | def test_nn_layernorm_forward_1(): 1168 | np.testing.assert_allclose( 1169 | layernorm_forward((3, 3), 3), 1170 | np.array( 1171 | [ 1172 | [-0.06525002, -1.1908097, 1.2560595], 1173 | [1.3919864, -0.47999576, -0.911992], 1174 | [1.3628436, -1.0085043, -0.3543393], 1175 | ], 1176 | dtype=np.float32, 1177 | ), 1178 | rtol=1e-5, 1179 | atol=1e-5, 1180 | ) 1181 | 1182 | 1183 | def test_nn_layernorm_forward_2(): 1184 | np.testing.assert_allclose( 1185 | layernorm_forward((2, 10), 10), 1186 | np.array( 1187 | [ 1188 | [ 1189 | 0.8297899, 1190 | 1.6147263, 1191 | -1.525019, 1192 | -0.4036814, 1193 | 0.306499, 1194 | 0.08223152, 1195 | 0.6429003, 1196 | -1.3381294, 1197 | 0.8671678, 1198 | -1.0764838, 1199 | ], 1200 | [ 1201 | -1.8211555, 1202 | 0.39098236, 1203 | -0.5864739, 1204 | 0.853988, 1205 | -0.3806936, 1206 | 1.2655486, 1207 | 0.33953735, 1208 | 1.522774, 1209 | -0.8951442, 1210 | -0.68936396, 1211 | ], 1212 | ], 1213 | dtype=np.float32, 1214 | ), 1215 | rtol=1e-5, 1216 | atol=1e-5, 1217 | ) 1218 | 1219 | 1220 | def test_nn_layernorm_forward_3(): 1221 | np.testing.assert_allclose( 1222 | layernorm_forward((1, 5), 5), 1223 | np.array( 1224 | [[-1.0435007, -0.8478443, 0.7500162, -0.42392215, 1.565251]], 1225 | dtype=np.float32, 1226 | ), 1227 | rtol=1e-5, 1228 | atol=1e-5, 1229 | ) 1230 | 1231 | 1232 | def test_nn_layernorm_backward_1(): 1233 | np.testing.assert_allclose( 1234 | layernorm_backward((3, 3), 3), 1235 | np.array( 1236 | [ 1237 | [-2.8312206e-06, -6.6757202e-05, 6.9618225e-05], 1238 | [1.9950867e-03, -6.8092346e-04, -1.3141632e-03], 1239 | [4.4703484e-05, -3.2544136e-05, -1.1801720e-05], 1240 | ], 1241 | dtype=np.float32, 1242 | ), 1243 | rtol=1e-5, 1244 | atol=1e-5, 1245 | ) 1246 | 1247 | 1248 | def test_nn_layernorm_backward_2(): 1249 | np.testing.assert_allclose( 1250 | layernorm_backward((2, 10), 10), 1251 | np.array( 1252 | [ 1253 | [ 1254 | -2.301574, 1255 | 4.353944, 1256 | -1.9396116, 1257 | 2.4330146, 1258 | -1.1070801, 1259 | 0.01571643, 1260 | -2.209449, 1261 | 0.49513134, 1262 | -2.261348, 1263 | 2.5212562, 1264 | ], 1265 | [ 1266 | -9.042961, 1267 | -2.6184766, 1268 | 4.5592957, 1269 | -4.2109876, 1270 | 3.4247458, 1271 | -1.9075732, 1272 | -2.2689414, 1273 | 2.110825, 1274 | 5.044025, 1275 | 4.910048, 1276 | ], 1277 | ], 1278 | dtype=np.float32, 1279 | ), 1280 | rtol=1e-5, 1281 | atol=1e-5, 1282 | ) 1283 | 1284 | 1285 | def test_nn_layernorm_backward_3(): 1286 | np.testing.assert_allclose( 1287 | layernorm_backward((1, 5), 5), 1288 | np.array( 1289 | [[0.150192, 0.702322, -3.321343, 0.31219, 2.156639]], dtype=np.float32 1290 | ), 1291 | rtol=1e-5, 1292 | atol=1e-5, 1293 | ) 1294 | 1295 | 1296 | def test_nn_layernorm_backward_4(): 1297 | np.testing.assert_allclose( 1298 | layernorm_backward((5, 1), 1), 1299 | np.array([[0], [0], [0], [0], [0]], dtype=np.float32), 1300 | rtol=1e-5, 1301 | atol=1e-5, 1302 | ) 1303 | 1304 | 1305 | def submit_nn_layernorm(): 1306 | mugrade.submit(layernorm_forward((1, 1), 1)) 1307 | mugrade.submit(layernorm_forward((10, 10), 10)) 1308 | mugrade.submit(layernorm_forward((10, 30), 30)) 1309 | mugrade.submit(layernorm_forward((1, 3), 3)) 1310 | mugrade.submit(layernorm_backward((1, 1), 1)) 1311 | mugrade.submit(layernorm_backward((10, 10), 10)) 1312 | mugrade.submit(layernorm_backward((10, 30), 30)) 1313 | mugrade.submit(layernorm_backward((1, 3), 3)) 1314 | 1315 | 1316 | def test_nn_batchnorm_check_model_eval_switches_training_flag_1(): 1317 | np.testing.assert_allclose( 1318 | check_training_mode(), 1319 | np.array( 1320 | [ 1321 | 0, 1322 | 0, 1323 | 0, 1324 | 0, 1325 | 0, 1326 | 0, 1327 | 0, 1328 | 0, 1329 | 0, 1330 | 1, 1331 | 1, 1332 | 1, 1333 | 1, 1334 | 1, 1335 | 1, 1336 | 1, 1337 | 1, 1338 | 1, 1339 | 0, 1340 | 0, 1341 | 0, 1342 | 0, 1343 | 0, 1344 | 0, 1345 | 0, 1346 | 0, 1347 | 0, 1348 | ] 1349 | ), 1350 | rtol=1e-5, 1351 | atol=1e-5, 1352 | ) 1353 | 1354 | 1355 | def test_nn_batchnorm_forward_1(): 1356 | np.testing.assert_allclose( 1357 | batchnorm_forward(4, 4), 1358 | np.array( 1359 | [ 1360 | [7.8712696e-01, -3.1676728e-01, -6.4885163e-01, 2.0828949e-01], 1361 | [-7.9508079e-03, 1.0092355e00, 1.6221288e00, 8.5209310e-01], 1362 | [8.5073310e-01, -1.4954363e00, -9.6686421e-08, -1.6852506e00], 1363 | [-1.6299094e00, 8.0296844e-01, -9.7327745e-01, 6.2486827e-01], 1364 | ], 1365 | dtype=np.float32, 1366 | ), 1367 | rtol=1e-5, 1368 | atol=1e-5, 1369 | ) 1370 | 1371 | 1372 | def test_nn_batchnorm_forward_affine_1(): 1373 | np.testing.assert_allclose( 1374 | batchnorm_forward(4, 4, affine=True), 1375 | np.array( 1376 | [ 1377 | [7.49529, 0.047213316, 2.690084, 5.5227957], 1378 | [4.116209, 3.8263211, 7.79979, 7.293256], 1379 | [7.765616, -3.3119934, 4.15, 0.31556034], 1380 | [-2.7771149, 3.23846, 1.9601259, 6.6683874], 1381 | ], 1382 | dtype=np.float32, 1383 | ), 1384 | rtol=1e-5, 1385 | atol=1e-5, 1386 | ) 1387 | 1388 | 1389 | def test_nn_batchnorm_backward_1(): 1390 | np.testing.assert_allclose( 1391 | batchnorm_backward(5, 4), 1392 | np.array( 1393 | [ 1394 | [2.1338463e-04, 5.2094460e-06, -2.8359889e-05, -4.4368207e-06], 1395 | [-3.8480759e-04, -4.0292739e-06, 1.8370152e-05, -1.1172146e-05], 1396 | [2.5629997e-04, -1.1003018e-05, -9.0479853e-06, 5.5171549e-06], 1397 | [-4.2676926e-04, 3.4213067e-06, 1.3601780e-05, 1.0166317e-05], 1398 | [3.4189224e-04, 6.4015389e-06, 5.4359434e-06, -7.4505806e-08], 1399 | ], 1400 | dtype=np.float32, 1401 | ), 1402 | rtol=1e-5, 1403 | atol=1e-5, 1404 | ) 1405 | 1406 | 1407 | def test_nn_batchnorm_backward_affine_1(): 1408 | np.testing.assert_allclose( 1409 | batchnorm_backward(5, 4, affine=True), 1410 | np.array( 1411 | [ 1412 | [3.8604736e-03, 4.2676926e-05, -1.4114380e-04, -3.2424927e-05], 1413 | [-6.9427490e-03, -3.3140182e-05, 9.1552734e-05, -8.5830688e-05], 1414 | [4.6386719e-03, -8.9883804e-05, -4.5776367e-05, 4.3869019e-05], 1415 | [-7.7133179e-03, 2.7418137e-05, 6.6757202e-05, 7.4386597e-05], 1416 | [6.1874390e-03, 5.2213669e-05, 2.8610229e-05, -1.9073486e-06], 1417 | ], 1418 | dtype=np.float32, 1419 | ), 1420 | rtol=1e-5, 1421 | atol=1e-4, 1422 | ) 1423 | 1424 | 1425 | def test_nn_batchnorm_running_mean_1(): 1426 | np.testing.assert_allclose( 1427 | batchnorm_running_mean(4, 3), 1428 | np.array([2.020656, 1.69489, 1.498846], dtype=np.float32), 1429 | rtol=1e-5, 1430 | atol=1e-5, 1431 | ) 1432 | 1433 | 1434 | def test_nn_batchnorm_running_var_1(): 1435 | np.testing.assert_allclose( 1436 | batchnorm_running_var(4, 3), 1437 | np.array([1.412775, 1.386191, 1.096604], dtype=np.float32), 1438 | rtol=1e-5, 1439 | atol=1e-5, 1440 | ) 1441 | 1442 | 1443 | def test_nn_batchnorm_running_grad_1(): 1444 | np.testing.assert_allclose( 1445 | batchnorm_running_grad(4, 3), 1446 | np.array( 1447 | [ 1448 | [8.7022781e-06, -4.9751252e-06, 9.5367432e-05], 1449 | [6.5565109e-06, -7.2401017e-06, -2.3484230e-05], 1450 | [-3.5762787e-06, -4.5262277e-07, 1.6093254e-05], 1451 | [-1.1682510e-05, 1.2667850e-05, -8.7976456e-05], 1452 | ], 1453 | dtype=np.float32, 1454 | ), 1455 | rtol=1e-5, 1456 | atol=1e-5, 1457 | ) 1458 | 1459 | 1460 | def submit_nn_batchnorm(): 1461 | mugrade.submit(batchnorm_forward(2, 3)) 1462 | mugrade.submit(batchnorm_forward(3, 4, affine=True)) 1463 | mugrade.submit(batchnorm_backward(5, 3)) 1464 | 1465 | # todo(Zico) : these need to be added to mugrade 1466 | mugrade.submit(batchnorm_backward(4, 2, affine=True)) 1467 | mugrade.submit(batchnorm_running_mean(3, 3)) 1468 | mugrade.submit(batchnorm_running_mean(3, 3)) 1469 | mugrade.submit(batchnorm_running_var(4, 3)) 1470 | mugrade.submit(batchnorm_running_var(4, 4)) 1471 | mugrade.submit(batchnorm_running_grad(4, 3)) 1472 | 1473 | 1474 | def test_nn_dropout_forward_1(): 1475 | np.testing.assert_allclose( 1476 | dropout_forward((2, 3), prob=0.45), 1477 | np.array([[6.818182, 0.0, 0.0], [0.18181819, 0.0, 6.090909]], dtype=np.float32), 1478 | rtol=1e-5, 1479 | atol=1e-5, 1480 | ) 1481 | 1482 | 1483 | def test_nn_dropout_backward_1(): 1484 | np.testing.assert_allclose( 1485 | dropout_backward((2, 3), prob=0.26), 1486 | np.array( 1487 | [[1.3513514, 0.0, 0.0], [1.3513514, 0.0, 1.3513514]], dtype=np.float32 1488 | ), 1489 | rtol=1e-5, 1490 | atol=1e-5, 1491 | ) 1492 | 1493 | 1494 | def submit_nn_dropout(): 1495 | mugrade.submit(dropout_forward((3, 3), prob=0.4)) 1496 | mugrade.submit(dropout_backward((3, 3), prob=0.15)) 1497 | 1498 | 1499 | def test_nn_residual_forward_1(): 1500 | np.testing.assert_allclose( 1501 | residual_forward(), 1502 | np.array( 1503 | [ 1504 | [0.4660964, 3.8619597, -3.637068, 3.7489638, 2.4931884], 1505 | [-3.3769124, 2.5409935, -2.7110925, 4.9782896, -3.005401], 1506 | [-3.0222898, 3.796795, -2.101042, 6.785948, 0.9347453], 1507 | [-2.2496533, 3.635599, -2.1818666, 5.6361046, 0.9748006], 1508 | [-0.03458184, 0.0823682, -0.06686163, 1.9169499, 1.2638961], 1509 | ], 1510 | dtype=np.float32, 1511 | ), 1512 | rtol=1e-5, 1513 | atol=1e-5, 1514 | ) 1515 | 1516 | 1517 | def test_nn_residual_backward_1(): 1518 | np.testing.assert_allclose( 1519 | residual_backward(), 1520 | np.array( 1521 | [ 1522 | [0.24244219, -0.19571924, -0.08556509, 0.9191598, 1.6787351], 1523 | [0.24244219, -0.19571924, -0.08556509, 0.9191598, 1.6787351], 1524 | [0.24244219, -0.19571924, -0.08556509, 0.9191598, 1.6787351], 1525 | [0.24244219, -0.19571924, -0.08556509, 0.9191598, 1.6787351], 1526 | [0.24244219, -0.19571924, -0.08556509, 0.9191598, 1.6787351], 1527 | ], 1528 | dtype=np.float32, 1529 | ), 1530 | rtol=1e-5, 1531 | atol=1e-5, 1532 | ) 1533 | 1534 | 1535 | def submit_nn_residual(): 1536 | mugrade.submit(residual_forward(shape=(3, 4))) 1537 | mugrade.submit(residual_backward(shape=(3, 4))) 1538 | 1539 | 1540 | def test_nn_flatten_forward_1(): 1541 | np.testing.assert_allclose( 1542 | flatten_forward(3, 3), 1543 | np.array( 1544 | [[2.1, 0.95, 3.45], [3.1, 2.45, 2.3], [3.3, 0.4, 1.2]], dtype=np.float32 1545 | ), 1546 | rtol=1e-5, 1547 | atol=1e-5, 1548 | ) 1549 | 1550 | 1551 | def test_nn_flatten_forward_2(): 1552 | np.testing.assert_allclose( 1553 | flatten_forward(3, 3, 3), 1554 | np.array( 1555 | [ 1556 | [3.35, 3.25, 2.8, 2.3, 3.75, 3.75, 3.35, 2.45, 2.1], 1557 | [1.65, 0.15, 4.15, 2.8, 2.1, 0.5, 2.6, 2.25, 3.25], 1558 | [2.4, 4.55, 4.75, 0.75, 3.85, 0.05, 4.7, 1.7, 4.7], 1559 | ], 1560 | dtype=np.float32, 1561 | ), 1562 | rtol=1e-5, 1563 | atol=1e-5, 1564 | ) 1565 | 1566 | 1567 | def test_nn_flatten_forward_3(): 1568 | np.testing.assert_allclose( 1569 | flatten_forward(1, 2, 3, 4), 1570 | np.array( 1571 | [ 1572 | [ 1573 | 4.2, 1574 | 4.5, 1575 | 1.9, 1576 | 4.85, 1577 | 4.85, 1578 | 3.3, 1579 | 2.7, 1580 | 3.05, 1581 | 0.3, 1582 | 3.65, 1583 | 3.1, 1584 | 0.1, 1585 | 4.5, 1586 | 4.05, 1587 | 3.05, 1588 | 0.15, 1589 | 3.0, 1590 | 1.65, 1591 | 4.85, 1592 | 1.3, 1593 | 3.95, 1594 | 2.9, 1595 | 1.2, 1596 | 1.0, 1597 | ] 1598 | ], 1599 | dtype=np.float32, 1600 | ), 1601 | rtol=1e-5, 1602 | atol=1e-5, 1603 | ) 1604 | 1605 | 1606 | def test_nn_flatten_forward_4(): 1607 | np.testing.assert_allclose( 1608 | flatten_forward(3, 3, 4, 4), 1609 | np.array( 1610 | [ 1611 | [ 1612 | 0.95, 1613 | 1.1, 1614 | 1.0, 1615 | 1.0, 1616 | 4.9, 1617 | 0.25, 1618 | 1.6, 1619 | 0.35, 1620 | 1.5, 1621 | 3.4, 1622 | 1.75, 1623 | 3.4, 1624 | 4.8, 1625 | 1.4, 1626 | 2.35, 1627 | 3.2, 1628 | 1.65, 1629 | 1.9, 1630 | 3.05, 1631 | 0.35, 1632 | 3.15, 1633 | 4.05, 1634 | 3.3, 1635 | 2.2, 1636 | 2.5, 1637 | 1.5, 1638 | 3.25, 1639 | 0.65, 1640 | 3.05, 1641 | 0.75, 1642 | 3.25, 1643 | 2.55, 1644 | 0.55, 1645 | 0.25, 1646 | 3.65, 1647 | 3.4, 1648 | 0.05, 1649 | 1.4, 1650 | 0.75, 1651 | 1.55, 1652 | 4.45, 1653 | 0.2, 1654 | 3.35, 1655 | 2.45, 1656 | 3.45, 1657 | 4.75, 1658 | 2.45, 1659 | 4.3, 1660 | ], 1661 | [ 1662 | 1.0, 1663 | 0.2, 1664 | 0.4, 1665 | 0.7, 1666 | 4.9, 1667 | 4.2, 1668 | 2.55, 1669 | 3.15, 1670 | 1.2, 1671 | 3.8, 1672 | 1.35, 1673 | 1.85, 1674 | 3.15, 1675 | 2.7, 1676 | 1.5, 1677 | 1.35, 1678 | 4.85, 1679 | 4.2, 1680 | 1.5, 1681 | 1.75, 1682 | 0.8, 1683 | 4.3, 1684 | 4.2, 1685 | 4.85, 1686 | 0.0, 1687 | 3.75, 1688 | 0.9, 1689 | 0.0, 1690 | 3.35, 1691 | 1.05, 1692 | 2.2, 1693 | 0.75, 1694 | 3.6, 1695 | 2.0, 1696 | 1.2, 1697 | 1.9, 1698 | 3.45, 1699 | 1.6, 1700 | 3.95, 1701 | 4.45, 1702 | 4.55, 1703 | 4.75, 1704 | 3.7, 1705 | 0.3, 1706 | 2.45, 1707 | 3.75, 1708 | 0.9, 1709 | 2.2, 1710 | ], 1711 | [ 1712 | 4.95, 1713 | 1.05, 1714 | 2.4, 1715 | 4.05, 1716 | 3.75, 1717 | 1.95, 1718 | 0.65, 1719 | 4.9, 1720 | 4.3, 1721 | 2.5, 1722 | 1.9, 1723 | 1.75, 1724 | 2.05, 1725 | 3.95, 1726 | 0.8, 1727 | 0.0, 1728 | 0.8, 1729 | 3.45, 1730 | 1.55, 1731 | 0.3, 1732 | 1.5, 1733 | 2.9, 1734 | 2.15, 1735 | 2.15, 1736 | 3.3, 1737 | 3.2, 1738 | 4.3, 1739 | 3.7, 1740 | 0.4, 1741 | 1.7, 1742 | 0.35, 1743 | 1.9, 1744 | 1.8, 1745 | 4.3, 1746 | 4.7, 1747 | 4.05, 1748 | 3.65, 1749 | 1.1, 1750 | 1.0, 1751 | 2.7, 1752 | 3.95, 1753 | 2.3, 1754 | 2.6, 1755 | 3.5, 1756 | 0.75, 1757 | 4.3, 1758 | 3.0, 1759 | 3.85, 1760 | ], 1761 | ], 1762 | dtype=np.float32, 1763 | ), 1764 | rtol=1e-5, 1765 | atol=1e-5, 1766 | ) 1767 | 1768 | 1769 | def test_nn_flatten_backward_1(): 1770 | np.testing.assert_allclose( 1771 | flatten_backward(3, 3), 1772 | np.array([[4.2, 1.9, 6.9], [6.2, 4.9, 4.6], [6.6, 0.8, 2.4]], dtype=np.float32), 1773 | rtol=1e-5, 1774 | atol=1e-5, 1775 | ) 1776 | 1777 | 1778 | def test_nn_flatten_backward_2(): 1779 | np.testing.assert_allclose( 1780 | flatten_backward(3, 3, 3), 1781 | np.array( 1782 | [ 1783 | [[6.7, 6.5, 5.6], [4.6, 7.5, 7.5], [6.7, 4.9, 4.2]], 1784 | [[3.3, 0.3, 8.3], [5.6, 4.2, 1.0], [5.2, 4.5, 6.5]], 1785 | [[4.8, 9.1, 9.5], [1.5, 7.7, 0.1], [9.4, 3.4, 9.4]], 1786 | ], 1787 | dtype=np.float32, 1788 | ), 1789 | rtol=1e-5, 1790 | atol=1e-5, 1791 | ) 1792 | 1793 | 1794 | def test_nn_flatten_backward_3(): 1795 | np.testing.assert_allclose( 1796 | flatten_backward(2, 2, 2, 2), 1797 | np.array( 1798 | [ 1799 | [[[6.8, 3.8], [5.4, 5.1]], [[8.5, 4.8], [3.1, 1.0]]], 1800 | [[[9.3, 0.8], [3.4, 1.6]], [[9.4, 3.6], [6.6, 7.0]]], 1801 | ], 1802 | dtype=np.float32, 1803 | ), 1804 | rtol=1e-5, 1805 | atol=1e-5, 1806 | ) 1807 | 1808 | 1809 | def test_nn_flatten_backward_4(): 1810 | np.testing.assert_allclose( 1811 | flatten_backward(1, 2, 3, 4), 1812 | np.array( 1813 | [ 1814 | [ 1815 | [[8.4, 9.0, 3.8, 9.7], [9.7, 6.6, 5.4, 6.1], [0.6, 7.3, 6.2, 0.2]], 1816 | [[9.0, 8.1, 6.1, 0.3], [6.0, 3.3, 9.7, 2.6], [7.9, 5.8, 2.4, 2.0]], 1817 | ] 1818 | ], 1819 | dtype=np.float32, 1820 | ), 1821 | rtol=1e-5, 1822 | atol=1e-5, 1823 | ) 1824 | 1825 | 1826 | def test_nn_flatten_backward_5(): 1827 | np.testing.assert_allclose( 1828 | flatten_backward(2, 2, 4, 3), 1829 | np.array( 1830 | [ 1831 | [ 1832 | [ 1833 | [9.8, 7.1, 5.4], 1834 | [4.0, 6.2, 5.7], 1835 | [7.2, 2.0, 2.4], 1836 | [8.9, 4.9, 3.3], 1837 | ], 1838 | [ 1839 | [9.0, 9.8, 5.9], 1840 | [7.1, 2.7, 9.6], 1841 | [8.5, 9.3, 5.8], 1842 | [3.1, 9.0, 6.7], 1843 | ], 1844 | ], 1845 | [ 1846 | [ 1847 | [7.4, 8.6, 6.9], 1848 | [8.2, 5.3, 8.7], 1849 | [8.8, 8.7, 4.0], 1850 | [3.9, 1.8, 2.7], 1851 | ], 1852 | [ 1853 | [5.7, 6.2, 0.0], 1854 | [6.0, 0.0, 0.3], 1855 | [2.0, 0.1, 2.7], 1856 | [2.1, 0.1, 6.7], 1857 | ], 1858 | ], 1859 | ], 1860 | dtype=np.float32, 1861 | ), 1862 | rtol=1e-5, 1863 | atol=1e-5, 1864 | ) 1865 | 1866 | 1867 | def submit_nn_flatten(): 1868 | mugrade.submit(flatten_forward(1, 2, 2)) 1869 | mugrade.submit(flatten_forward(2, 2, 2)) 1870 | mugrade.submit(flatten_forward(2, 3, 4, 2, 1, 2)) 1871 | mugrade.submit(flatten_forward(2, 3)) 1872 | mugrade.submit(flatten_backward(1, 2, 2)) 1873 | mugrade.submit(flatten_backward(2, 2, 2)) 1874 | mugrade.submit(flatten_backward(2, 3, 4, 2, 1, 2)) 1875 | mugrade.submit(flatten_backward(2, 3, 4, 4)) 1876 | 1877 | 1878 | def test_optim_sgd_vanilla_1(): 1879 | np.testing.assert_allclose( 1880 | learn_model_1d( 1881 | 64, 1882 | 16, 1883 | lambda z: nn.Sequential(nn.Linear(64, 32), nn.ReLU(), nn.Linear(32, 16)), 1884 | ndl.optim.SGD, 1885 | lr=0.01, 1886 | momentum=0.0, 1887 | ), 1888 | np.array(3.207009), 1889 | rtol=1e-5, 1890 | atol=1e-5, 1891 | ) 1892 | 1893 | 1894 | def test_optim_sgd_momentum_1(): 1895 | np.testing.assert_allclose( 1896 | learn_model_1d( 1897 | 64, 1898 | 16, 1899 | lambda z: nn.Sequential(nn.Linear(64, 32), nn.ReLU(), nn.Linear(32, 16)), 1900 | ndl.optim.SGD, 1901 | lr=0.01, 1902 | momentum=0.9, 1903 | ), 1904 | np.array(3.311805), 1905 | rtol=1e-5, 1906 | atol=1e-5, 1907 | ) 1908 | 1909 | 1910 | def test_optim_sgd_weight_decay_1(): 1911 | np.testing.assert_allclose( 1912 | learn_model_1d( 1913 | 64, 1914 | 16, 1915 | lambda z: nn.Sequential(nn.Linear(64, 32), nn.ReLU(), nn.Linear(32, 16)), 1916 | ndl.optim.SGD, 1917 | lr=0.01, 1918 | momentum=0.0, 1919 | weight_decay=0.01, 1920 | ), 1921 | np.array(3.202637), 1922 | rtol=1e-5, 1923 | atol=1e-5, 1924 | ) 1925 | 1926 | 1927 | def test_optim_sgd_momentum_weight_decay_1(): 1928 | np.testing.assert_allclose( 1929 | learn_model_1d( 1930 | 64, 1931 | 16, 1932 | lambda z: nn.Sequential(nn.Linear(64, 32), nn.ReLU(), nn.Linear(32, 16)), 1933 | ndl.optim.SGD, 1934 | lr=0.01, 1935 | momentum=0.9, 1936 | weight_decay=0.01, 1937 | ), 1938 | np.array(3.306993), 1939 | rtol=1e-5, 1940 | atol=1e-5, 1941 | ) 1942 | 1943 | 1944 | def test_optim_sgd_layernorm_residual_1(): 1945 | nn.LayerNorm1d(8) 1946 | np.testing.assert_allclose( 1947 | learn_model_1d( 1948 | 64, 1949 | 16, 1950 | lambda z: nn.Sequential( 1951 | nn.Linear(64, 8), 1952 | nn.ReLU(), 1953 | nn.Residual(nn.Linear(8, 8)), 1954 | nn.Linear(8, 16), 1955 | ), 1956 | ndl.optim.SGD, 1957 | epochs=3, 1958 | lr=0.01, 1959 | weight_decay=0.001, 1960 | ), 1961 | np.array(2.852236), 1962 | rtol=1e-5, 1963 | atol=1e-5, 1964 | ) 1965 | 1966 | 1967 | # We're checking that you have not allocated too many tensors; 1968 | # if this fails, make sure you're using .detach()/.data whenever possible. 1969 | def test_optim_sgd_z_memory_check_1(): 1970 | np.testing.assert_allclose( 1971 | global_tensor_count(), np.array(387), rtol=1e-5, atol=1000 1972 | ) 1973 | 1974 | 1975 | def submit_optim_sgd(): 1976 | mugrade.submit( 1977 | learn_model_1d( 1978 | 48, 1979 | 17, 1980 | lambda z: nn.Sequential(nn.Linear(48, 32), nn.ReLU(), nn.Linear(32, 17)), 1981 | ndl.optim.SGD, 1982 | lr=0.03, 1983 | momentum=0.0, 1984 | epochs=2, 1985 | ) 1986 | ) 1987 | mugrade.submit( 1988 | learn_model_1d( 1989 | 48, 1990 | 16, 1991 | lambda z: nn.Sequential(nn.Linear(48, 32), nn.ReLU(), nn.Linear(32, 16)), 1992 | ndl.optim.SGD, 1993 | lr=0.01, 1994 | momentum=0.9, 1995 | epochs=2, 1996 | ) 1997 | ) 1998 | mugrade.submit( 1999 | learn_model_1d( 2000 | 48, 2001 | 16, 2002 | lambda z: nn.Sequential( 2003 | nn.Linear(48, 32), nn.ReLU(), nn.BatchNorm1d(32), nn.Linear(32, 16) 2004 | ), 2005 | ndl.optim.SGD, 2006 | lr=0.01, 2007 | momentum=0.0, 2008 | weight_decay=0.01, 2009 | epochs=2, 2010 | ) 2011 | ) 2012 | mugrade.submit( 2013 | learn_model_1d( 2014 | 54, 2015 | 16, 2016 | lambda z: nn.Sequential(nn.Linear(54, 32), nn.ReLU(), nn.Linear(32, 16)), 2017 | ndl.optim.SGD, 2018 | lr=0.01, 2019 | momentum=0.9, 2020 | weight_decay=0.01, 2021 | epochs=2, 2022 | ) 2023 | ) 2024 | mugrade.submit( 2025 | learn_model_1d( 2026 | 64, 2027 | 4, 2028 | lambda z: nn.Sequential( 2029 | nn.Linear(64, 8), 2030 | nn.ReLU(), 2031 | nn.Residual(nn.Linear(8, 8)), 2032 | nn.Linear(8, 4), 2033 | ), 2034 | ndl.optim.SGD, 2035 | epochs=3, 2036 | lr=0.01, 2037 | weight_decay=0.001, 2038 | ) 2039 | ) 2040 | 2041 | 2042 | def test_optim_adam_1(): 2043 | np.testing.assert_allclose( 2044 | learn_model_1d( 2045 | 64, 2046 | 16, 2047 | lambda z: nn.Sequential(nn.Linear(64, 32), nn.ReLU(), nn.Linear(32, 16)), 2048 | ndl.optim.Adam, 2049 | lr=0.001, 2050 | ), 2051 | np.array(3.703999), 2052 | rtol=1e-5, 2053 | atol=1e-5, 2054 | ) 2055 | 2056 | 2057 | def test_optim_adam_weight_decay_1(): 2058 | np.testing.assert_allclose( 2059 | learn_model_1d( 2060 | 64, 2061 | 16, 2062 | lambda z: nn.Sequential(nn.Linear(64, 32), nn.ReLU(), nn.Linear(32, 16)), 2063 | ndl.optim.Adam, 2064 | lr=0.001, 2065 | weight_decay=0.01, 2066 | ), 2067 | np.array(3.705134), 2068 | rtol=1e-5, 2069 | atol=1e-5, 2070 | ) 2071 | 2072 | 2073 | def test_optim_adam_batchnorm_1(): 2074 | np.testing.assert_allclose( 2075 | learn_model_1d( 2076 | 64, 2077 | 16, 2078 | lambda z: nn.Sequential( 2079 | nn.Linear(64, 32), nn.ReLU(), nn.BatchNorm1d(32), nn.Linear(32, 16) 2080 | ), 2081 | ndl.optim.Adam, 2082 | lr=0.001, 2083 | weight_decay=0.001, 2084 | ), 2085 | np.array(3.296256, dtype=np.float32), 2086 | rtol=1e-5, 2087 | atol=1e-5, 2088 | ) 2089 | 2090 | 2091 | def test_optim_adam_batchnorm_eval_mode_1(): 2092 | np.testing.assert_allclose( 2093 | learn_model_1d_eval( 2094 | 64, 2095 | 16, 2096 | lambda z: nn.Sequential( 2097 | nn.Linear(64, 32), nn.ReLU(), nn.BatchNorm1d(32), nn.Linear(32, 16) 2098 | ), 2099 | ndl.optim.Adam, 2100 | lr=0.001, 2101 | weight_decay=0.001, 2102 | ), 2103 | np.array(3.192054, dtype=np.float32), 2104 | rtol=1e-5, 2105 | atol=1e-5, 2106 | ) 2107 | 2108 | 2109 | def test_optim_adam_layernorm_1(): 2110 | np.testing.assert_allclose( 2111 | learn_model_1d( 2112 | 64, 2113 | 16, 2114 | lambda z: nn.Sequential( 2115 | nn.Linear(64, 32), nn.ReLU(), nn.LayerNorm1d(32), nn.Linear(32, 16) 2116 | ), 2117 | ndl.optim.Adam, 2118 | lr=0.01, 2119 | weight_decay=0.01, 2120 | ), 2121 | np.array(2.82192, dtype=np.float32), 2122 | rtol=1e-5, 2123 | atol=1e-5, 2124 | ) 2125 | 2126 | 2127 | def test_optim_adam_weight_decay_bias_correction_1(): 2128 | np.testing.assert_allclose( 2129 | learn_model_1d( 2130 | 64, 2131 | 16, 2132 | lambda z: nn.Sequential(nn.Linear(64, 32), nn.ReLU(), nn.Linear(32, 16)), 2133 | ndl.optim.Adam, 2134 | lr=0.001, 2135 | weight_decay=0.01, 2136 | ), 2137 | np.array(3.705134), 2138 | rtol=1e-5, 2139 | atol=1e-5, 2140 | ) 2141 | 2142 | 2143 | # We're checking that you have not allocated too many tensors; 2144 | # if this fails, make sure you're using .detach()/.data whenever possible. 2145 | def test_optim_adam_z_memory_check_1(): 2146 | np.testing.assert_allclose( 2147 | global_tensor_count(), np.array(1132), rtol=1e-5, atol=1000 2148 | ) 2149 | 2150 | 2151 | def submit_optim_adam(): 2152 | mugrade.submit( 2153 | learn_model_1d( 2154 | 48, 2155 | 16, 2156 | lambda z: nn.Sequential(nn.Linear(48, 32), nn.ReLU(), nn.Linear(32, 16)), 2157 | ndl.optim.Adam, 2158 | lr=0.001, 2159 | epochs=2, 2160 | ) 2161 | ) 2162 | mugrade.submit( 2163 | learn_model_1d( 2164 | 48, 2165 | 16, 2166 | lambda z: nn.Sequential(nn.Linear(48, 32), nn.ReLU(), nn.Linear(32, 16)), 2167 | ndl.optim.Adam, 2168 | lr=0.001, 2169 | weight_decay=0.01, 2170 | epochs=2, 2171 | ) 2172 | ) 2173 | mugrade.submit( 2174 | learn_model_1d( 2175 | 48, 2176 | 16, 2177 | lambda z: nn.Sequential( 2178 | nn.Linear(48, 32), nn.ReLU(), nn.BatchNorm1d(32), nn.Linear(32, 16) 2179 | ), 2180 | ndl.optim.Adam, 2181 | lr=0.001, 2182 | weight_decay=0.001, 2183 | epochs=3, 2184 | ) 2185 | ) 2186 | mugrade.submit( 2187 | learn_model_1d_eval( 2188 | 48, 2189 | 16, 2190 | lambda z: nn.Sequential( 2191 | nn.Linear(48, 32), nn.ReLU(), nn.BatchNorm1d(32), nn.Linear(32, 16) 2192 | ), 2193 | ndl.optim.Adam, 2194 | lr=0.001, 2195 | weight_decay=0.001, 2196 | epochs=2, 2197 | ) 2198 | ) 2199 | mugrade.submit( 2200 | learn_model_1d( 2201 | 48, 2202 | 16, 2203 | lambda z: nn.Sequential( 2204 | nn.Linear(48, 32), nn.ReLU(), nn.LayerNorm1d(32), nn.Linear(32, 16) 2205 | ), 2206 | ndl.optim.Adam, 2207 | lr=0.01, 2208 | weight_decay=0.01, 2209 | epochs=2, 2210 | ) 2211 | ) 2212 | mugrade.submit( 2213 | learn_model_1d( 2214 | 48, 2215 | 16, 2216 | lambda z: nn.Sequential(nn.Linear(48, 32), nn.ReLU(), nn.Linear(32, 16)), 2217 | ndl.optim.Adam, 2218 | lr=0.001, 2219 | weight_decay=0.01, 2220 | epochs=2, 2221 | ) 2222 | ) 2223 | 2224 | 2225 | def test_mlp_residual_block_num_params_1(): 2226 | np.testing.assert_allclose( 2227 | residual_block_num_params(15, 2, nn.BatchNorm1d), 2228 | np.array(111), 2229 | rtol=1e-5, 2230 | atol=1e-5, 2231 | ) 2232 | 2233 | 2234 | def test_mlp_residual_block_num_params_2(): 2235 | np.testing.assert_allclose( 2236 | residual_block_num_params(784, 100, nn.LayerNorm1d), 2237 | np.array(159452), 2238 | rtol=1e-5, 2239 | atol=1e-5, 2240 | ) 2241 | 2242 | 2243 | def test_mlp_residual_block_forward_1(): 2244 | np.testing.assert_allclose( 2245 | residual_block_forward(15, 10, nn.LayerNorm1d, 0.5), 2246 | np.array( 2247 | [ 2248 | [ 2249 | 0.0, 2250 | 1.358399, 2251 | 0.0, 2252 | 1.384224, 2253 | 0.0, 2254 | 0.0, 2255 | 0.255451, 2256 | 0.077662, 2257 | 0.0, 2258 | 0.939582, 2259 | 0.525591, 2260 | 1.99213, 2261 | 0.0, 2262 | 0.0, 2263 | 1.012827, 2264 | ] 2265 | ], 2266 | dtype=np.float32, 2267 | ), 2268 | rtol=1e-5, 2269 | atol=1e-5, 2270 | ) 2271 | 2272 | 2273 | def test_mlp_resnet_num_params_1(): 2274 | np.testing.assert_allclose( 2275 | mlp_resnet_num_params(150, 100, 5, 10, nn.LayerNorm1d), 2276 | np.array(68360), 2277 | rtol=1e-5, 2278 | atol=1e-5, 2279 | ) 2280 | 2281 | 2282 | def test_mlp_resnet_num_params_2(): 2283 | np.testing.assert_allclose( 2284 | mlp_resnet_num_params(10, 100, 1, 100, nn.BatchNorm1d), 2285 | np.array(21650), 2286 | rtol=1e-5, 2287 | atol=1e-5, 2288 | ) 2289 | 2290 | 2291 | def test_mlp_resnet_forward_1(): 2292 | np.testing.assert_allclose( 2293 | mlp_resnet_forward(10, 5, 2, 5, nn.LayerNorm1d, 0.5), 2294 | np.array( 2295 | [ 2296 | [3.046162, 1.44972, -1.921363, 0.021816, -0.433953], 2297 | [3.489114, 1.820994, -2.111306, 0.226388, -1.029428], 2298 | ], 2299 | dtype=np.float32, 2300 | ), 2301 | rtol=1e-5, 2302 | atol=1e-5, 2303 | ) 2304 | 2305 | 2306 | def test_mlp_resnet_forward_2(): 2307 | np.testing.assert_allclose( 2308 | mlp_resnet_forward(15, 25, 5, 14, nn.BatchNorm1d, 0.0), 2309 | np.array( 2310 | [ 2311 | [ 2312 | 0.92448235, 2313 | -2.745743, 2314 | -1.5077105, 2315 | 1.130784, 2316 | -1.2078242, 2317 | -0.09833566, 2318 | -0.69301605, 2319 | 2.8945382, 2320 | 1.259397, 2321 | 0.13866742, 2322 | -2.963875, 2323 | -4.8566914, 2324 | 1.7062538, 2325 | -4.846424, 2326 | ], 2327 | [ 2328 | 0.6653336, 2329 | -2.4708004, 2330 | 2.0572243, 2331 | -1.0791507, 2332 | 4.3489094, 2333 | 3.1086435, 2334 | 0.0304327, 2335 | -1.9227124, 2336 | -1.416201, 2337 | -7.2151937, 2338 | -1.4858506, 2339 | 7.1039696, 2340 | -2.1589825, 2341 | -0.7593413, 2342 | ], 2343 | ], 2344 | dtype=np.float32, 2345 | ), 2346 | rtol=1e-5, 2347 | atol=1e-5, 2348 | ) 2349 | 2350 | 2351 | def test_mlp_train_epoch_1(): 2352 | np.testing.assert_allclose( 2353 | train_epoch_1(5, 250, ndl.optim.Adam, lr=0.01, weight_decay=0.1), 2354 | np.array([0.675267, 1.84043]), 2355 | rtol=0.0001, 2356 | atol=0.0001, 2357 | ) 2358 | 2359 | 2360 | def test_mlp_eval_epoch_1(): 2361 | np.testing.assert_allclose( 2362 | eval_epoch_1(10, 150), np.array([0.9164, 4.137814]), rtol=1e-5, atol=1e-5 2363 | ) 2364 | 2365 | 2366 | def test_mlp_train_mnist_1(): 2367 | np.testing.assert_allclose( 2368 | train_mnist_1(250, 2, ndl.optim.SGD, 0.001, 0.01, 100), 2369 | np.array([0.4875, 1.462595, 0.3245, 1.049429]), 2370 | rtol=0.001, 2371 | atol=0.001, 2372 | ) 2373 | 2374 | 2375 | def submit_mlp_resnet(): 2376 | mugrade.submit(residual_block_num_params(17, 13, nn.BatchNorm1d)) 2377 | mugrade.submit(residual_block_num_params(785, 101, nn.LayerNorm1d)) 2378 | mugrade.submit(residual_block_forward(15, 5, nn.LayerNorm1d, 0.3)) 2379 | mugrade.submit(mlp_resnet_num_params(75, 75, 3, 3, nn.LayerNorm1d)) 2380 | mugrade.submit(mlp_resnet_num_params(15, 10, 10, 5, nn.BatchNorm1d)) 2381 | mugrade.submit(mlp_resnet_forward(12, 7, 1, 6, nn.LayerNorm1d, 0.8)) 2382 | mugrade.submit(mlp_resnet_forward(15, 3, 2, 15, nn.BatchNorm1d, 0.3)) 2383 | mugrade.submit(train_epoch_1(7, 256, ndl.optim.Adam, lr=0.01, weight_decay=0.01)) 2384 | mugrade.submit(eval_epoch_1(12, 154)) 2385 | mugrade.submit(train_mnist_1(550, 1, ndl.optim.SGD, 0.01, 0.01, 7)) 2386 | --------------------------------------------------------------------------------