├── CMakeLists.txt
├── Makefile
├── README.md
├── ResNet9.png
├── apps
    ├── models.py
    └── simple_ml.py
├── hw4.ipynb
├── python
    └── needle
    │   ├── __init__.py
    │   ├── autograd.py
    │   ├── backend_ndarray
    │       ├── __init__.py
    │       ├── ndarray.py
    │       └── ndarray_backend_numpy.py
    │   ├── backend_numpy.py
    │   ├── backend_selection.py
    │   ├── data
    │       ├── __init__.py
    │       ├── data_basic.py
    │       ├── data_transforms.py
    │       └── datasets
    │       │   ├── __init__.py
    │       │   ├── cifar10_dataset.py
    │       │   ├── mnist_dataset.py
    │       │   ├── ndarray_dataset.py
    │       │   └── ptb_dataset.py
    │   ├── init
    │       ├── __init__.py
    │       ├── init_basic.py
    │       └── init_initializers.py
    │   ├── nn
    │       ├── __init__.py
    │       ├── nn_basic.py
    │       ├── nn_conv.py
    │       └── nn_sequence.py
    │   ├── ops
    │       ├── __init__.py
    │       ├── ops_logarithmic.py
    │       ├── ops_mathematic.py
    │       └── ops_tuple.py
    │   └── optim.py
├── src
    ├── ndarray_backend_cpu.cc
    └── ndarray_backend_cuda.cu
└── tests
    └── hw4
        ├── test_cifar_ptb_data.py
        ├── test_conv.py
        ├── test_nd_backend.py
        └── test_sequence_models.py


/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.5)
 2 | project(needle C CXX)
 3 | cmake_policy(SET CMP0146 OLD)
 4 | 
 5 | # find correct version of Python
 6 | execute_process(COMMAND python3-config --prefix
 7 |   OUTPUT_VARIABLE Python_ROOT_DIR)
 8 | find_package(Python COMPONENTS Development Interpreter REQUIRED)
 9 | include_directories(${Python_INCLUDE_DIRS})
10 | 
11 | # find pybind
12 | execute_process(COMMAND python3 -m pybind11 --cmakedir
13 |   RESULT_VARIABLE __pybind_exit_code
14 |   OUTPUT_VARIABLE __pybind_path
15 |   OUTPUT_STRIP_TRAILING_WHITESPACE)
16 | find_package(pybind11 PATHS ${__pybind_path})
17 | 
18 | 
19 | if(NOT MSVC)
20 |   set(CMAKE_CXX_FLAGS "-std=c++11 -O2 -march=native ${CMAKE_CXX_FLAGS}")
21 |   set(CMAKE_CUDA_STANDARD 14)
22 | else()
23 |   set(CMAKE_CXX_FLAGS "/std:c++11 -O2 -march=native ${CMAKE_CXX_FLAGS}")
24 |   set(CMAKE_CUDA_STANDARD 14)
25 | endif()
26 | 
27 | include_directories(SYSTEM ${pybind11_INCLUDE_DIRS})
28 | list(APPEND LINKER_LIBS ${pybind11_LIBRARIES})
29 | 
30 | 
31 | ###################
32 | ### CPU BACKEND ###
33 | ###################
34 | add_library(ndarray_backend_cpu MODULE src/ndarray_backend_cpu.cc)
35 | target_link_libraries(ndarray_backend_cpu PUBLIC ${LINKER_LIBS})
36 | pybind11_extension(ndarray_backend_cpu)
37 | pybind11_strip(ndarray_backend_cpu)
38 | 
39 | 
40 | # directly output to ffi folder
41 | set_target_properties(ndarray_backend_cpu
42 |   PROPERTIES
43 |   LIBRARY_OUTPUT_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/python/needle/backend_ndarray
44 |   CXX_VISIBILITY_PRESET "hidden"
45 | )
46 | 
47 | if(${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
48 |   set_property(TARGET ndarray_backend_cpu PROPERTY LINK_OPTIONS -undefined dynamic_lookup)
49 | endif()
50 | 
51 | 
52 | 
53 | ####################
54 | ### CUDA BACKEND ###
55 | ####################
56 | find_package(CUDA)
57 | if(CUDA_FOUND)
58 |   message(STATUS "Found cuda, building cuda backend")
59 | 
60 |   include_directories(SYSTEM ${CUDA_INCLUDE_DIRS})
61 |   list(APPEND LINKER_LIBS ${CUDA_CUDART_LIBRARY})
62 | 
63 |   # invoke nvidia smi to detect if we really have a GPU
64 |   execute_process(COMMAND "nvidia-smi" ERROR_QUIET  RESULT_VARIABLE NV_RET)
65 |   if(NV_RET EQUAL "0")
66 |     CUDA_SELECT_NVCC_ARCH_FLAGS(ARCH_FLAGS Auto)
67 |   else()
68 |     # set to 3.7 the flag of K80
69 |     CUDA_SELECT_NVCC_ARCH_FLAGS(ARCH_FLAGS 3.7)
70 |   endif()
71 | 
72 |   # set arch flags properly
73 |   CUDA_ADD_LIBRARY(ndarray_backend_cuda MODULE src/ndarray_backend_cuda.cu OPTIONS ${ARCH_FLAGS})
74 | 
75 |   target_link_libraries(ndarray_backend_cuda ${LINKER_LIBS})
76 |   pybind11_extension(ndarray_backend_cuda)
77 |   pybind11_strip(ndarray_backend_cuda)
78 | 
79 |   # directly output to ffi folder
80 |   set_target_properties(ndarray_backend_cuda
81 |     PROPERTIES
82 |     LIBRARY_OUTPUT_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/python/needle/backend_ndarray
83 |     CXX_VISIBILITY_PRESET "hidden"
84 |     CUDA_VISIBILITY_PRESET "hidden"
85 | )
86 | 
87 | endif()
88 | 
89 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: lib, pybind, clean, format, all
 2 | 
 3 | all: lib
 4 | 
 5 | 
 6 | lib:
 7 | 	@mkdir -p build
 8 | 	@cd build; cmake ..
 9 | 	@cd build; $(MAKE)
10 | 
11 | format:
12 | 	python3 -m black .
13 | 	clang-format -i src/*.cc src/*.cu
14 | 
15 | clean:
16 | 	rm -rf build python/needle/backend_ndarray/ndarray_backend*.so
17 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Homework 4
2 | Public repository and stub/testing code for Homework 4 of 10-714.
3 | 


--------------------------------------------------------------------------------
/ResNet9.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dlsyscourse/hw4/a1fefc20753cb30afeb222fb92351153c522dea5/ResNet9.png


--------------------------------------------------------------------------------
/apps/models.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | sys.path.append('./python')
 3 | import needle as ndl
 4 | import needle.nn as nn
 5 | import math
 6 | import numpy as np
 7 | np.random.seed(0)
 8 | 
 9 | 
10 | class ResNet9(ndl.nn.Module):
11 |     def __init__(self, device=None, dtype="float32"):
12 |         super().__init__()
13 |         ### BEGIN YOUR SOLUTION ###
14 |         raise NotImplementedError() ###
15 |         ### END YOUR SOLUTION
16 | 
17 |     def forward(self, x):
18 |         ### BEGIN YOUR SOLUTION
19 |         raise NotImplementedError()
20 |         ### END YOUR SOLUTION
21 | 
22 | 
23 | class LanguageModel(nn.Module):
24 |     def __init__(self, embedding_size, output_size, hidden_size, num_layers=1,
25 |                  seq_model='rnn', seq_len=40, device=None, dtype="float32"):
26 |         """
27 |         Consists of an embedding layer, a sequence model (either RNN or LSTM), and a
28 |         linear layer.
29 |         Parameters:
30 |         output_size: Size of dictionary
31 |         embedding_size: Size of embeddings
32 |         hidden_size: The number of features in the hidden state of LSTM or RNN
33 |         seq_model: 'rnn' or 'lstm', whether to use RNN or LSTM
34 |         num_layers: Number of layers in RNN or LSTM
35 |         """
36 |         super(LanguageModel, self).__init__()
37 |         ### BEGIN YOUR SOLUTION
38 |         raise NotImplementedError()
39 |         ### END YOUR SOLUTION
40 | 
41 |     def forward(self, x, h=None):
42 |         """
43 |         Given sequence (and the previous hidden state if given), returns probabilities of next word
44 |         (along with the last hidden state from the sequence model).
45 |         Inputs:
46 |         x of shape (seq_len, bs)
47 |         h of shape (num_layers, bs, hidden_size) if using RNN,
48 |             else h is tuple of (h0, c0), each of shape (num_layers, bs, hidden_size)
49 |         Returns (out, h)
50 |         out of shape (seq_len*bs, output_size)
51 |         h of shape (num_layers, bs, hidden_size) if using RNN,
52 |             else h is tuple of (h0, c0), each of shape (num_layers, bs, hidden_size)
53 |         """
54 |         ### BEGIN YOUR SOLUTION
55 |         raise NotImplementedError()
56 |         ### END YOUR SOLUTION
57 | 
58 | 
59 | if __name__ == "__main__":
60 |     model = ResNet9()
61 |     x = ndl.ops.randu((1, 32, 32, 3), requires_grad=True)
62 |     model(x)
63 |     cifar10_train_dataset = ndl.data.CIFAR10Dataset("data/cifar-10-batches-py", train=True)
64 |     train_loader = ndl.data.DataLoader(cifar10_train_dataset, 128, ndl.cpu(), dtype="float32")
65 |     print(cifar10_train_dataset[1][0].shape)
66 | 


--------------------------------------------------------------------------------
/apps/simple_ml.py:
--------------------------------------------------------------------------------
  1 | """hw1/apps/simple_ml.py"""
  2 | 
  3 | import struct
  4 | import gzip
  5 | import numpy as np
  6 | 
  7 | import sys
  8 | 
  9 | sys.path.append("python/")
 10 | import needle as ndl
 11 | 
 12 | import needle.nn as nn
 13 | from apps.models import *
 14 | import time
 15 | device = ndl.cpu()
 16 | 
 17 | def parse_mnist(image_filesname, label_filename):
 18 |     """Read an images and labels file in MNIST format.  See this page:
 19 |     http://yann.lecun.com/exdb/mnist/ for a description of the file format.
 20 | 
 21 |     Args:
 22 |         image_filename (str): name of gzipped images file in MNIST format
 23 |         label_filename (str): name of gzipped labels file in MNIST format
 24 | 
 25 |     Returns:
 26 |         Tuple (X,y):
 27 |             X (numpy.ndarray[np.float32]): 2D numpy array containing the loaded
 28 |                 data.  The dimensionality of the data should be
 29 |                 (num_examples x input_dim) where 'input_dim' is the full
 30 |                 dimension of the data, e.g., since MNIST images are 28x28, it
 31 |                 will be 784.  Values should be of type np.float32, and the data
 32 |                 should be normalized to have a minimum value of 0.0 and a
 33 |                 maximum value of 1.0.
 34 | 
 35 |             y (numpy.ndarray[dypte=np.int8]): 1D numpy array containing the
 36 |                 labels of the examples.  Values should be of type np.int8 and
 37 |                 for MNIST will contain the values 0-9.
 38 |     """
 39 |     ### BEGIN YOUR SOLUTION
 40 |     raise NotImplementedError()
 41 |     ### END YOUR SOLUTION
 42 | 
 43 | 
 44 | def softmax_loss(Z, y_one_hot):
 45 |     """Return softmax loss.  Note that for the purposes of this assignment,
 46 |     you don't need to worry about "nicely" scaling the numerical properties
 47 |     of the log-sum-exp computation, but can just compute this directly.
 48 | 
 49 |     Args:
 50 |         Z (ndl.Tensor[np.float32]): 2D Tensor of shape
 51 |             (batch_size, num_classes), containing the logit predictions for
 52 |             each class.
 53 |         y (ndl.Tensor[np.int8]): 2D Tensor of shape (batch_size, num_classes)
 54 |             containing a 1 at the index of the true label of each example and
 55 |             zeros elsewhere.
 56 | 
 57 |     Returns:
 58 |         Average softmax loss over the sample. (ndl.Tensor[np.float32])
 59 |     """
 60 |     ### BEGIN YOUR SOLUTION
 61 |     raise NotImplementedError()
 62 |     ### END YOUR SOLUTION
 63 | 
 64 | 
 65 | def nn_epoch(X, y, W1, W2, lr=0.1, batch=100):
 66 |     """Run a single epoch of SGD for a two-layer neural network defined by the
 67 |     weights W1 and W2 (with no bias terms):
 68 |         logits = ReLU(X * W1) * W1
 69 |     The function should use the step size lr, and the specified batch size (and
 70 |     again, without randomizing the order of X).
 71 | 
 72 |     Args:
 73 |         X (np.ndarray[np.float32]): 2D input array of size
 74 |             (num_examples x input_dim).
 75 |         y (np.ndarray[np.uint8]): 1D class label array of size (num_examples,)
 76 |         W1 (ndl.Tensor[np.float32]): 2D array of first layer weights, of shape
 77 |             (input_dim, hidden_dim)
 78 |         W2 (ndl.Tensor[np.float32]): 2D array of second layer weights, of shape
 79 |             (hidden_dim, num_classes)
 80 |         lr (float): step size (learning rate) for SGD
 81 |         batch (int): size of SGD mini-batch
 82 | 
 83 |     Returns:
 84 |         Tuple: (W1, W2)
 85 |             W1: ndl.Tensor[np.float32]
 86 |             W2: ndl.Tensor[np.float32]
 87 |     """
 88 | 
 89 |     ### BEGIN YOUR SOLUTION
 90 |     raise NotImplementedError()
 91 |     ### END YOUR SOLUTION
 92 | 
 93 | ### CIFAR-10 training ###
 94 | def epoch_general_cifar10(dataloader, model, loss_fn=nn.SoftmaxLoss(), opt=None):
 95 |     """
 96 |     Iterates over the dataloader. If optimizer is not None, sets the
 97 |     model to train mode, and for each batch updates the model parameters.
 98 |     If optimizer is None, sets the model to eval mode, and simply computes
 99 |     the loss/accuracy.
100 | 
101 |     Args:
102 |         dataloader: Dataloader instance
103 |         model: nn.Module instance
104 |         loss_fn: nn.Module instance
105 |         opt: Optimizer instance (optional)
106 | 
107 |     Returns:
108 |         avg_acc: average accuracy over dataset
109 |         avg_loss: average loss over dataset
110 |     """
111 |     np.random.seed(4)
112 |     ### BEGIN YOUR SOLUTION
113 |     raise NotImplementedError()
114 |     ### END YOUR SOLUTION
115 | 
116 | 
117 | def train_cifar10(model, dataloader, n_epochs=1, optimizer=ndl.optim.Adam,
118 |           lr=0.001, weight_decay=0.001, loss_fn=nn.SoftmaxLoss):
119 |     """
120 |     Performs {n_epochs} epochs of training.
121 | 
122 |     Args:
123 |         dataloader: Dataloader instance
124 |         model: nn.Module instance
125 |         n_epochs: number of epochs (int)
126 |         optimizer: Optimizer class
127 |         lr: learning rate (float)
128 |         weight_decay: weight decay (float)
129 |         loss_fn: nn.Module class
130 | 
131 |     Returns:
132 |         avg_acc: average accuracy over dataset from last epoch of training
133 |         avg_loss: average loss over dataset from last epoch of training
134 |     """
135 |     np.random.seed(4)
136 |     ### BEGIN YOUR SOLUTION
137 |     raise NotImplementedError()
138 |     ### END YOUR SOLUTION
139 | 
140 | 
141 | def evaluate_cifar10(model, dataloader, loss_fn=nn.SoftmaxLoss):
142 |     """
143 |     Computes the test accuracy and loss of the model.
144 | 
145 |     Args:
146 |         dataloader: Dataloader instance
147 |         model: nn.Module instance
148 |         loss_fn: nn.Module class
149 | 
150 |     Returns:
151 |         avg_acc: average accuracy over dataset
152 |         avg_loss: average loss over dataset
153 |     """
154 |     np.random.seed(4)
155 |     ### BEGIN YOUR SOLUTION
156 |     raise NotImplementedError()
157 |     ### END YOUR SOLUTION
158 | 
159 | 
160 | ### PTB training ###
161 | def epoch_general_ptb(data, model, seq_len=40, loss_fn=nn.SoftmaxLoss(), opt=None,
162 |         clip=None, device=None, dtype="float32"):
163 |     """
164 |     Iterates over the data. If optimizer is not None, sets the
165 |     model to train mode, and for each batch updates the model parameters.
166 |     If optimizer is None, sets the model to eval mode, and simply computes
167 |     the loss/accuracy.
168 | 
169 |     Args:
170 |         data: data of shape (nbatch, batch_size) given from batchify function
171 |         model: LanguageModel instance
172 |         seq_len: i.e. bptt, sequence length
173 |         loss_fn: nn.Module instance
174 |         opt: Optimizer instance (optional)
175 |         clip: max norm of gradients (optional)
176 | 
177 |     Returns:
178 |         avg_acc: average accuracy over dataset
179 |         avg_loss: average loss over dataset
180 |     """
181 |     np.random.seed(4)
182 |     ### BEGIN YOUR SOLUTION
183 |     raise NotImplementedError()
184 |     ### END YOUR SOLUTION
185 | 
186 | 
187 | def train_ptb(model, data, seq_len=40, n_epochs=1, optimizer=ndl.optim.SGD,
188 |           lr=4.0, weight_decay=0.0, loss_fn=nn.SoftmaxLoss, clip=None,
189 |           device=None, dtype="float32"):
190 |     """
191 |     Performs {n_epochs} epochs of training.
192 | 
193 |     Args:
194 |         model: LanguageModel instance
195 |         data: data of shape (nbatch, batch_size) given from batchify function
196 |         seq_len: i.e. bptt, sequence length
197 |         n_epochs: number of epochs (int)
198 |         optimizer: Optimizer class
199 |         lr: learning rate (float)
200 |         weight_decay: weight decay (float)
201 |         loss_fn: nn.Module class
202 |         clip: max norm of gradients (optional)
203 | 
204 |     Returns:
205 |         avg_acc: average accuracy over dataset from last epoch of training
206 |         avg_loss: average loss over dataset from last epoch of training
207 |     """
208 |     np.random.seed(4)
209 |     ### BEGIN YOUR SOLUTION
210 |     raise NotImplementedError()
211 |     ### END YOUR SOLUTION
212 | 
213 | def evaluate_ptb(model, data, seq_len=40, loss_fn=nn.SoftmaxLoss,
214 |         device=None, dtype="float32"):
215 |     """
216 |     Computes the test accuracy and loss of the model.
217 | 
218 |     Args:
219 |         model: LanguageModel instance
220 |         data: data of shape (nbatch, batch_size) given from batchify function
221 |         seq_len: i.e. bptt, sequence length
222 |         loss_fn: nn.Module class
223 | 
224 |     Returns:
225 |         avg_acc: average accuracy over dataset
226 |         avg_loss: average loss over dataset
227 |     """
228 |     np.random.seed(4)
229 |     ### BEGIN YOUR SOLUTION
230 |     raise NotImplementedError()
231 |     ### END YOUR SOLUTION
232 | 
233 | ### CODE BELOW IS FOR ILLUSTRATION, YOU DO NOT NEED TO EDIT
234 | 
235 | 
236 | def loss_err(h, y):
237 |     """Helper function to compute both loss and error"""
238 |     y_one_hot = np.zeros((y.shape[0], h.shape[-1]))
239 |     y_one_hot[np.arange(y.size), y] = 1
240 |     y_ = ndl.Tensor(y_one_hot)
241 |     return softmax_loss(h, y_).numpy(), np.mean(h.numpy().argmax(axis=1) != y)
242 | 


--------------------------------------------------------------------------------
/python/needle/__init__.py:
--------------------------------------------------------------------------------
 1 | from . import ops
 2 | from .ops import *
 3 | from .autograd import Tensor, cpu, all_devices
 4 | 
 5 | from . import init
 6 | from .init import ones, zeros, zeros_like, ones_like
 7 | 
 8 | from . import data
 9 | from . import nn
10 | from . import optim
11 | from .backend_selection import *
12 | 


--------------------------------------------------------------------------------
/python/needle/autograd.py:
--------------------------------------------------------------------------------
  1 | """Core data structures."""
  2 | import needle
  3 | from .backend_numpy import Device, cpu, all_devices
  4 | from typing import List, Optional, NamedTuple, Tuple, Union
  5 | from collections import namedtuple
  6 | import numpy
  7 | 
  8 | from needle import init
  9 | 
 10 | # needle version
 11 | LAZY_MODE = False
 12 | TENSOR_COUNTER = 0
 13 | 
 14 | # NOTE: we will import numpy as the array_api
 15 | # as the backend for our computations, this line will change in later homeworks
 16 | 
 17 | import numpy as array_api
 18 | NDArray = numpy.ndarray
 19 | 
 20 | from .backend_selection import array_api, NDArray, default_device
 21 | 
 22 | class Op:
 23 |     """Operator definition."""
 24 | 
 25 |     def __call__(self, *args):
 26 |         raise NotImplementedError()
 27 | 
 28 |     def compute(self, *args: Tuple[NDArray]):
 29 |         """Calculate forward pass of operator.
 30 | 
 31 |         Parameters
 32 |         ----------
 33 |         input: np.ndarray
 34 |             A list of input arrays to the function
 35 | 
 36 |         Returns
 37 |         -------
 38 |         output: nd.array
 39 |             Array output of the operation
 40 | 
 41 |         """
 42 |         raise NotImplementedError()
 43 | 
 44 |     def gradient(
 45 |         self, out_grad: "Value", node: "Value"
 46 |     ) -> Union["Value", Tuple["Value"]]:
 47 |         """Compute partial adjoint for each input value for a given output adjoint.
 48 | 
 49 |         Parameters
 50 |         ----------
 51 |         out_grad: Value
 52 |             The adjoint wrt to the output value.
 53 | 
 54 |         node: Value
 55 |             The value node of forward evaluation.
 56 | 
 57 |         Returns
 58 |         -------
 59 |         input_grads: Value or Tuple[Value]
 60 |             A list containing partial gradient adjoints to be propagated to
 61 |             each of the input node.
 62 |         """
 63 |         raise NotImplementedError()
 64 | 
 65 |     def gradient_as_tuple(self, out_grad: "Value", node: "Value") -> Tuple["Value"]:
 66 |         """Convenience method to always return a tuple from gradient call"""
 67 |         output = self.gradient(out_grad, node)
 68 |         if isinstance(output, tuple):
 69 |             return output
 70 |         elif isinstance(output, list):
 71 |             return tuple(output)
 72 |         else:
 73 |             return (output,)
 74 | 
 75 | 
 76 | class TensorOp(Op):
 77 |     """Op class specialized to output tensors, will be alternate subclasses for other structures"""
 78 | 
 79 |     def __call__(self, *args):
 80 |         return Tensor.make_from_op(self, args)
 81 | 
 82 | 
 83 | class TensorTupleOp(Op):
 84 |     """Op class specialized to output TensorTuple"""
 85 | 
 86 |     def __call__(self, *args):
 87 |         return TensorTuple.make_from_op(self, args)
 88 | 
 89 | 
 90 | class Value:
 91 |     """A value in the computational graph."""
 92 | 
 93 |     # trace of computational graph
 94 |     op: Optional[Op]
 95 |     inputs: List["Value"]
 96 |     # The following fields are cached fields for
 97 |     # dynamic computation
 98 |     cached_data: NDArray
 99 |     requires_grad: bool
100 | 
101 |     def realize_cached_data(self):
102 |         """Run compute to realize the cached data"""
103 |         # avoid recomputation
104 |         if self.cached_data is not None:
105 |             return self.cached_data
106 |         # note: data implicitly calls realized cached data
107 |         self.cached_data = self.op.compute(
108 |             *[x.realize_cached_data() for x in self.inputs]
109 |         )
110 |         return self.cached_data
111 | 
112 |     def is_leaf(self):
113 |         return self.op is None
114 | 
115 |     def __del__(self):
116 |         global TENSOR_COUNTER
117 |         TENSOR_COUNTER -= 1
118 | 
119 |     def _init(
120 |         self,
121 |         op: Optional[Op],
122 |         inputs: List["Tensor"],
123 |         *,
124 |         num_outputs: int = 1,
125 |         cached_data: List[object] = None,
126 |         requires_grad: Optional[bool] = None
127 |     ):
128 |         global TENSOR_COUNTER
129 |         TENSOR_COUNTER += 1
130 |         if requires_grad is None:
131 |             requires_grad = any(x.requires_grad for x in inputs)
132 |         self.op = op
133 |         self.inputs = inputs
134 |         self.num_outputs = num_outputs
135 |         self.cached_data = cached_data
136 |         self.requires_grad = requires_grad
137 | 
138 |     @classmethod
139 |     def make_const(cls, data, *, requires_grad=False):
140 |         value = cls.__new__(cls)
141 |         value._init(
142 |             None,
143 |             [],
144 |             cached_data=data,
145 |             requires_grad=requires_grad,
146 |         )
147 |         return value
148 | 
149 |     @classmethod
150 |     def make_from_op(cls, op: Op, inputs: List["Value"]):
151 |         value = cls.__new__(cls)
152 |         value._init(op, inputs)
153 | 
154 |         if not LAZY_MODE:
155 |             if not value.requires_grad:
156 |                 return value.detach()
157 |             value.realize_cached_data()
158 |         return value
159 | 
160 | 
161 | ### Not needed in HW1
162 | class TensorTuple(Value):
163 |     """Represent a tuple of tensors.
164 | 
165 |     To keep things simple, we do not support nested tuples.
166 |     """
167 | 
168 |     def __len__(self):
169 |         cdata = self.realize_cached_data()
170 |         return len(cdata)
171 | 
172 |     def __getitem__(self, index: int):
173 |         return needle.ops.tuple_get_item(self, index)
174 | 
175 |     def tuple(self):
176 |         return tuple([x for x in self])
177 | 
178 |     def __repr__(self):
179 |         return "needle.TensorTuple" + str(self.tuple())
180 | 
181 |     def __str__(self):
182 |         return self.__repr__()
183 | 
184 |     def __add__(self, other):
185 |         assert isinstance(other, TensorTuple)
186 |         assert len(self) == len(other)
187 |         return needle.ops.make_tuple(*[self[i] + other[i] for i in range(len(self))])
188 | 
189 |     def detach(self):
190 |         """Create a new tensor that shares the data but detaches from the graph."""
191 |         return TensorTuple.make_const(self.realize_cached_data())
192 | 
193 | 
194 | class Tensor(Value):
195 |     grad: "Tensor"
196 | 
197 |     def __init__(
198 |         self,
199 |         array,
200 |         *,
201 |         device: Optional[Device] = None,
202 |         dtype=None,
203 |         requires_grad=True,
204 |         **kwargs
205 |     ):
206 |         if isinstance(array, Tensor):
207 |             if device is None:
208 |                 device = array.device
209 |             if dtype is None:
210 |                 dtype = array.dtype
211 |             if device == array.device and dtype == array.dtype:
212 |                 cached_data = array.realize_cached_data()
213 |             else:
214 |                 # fall back, copy through numpy conversion
215 |                 cached_data = Tensor._array_from_numpy(
216 |                     array.numpy(), device=device, dtype=dtype
217 |                 )
218 |         else:
219 |             device = device if device else default_device()
220 |             cached_data = Tensor._array_from_numpy(array, device=device, dtype=dtype)
221 | 
222 |         self._init(
223 |             None,
224 |             [],
225 |             cached_data=cached_data,
226 |             requires_grad=requires_grad,
227 |         )
228 | 
229 |     @staticmethod
230 |     def _array_from_numpy(numpy_array, device, dtype):
231 |         if array_api is numpy:
232 |             return numpy.array(numpy_array, dtype=dtype)
233 |         return array_api.array(numpy_array, device=device, dtype=dtype)
234 | 
235 |     @staticmethod
236 |     def make_from_op(op: Op, inputs: List["Value"]):
237 |         tensor = Tensor.__new__(Tensor)
238 |         tensor._init(op, inputs)
239 |         if not LAZY_MODE:
240 |             if not tensor.requires_grad:
241 |                 return tensor.detach()
242 |             tensor.realize_cached_data()
243 |         return tensor
244 | 
245 |     @staticmethod
246 |     def make_const(data, requires_grad=False):
247 |         tensor = Tensor.__new__(Tensor)
248 |         tensor._init(
249 |             None,
250 |             [],
251 |             cached_data=data
252 |             if not isinstance(data, Tensor)
253 |             else data.realize_cached_data(),
254 |             requires_grad=requires_grad,
255 |         )
256 |         return tensor
257 | 
258 |     @property
259 |     def data(self):
260 |         return self.detach()
261 | 
262 |     @data.setter
263 |     def data(self, value):
264 |         assert isinstance(value, Tensor)
265 |         assert value.dtype == self.dtype, "%s %s" % (
266 |             value.dtype,
267 |             self.dtype,
268 |         )
269 |         self.cached_data = value.realize_cached_data()
270 | 
271 |     def detach(self):
272 |         """Create a new tensor that shares the data but detaches from the graph."""
273 |         return Tensor.make_const(self.realize_cached_data())
274 | 
275 |     @property
276 |     def shape(self):
277 |         return self.realize_cached_data().shape
278 | 
279 |     @property
280 |     def dtype(self):
281 |         return self.realize_cached_data().dtype
282 | 
283 |     @property
284 |     def device(self):
285 |         data = self.realize_cached_data()
286 |         # numpy array always sits on cpu
287 |         if array_api is numpy:
288 |             return cpu()
289 |         return data.device
290 | 
291 |     def backward(self, out_grad=None):
292 |         out_grad = (
293 |             out_grad
294 |             if out_grad
295 |             else init.ones(*self.shape, dtype=self.dtype, device=self.device)
296 |         )
297 |         compute_gradient_of_variables(self, out_grad)
298 | 
299 |     def __repr__(self):
300 |         return "needle.Tensor(" + str(self.realize_cached_data()) + ")"
301 | 
302 |     def __str__(self):
303 |         return self.realize_cached_data().__str__()
304 | 
305 |     def numpy(self):
306 |         data = self.realize_cached_data()
307 |         if array_api is numpy:
308 |             return data
309 |         return data.numpy()
310 | 
311 |     def __add__(self, other):
312 |         if isinstance(other, Tensor):
313 |             return needle.ops.EWiseAdd()(self, other)
314 |         else:
315 |             return needle.ops.AddScalar(other)(self)
316 | 
317 |     def __mul__(self, other):
318 |         if isinstance(other, Tensor):
319 |             return needle.ops.EWiseMul()(self, other)
320 |         else:
321 |             return needle.ops.MulScalar(other)(self)
322 | 
323 |     def __pow__(self, other):
324 |         if isinstance(other, Tensor):
325 |             return needle.ops.EWisePow()(self, other)
326 |         else:
327 |             return needle.ops.PowerScalar(other)(self)
328 | 
329 |     def __sub__(self, other):
330 |         if isinstance(other, Tensor):
331 |             return needle.ops.EWiseAdd()(self, needle.ops.Negate()(other))
332 |         else:
333 |             return needle.ops.AddScalar(-other)(self)
334 | 
335 |     def __truediv__(self, other):
336 |         if isinstance(other, Tensor):
337 |             return needle.ops.EWiseDiv()(self, other)
338 |         else:
339 |             return needle.ops.DivScalar(other)(self)
340 | 
341 |     def __matmul__(self, other):
342 |         return needle.ops.MatMul()(self, other)
343 | 
344 |     def matmul(self, other):
345 |         return needle.ops.MatMul()(self, other)
346 | 
347 |     def sum(self, axes=None):
348 |         return needle.ops.Summation(axes)(self)
349 | 
350 |     def broadcast_to(self, shape):
351 |         return needle.ops.BroadcastTo(shape)(self)
352 | 
353 |     def reshape(self, shape):
354 |         return needle.ops.Reshape(shape)(self)
355 | 
356 |     def __neg__(self):
357 |         return needle.ops.Negate()(self)
358 | 
359 |     def transpose(self, axes=None):
360 |         return needle.ops.Transpose(axes)(self)
361 | 
362 | 
363 | 
364 | 
365 |     __radd__ = __add__
366 |     __rmul__ = __mul__
367 | 
368 | def compute_gradient_of_variables(output_tensor, out_grad):
369 |     """Take gradient of output node with respect to each node in node_list.
370 | 
371 |     Store the computed result in the grad field of each Variable.
372 |     """
373 |     # a map from node to a list of gradient contributions from each output node
374 |     node_to_output_grads_list: Dict[Tensor, List[Tensor]] = {}
375 |     # Special note on initializing gradient of
376 |     # We are really taking a derivative of the scalar reduce_sum(output_node)
377 |     # instead of the vector output_node. But this is the common case for loss function.
378 |     node_to_output_grads_list[output_tensor] = [out_grad]
379 | 
380 |     # Traverse graph in reverse topological order given the output_node that we are taking gradient wrt.
381 |     reverse_topo_order = list(reversed(find_topo_sort([output_tensor])))
382 | 
383 |     ### BEGIN YOUR SOLUTION
384 |     raise NotImplementedError()
385 |     ### END YOUR SOLUTION
386 | 
387 | 
388 | def find_topo_sort(node_list: List[Value]) -> List[Value]:
389 |     """Given a list of nodes, return a topological sort list of nodes ending in them.
390 | 
391 |     A simple algorithm is to do a post-order DFS traversal on the given nodes,
392 |     going backwards based on input edges. Since a node is added to the ordering
393 |     after all its predecessors are traversed due to post-order DFS, we get a topological
394 |     sort.
395 |     """
396 |     ### BEGIN YOUR SOLUTION
397 |     raise NotImplementedError()
398 |     ### END YOUR SOLUTION
399 | 
400 | 
401 | def topo_sort_dfs(node, visited, topo_order):
402 |     """Post-order DFS"""
403 |     ### BEGIN YOUR SOLUTION
404 |     raise NotImplementedError()
405 |     ### END YOUR SOLUTION
406 | 
407 | 
408 | ##############################
409 | ####### Helper Methods #######
410 | ##############################
411 | 
412 | 
413 | def sum_node_list(node_list):
414 |     """Custom sum function in order to avoid create redundant nodes in Python sum implementation."""
415 |     from operator import add
416 |     from functools import reduce
417 | 
418 |     return reduce(add, node_list)
419 | 


--------------------------------------------------------------------------------
/python/needle/backend_ndarray/__init__.py:
--------------------------------------------------------------------------------
1 | from .ndarray import *
2 | 


--------------------------------------------------------------------------------
/python/needle/backend_ndarray/ndarray.py:
--------------------------------------------------------------------------------
  1 | import operator
  2 | import math
  3 | from functools import reduce
  4 | import numpy as np
  5 | from . import ndarray_backend_numpy
  6 | from . import ndarray_backend_cpu
  7 | 
  8 | 
  9 | # math.prod not in Python 3.7
 10 | def prod(x):
 11 |     return reduce(operator.mul, x, 1)
 12 | 
 13 | 
 14 | class BackendDevice:
 15 |     """A backend device, wrapps the implementation module."""
 16 | 
 17 |     def __init__(self, name, mod):
 18 |         self.name = name
 19 |         self.mod = mod
 20 | 
 21 |     def __eq__(self, other):
 22 |         return self.name == other.name
 23 | 
 24 |     def __repr__(self):
 25 |         return self.name + "()"
 26 | 
 27 |     def __getattr__(self, name):
 28 |         return getattr(self.mod, name)
 29 | 
 30 |     def enabled(self):
 31 |         return self.mod is not None
 32 | 
 33 |     def randn(self, *shape, dtype="float32"):
 34 |         # note: numpy doesn't support types within standard random routines, and
 35 |         # .astype("float32") does work if we're generating a singleton
 36 |         return NDArray(np.random.randn(*shape).astype(dtype), device=self)
 37 | 
 38 |     def rand(self, *shape, dtype="float32"):
 39 |         # note: numpy doesn't support types within standard random routines, and
 40 |         # .astype("float32") does work if we're generating a singleton
 41 |         return NDArray(np.random.rand(*shape).astype(dtype), device=self)
 42 | 
 43 |     def one_hot(self, n, i, dtype="float32"):
 44 |         return NDArray(np.eye(n, dtype=dtype)[i], device=self)
 45 | 
 46 |     def empty(self, shape, dtype="float32"):
 47 |         dtype = "float32" if dtype is None else dtype
 48 |         assert dtype == "float32"
 49 |         return NDArray.make(shape, device=self)
 50 | 
 51 |     def full(self, shape, fill_value, dtype="float32"):
 52 |         dtype = "float32" if dtype is None else dtype
 53 |         assert dtype == "float32"
 54 |         arr = self.empty(shape, dtype)
 55 |         arr.fill(fill_value)
 56 |         return arr
 57 | 
 58 | 
 59 | def cuda():
 60 |     """Return cuda device"""
 61 |     try:
 62 |         from . import ndarray_backend_cuda
 63 | 
 64 |         return BackendDevice("cuda", ndarray_backend_cuda)
 65 |     except ImportError:
 66 |         return BackendDevice("cuda", None)
 67 | 
 68 | 
 69 | def cpu_numpy():
 70 |     """Return numpy device"""
 71 |     return BackendDevice("cpu_numpy", ndarray_backend_numpy)
 72 | 
 73 | 
 74 | def cpu():
 75 |     """Return cpu device"""
 76 |     return BackendDevice("cpu", ndarray_backend_cpu)
 77 | 
 78 | 
 79 | def default_device():
 80 |     return cpu_numpy()
 81 | 
 82 | 
 83 | def all_devices():
 84 |     """return a list of all available devices"""
 85 |     return [cpu(), cuda(), cpu_numpy()]
 86 | 
 87 | 
 88 | class NDArray:
 89 |     """A generic ND array class that may contain multipe different backends
 90 |     i.e., a Numpy backend, a native CPU backend, or a GPU backend.
 91 | 
 92 |     This class will only contains those functions that you need to implement
 93 |     to actually get the desired functionality for the programming examples
 94 |     in the homework, and no more.
 95 | 
 96 |     For now, for simplicity the class only supports float32 types, though
 97 |     this can be extended if desired.
 98 |     """
 99 | 
100 |     def __init__(self, other, device=None):
101 |         """Create by copying another NDArray, or from numpy"""
102 |         if isinstance(other, NDArray):
103 |             # create a copy of existing NDArray
104 |             if device is None:
105 |                 device = other.device
106 |             self._init(other.to(device) + 0.0)  # this creates a copy
107 |         elif isinstance(other, np.ndarray):
108 |             # create copy from numpy array
109 |             device = device if device is not None else default_device()
110 |             array = self.make(other.shape, device=device)
111 |             array.device.from_numpy(np.ascontiguousarray(other), array._handle)
112 |             self._init(array)
113 |         else:
114 |             # see if we can create a numpy array from input
115 |             array = NDArray(np.array(other), device=device)
116 |             self._init(array)
117 | 
118 |     def _init(self, other):
119 |         self._shape = other._shape
120 |         self._strides = other._strides
121 |         self._offset = other._offset
122 |         self._device = other._device
123 |         self._handle = other._handle
124 | 
125 |     @staticmethod
126 |     def compact_strides(shape):
127 |         """Utility function to compute compact strides"""
128 |         stride = 1
129 |         res = []
130 |         for i in range(1, len(shape) + 1):
131 |             res.append(stride)
132 |             stride *= shape[-i]
133 |         return tuple(res[::-1])
134 | 
135 |     @staticmethod
136 |     def make(shape, strides=None, device=None, handle=None, offset=0):
137 |         """Create a new NDArray with the given properties.  This will allocation the
138 |         memory if handle=None, otherwise it will use the handle of an existing
139 |         array."""
140 |         array = NDArray.__new__(NDArray)
141 |         array._shape = tuple(shape)
142 |         array._strides = NDArray.compact_strides(shape) if strides is None else strides
143 |         array._offset = offset
144 |         array._device = device if device is not None else default_device()
145 |         if handle is None:
146 |             array._handle = array.device.Array(prod(shape))
147 |         else:
148 |             array._handle = handle
149 |         return array
150 | 
151 |     ### Properies and string representations
152 |     @property
153 |     def shape(self):
154 |         return self._shape
155 | 
156 |     @property
157 |     def strides(self):
158 |         return self._strides
159 | 
160 |     @property
161 |     def device(self):
162 |         return self._device
163 | 
164 |     @property
165 |     def dtype(self):
166 |         # only support float32 for now
167 |         return "float32"
168 | 
169 |     @property
170 |     def ndim(self):
171 |         """Return number of dimensions."""
172 |         return len(self._shape)
173 | 
174 |     @property
175 |     def size(self):
176 |         return prod(self._shape)
177 | 
178 |     def __repr__(self):
179 |         return "NDArray(" + self.numpy().__str__() + f", device={self.device})"
180 | 
181 |     def __str__(self):
182 |         return self.numpy().__str__()
183 | 
184 |     ### Basic array manipulation
185 |     def fill(self, value):
186 |         """Fill (in place) with a constant value."""
187 |         self._device.fill(self._handle, value)
188 | 
189 |     def to(self, device):
190 |         """Convert between devices, using to/from numpy calls as the unifying bridge."""
191 |         if device == self.device:
192 |             return self
193 |         else:
194 |             return NDArray(self.numpy(), device=device)
195 | 
196 |     def numpy(self):
197 |         """convert to a numpy array"""
198 |         return self.device.to_numpy(
199 |             self._handle, self.shape, self.strides, self._offset
200 |         )
201 | 
202 |     def is_compact(self):
203 |         """Return true if array is compact in memory and internal size equals product
204 |         of the shape dimensions"""
205 |         return (
206 |             self._strides == self.compact_strides(self._shape)
207 |             and prod(self.shape) == self._handle.size
208 |         )
209 | 
210 |     def compact(self):
211 |         """Convert a matrix to be compact"""
212 |         if self.is_compact():
213 |             return self
214 |         else:
215 |             out = NDArray.make(self.shape, device=self.device)
216 |             self.device.compact(
217 |                 self._handle, out._handle, self.shape, self.strides, self._offset
218 |             )
219 |             return out
220 | 
221 |     def as_strided(self, shape, strides):
222 |         """Restride the matrix without copying memory."""
223 |         assert len(shape) == len(strides)
224 |         return NDArray.make(
225 |             shape, strides=strides, device=self.device, handle=self._handle, offset=self._offset
226 |         )
227 | 
228 |     @property
229 |     def flat(self):
230 |         return self.reshape((self.size,))
231 | 
232 |     def reshape(self, new_shape):
233 |         """
234 |         Reshape the matrix without copying memory.  This will return a matrix
235 |         that corresponds to a reshaped array but points to the same memory as
236 |         the original array.
237 | 
238 |         Raises:
239 |             ValueError if product of current shape is not equal to the product
240 |             of the new shape, or if the matrix is not compact.
241 | 
242 |         Args:
243 |             new_shape (tuple): new shape of the array
244 | 
245 |         Returns:
246 |             NDArray : reshaped array; this will point to thep
247 |         """
248 | 
249 |         ### BEGIN YOUR SOLUTION
250 |         raise NotImplementedError()
251 |         ### END YOUR SOLUTION
252 | 
253 |     def permute(self, new_axes):
254 |         """
255 |         Permute order of the dimensions.  new_axes describes a permuation of the
256 |         existing axes, so e.g.:
257 |           - If we have an array with dimension "BHWC" then .permute((0,3,1,2))
258 |             would convert this to "BCHW" order.
259 |           - For a 2D array, .permute((1,0)) would transpose the array.
260 |         Like reshape, this operation should not copy memory, but achieves the
261 |         permuting by just adjusting the shape/strides of the array.  That is,
262 |         it returns a new array that has the dimensions permuted as desired, but
263 |         which points to the same memroy as the original array.
264 | 
265 |         Args:
266 |             new_axes (tuple): permuation order of the dimensions
267 | 
268 |         Returns:
269 |             NDarray : new NDArray object with permuted dimensions, pointing
270 |             to the same memory as the original NDArray (i.e., just shape and
271 |             strides changed).
272 |         """
273 | 
274 |         ### BEGIN YOUR SOLUTION
275 |         raise NotImplementedError()
276 |         ### END YOUR SOLUTION
277 | 
278 |     def broadcast_to(self, new_shape):
279 |         """
280 |         Broadcast an array to a new shape.  new_shape's elements must be the
281 |         same as the original shape, except for dimensions in the self where
282 |         the size = 1 (which can then be broadcast to any size).  As with the
283 |         previous calls, this will not copy memory, and just achieves
284 |         broadcasting by manipulating the strides.
285 | 
286 |         Raises:
287 |             assertion error if new_shape[i] != shape[i] for all i where
288 |             shape[i] != 1
289 | 
290 |         Args:
291 |             new_shape (tuple): shape to broadcast to
292 | 
293 |         Returns:
294 |             NDArray: the new NDArray object with the new broadcast shape; should
295 |             point to the same memory as the original array.
296 |         """
297 | 
298 |         ### BEGIN YOUR SOLUTION
299 |         raise NotImplementedError()
300 |         ### END YOUR SOLUTION
301 | 
302 |     ### Get and set elements
303 | 
304 |     def process_slice(self, sl, dim):
305 |         """Convert a slice to an explicit start/stop/step"""
306 |         start, stop, step = sl.start, sl.stop, sl.step
307 |         if start == None:
308 |             start = 0
309 |         if start < 0:
310 |             start = self.shape[dim]
311 |         if stop == None:
312 |             stop = self.shape[dim]
313 |         if stop < 0:
314 |             stop = self.shape[dim] + stop
315 |         if step == None:
316 |             step = 1
317 | 
318 |         # we're not gonna handle negative strides and that kind of thing
319 |         assert stop > start, "Start must be less than stop"
320 |         assert step > 0, "No support for  negative increments"
321 |         return slice(start, stop, step)
322 | 
323 |     def __getitem__(self, idxs):
324 |         """
325 |         The __getitem__ operator in Python allows us to access elements of our
326 |         array.  When passed notation such as a[1:5,:-1:2,4,:] etc, Python will
327 |         convert this to a tuple of slices and integers (for singletons like the
328 |         '4' in this example).  Slices can be a bit odd to work with (they have
329 |         three elements .start .stop .step), which can be None or have negative
330 |         entries, so for simplicity we wrote the code for you to convert these
331 |         to always be a tuple of slices, one of each dimension.
332 | 
333 |         For this tuple of slices, return an array that subsets the desired
334 |         elements.  As before, this can be done entirely through compute a new
335 |         shape, stride, and offset for the new "view" into the original array,
336 |         pointing to the same memory
337 | 
338 |         Raises:
339 |             AssertionError if a slice has negative size or step, or if number
340 |             of slices is not equal to the number of dimension (the stub code
341 |             already raises all these errors.
342 | 
343 |         Args:
344 |             idxs tuple: (after stub code processes), a tuple of slice elements
345 |             coresponding to the subset of the matrix to get
346 | 
347 |         Returns:
348 |             NDArray: a new NDArray object corresponding to the selected
349 |             subset of elements.  As before, this should not copy memroy but just
350 |             manipulate the shape/strides/offset of the new array, referecing
351 |             the same array as the original one.
352 |         """
353 | 
354 |         # handle singleton as tuple, everything as slices
355 |         if not isinstance(idxs, tuple):
356 |             idxs = (idxs,)
357 |         idxs = tuple(
358 |             [
359 |                 self.process_slice(s, i) if isinstance(s, slice) else slice(s, s + 1, 1)
360 |                 for i, s in enumerate(idxs)
361 |             ]
362 |         )
363 |         assert len(idxs) == self.ndim, "Need indexes equal to number of dimensions"
364 | 
365 |         ### BEGIN YOUR SOLUTION
366 |         raise NotImplementedError()
367 |         ### END YOUR SOLUTION
368 | 
369 |     def __setitem__(self, idxs, other):
370 |         """Set the values of a view into an array, using the same semantics
371 |         as __getitem__()."""
372 |         view = self.__getitem__(idxs)
373 |         if isinstance(other, NDArray):
374 |             assert prod(view.shape) == prod(other.shape)
375 |             self.device.ewise_setitem(
376 |                 other.compact()._handle,
377 |                 view._handle,
378 |                 view.shape,
379 |                 view.strides,
380 |                 view._offset,
381 |             )
382 |         else:
383 |             self.device.scalar_setitem(
384 |                 prod(view.shape),
385 |                 other,
386 |                 view._handle,
387 |                 view.shape,
388 |                 view.strides,
389 |                 view._offset,
390 |             )
391 | 
392 |     ### Collection of elementwise and scalar function: add, multiply, boolean, etc
393 | 
394 |     def ewise_or_scalar(self, other, ewise_func, scalar_func):
395 |         """Run either an elementwise or scalar version of a function,
396 |         depending on whether "other" is an NDArray or scalar
397 |         """
398 |         out = NDArray.make(self.shape, device=self.device)
399 |         if isinstance(other, NDArray):
400 |             assert self.shape == other.shape, "operation needs two equal-sized arrays"
401 |             ewise_func(self.compact()._handle, other.compact()._handle, out._handle)
402 |         else:
403 |             scalar_func(self.compact()._handle, other, out._handle)
404 |         return out
405 | 
406 |     def __add__(self, other):
407 |         return self.ewise_or_scalar(
408 |             other, self.device.ewise_add, self.device.scalar_add
409 |         )
410 | 
411 |     __radd__ = __add__
412 | 
413 |     def __sub__(self, other):
414 |         return self + (-other)
415 | 
416 |     def __rsub__(self, other):
417 |         return other + (-self)
418 | 
419 |     def __mul__(self, other):
420 |         return self.ewise_or_scalar(
421 |             other, self.device.ewise_mul, self.device.scalar_mul
422 |         )
423 | 
424 |     __rmul__ = __mul__
425 | 
426 |     def __truediv__(self, other):
427 |         return self.ewise_or_scalar(
428 |             other, self.device.ewise_div, self.device.scalar_div
429 |         )
430 | 
431 |     def __neg__(self):
432 |         return self * (-1)
433 | 
434 |     def __pow__(self, other):
435 |         out = NDArray.make(self.shape, device=self.device)
436 |         self.device.scalar_power(self.compact()._handle, other, out._handle)
437 |         return out
438 | 
439 |     def maximum(self, other):
440 |         return self.ewise_or_scalar(
441 |             other, self.device.ewise_maximum, self.device.scalar_maximum
442 |         )
443 | 
444 |     ### Binary operators all return (0.0, 1.0) floating point values, could of course be optimized
445 |     def __eq__(self, other):
446 |         return self.ewise_or_scalar(other, self.device.ewise_eq, self.device.scalar_eq)
447 | 
448 |     def __ge__(self, other):
449 |         return self.ewise_or_scalar(other, self.device.ewise_ge, self.device.scalar_ge)
450 | 
451 |     def __ne__(self, other):
452 |         return 1 - (self == other)
453 | 
454 |     def __gt__(self, other):
455 |         return (self >= other) * (self != other)
456 | 
457 |     def __lt__(self, other):
458 |         return 1 - (self >= other)
459 | 
460 |     def __le__(self, other):
461 |         return 1 - (self > other)
462 | 
463 |     ### Elementwise functions
464 | 
465 |     def log(self):
466 |         out = NDArray.make(self.shape, device=self.device)
467 |         self.device.ewise_log(self.compact()._handle, out._handle)
468 |         return out
469 | 
470 |     def exp(self):
471 |         out = NDArray.make(self.shape, device=self.device)
472 |         self.device.ewise_exp(self.compact()._handle, out._handle)
473 |         return out
474 | 
475 |     def tanh(self):
476 |         out = NDArray.make(self.shape, device=self.device)
477 |         self.device.ewise_tanh(self.compact()._handle, out._handle)
478 |         return out
479 | 
480 |     ### Matrix multiplication
481 |     def __matmul__(self, other):
482 |         """Matrix multplication of two arrays.  This requires that both arrays
483 |         be 2D (i.e., we don't handle batch matrix multiplication), and that the
484 |         sizes match up properly for matrix multiplication.
485 | 
486 |         In the case of the CPU backend, you will implement an efficient "tiled"
487 |         version of matrix multiplication for the case when all dimensions of
488 |         the array are divisible by self.device.__tile_size__.  In this case,
489 |         the code below will restride and compact the matrix into tiled form,
490 |         and then pass to the relevant CPU backend.  For the CPU version we will
491 |         just fall back to the naive CPU implementation if the array shape is not
492 |         a multiple of the tile size
493 | 
494 |         The GPU (and numpy) versions don't have any tiled version (or rather,
495 |         the GPU version will just work natively by tiling any input size).
496 |         """
497 | 
498 |         assert self.ndim == 2 and other.ndim == 2
499 |         assert self.shape[1] == other.shape[0]
500 | 
501 |         m, n, p = self.shape[0], self.shape[1], other.shape[1]
502 | 
503 |         # if the matrix is aligned, use tiled matrix multiplication
504 |         if hasattr(self.device, "matmul_tiled") and all(
505 |             d % self.device.__tile_size__ == 0 for d in (m, n, p)
506 |         ):
507 | 
508 |             def tile(a, tile):
509 |                 return a.as_strided(
510 |                     (a.shape[0] // tile, a.shape[1] // tile, tile, tile),
511 |                     (a.shape[1] * tile, tile, a.shape[1], 1),
512 |                 )
513 | 
514 |             t = self.device.__tile_size__
515 |             a = tile(self.compact(), t).compact()
516 |             b = tile(other.compact(), t).compact()
517 |             out = NDArray.make((a.shape[0], b.shape[1], t, t), device=self.device)
518 |             self.device.matmul_tiled(a._handle, b._handle, out._handle, m, n, p)
519 | 
520 |             return (
521 |                 out.permute((0, 2, 1, 3))
522 |                 .compact()
523 |                 .reshape((self.shape[0], other.shape[1]))
524 |             )
525 | 
526 |         else:
527 |             out = NDArray.make((m, p), device=self.device)
528 |             self.device.matmul(
529 |                 self.compact()._handle, other.compact()._handle, out._handle, m, n, p
530 |             )
531 |             return out
532 | 
533 |     ### Reductions, i.e., sum/max over all element or over given axis
534 |     def reduce_view_out(self, axis, keepdims=False):
535 |         """ Return a view to the array set up for reduction functions and output array. """
536 |         if isinstance(axis, tuple) and not axis:
537 |             raise ValueError("Empty axis in reduce")
538 | 
539 |         if axis is None:
540 |             view = self.compact().reshape((1,) * (self.ndim - 1) + (prod(self.shape),))
541 |             #out = NDArray.make((1,) * self.ndim, device=self.device)
542 |             out = NDArray.make((1,), device=self.device)
543 | 
544 |         else:
545 |             if isinstance(axis, (tuple, list)):
546 |                 assert len(axis) == 1, "Only support reduction over a single axis"
547 |                 axis = axis[0]
548 | 
549 |             view = self.permute(
550 |                 tuple([a for a in range(self.ndim) if a != axis]) + (axis,)
551 |             )
552 |             out = NDArray.make(
553 |                 tuple([1 if i == axis else s for i, s in enumerate(self.shape)])
554 |                 if keepdims else
555 |                 tuple([s for i, s in enumerate(self.shape) if i != axis]),
556 |                 device=self.device,
557 |             )
558 |         return view, out
559 | 
560 |     def sum(self, axis=None, keepdims=False):
561 |         view, out = self.reduce_view_out(axis, keepdims=keepdims)
562 |         self.device.reduce_sum(view.compact()._handle, out._handle, view.shape[-1])
563 |         return out
564 | 
565 |     def max(self, axis=None, keepdims=False):
566 |         view, out = self.reduce_view_out(axis, keepdims=keepdims)
567 |         self.device.reduce_max(view.compact()._handle, out._handle, view.shape[-1])
568 |         return out
569 | 
570 |     def flip(self, axes):
571 |         """
572 |         Flip this ndarray along the specified axes.
573 |         Note: compact() before returning.
574 |         """
575 |         ### BEGIN YOUR SOLUTION
576 |         raise NotImplementedError()
577 |         ### END YOUR SOLUTION
578 | 
579 |     def pad(self, axes):
580 |         """
581 |         Pad this ndarray by zeros by the specified amount in `axes`,
582 |         which lists for _all_ axes the left and right padding amount, e.g.,
583 |         axes = ( (0, 0), (1, 1), (0, 0)) pads the middle axis with a 0 on the left and right side.
584 |         """
585 |         ### BEGIN YOUR SOLUTION
586 |         raise NotImplementedError()
587 |         ### END YOUR SOLUTION
588 | 
589 | def array(a, dtype="float32", device=None):
590 |     """Convenience methods to match numpy a bit more closely."""
591 |     dtype = "float32" if dtype is None else dtype
592 |     assert dtype == "float32"
593 |     return NDArray(a, device=device)
594 | 
595 | 
596 | def empty(shape, dtype="float32", device=None):
597 |     device = device if device is not None else default_device()
598 |     return device.empty(shape, dtype)
599 | 
600 | 
601 | def full(shape, fill_value, dtype="float32", device=None):
602 |     device = device if device is not None else default_device()
603 |     return device.full(shape, fill_value, dtype)
604 | 
605 | 
606 | def broadcast_to(array, new_shape):
607 |     return array.broadcast_to(new_shape)
608 | 
609 | 
610 | def reshape(array, new_shape):
611 |     return array.reshape(new_shape)
612 | 
613 | 
614 | def maximum(a, b):
615 |     return a.maximum(b)
616 | 
617 | 
618 | def log(a):
619 |     return a.log()
620 | 
621 | 
622 | def exp(a):
623 |     return a.exp()
624 | 
625 | 
626 | def tanh(a):
627 |     return a.tanh()
628 | 
629 | 
630 | def sum(a, axis=None, keepdims=False):
631 |     return a.sum(axis=axis, keepdims=keepdims)
632 | 
633 | 
634 | def flip(a, axes):
635 |     return a.flip(axes)
636 | 


--------------------------------------------------------------------------------
/python/needle/backend_ndarray/ndarray_backend_numpy.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | 
  4 | __device_name__ = "numpy"
  5 | _datatype = np.float32
  6 | _datetype_size = np.dtype(_datatype).itemsize
  7 | 
  8 | 
  9 | class Array:
 10 |     def __init__(self, size):
 11 |         self.array = np.empty(size, dtype=np.float32)
 12 | 
 13 |     @property
 14 |     def size(self):
 15 |         return self.array.size
 16 | 
 17 | 
 18 | def to_numpy(a, shape, strides, offset):
 19 |     return np.lib.stride_tricks.as_strided(
 20 |         a.array[offset:], shape, tuple([s * _datetype_size for s in strides])
 21 |     )
 22 | 
 23 | 
 24 | def from_numpy(a, out):
 25 |     out.array[:] = a.flatten()
 26 | 
 27 | 
 28 | def fill(out, val):
 29 |     out.array.fill(val)
 30 | 
 31 | 
 32 | def compact(a, out, shape, strides, offset):
 33 |     out.array[:] = to_numpy(a, shape, strides, offset).flatten()
 34 | 
 35 | 
 36 | def ewise_setitem(a, out, shape, strides, offset):
 37 |     to_numpy(out, shape, strides, offset)[:] = a.array.reshape(shape)
 38 | 
 39 | 
 40 | def scalar_setitem(size, val, out, shape, strides, offset):
 41 |     to_numpy(out, shape, strides, offset)[:] = val
 42 | 
 43 | 
 44 | def ewise_add(a, b, out):
 45 |     out.array[:] = a.array + b.array
 46 | 
 47 | 
 48 | def scalar_add(a, val, out):
 49 |     out.array[:] = a.array + val
 50 | 
 51 | 
 52 | def ewise_mul(a, b, out):
 53 |     out.array[:] = a.array * b.array
 54 | 
 55 | 
 56 | def scalar_mul(a, val, out):
 57 |     out.array[:] = a.array * val
 58 | 
 59 | 
 60 | def ewise_div(a, b, out):
 61 |     out.array[:] = a.array / b.array
 62 | 
 63 | 
 64 | def scalar_div(a, val, out):
 65 |     out.array[:] = a.array / val
 66 | 
 67 | 
 68 | def scalar_power(a, val, out):
 69 |     out.array[:] = a.array**val
 70 | 
 71 | 
 72 | def ewise_maximum(a, b, out):
 73 |     out.array[:] = np.maximum(a.array, b.array)
 74 | 
 75 | 
 76 | def scalar_maximum(a, val, out):
 77 |     out.array[:] = np.maximum(a.array, val)
 78 | 
 79 | 
 80 | def ewise_eq(a, b, out):
 81 |     out.array[:] = (a.array == b.array).astype(np.float32)
 82 | 
 83 | 
 84 | def scalar_eq(a, val, out):
 85 |     out.array[:] = (a.array == val).astype(np.float32)
 86 | 
 87 | 
 88 | def ewise_ge(a, b, out):
 89 |     out.array[:] = (a.array >= b.array).astype(np.float32)
 90 | 
 91 | 
 92 | def scalar_ge(a, val, out):
 93 |     out.array[:] = (a.array >= val).astype(np.float32)
 94 | 
 95 | 
 96 | def ewise_log(a, out):
 97 |     out.array[:] = np.log(a.array)
 98 | 
 99 | 
100 | def ewise_exp(a, out):
101 |     out.array[:] = np.exp(a.array)
102 | 
103 | 
104 | def ewise_tanh(a, out):
105 |     out.array[:] = np.tanh(a.array)
106 | 
107 | 
108 | def matmul(a, b, out, m, n, p):
109 |     out.array[:] = (a.array.reshape(m, n) @ b.array.reshape(n, p)).reshape(-1)
110 | 
111 | 
112 | def reduce_max(a, out, reduce_size):
113 |     out.array[:] = a.array[:].reshape(-1, reduce_size).max(axis=1)
114 | 
115 | 
116 | def reduce_sum(a, out, reduce_size):
117 |     out.array[:] = a.array[:].reshape(-1, reduce_size).sum(axis=1)
118 | 


--------------------------------------------------------------------------------
/python/needle/backend_numpy.py:
--------------------------------------------------------------------------------
 1 | """This file defies specific implementations of devices when using numpy as NDArray backend.
 2 | """
 3 | import numpy
 4 | 
 5 | 
 6 | class Device:
 7 |     """Baseclass of all device"""
 8 | 
 9 | 
10 | class CPUDevice(Device):
11 |     """Represents data that sits in CPU"""
12 | 
13 |     def __repr__(self):
14 |         return "needle.cpu()"
15 | 
16 |     def __hash__(self):
17 |         return self.__repr__().__hash__()
18 | 
19 |     def __eq__(self, other):
20 |         return isinstance(other, CPUDevice)
21 | 
22 |     def enabled(self):
23 |         return True
24 | 
25 |     def zeros(self, *shape, dtype="float32"):
26 |         return numpy.zeros(shape, dtype=dtype)
27 | 
28 |     def ones(self, *shape, dtype="float32"):
29 |         return numpy.ones(shape, dtype=dtype)
30 | 
31 |     def randn(self, *shape):
32 |         # note: numpy doesn't support types within standard random routines, and
33 |         # .astype("float32") does work if we're generating a singleton
34 |         return numpy.random.randn(*shape)
35 | 
36 |     def rand(self, *shape):
37 |         # note: numpy doesn't support types within standard random routines, and
38 |         # .astype("float32") does work if we're generating a singleton
39 |         return numpy.random.rand(*shape)
40 | 
41 |     def one_hot(self, n, i, dtype="float32"):
42 |         return numpy.eye(n, dtype=dtype)[i]
43 | 
44 |     def empty(self, shape, dtype="float32"):
45 |         return numpy.empty(shape, dtype=dtype)
46 | 
47 |     def full(self, shape, fill_value, dtype="float32"):
48 |         return numpy.full(shape, fill_value, dtype=dtype)
49 | 
50 | 
51 | def cpu():
52 |     """Return cpu device"""
53 |     return CPUDevice()
54 | 
55 | 
56 | def default_device():
57 |     return cpu()
58 | 
59 | 
60 | def all_devices():
61 |     """return a list of all available devices"""
62 |     return [cpu()]
63 | 


--------------------------------------------------------------------------------
/python/needle/backend_selection.py:
--------------------------------------------------------------------------------
 1 | """Logic for backend selection"""
 2 | import os
 3 | 
 4 | 
 5 | BACKEND = os.environ.get("NEEDLE_BACKEND", "nd")
 6 | 
 7 | 
 8 | if BACKEND == "nd":
 9 |     print("Using needle backend")
10 |     from . import backend_ndarray as array_api
11 |     from .backend_ndarray import (
12 |         all_devices,
13 |         cuda,
14 |         cpu,
15 |         cpu_numpy,
16 |         default_device,
17 |         BackendDevice as Device,
18 |     )
19 | 
20 |     NDArray = array_api.NDArray
21 | elif BACKEND == "np":
22 |     print("Using numpy backend")
23 |     import numpy as array_api
24 |     from .backend_numpy import all_devices, cpu, default_device, Device
25 | 
26 |     NDArray = array_api.ndarray
27 | else:
28 |     raise RuntimeError("Unknown needle array backend %s" % BACKEND)
29 | 


--------------------------------------------------------------------------------
/python/needle/data/__init__.py:
--------------------------------------------------------------------------------
1 | from .data_basic import *
2 | from .data_transforms import *
3 | from .datasets import *
4 | 


--------------------------------------------------------------------------------
/python/needle/data/data_basic.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from ..autograd import Tensor
 3 | 
 4 | from typing import Iterator, Optional, List, Sized, Union, Iterable, Any
 5 | 
 6 | 
 7 | 
 8 | class Dataset:
 9 |     r"""An abstract class representing a `Dataset`.
10 | 
11 |     All subclasses should overwrite :meth:`__getitem__`, supporting fetching a
12 |     data sample for a given key. Subclasses must also overwrite
13 |     :meth:`__len__`, which is expected to return the size of the dataset.
14 |     """
15 | 
16 |     def __init__(self, transforms: Optional[List] = None):
17 |         self.transforms = transforms
18 | 
19 |     def __getitem__(self, index) -> object:
20 |         raise NotImplementedError
21 | 
22 |     def __len__(self) -> int:
23 |         raise NotImplementedError
24 |     
25 |     def apply_transforms(self, x):
26 |         if self.transforms is not None:
27 |             # apply the transforms
28 |             for tform in self.transforms:
29 |                 x = tform(x)
30 |         return x
31 | 
32 | 
33 | class DataLoader:
34 |     r"""
35 |     Data loader. Combines a dataset and a sampler, and provides an iterable over
36 |     the given dataset.
37 |     Args:
38 |         dataset (Dataset): dataset from which to load the data.
39 |         batch_size (int, optional): how many samples per batch to load
40 |             (default: ``1``).
41 |         shuffle (bool, optional): set to ``True`` to have the data reshuffled
42 |             at every epoch (default: ``False``).
43 |      """
44 |     dataset: Dataset
45 |     batch_size: Optional[int]
46 | 
47 |     def __init__(
48 |         self,
49 |         dataset: Dataset,
50 |         batch_size: Optional[int] = 1,
51 |         shuffle: bool = False,
52 |     ):
53 | 
54 |         self.dataset = dataset
55 |         self.shuffle = shuffle
56 |         self.batch_size = batch_size
57 |         if not self.shuffle:
58 |             self.ordering = np.array_split(np.arange(len(dataset)), 
59 |                                            range(batch_size, len(dataset), batch_size))
60 | 
61 |     def __iter__(self):
62 |         ### BEGIN YOUR SOLUTION
63 |         raise NotImplementedError()
64 |         ### END YOUR SOLUTION
65 |         return self
66 | 
67 |     def __next__(self):
68 |         ### BEGIN YOUR SOLUTION
69 |         raise NotImplementedError()
70 |         ### END YOUR SOLUTION
71 | 
72 | 


--------------------------------------------------------------------------------
/python/needle/data/data_transforms.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | class Transform:
 4 |     def __call__(self, x):
 5 |         raise NotImplementedError
 6 | 
 7 | 
 8 | class RandomFlipHorizontal(Transform):
 9 |     def __init__(self, p = 0.5):
10 |         self.p = p
11 | 
12 |     def __call__(self, img):
13 |         """
14 |         Horizonally flip an image, specified as an H x W x C NDArray.
15 |         Args:
16 |             img: H x W x C NDArray of an image
17 |         Returns:
18 |             H x W x C ndarray corresponding to image flipped with probability self.p
19 |         Note: use the provided code to provide randomness, for easier testing
20 |         """
21 |         flip_img = np.random.rand() < self.p
22 |         ### BEGIN YOUR SOLUTION
23 |         raise NotImplementedError()
24 |         ### END YOUR SOLUTION
25 | 
26 | 
27 | class RandomCrop(Transform):
28 |     def __init__(self, padding=3):
29 |         self.padding = padding
30 | 
31 |     def __call__(self, img):
32 |         """ Zero pad and then randomly crop an image.
33 |         Args:
34 |              img: H x W x C NDArray of an image
35 |         Return 
36 |             H x W x C NAArray of cliped image
37 |         Note: generate the image shifted by shift_x, shift_y specified below
38 |         """
39 |         shift_x, shift_y = np.random.randint(low=-self.padding, high=self.padding+1, size=2)
40 |         ### BEGIN YOUR SOLUTION
41 |         raise NotImplementedError()
42 |         ### END YOUR SOLUTION
43 | 


--------------------------------------------------------------------------------
/python/needle/data/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | from .mnist_dataset import *
2 | from .ndarray_dataset import *
3 | from .cifar10_dataset import *
4 | from .ptb_dataset import *
5 | 


--------------------------------------------------------------------------------
/python/needle/data/datasets/cifar10_dataset.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pickle
 3 | from typing import Iterator, Optional, List, Sized, Union, Iterable, Any
 4 | import numpy as np
 5 | from ..data_basic import Dataset
 6 | 
 7 | class CIFAR10Dataset(Dataset):
 8 |     def __init__(
 9 |         self,
10 |         base_folder: str,
11 |         train: bool,
12 |         p: Optional[int] = 0.5,
13 |         transforms: Optional[List] = None
14 |     ):
15 |         """
16 |         Parameters:
17 |         base_folder - cifar-10-batches-py folder filepath
18 |         train - bool, if True load training dataset, else load test dataset
19 |         Divide pixel values by 255. so that images are in 0-1 range.
20 |         Attributes:
21 |         X - numpy array of images
22 |         y - numpy array of labels
23 |         """
24 |         ### BEGIN YOUR SOLUTION
25 |         raise NotImplementedError()
26 |         ### END YOUR SOLUTION
27 | 
28 |     def __getitem__(self, index) -> object:
29 |         """
30 |         Returns the image, label at given index
31 |         Image should be of shape (3, 32, 32)
32 |         """
33 |         ### BEGIN YOUR SOLUTION
34 |         raise NotImplementedError()
35 |         ### END YOUR SOLUTION
36 | 
37 |     def __len__(self) -> int:
38 |         """
39 |         Returns the total number of examples in the dataset
40 |         """
41 |         ### BEGIN YOUR SOLUTION
42 |         raise NotImplementedError()
43 |         ### END YOUR SOLUTION
44 | 


--------------------------------------------------------------------------------
/python/needle/data/datasets/mnist_dataset.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Optional
 2 | from ..data_basic import Dataset
 3 | import numpy as np
 4 | 
 5 | class MNISTDataset(Dataset):
 6 |     def __init__(
 7 |         self,
 8 |         image_filename: str,
 9 |         label_filename: str,
10 |         transforms: Optional[List] = None,
11 |     ):
12 |         ### BEGIN YOUR SOLUTION
13 |         raise NotImplementedError()
14 |         ### END YOUR SOLUTION
15 | 
16 |     def __getitem__(self, index) -> object:
17 |         ### BEGIN YOUR SOLUTION
18 |         raise NotImplementedError()
19 |         ### END YOUR SOLUTION
20 | 
21 |     def __len__(self) -> int:
22 |         ### BEGIN YOUR SOLUTION
23 |         raise NotImplementedError()
24 |         ### END YOUR SOLUTION


--------------------------------------------------------------------------------
/python/needle/data/datasets/ndarray_dataset.py:
--------------------------------------------------------------------------------
 1 | from ..data_basic import Dataset
 2 | 
 3 | class NDArrayDataset(Dataset):
 4 |     def __init__(self, *arrays):
 5 |         self.arrays = arrays
 6 | 
 7 |     def __len__(self) -> int:
 8 |         return self.arrays[0].shape[0]
 9 | 
10 |     def __getitem__(self, i) -> object:
11 |         return tuple([a[i] for a in self.arrays])


--------------------------------------------------------------------------------
/python/needle/data/datasets/ptb_dataset.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | import numpy as np
  4 | from needle import backend_ndarray as nd
  5 | from needle import Tensor
  6 | 
  7 | class Dictionary(object):
  8 |     """
  9 |     Creates a dictionary from a list of words, mapping each word to a
 10 |     unique integer.
 11 |     Attributes:
 12 |     word2idx: dictionary mapping from a word to its unique ID
 13 |     idx2word: list of words in the dictionary, in the order they were added
 14 |         to the dictionary (i.e. each word only appears once in this list)
 15 |     """
 16 |     def __init__(self):
 17 |         self.word2idx = {}
 18 |         self.idx2word = []
 19 | 
 20 |     def add_word(self, word):
 21 |         """
 22 |         Input: word of type str
 23 |         If the word is not in the dictionary, adds the word to the dictionary
 24 |         and appends to the list of words.
 25 |         Returns the word's unique ID.
 26 |         """
 27 |         ### BEGIN YOUR SOLUTION
 28 |         raise NotImplementedError()
 29 |         ### END YOUR SOLUTION
 30 | 
 31 |     def __len__(self):
 32 |         """
 33 |         Returns the number of unique words in the dictionary.
 34 |         """
 35 |         ### BEGIN YOUR SOLUTION
 36 |         raise NotImplementedError()
 37 |         ### END YOUR SOLUTION
 38 | 
 39 | 
 40 | 
 41 | class Corpus(object):
 42 |     """
 43 |     Creates corpus from train, and test txt files.
 44 |     """
 45 |     def __init__(self, base_dir, max_lines=None):
 46 |         self.dictionary = Dictionary()
 47 |         self.train = self.tokenize(os.path.join(base_dir, 'train.txt'), max_lines)
 48 |         self.test = self.tokenize(os.path.join(base_dir, 'test.txt'), max_lines)
 49 | 
 50 |     def tokenize(self, path, max_lines=None):
 51 |         """
 52 |         Input:
 53 |         path - path to text file
 54 |         max_lines - maximum number of lines to read in
 55 |         Tokenizes a text file, first adding each word in the file to the dictionary,
 56 |         and then tokenizing the text file to a list of IDs. When adding words to the
 57 |         dictionary (and tokenizing the file content) '<eos>' should be appended to
 58 |         the end of each line in order to properly account for the end of the sentence.
 59 |         Output:
 60 |         ids: List of ids
 61 |         """
 62 |         ### BEGIN YOUR SOLUTION
 63 |         raise NotImplementedError()
 64 |         ### END YOUR SOLUTION
 65 | 
 66 | 
 67 | def batchify(data, batch_size, device, dtype):
 68 |     """
 69 |     Starting from sequential data, batchify arranges the dataset into columns.
 70 |     For instance, with the alphabet as the sequence and batch size 4, we'd get
 71 |     ┌ a g m s ┐
 72 |     │ b h n t │
 73 |     │ c i o u │
 74 |     │ d j p v │
 75 |     │ e k q w │
 76 |     └ f l r x ┘.
 77 |     These columns are treated as independent by the model, which means that the
 78 |     dependence of e. g. 'g' on 'f' cannot be learned, but allows more efficient
 79 |     batch processing.
 80 |     If the data cannot be evenly divided by the batch size, trim off the remainder.
 81 |     Returns the data as a numpy array of shape (nbatch, batch_size).
 82 |     """
 83 |     ### BEGIN YOUR SOLUTION
 84 |     raise NotImplementedError()
 85 |     ### END YOUR SOLUTION
 86 | 
 87 | 
 88 | def get_batch(batches, i, bptt, device=None, dtype=None):
 89 |     """
 90 |     get_batch subdivides the source data into chunks of length bptt.
 91 |     If source is equal to the example output of the batchify function, with
 92 |     a bptt-limit of 2, we'd get the following two Variables for i = 0:
 93 |     ┌ a g m s ┐ ┌ b h n t ┐
 94 |     └ b h n t ┘ └ c i o u ┘
 95 |     Note that despite the name of the function, the subdivison of data is not
 96 |     done along the batch dimension (i.e. dimension 1), since that was handled
 97 |     by the batchify function. The chunks are along dimension 0, corresponding
 98 |     to the seq_len dimension in the LSTM or RNN.
 99 |     Inputs:
100 |     batches - numpy array returned from batchify function
101 |     i - index
102 |     bptt - Sequence length
103 |     Returns:
104 |     data - Tensor of shape (bptt, bs) with cached data as NDArray
105 |     target - Tensor of shape (bptt*bs,) with cached data as NDArray
106 |     """
107 |     ### BEGIN YOUR SOLUTION
108 |     raise NotImplementedError()
109 |     ### END YOUR SOLUTION


--------------------------------------------------------------------------------
/python/needle/init/__init__.py:
--------------------------------------------------------------------------------
1 | from .init_basic import *
2 | 
3 | from .init_initializers import *
4 | 


--------------------------------------------------------------------------------
/python/needle/init/init_basic.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | import needle as ndl
 3 | 
 4 | 
 5 | def rand(*shape, low=0.0, high=1.0, device=None, dtype="float32", requires_grad=False):
 6 |     """Generate random numbers uniform between low and high"""
 7 |     device = ndl.cpu() if device is None else device
 8 |     array = device.rand(*shape) * (high - low) + low
 9 |     return ndl.Tensor(array, device=device, dtype=dtype, requires_grad=requires_grad)
10 | 
11 | 
12 | def randn(*shape, mean=0.0, std=1.0, device=None, dtype="float32", requires_grad=False):
13 |     """Generate random normal with specified mean and std deviation"""
14 |     device = ndl.cpu() if device is None else device
15 |     array = device.randn(*shape) * std + mean
16 |     return ndl.Tensor(array, device=device, dtype=dtype, requires_grad=requires_grad)
17 | 
18 | 
19 | 
20 | 
21 | 
22 | 
23 | def constant(*shape, c=1.0, device=None, dtype="float32", requires_grad=False):
24 |     """Generate constant Tensor"""
25 |     device = ndl.cpu() if device is None else device
26 |     array = device.full(shape, c, dtype=dtype)
27 |     return ndl.Tensor(array, device=device, dtype=dtype, requires_grad=requires_grad)
28 | 
29 | def ones(*shape, device=None, dtype="float32", requires_grad=False):
30 |     """Generate all-ones Tensor"""
31 |     return constant(
32 |         *shape, c=1.0, device=device, dtype=dtype, requires_grad=requires_grad
33 |     )
34 | 
35 | 
36 | def zeros(*shape, device=None, dtype="float32", requires_grad=False):
37 |     """Generate all-zeros Tensor"""
38 |     return constant(
39 |         *shape, c=0.0, device=device, dtype=dtype, requires_grad=requires_grad
40 |     )
41 | 
42 | 
43 | def randb(*shape, p=0.5, device=None, dtype="bool", requires_grad=False):
44 |     """Generate binary random Tensor"""
45 |     device = ndl.cpu() if device is None else device
46 |     array = device.rand(*shape) <= p
47 |     return ndl.Tensor(array, device=device, dtype=dtype, requires_grad=requires_grad)
48 | 
49 | 
50 | def one_hot(n, i, device=None, dtype="float32", requires_grad=False):
51 |     """Generate one-hot encoding Tensor"""
52 |     device = ndl.cpu() if device is None else device
53 |     return ndl.Tensor(
54 |         device.one_hot(n, i.numpy().astype("int32"), dtype=dtype),
55 |         device=device,
56 |         requires_grad=requires_grad,
57 |     )
58 | 
59 | 
60 | def zeros_like(array, *, device=None, requires_grad=False):
61 |     device = device if device else array.device
62 |     return zeros(
63 |         *array.shape, dtype=array.dtype, device=device, requires_grad=requires_grad
64 |     )
65 | 
66 | 
67 | def ones_like(array, *, device=None, requires_grad=False):
68 |     device = device if device else array.device
69 |     return ones(
70 |         *array.shape, dtype=array.dtype, device=device, requires_grad=requires_grad
71 |     )
72 | 


--------------------------------------------------------------------------------
/python/needle/init/init_initializers.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | from .init_basic import *
 3 | 
 4 | 
 5 | def xavier_uniform(fan_in, fan_out, gain=1.0, **kwargs):
 6 |     ### BEGIN YOUR SOLUTION
 7 |     raise NotImplementedError()
 8 |     ### END YOUR SOLUTION
 9 | 
10 | 
11 | def xavier_normal(fan_in, fan_out, gain=1.0, **kwargs):
12 |     ### BEGIN YOUR SOLUTION
13 |     raise NotImplementedError()
14 |     ### END YOUR SOLUTION
15 | 
16 | 
17 | 
18 | def kaiming_uniform(fan_in, fan_out, shape=None, nonlinearity="relu", **kwargs):
19 |     assert nonlinearity == "relu", "Only relu supported currently"
20 |     ### BEGIN YOUR SOLUTION
21 |     raise NotImplementedError()
22 |     ### END YOUR SOLUTION
23 | 
24 | def kaiming_normal(fan_in, fan_out, nonlinearity="relu", **kwargs):
25 |     assert nonlinearity == "relu", "Only relu supported currently"
26 |     ### BEGIN YOUR SOLUTION
27 |     raise NotImplementedError()
28 |     ### END YOUR SOLUTION


--------------------------------------------------------------------------------
/python/needle/nn/__init__.py:
--------------------------------------------------------------------------------
1 | from .nn_basic import *
2 | from .nn_conv import *
3 | from .nn_sequence import *
4 | # from .nn_transformer import *
5 | 


--------------------------------------------------------------------------------
/python/needle/nn/nn_basic.py:
--------------------------------------------------------------------------------
  1 | """The module.
  2 | """
  3 | from typing import List, Callable, Any
  4 | from needle.autograd import Tensor
  5 | from needle import ops
  6 | import needle.init as init
  7 | import numpy as np
  8 | 
  9 | 
 10 | class Parameter(Tensor):
 11 |     """A special kind of tensor that represents parameters."""
 12 | 
 13 | 
 14 | def _unpack_params(value: object) -> List[Tensor]:
 15 |     if isinstance(value, Parameter):
 16 |         return [value]
 17 |     elif isinstance(value, Module):
 18 |         return value.parameters()
 19 |     elif isinstance(value, dict):
 20 |         params = []
 21 |         for k, v in value.items():
 22 |             params += _unpack_params(v)
 23 |         return params
 24 |     elif isinstance(value, (list, tuple)):
 25 |         params = []
 26 |         for v in value:
 27 |             params += _unpack_params(v)
 28 |         return params
 29 |     else:
 30 |         return []
 31 | 
 32 | 
 33 | def _child_modules(value: object) -> List["Module"]:
 34 |     if isinstance(value, Module):
 35 |         modules = [value]
 36 |         modules.extend(_child_modules(value.__dict__))
 37 |         return modules
 38 |     if isinstance(value, dict):
 39 |         modules = []
 40 |         for k, v in value.items():
 41 |             modules += _child_modules(v)
 42 |         return modules
 43 |     elif isinstance(value, (list, tuple)):
 44 |         modules = []
 45 |         for v in value:
 46 |             modules += _child_modules(v)
 47 |         return modules
 48 |     else:
 49 |         return []
 50 | 
 51 | 
 52 | class Module:
 53 |     def __init__(self):
 54 |         self.training = True
 55 | 
 56 |     def parameters(self) -> List[Tensor]:
 57 |         """Return the list of parameters in the module."""
 58 |         return _unpack_params(self.__dict__)
 59 | 
 60 |     def _children(self) -> List["Module"]:
 61 |         return _child_modules(self.__dict__)
 62 | 
 63 |     def eval(self):
 64 |         self.training = False
 65 |         for m in self._children():
 66 |             m.training = False
 67 | 
 68 |     def train(self):
 69 |         self.training = True
 70 |         for m in self._children():
 71 |             m.training = True
 72 | 
 73 |     def __call__(self, *args, **kwargs):
 74 |         return self.forward(*args, **kwargs)
 75 | 
 76 | 
 77 | class Identity(Module):
 78 |     def forward(self, x):
 79 |         return x
 80 | 
 81 | 
 82 | class Linear(Module):
 83 |     def __init__(
 84 |         self, in_features, out_features, bias=True, device=None, dtype="float32"
 85 |     ):
 86 |         super().__init__()
 87 |         self.in_features = in_features
 88 |         self.out_features = out_features
 89 | 
 90 |         ### BEGIN YOUR SOLUTION
 91 |         raise NotImplementedError()
 92 |         ### END YOUR SOLUTION
 93 | 
 94 |     def forward(self, X: Tensor) -> Tensor:
 95 |         ### BEGIN YOUR SOLUTION
 96 |         raise NotImplementedError()
 97 |         ### END YOUR SOLUTION
 98 | 
 99 | 
100 | class Flatten(Module):
101 |     def forward(self, X):
102 |         ### BEGIN YOUR SOLUTION
103 |         raise NotImplementedError()
104 |         ### END YOUR SOLUTION
105 | 
106 | 
107 | class ReLU(Module):
108 |     def forward(self, x: Tensor) -> Tensor:
109 |         ### BEGIN YOUR SOLUTION
110 |         raise NotImplementedError()
111 |         ### END YOUR SOLUTION
112 | 
113 | class Sequential(Module):
114 |     def __init__(self, *modules):
115 |         super().__init__()
116 |         self.modules = modules
117 | 
118 |     def forward(self, x: Tensor) -> Tensor:
119 |         ### BEGIN YOUR SOLUTION
120 |         raise NotImplementedError()
121 |         ### END YOUR SOLUTION
122 | 
123 | 
124 | class SoftmaxLoss(Module):
125 |     def forward(self, logits: Tensor, y: Tensor):
126 |         ### BEGIN YOUR SOLUTION
127 |         raise NotImplementedError()
128 |         ### END YOUR SOLUTION
129 | 
130 | 
131 | class BatchNorm1d(Module):
132 |     def __init__(self, dim, eps=1e-5, momentum=0.1, device=None, dtype="float32"):
133 |         super().__init__()
134 |         self.dim = dim
135 |         self.eps = eps
136 |         self.momentum = momentum
137 |         ### BEGIN YOUR SOLUTION
138 |         raise NotImplementedError()
139 |         ### END YOUR SOLUTION
140 | 
141 |     def forward(self, x: Tensor) -> Tensor:
142 |         ### BEGIN YOUR SOLUTION
143 |         raise NotImplementedError()
144 |         ### END YOUR SOLUTION
145 | 
146 | class BatchNorm2d(BatchNorm1d):
147 |     def __init__(self, *args, **kwargs):
148 |         super().__init__(*args, **kwargs)
149 | 
150 |     def forward(self, x: Tensor):
151 |         # nchw -> nhcw -> nhwc
152 |         s = x.shape
153 |         _x = x.transpose((1, 2)).transpose((2, 3)).reshape((s[0] * s[2] * s[3], s[1]))
154 |         y = super().forward(_x).reshape((s[0], s[2], s[3], s[1]))
155 |         return y.transpose((2,3)).transpose((1,2))
156 | 
157 | 
158 | class LayerNorm1d(Module):
159 |     def __init__(self, dim, eps=1e-5, device=None, dtype="float32"):
160 |         super().__init__()
161 |         self.dim = dim
162 |         self.eps = eps
163 |         ### BEGIN YOUR SOLUTION
164 |         raise NotImplementedError()
165 |         ### END YOUR SOLUTION
166 | 
167 |     def forward(self, x: Tensor) -> Tensor:
168 |         ### BEGIN YOUR SOLUTION
169 |         raise NotImplementedError()
170 |         ### END YOUR SOLUTION
171 | 
172 | 
173 | class Dropout(Module):
174 |     def __init__(self, p=0.5):
175 |         super().__init__()
176 |         self.p = p
177 | 
178 |     def forward(self, x: Tensor) -> Tensor:
179 |         ### BEGIN YOUR SOLUTION
180 |         raise NotImplementedError()
181 |         ### END YOUR SOLUTION
182 | 
183 | 
184 | class Residual(Module):
185 |     def __init__(self, fn: Module):
186 |         super().__init__()
187 |         self.fn = fn
188 | 
189 |     def forward(self, x: Tensor) -> Tensor:
190 |         ### BEGIN YOUR SOLUTION
191 |         raise NotImplementedError()
192 |         ### END YOUR SOLUTION
193 | 


--------------------------------------------------------------------------------
/python/needle/nn/nn_conv.py:
--------------------------------------------------------------------------------
 1 | """The module.
 2 | """
 3 | from typing import List, Callable, Any
 4 | from needle.autograd import Tensor
 5 | from needle import ops
 6 | import needle.init as init
 7 | import numpy as np
 8 | from .nn_basic import Parameter, Module
 9 | 
10 | 
11 | class Conv(Module):
12 |     """
13 |     Multi-channel 2D convolutional layer
14 |     IMPORTANT: Accepts inputs in NCHW format, outputs also in NCHW format
15 |     Only supports padding=same
16 |     No grouped convolution or dilation
17 |     Only supports square kernels
18 |     """
19 |     def __init__(self, in_channels, out_channels, kernel_size, stride=1, bias=True, device=None, dtype="float32"):
20 |         super().__init__()
21 |         if isinstance(kernel_size, tuple):
22 |             kernel_size = kernel_size[0]
23 |         if isinstance(stride, tuple):
24 |             stride = stride[0]
25 |         self.in_channels = in_channels
26 |         self.out_channels = out_channels
27 |         self.kernel_size = kernel_size
28 |         self.stride = stride
29 | 
30 |         ### BEGIN YOUR SOLUTION
31 |         raise NotImplementedError()
32 |         ### END YOUR SOLUTION
33 | 
34 |     def forward(self, x: Tensor) -> Tensor:
35 |         ### BEGIN YOUR SOLUTION
36 |         raise NotImplementedError()
37 |         ### END YOUR SOLUTION


--------------------------------------------------------------------------------
/python/needle/nn/nn_sequence.py:
--------------------------------------------------------------------------------
  1 | """The module.
  2 | """
  3 | from typing import List
  4 | from needle.autograd import Tensor
  5 | from needle import ops
  6 | import needle.init as init
  7 | import numpy as np
  8 | from .nn_basic import Parameter, Module
  9 | 
 10 | 
 11 | class Sigmoid(Module):
 12 |     def __init__(self):
 13 |         super().__init__()
 14 | 
 15 |     def forward(self, x: Tensor) -> Tensor:
 16 |         ### BEGIN YOUR SOLUTION
 17 |         raise NotImplementedError()
 18 |         ### END YOUR SOLUTION
 19 | 
 20 | class RNNCell(Module):
 21 |     def __init__(self, input_size, hidden_size, bias=True, nonlinearity='tanh', device=None, dtype="float32"):
 22 |         """
 23 |         Applies an RNN cell with tanh or ReLU nonlinearity.
 24 | 
 25 |         Parameters:
 26 |         input_size: The number of expected features in the input X
 27 |         hidden_size: The number of features in the hidden state h
 28 |         bias: If False, then the layer does not use bias weights
 29 |         nonlinearity: The non-linearity to use. Can be either 'tanh' or 'relu'.
 30 | 
 31 |         Variables:
 32 |         W_ih: The learnable input-hidden weights of shape (input_size, hidden_size).
 33 |         W_hh: The learnable hidden-hidden weights of shape (hidden_size, hidden_size).
 34 |         bias_ih: The learnable input-hidden bias of shape (hidden_size,).
 35 |         bias_hh: The learnable hidden-hidden bias of shape (hidden_size,).
 36 | 
 37 |         Weights and biases are initialized from U(-sqrt(k), sqrt(k)) where k = 1/hidden_size
 38 |         """
 39 |         super().__init__()
 40 |         ### BEGIN YOUR SOLUTION
 41 |         raise NotImplementedError()
 42 |         ### END YOUR SOLUTION
 43 | 
 44 |     def forward(self, X, h=None):
 45 |         """
 46 |         Inputs:
 47 |         X of shape (bs, input_size): Tensor containing input features
 48 |         h of shape (bs, hidden_size): Tensor containing the initial hidden state
 49 |             for each element in the batch. Defaults to zero if not provided.
 50 | 
 51 |         Outputs:
 52 |         h' of shape (bs, hidden_size): Tensor contianing the next hidden state
 53 |             for each element in the batch.
 54 |         """
 55 |         ### BEGIN YOUR SOLUTION
 56 |         raise NotImplementedError()
 57 |         ### END YOUR SOLUTION
 58 | 
 59 | 
 60 | class RNN(Module):
 61 |     def __init__(self, input_size, hidden_size, num_layers=1, bias=True, nonlinearity='tanh', device=None, dtype="float32"):
 62 |         """
 63 |         Applies a multi-layer RNN with tanh or ReLU non-linearity to an input sequence.
 64 | 
 65 |         Parameters:
 66 |         input_size - The number of expected features in the input x
 67 |         hidden_size - The number of features in the hidden state h
 68 |         num_layers - Number of recurrent layers.
 69 |         nonlinearity - The non-linearity to use. Can be either 'tanh' or 'relu'.
 70 |         bias - If False, then the layer does not use bias weights.
 71 | 
 72 |         Variables:
 73 |         rnn_cells[k].W_ih: The learnable input-hidden weights of the k-th layer,
 74 |             of shape (input_size, hidden_size) for k=0. Otherwise the shape is
 75 |             (hidden_size, hidden_size).
 76 |         rnn_cells[k].W_hh: The learnable hidden-hidden weights of the k-th layer,
 77 |             of shape (hidden_size, hidden_size).
 78 |         rnn_cells[k].bias_ih: The learnable input-hidden bias of the k-th layer,
 79 |             of shape (hidden_size,).
 80 |         rnn_cells[k].bias_hh: The learnable hidden-hidden bias of the k-th layer,
 81 |             of shape (hidden_size,).
 82 |         """
 83 |         super().__init__()
 84 |         ### BEGIN YOUR SOLUTION
 85 |         raise NotImplementedError()
 86 |         ### END YOUR SOLUTION
 87 | 
 88 |     def forward(self, X, h0=None):
 89 |         """
 90 |         Inputs:
 91 |         X of shape (seq_len, bs, input_size) containing the features of the input sequence.
 92 |         h_0 of shape (num_layers, bs, hidden_size) containing the initial
 93 |             hidden state for each element in the batch. Defaults to zeros if not provided.
 94 | 
 95 |         Outputs
 96 |         output of shape (seq_len, bs, hidden_size) containing the output features
 97 |             (h_t) from the last layer of the RNN, for each t.
 98 |         h_n of shape (num_layers, bs, hidden_size) containing the final hidden state for each element in the batch.
 99 |         """
100 |         ### BEGIN YOUR SOLUTION
101 |         raise NotImplementedError()
102 |         ### END YOUR SOLUTION
103 | 
104 | 
105 | class LSTMCell(Module):
106 |     def __init__(self, input_size, hidden_size, bias=True, device=None, dtype="float32"):
107 |         """
108 |         A long short-term memory (LSTM) cell.
109 | 
110 |         Parameters:
111 |         input_size - The number of expected features in the input X
112 |         hidden_size - The number of features in the hidden state h
113 |         bias - If False, then the layer does not use bias weights
114 | 
115 |         Variables:
116 |         W_ih - The learnable input-hidden weights, of shape (input_size, 4*hidden_size).
117 |         W_hh - The learnable hidden-hidden weights, of shape (hidden_size, 4*hidden_size).
118 |         bias_ih - The learnable input-hidden bias, of shape (4*hidden_size,).
119 |         bias_hh - The learnable hidden-hidden bias, of shape (4*hidden_size,).
120 | 
121 |         Weights and biases are initialized from U(-sqrt(k), sqrt(k)) where k = 1/hidden_size
122 |         """
123 |         super().__init__()
124 |         ### BEGIN YOUR SOLUTION
125 |         raise NotImplementedError()
126 |         ### END YOUR SOLUTION
127 | 
128 | 
129 |     def forward(self, X, h=None):
130 |         """
131 |         Inputs: X, h
132 |         X of shape (batch, input_size): Tensor containing input features
133 |         h, tuple of (h0, c0), with
134 |             h0 of shape (bs, hidden_size): Tensor containing the initial hidden state
135 |                 for each element in the batch. Defaults to zero if not provided.
136 |             c0 of shape (bs, hidden_size): Tensor containing the initial cell state
137 |                 for each element in the batch. Defaults to zero if not provided.
138 | 
139 |         Outputs: (h', c')
140 |         h' of shape (bs, hidden_size): Tensor containing the next hidden state for each
141 |             element in the batch.
142 |         c' of shape (bs, hidden_size): Tensor containing the next cell state for each
143 |             element in the batch.
144 |         """
145 |         ### BEGIN YOUR SOLUTION
146 |         raise NotImplementedError()
147 |         ### END YOUR SOLUTION
148 | 
149 | 
150 | class LSTM(Module):
151 |     def __init__(self, input_size, hidden_size, num_layers=1, bias=True, device=None, dtype="float32"):
152 |         super().__init__()
153 |         """
154 |         Applies a multi-layer long short-term memory (LSTM) RNN to an input sequence.
155 | 
156 |         Parameters:
157 |         input_size - The number of expected features in the input x
158 |         hidden_size - The number of features in the hidden state h
159 |         num_layers - Number of recurrent layers.
160 |         bias - If False, then the layer does not use bias weights.
161 | 
162 |         Variables:
163 |         lstm_cells[k].W_ih: The learnable input-hidden weights of the k-th layer,
164 |             of shape (input_size, 4*hidden_size) for k=0. Otherwise the shape is
165 |             (hidden_size, 4*hidden_size).
166 |         lstm_cells[k].W_hh: The learnable hidden-hidden weights of the k-th layer,
167 |             of shape (hidden_size, 4*hidden_size).
168 |         lstm_cells[k].bias_ih: The learnable input-hidden bias of the k-th layer,
169 |             of shape (4*hidden_size,).
170 |         lstm_cells[k].bias_hh: The learnable hidden-hidden bias of the k-th layer,
171 |             of shape (4*hidden_size,).
172 |         """
173 |         ### BEGIN YOUR SOLUTION
174 |         raise NotImplementedError()
175 |         ### END YOUR SOLUTION
176 | 
177 |     def forward(self, X, h=None):
178 |         """
179 |         Inputs: X, h
180 |         X of shape (seq_len, bs, input_size) containing the features of the input sequence.
181 |         h, tuple of (h0, c0) with
182 |             h_0 of shape (num_layers, bs, hidden_size) containing the initial
183 |                 hidden state for each element in the batch. Defaults to zeros if not provided.
184 |             c0 of shape (num_layers, bs, hidden_size) containing the initial
185 |                 hidden cell state for each element in the batch. Defaults to zeros if not provided.
186 | 
187 |         Outputs: (output, (h_n, c_n))
188 |         output of shape (seq_len, bs, hidden_size) containing the output features
189 |             (h_t) from the last layer of the LSTM, for each t.
190 |         tuple of (h_n, c_n) with
191 |             h_n of shape (num_layers, bs, hidden_size) containing the final hidden state for each element in the batch.
192 |             h_n of shape (num_layers, bs, hidden_size) containing the final hidden cell state for each element in the batch.
193 |         """
194 |         ### BEGIN YOUR SOLUTION
195 |         raise NotImplementedError()
196 |         ### END YOUR SOLUTION
197 | 
198 | class Embedding(Module):
199 |     def __init__(self, num_embeddings, embedding_dim, device=None, dtype="float32"):
200 |         super().__init__()
201 |         """
202 |         Maps one-hot word vectors from a dictionary of fixed size to embeddings.
203 | 
204 |         Parameters:
205 |         num_embeddings (int) - Size of the dictionary
206 |         embedding_dim (int) - The size of each embedding vector
207 | 
208 |         Variables:
209 |         weight - The learnable weights of shape (num_embeddings, embedding_dim)
210 |             initialized from N(0, 1).
211 |         """
212 |         ### BEGIN YOUR SOLUTION
213 |         raise NotImplementedError()
214 |         ### END YOUR SOLUTION
215 | 
216 |     def forward(self, x: Tensor) -> Tensor:
217 |         """
218 |         Maps word indices to one-hot vectors, and projects to embedding vectors
219 | 
220 |         Input:
221 |         x of shape (seq_len, bs)
222 | 
223 |         Output:
224 |         output of shape (seq_len, bs, embedding_dim)
225 |         """
226 |         ### BEGIN YOUR SOLUTION
227 |         raise NotImplementedError()
228 |         ### END YOUR SOLUTION


--------------------------------------------------------------------------------
/python/needle/ops/__init__.py:
--------------------------------------------------------------------------------
1 | from .ops_mathematic import *
2 | 
3 | from .ops_logarithmic import *
4 | from .ops_tuple import *
5 | 


--------------------------------------------------------------------------------
/python/needle/ops/ops_logarithmic.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional
 2 | from ..autograd import NDArray
 3 | from ..autograd import Op, Tensor, Value, TensorOp
 4 | from ..autograd import TensorTuple, TensorTupleOp
 5 | 
 6 | from .ops_mathematic import *
 7 | 
 8 | from ..backend_selection import array_api, BACKEND 
 9 | 
10 | class LogSoftmax(TensorOp):
11 |     def compute(self, Z):
12 |         ### BEGIN YOUR SOLUTION
13 |         raise NotImplementedError()
14 |         ### END YOUR SOLUTION
15 | 
16 |     def gradient(self, out_grad, node):
17 |         ### BEGIN YOUR SOLUTION
18 |         raise NotImplementedError()
19 |         ### END YOUR SOLUTION
20 | 
21 | 
22 | def logsoftmax(a):
23 |     return LogSoftmax()(a)
24 | 
25 | 
26 | class LogSumExp(TensorOp):
27 |     def __init__(self, axes: Optional[tuple] = None):
28 |         self.axes = axes
29 | 
30 |     def compute(self, Z):
31 |         ### BEGIN YOUR SOLUTION
32 |         raise NotImplementedError()
33 |         ### END YOUR SOLUTION
34 | 
35 |     def gradient(self, out_grad, node):
36 |         ### BEGIN YOUR SOLUTION
37 |         raise NotImplementedError()
38 |         ### END YOUR SOLUTION
39 | 
40 | 
41 | def logsumexp(a, axes=None):
42 |     return LogSumExp(axes=axes)(a)
43 | 
44 | 


--------------------------------------------------------------------------------
/python/needle/ops/ops_mathematic.py:
--------------------------------------------------------------------------------
  1 | """Operator implementations."""
  2 | 
  3 | from numbers import Number
  4 | from typing import Optional, List, Tuple, Union
  5 | 
  6 | from ..autograd import NDArray
  7 | from ..autograd import Op, Tensor, Value, TensorOp
  8 | from ..autograd import TensorTuple, TensorTupleOp
  9 | import numpy
 10 | 
 11 | # NOTE: we will import numpy as the array_api
 12 | # as the backend for our computations, this line will change in later homeworks
 13 | 
 14 | from ..backend_selection import array_api, BACKEND
 15 | from .ops_tuple import *
 16 | 
 17 | 
 18 | class EWiseAdd(TensorOp):
 19 |     def compute(self, a: NDArray, b: NDArray):
 20 |         return a + b
 21 | 
 22 |     def gradient(self, out_grad: Tensor, node: Tensor):
 23 |         return out_grad, out_grad
 24 | 
 25 | 
 26 | def add(a, b):
 27 |     return EWiseAdd()(a, b)
 28 | 
 29 | 
 30 | class AddScalar(TensorOp):
 31 |     def __init__(self, scalar):
 32 |         self.scalar = scalar
 33 | 
 34 |     def compute(self, a: NDArray):
 35 |         return a + self.scalar
 36 | 
 37 |     def gradient(self, out_grad: Tensor, node: Tensor):
 38 |         return out_grad
 39 | 
 40 | 
 41 | def add_scalar(a, scalar):
 42 |     return AddScalar(scalar)(a)
 43 | 
 44 | 
 45 | class EWiseMul(TensorOp):
 46 |     def compute(self, a: NDArray, b: NDArray):
 47 |         return a * b
 48 | 
 49 |     def gradient(self, out_grad: Tensor, node: Tensor):
 50 |         lhs, rhs = node.inputs
 51 |         return out_grad * rhs, out_grad * lhs
 52 | 
 53 | 
 54 | def multiply(a, b):
 55 |     return EWiseMul()(a, b)
 56 | 
 57 | 
 58 | class MulScalar(TensorOp):
 59 |     def __init__(self, scalar):
 60 |         self.scalar = scalar
 61 | 
 62 |     def compute(self, a: NDArray):
 63 |         return a * self.scalar
 64 | 
 65 |     def gradient(self, out_grad: Tensor, node: Tensor):
 66 |         return (out_grad * self.scalar,)
 67 | 
 68 | 
 69 | def mul_scalar(a, scalar):
 70 |     return MulScalar(scalar)(a)
 71 | 
 72 | 
 73 | class EWisePow(TensorOp):
 74 |     """Op to element-wise raise a tensor to a power."""
 75 | 
 76 |     def compute(self, a: NDArray, b: NDArray) -> NDArray:
 77 |         ### BEGIN YOUR SOLUTION
 78 |         raise NotImplementedError()
 79 |         ### END YOUR SOLUTION
 80 | 
 81 |     def gradient(self, out_grad, node):
 82 |         ### BEGIN YOUR SOLUTION
 83 |         raise NotImplementedError()
 84 |         ### END YOUR SOLUTION
 85 | 
 86 | 
 87 | def power(a, b):
 88 |     return EWisePow()(a, b)
 89 | 
 90 | 
 91 | class PowerScalar(TensorOp):
 92 |     """Op raise a tensor to an (integer) power."""
 93 | 
 94 |     def __init__(self, scalar: int):
 95 |         self.scalar = scalar
 96 | 
 97 |     def compute(self, a: NDArray) -> NDArray:
 98 |         ### BEGIN YOUR SOLUTION
 99 |         raise NotImplementedError()
100 |         ### END YOUR SOLUTION
101 | 
102 |     def gradient(self, out_grad, node):
103 |         ### BEGIN YOUR SOLUTION
104 |         raise NotImplementedError()
105 |         ### END YOUR SOLUTION
106 | 
107 | 
108 | def power_scalar(a, scalar):
109 |     return PowerScalar(scalar)(a)
110 | 
111 | 
112 | class EWiseDiv(TensorOp):
113 |     """Op to element-wise divide two nodes."""
114 | 
115 |     def compute(self, a, b):
116 |         ### BEGIN YOUR SOLUTION
117 |         raise NotImplementedError()
118 |         ### END YOUR SOLUTION
119 | 
120 |     def gradient(self, out_grad, node):
121 |         ### BEGIN YOUR SOLUTION
122 |         raise NotImplementedError()
123 |         ### END YOUR SOLUTION
124 | 
125 | 
126 | def divide(a, b):
127 |     return EWiseDiv()(a, b)
128 | 
129 | 
130 | class DivScalar(TensorOp):
131 |     def __init__(self, scalar):
132 |         self.scalar = scalar
133 | 
134 |     def compute(self, a):
135 |         ### BEGIN YOUR SOLUTION
136 |         raise NotImplementedError()
137 |         ### END YOUR SOLUTION
138 | 
139 |     def gradient(self, out_grad, node):
140 |         ### BEGIN YOUR SOLUTION
141 |         raise NotImplementedError()
142 |         ### END YOUR SOLUTION
143 | 
144 | 
145 | def divide_scalar(a, scalar):
146 |     return DivScalar(scalar)(a)
147 | 
148 | 
149 | class Transpose(TensorOp):
150 |     def __init__(self, axes: Optional[tuple] = None):
151 |         self.axes = axes
152 | 
153 |     def compute(self, a):
154 |         ### BEGIN YOUR SOLUTION
155 |         raise NotImplementedError()
156 |         ### END YOUR SOLUTION
157 | 
158 |     def gradient(self, out_grad, node):
159 |         ### BEGIN YOUR SOLUTION
160 |         raise NotImplementedError()
161 |         ### END YOUR SOLUTION
162 | 
163 | 
164 | def transpose(a, axes=None):
165 |     return Transpose(axes)(a)
166 | 
167 | 
168 | class Reshape(TensorOp):
169 |     def __init__(self, shape):
170 |         self.shape = shape
171 | 
172 |     def compute(self, a):
173 |         ### BEGIN YOUR SOLUTION
174 |         raise NotImplementedError()
175 |         ### END YOUR SOLUTION
176 | 
177 |     def gradient(self, out_grad, node):
178 |         ### BEGIN YOUR SOLUTION
179 |         raise NotImplementedError()
180 |         ### END YOUR SOLUTION
181 | 
182 | 
183 | def reshape(a, shape):
184 |     return Reshape(shape)(a)
185 | 
186 | 
187 | class BroadcastTo(TensorOp):
188 |     def __init__(self, shape):
189 |         self.shape = shape
190 | 
191 |     def compute(self, a):
192 |         ### BEGIN YOUR SOLUTION
193 |         raise NotImplementedError()
194 |         ### END YOUR SOLUTION
195 | 
196 |     def gradient(self, out_grad, node):
197 |         ### BEGIN YOUR SOLUTION
198 |         raise NotImplementedError()
199 |         ### END YOUR SOLUTION
200 | 
201 | 
202 | def broadcast_to(a, shape):
203 |     return BroadcastTo(shape)(a)
204 | 
205 | 
206 | class Summation(TensorOp):
207 |     def __init__(self, axes: Optional[tuple] = None):
208 |         self.axes = axes
209 | 
210 |     def compute(self, a):
211 |         ### BEGIN YOUR SOLUTION
212 |         raise NotImplementedError()
213 |         ### END YOUR SOLUTION
214 | 
215 |     def gradient(self, out_grad, node):
216 |         ### BEGIN YOUR SOLUTION
217 |         raise NotImplementedError()
218 |         ### END YOUR SOLUTION
219 | 
220 | 
221 | def summation(a, axes=None):
222 |     return Summation(axes)(a)
223 | 
224 | 
225 | class MatMul(TensorOp):
226 |     def compute(self, a, b):
227 |         ### BEGIN YOUR SOLUTION
228 |         raise NotImplementedError()
229 |         ### END YOUR SOLUTION
230 | 
231 |     def gradient(self, out_grad, node):
232 |         ### BEGIN YOUR SOLUTION
233 |         raise NotImplementedError()
234 |         ### END YOUR SOLUTION
235 | 
236 | 
237 | def matmul(a, b):
238 |     return MatMul()(a, b)
239 | 
240 | 
241 | class Negate(TensorOp):
242 |     def compute(self, a):
243 |         ### BEGIN YOUR SOLUTION
244 |         raise NotImplementedError()
245 |         ### END YOUR SOLUTION
246 | 
247 |     def gradient(self, out_grad, node):
248 |         ### BEGIN YOUR SOLUTION
249 |         raise NotImplementedError()
250 |         ### END YOUR SOLUTION
251 | 
252 | 
253 | def negate(a):
254 |     return Negate()(a)
255 | 
256 | 
257 | class Log(TensorOp):
258 |     def compute(self, a):
259 |         ### BEGIN YOUR SOLUTION
260 |         raise NotImplementedError()
261 |         ### END YOUR SOLUTION
262 | 
263 |     def gradient(self, out_grad, node):
264 |         ### BEGIN YOUR SOLUTION
265 |         raise NotImplementedError()
266 |         ### END YOUR SOLUTION
267 | 
268 | 
269 | def log(a):
270 |     return Log()(a)
271 | 
272 | 
273 | class Exp(TensorOp):
274 |     def compute(self, a):
275 |         ### BEGIN YOUR SOLUTION
276 |         raise NotImplementedError()
277 |         ### END YOUR SOLUTION
278 | 
279 |     def gradient(self, out_grad, node):
280 |         ### BEGIN YOUR SOLUTION
281 |         raise NotImplementedError()
282 |         ### END YOUR SOLUTION
283 | 
284 | 
285 | def exp(a):
286 |     return Exp()(a)
287 | 
288 | 
289 | class ReLU(TensorOp):
290 |     def compute(self, a):
291 |         ### BEGIN YOUR SOLUTION
292 |         raise NotImplementedError()
293 |         ### END YOUR SOLUTION
294 | 
295 |     def gradient(self, out_grad, node):
296 |         ### BEGIN YOUR SOLUTION
297 |         raise NotImplementedError()
298 |         ### END YOUR SOLUTION
299 | 
300 | 
301 | def relu(a):
302 |     return ReLU()(a)
303 | 
304 | 
305 | class Tanh(TensorOp):
306 |     def compute(self, a):
307 |         ### BEGIN YOUR SOLUTION
308 |         raise NotImplementedError()
309 |         ### END YOUR SOLUTION
310 | 
311 |     def gradient(self, out_grad, node):
312 |         ### BEGIN YOUR SOLUTION
313 |         raise NotImplementedError()
314 |         ### END YOUR SOLUTION
315 | 
316 | 
317 | def tanh(a):
318 |     return Tanh()(a)
319 | 
320 | 
321 | class Stack(TensorOp):
322 |     def __init__(self, axis: int):
323 |         """
324 |         Concatenates a sequence of arrays along a new dimension.
325 |         Parameters:
326 |         axis - dimension to concatenate along
327 |         All arrays need to be of the same size.
328 |         """
329 |         self.axis = axis
330 | 
331 |     def compute(self, args: TensorTuple) -> Tensor:
332 |         ### BEGIN YOUR SOLUTION
333 |         raise NotImplementedError()
334 |         ### END YOUR SOLUTION
335 | 
336 |     def gradient(self, out_grad, node):
337 |         ### BEGIN YOUR SOLUTION
338 |         raise NotImplementedError()
339 |         ### END YOUR SOLUTION
340 | 
341 | 
342 | def stack(args, axis):
343 |     return Stack(axis)(make_tuple(*args))
344 | 
345 | 
346 | class Split(TensorTupleOp):
347 |     def __init__(self, axis: int):
348 |         """
349 |         Splits a tensor along an axis into a tuple of tensors.
350 |         (The "inverse" of Stack)
351 |         Parameters:
352 |         axis - dimension to split
353 |         """
354 |         self.axis = axis
355 | 
356 |     def compute(self, A):
357 |         ### BEGIN YOUR SOLUTION
358 |         raise NotImplementedError()
359 |         ### END YOUR SOLUTION
360 | 
361 |     def gradient(self, out_grad, node):
362 |         ### BEGIN YOUR SOLUTION
363 |         raise NotImplementedError()
364 |         ### END YOUR SOLUTION
365 | 
366 | 
367 | def split(a, axis):
368 |     return Split(axis)(a)
369 | 
370 | 
371 | class Flip(TensorOp):
372 |     def __init__(self, axes: Optional[tuple] = None):
373 |         self.axes = axes
374 | 
375 |     def compute(self, a):
376 |         ### BEGIN YOUR SOLUTION
377 |         raise NotImplementedError()
378 |         ### END YOUR SOLUTION
379 | 
380 |     def gradient(self, out_grad, node):
381 |         ### BEGIN YOUR SOLUTION
382 |         raise NotImplementedError()
383 |         ### END YOUR SOLUTION
384 | 
385 | 
386 | def flip(a, axes):
387 |     return Flip(axes)(a)
388 | 
389 | 
390 | class Dilate(TensorOp):
391 |     def __init__(self, axes: tuple, dilation: int):
392 |         self.axes = axes
393 |         self.dilation = dilation
394 | 
395 |     def compute(self, a):
396 |         ### BEGIN YOUR SOLUTION
397 |         raise NotImplementedError()
398 |         ### END YOUR SOLUTION
399 | 
400 |     def gradient(self, out_grad, node):
401 |         ### BEGIN YOUR SOLUTION
402 |         raise NotImplementedError()
403 |         ### END YOUR SOLUTION
404 | 
405 | 
406 | def dilate(a, axes, dilation):
407 |     return Dilate(axes, dilation)(a)
408 | 
409 | 
410 | class UnDilate(TensorOp):
411 |     def __init__(self, axes: tuple, dilation: int):
412 |         self.axes = axes
413 |         self.dilation = dilation
414 | 
415 |     def compute(self, a):
416 |         ### BEGIN YOUR SOLUTION
417 |         raise NotImplementedError()
418 |         ### END YOUR SOLUTION
419 | 
420 |     def gradient(self, out_grad, node):
421 |         ### BEGIN YOUR SOLUTION
422 |         raise NotImplementedError()
423 |         ### END YOUR SOLUTION
424 | 
425 | 
426 | def undilate(a, axes, dilation):
427 |     return UnDilate(axes, dilation)(a)
428 | 
429 | 
430 | class Conv(TensorOp):
431 |     def __init__(self, stride: Optional[int] = 1, padding: Optional[int] = 0):
432 |         self.stride = stride
433 |         self.padding = padding
434 | 
435 |     def compute(self, A, B):
436 |         ### BEGIN YOUR SOLUTION
437 |         raise NotImplementedError()
438 |         ### END YOUR SOLUTION
439 | 
440 |     def gradient(self, out_grad, node):
441 |         ### BEGIN YOUR SOLUTION
442 |         raise NotImplementedError()
443 |         ### END YOUR SOLUTION
444 | 
445 | 
446 | def conv(a, b, stride=1, padding=1):
447 |     return Conv(stride, padding)(a, b)
448 | 
449 | 
450 | 


--------------------------------------------------------------------------------
/python/needle/ops/ops_tuple.py:
--------------------------------------------------------------------------------
 1 | from ..autograd import Op, Tensor, TensorTuple, Value, TensorOp, TensorTupleOp
 2 | import needle.init as init
 3 | 
 4 | class MakeTensorTuple(TensorTupleOp):
 5 |     def compute(self, *args) -> tuple:
 6 |         return tuple(args)
 7 | 
 8 |     def gradient(self, out_grad, node):
 9 |         assert isinstance(out_grad, TensorTuple)
10 |         return tuple([out_grad[i] for i in range(len(out_grad))])
11 | 
12 | 
13 | def make_tuple(*args):
14 |     return MakeTensorTuple()(*args)
15 | 
16 | 
17 | class TupleGetItem(TensorOp):
18 |     def __init__(self, index):
19 |         self.index = index
20 | 
21 |     def __call__(self, a: TensorTuple, fold_const=True) -> Value:
22 |         assert isinstance(a, TensorTuple)
23 |         # constant folding
24 |         if fold_const and isinstance(a.op, MakeTensorTuple):
25 |             return a.inputs[self.index]
26 |         return Tensor.make_from_op(self, [a])
27 | 
28 |     def compute(self, a):
29 |         return a[self.index]
30 | 
31 |     def gradient(self, out_grad, node):
32 |         index = self.index
33 |         in_grad = []
34 |         for i, value in enumerate(node.inputs[0]):
35 |             if i != index:
36 |                 in_grad.append(init.zeros_like(value))
37 |             else:
38 |                 in_grad.append(out_grad)
39 |         return MakeTensorTuple()(*in_grad)
40 | 
41 | 
42 | def tuple_get_item(value, index):
43 |     return TupleGetItem(index)(value)
44 | 
45 | 
46 | class FusedAddScalars(TensorTupleOp):
47 |     def __init__(self, c0: float, c1: float):
48 |         self.c0 = c0
49 |         self.c1 = c1
50 | 
51 |     def compute(self, a):
52 |         return a + self.c0, a + self.c1
53 | 
54 |     def gradient(self, out_grad, node):
55 |         return out_grad[0] + out_grad[1]
56 | 
57 | 
58 | def fused_add_scalars(x, c0, c1):
59 |     return FusedAddScalars(c0, c1)(x)
60 | 


--------------------------------------------------------------------------------
/python/needle/optim.py:
--------------------------------------------------------------------------------
 1 | """Optimization module"""
 2 | import needle as ndl
 3 | import numpy as np
 4 | 
 5 | 
 6 | class Optimizer:
 7 |     def __init__(self, params):
 8 |         self.params = params
 9 | 
10 |     def step(self):
11 |         raise NotImplementedError()
12 | 
13 |     def reset_grad(self):
14 |         for p in self.params:
15 |             p.grad = None
16 | 
17 | 
18 | class SGD(Optimizer):
19 |     def __init__(self, params, lr=0.01, momentum=0.0, weight_decay=0.0):
20 |         super().__init__(params)
21 |         self.lr = lr
22 |         self.momentum = momentum
23 |         self.u = {}
24 |         self.weight_decay = weight_decay
25 | 
26 |     def step(self):
27 |         ### BEGIN YOUR SOLUTION
28 |         raise NotImplementedError()
29 |         ### END YOUR SOLUTION
30 | 
31 |     def clip_grad_norm(self, max_norm=0.25):
32 |         """
33 |         Clips gradient norm of parameters.
34 |         """
35 |         ### BEGIN YOUR SOLUTION
36 |         raise NotImplementedError()
37 |         ### END YOUR SOLUTION
38 | 
39 | 
40 | class Adam(Optimizer):
41 |     def __init__(
42 |         self,
43 |         params,
44 |         lr=0.01,
45 |         beta1=0.9,
46 |         beta2=0.999,
47 |         eps=1e-8,
48 |         weight_decay=0.0,
49 |     ):
50 |         super().__init__(params)
51 |         self.lr = lr
52 |         self.beta1 = beta1
53 |         self.beta2 = beta2
54 |         self.eps = eps
55 |         self.weight_decay = weight_decay
56 |         self.t = 0
57 | 
58 |         self.m = {}
59 |         self.v = {}
60 | 
61 |     def step(self):
62 |         ### BEGIN YOUR SOLUTION
63 |         raise NotImplementedError()
64 |         ### END YOUR SOLUTION
65 | 


--------------------------------------------------------------------------------
/src/ndarray_backend_cpu.cc:
--------------------------------------------------------------------------------
  1 | #include <pybind11/numpy.h>
  2 | #include <pybind11/pybind11.h>
  3 | #include <pybind11/stl.h>
  4 | 
  5 | #include <cmath>
  6 | #include <iostream>
  7 | #include <stdexcept>
  8 | 
  9 | namespace needle {
 10 | namespace cpu {
 11 | 
 12 | #define ALIGNMENT 256
 13 | #define TILE 8
 14 | typedef float scalar_t;
 15 | const size_t ELEM_SIZE = sizeof(scalar_t);
 16 | 
 17 | 
 18 | /**
 19 |  * This is a utility structure for maintaining an array aligned to ALIGNMENT boundaries in
 20 |  * memory.  This alignment should be at least TILE * ELEM_SIZE, though we make it even larger
 21 |  * here by default.
 22 |  */
 23 | struct AlignedArray {
 24 |   AlignedArray(const size_t size) {
 25 |     int ret = posix_memalign((void**)&ptr, ALIGNMENT, size * ELEM_SIZE);
 26 |     if (ret != 0) throw std::bad_alloc();
 27 |     this->size = size;
 28 |   }
 29 |   ~AlignedArray() { free(ptr); }
 30 |   size_t ptr_as_int() {return (size_t)ptr; }
 31 |   scalar_t* ptr;
 32 |   size_t size;
 33 | };
 34 | 
 35 | 
 36 | 
 37 | void Fill(AlignedArray* out, scalar_t val) {
 38 |   /**
 39 |    * Fill the values of an aligned array with val
 40 |    */
 41 |   for (int i = 0; i < out->size; i++) {
 42 |     out->ptr[i] = val;
 43 |   }
 44 | }
 45 | 
 46 | 
 47 | 
 48 | void Compact(const AlignedArray& a, AlignedArray* out, std::vector<int32_t> shape,
 49 |              std::vector<int32_t> strides, size_t offset) {
 50 |   /**
 51 |    * Compact an array in memory
 52 |    *
 53 |    * Args:
 54 |    *   a: non-compact representation of the array, given as input
 55 |    *   out: compact version of the array to be written
 56 |    *   shape: shapes of each dimension for a and out
 57 |    *   strides: strides of the *a* array (not out, which has compact strides)
 58 |    *   offset: offset of the *a* array (not out, which has zero offset, being compact)
 59 |    *
 60 |    * Returns:
 61 |    *  void (you need to modify out directly, rather than returning anything; this is true for all the
 62 |    *  function will implement here, so we won't repeat this note.)
 63 |    */
 64 |   /// BEGIN SOLUTION
 65 |   assert(false && "Not Implemented");
 66 |   /// END SOLUTION
 67 | }
 68 | 
 69 | void EwiseSetitem(const AlignedArray& a, AlignedArray* out, std::vector<int32_t> shape,
 70 |                   std::vector<int32_t> strides, size_t offset) {
 71 |   /**
 72 |    * Set items in a (non-compact) array
 73 |    *
 74 |    * Args:
 75 |    *   a: _compact_ array whose items will be written to out
 76 |    *   out: non-compact array whose items are to be written
 77 |    *   shape: shapes of each dimension for a and out
 78 |    *   strides: strides of the *out* array (not a, which has compact strides)
 79 |    *   offset: offset of the *out* array (not a, which has zero offset, being compact)
 80 |    */
 81 |   /// BEGIN SOLUTION
 82 |   assert(false && "Not Implemented");
 83 |   /// END SOLUTION
 84 | }
 85 | 
 86 | void ScalarSetitem(const size_t size, scalar_t val, AlignedArray* out, std::vector<int32_t> shape,
 87 |                    std::vector<int32_t> strides, size_t offset) {
 88 |   /**
 89 |    * Set items is a (non-compact) array
 90 |    *
 91 |    * Args:
 92 |    *   size: number of elements to write in out array (note that this will note be the same as
 93 |    *         out.size, because out is a non-compact subset array);  it _will_ be the same as the
 94 |    *         product of items in shape, but convenient to just pass it here.
 95 |    *   val: scalar value to write to
 96 |    *   out: non-compact array whose items are to be written
 97 |    *   shape: shapes of each dimension of out
 98 |    *   strides: strides of the out array
 99 |    *   offset: offset of the out array
100 |    */
101 | 
102 |   /// BEGIN SOLUTION
103 |   assert(false && "Not Implemented");
104 |   /// END SOLUTION
105 | }
106 | 
107 | void EwiseAdd(const AlignedArray& a, const AlignedArray& b, AlignedArray* out) {
108 |   /**
109 |    * Set entries in out to be the sum of correspondings entires in a and b.
110 |    */
111 |   for (size_t i = 0; i < a.size; i++) {
112 |     out->ptr[i] = a.ptr[i] + b.ptr[i];
113 |   }
114 | }
115 | 
116 | void ScalarAdd(const AlignedArray& a, scalar_t val, AlignedArray* out) {
117 |   /**
118 |    * Set entries in out to be the sum of corresponding entry in a plus the scalar val.
119 |    */
120 |   for (size_t i = 0; i < a.size; i++) {
121 |     out->ptr[i] = a.ptr[i] + val;
122 |   }
123 | }
124 | 
125 | 
126 | /**
127 |  * In the code the follows, use the above template to create analogous element-wise
128 |  * and and scalar operators for the following functions.  See the numpy backend for
129 |  * examples of how they should work.
130 |  *   - EwiseMul, ScalarMul
131 |  *   - EwiseDiv, ScalarDiv
132 |  *   - ScalarPower
133 |  *   - EwiseMaximum, ScalarMaximum
134 |  *   - EwiseEq, ScalarEq
135 |  *   - EwiseGe, ScalarGe
136 |  *   - EwiseLog
137 |  *   - EwiseExp
138 |  *   - EwiseTanh
139 |  *
140 |  * If you implement all these naively, there will be a lot of repeated code, so
141 |  * you are welcome (but not required), to use macros or templates to define these
142 |  * functions (however you want to do so, as long as the functions match the proper)
143 |  * signatures above.
144 |  */
145 | 
146 | 
147 | void Matmul(const AlignedArray& a, const AlignedArray& b, AlignedArray* out, uint32_t m, uint32_t n,
148 |             uint32_t p) {
149 |   /**
150 |    * Multiply two (compact) matrices into an output (also compact) matrix.  For this implementation
151 |    * you can use the "naive" three-loop algorithm.
152 |    *
153 |    * Args:
154 |    *   a: compact 2D array of size m x n
155 |    *   b: compact 2D array of size n x p
156 |    *   out: compact 2D array of size m x p to write the output to
157 |    *   m: rows of a / out
158 |    *   n: columns of a / rows of b
159 |    *   p: columns of b / out
160 |    */
161 | 
162 |   /// BEGIN SOLUTION
163 |   assert(false && "Not Implemented");
164 |   /// END SOLUTION
165 | }
166 | 
167 | inline void AlignedDot(const float* __restrict__ a,
168 |                        const float* __restrict__ b,
169 |                        float* __restrict__ out) {
170 | 
171 |   /**
172 |    * Multiply together two TILE x TILE matrices, and _add _the result to out (it is important to add
173 |    * the result to the existing out, which you should not set to zero beforehand).  We are including
174 |    * the compiler flags here that enable the compile to properly use vector operators to implement
175 |    * this function.  Specifically, the __restrict__ keyword indicates to the compile that a, b, and
176 |    * out don't have any overlapping memory (which is necessary in order for vector operations to be
177 |    * equivalent to their non-vectorized counterparts (imagine what could happen otherwise if a, b,
178 |    * and out had overlapping memory).  Similarly the __builtin_assume_aligned keyword tells the
179 |    * compiler that the input array will be aligned to the appropriate blocks in memory, which also
180 |    * helps the compiler vectorize the code.
181 |    *
182 |    * Args:
183 |    *   a: compact 2D array of size TILE x TILE
184 |    *   b: compact 2D array of size TILE x TILE
185 |    *   out: compact 2D array of size TILE x TILE to write to
186 |    */
187 | 
188 |   a = (const float*)__builtin_assume_aligned(a, TILE * ELEM_SIZE);
189 |   b = (const float*)__builtin_assume_aligned(b, TILE * ELEM_SIZE);
190 |   out = (float*)__builtin_assume_aligned(out, TILE * ELEM_SIZE);
191 | 
192 |   /// BEGIN SOLUTION
193 |   assert(false && "Not Implemented");
194 |   /// END SOLUTION
195 | }
196 | 
197 | void MatmulTiled(const AlignedArray& a, const AlignedArray& b, AlignedArray* out, uint32_t m,
198 |                  uint32_t n, uint32_t p) {
199 |   /**
200 |    * Matrix multiplication on tiled representations of array.  In this setting, a, b, and out
201 |    * are all *4D* compact arrays of the appropriate size, e.g. a is an array of size
202 |    *   a[m/TILE][n/TILE][TILE][TILE]
203 |    * You should do the multiplication tile-by-tile to improve performance of the array (i.e., this
204 |    * function should call `AlignedDot()` implemented above).
205 |    *
206 |    * Note that this function will only be called when m, n, p are all multiples of TILE, so you can
207 |    * assume that this division happens without any remainder.
208 |    *
209 |    * Args:
210 |    *   a: compact 4D array of size m/TILE x n/TILE x TILE x TILE
211 |    *   b: compact 4D array of size n/TILE x p/TILE x TILE x TILE
212 |    *   out: compact 4D array of size m/TILE x p/TILE x TILE x TILE to write to
213 |    *   m: rows of a / out
214 |    *   n: columns of a / rows of b
215 |    *   p: columns of b / out
216 |    *
217 |    */
218 |   /// BEGIN SOLUTION
219 |   assert(false && "Not Implemented");
220 |   /// END SOLUTION
221 | }
222 | 
223 | void ReduceMax(const AlignedArray& a, AlignedArray* out, size_t reduce_size) {
224 |   /**
225 |    * Reduce by taking maximum over `reduce_size` contiguous blocks.
226 |    *
227 |    * Args:
228 |    *   a: compact array of size a.size = out.size * reduce_size to reduce over
229 |    *   out: compact array to write into
230 |    *   reduce_size: size of the dimension to reduce over
231 |    */
232 | 
233 |   /// BEGIN SOLUTION
234 |   assert(false && "Not Implemented");
235 |   /// END SOLUTION
236 | }
237 | 
238 | void ReduceSum(const AlignedArray& a, AlignedArray* out, size_t reduce_size) {
239 |   /**
240 |    * Reduce by taking sum over `reduce_size` contiguous blocks.
241 |    *
242 |    * Args:
243 |    *   a: compact array of size a.size = out.size * reduce_size to reduce over
244 |    *   out: compact array to write into
245 |    *   reduce_size: size of the dimension to reduce over
246 |    */
247 | 
248 |   /// BEGIN SOLUTION
249 |   assert(false && "Not Implemented");
250 |   /// END SOLUTION
251 | }
252 | 
253 | }  // namespace cpu
254 | }  // namespace needle
255 | 
256 | PYBIND11_MODULE(ndarray_backend_cpu, m) {
257 |   namespace py = pybind11;
258 |   using namespace needle;
259 |   using namespace cpu;
260 | 
261 |   m.attr("__device_name__") = "cpu";
262 |   m.attr("__tile_size__") = TILE;
263 | 
264 |   py::class_<AlignedArray>(m, "Array")
265 |       .def(py::init<size_t>(), py::return_value_policy::take_ownership)
266 |       .def("ptr", &AlignedArray::ptr_as_int)
267 |       .def_readonly("size", &AlignedArray::size);
268 | 
269 |   // return numpy array (with copying for simplicity, otherwise garbage
270 |   // collection is a pain)
271 |   m.def("to_numpy", [](const AlignedArray& a, std::vector<size_t> shape,
272 |                        std::vector<size_t> strides, size_t offset) {
273 |     std::vector<size_t> numpy_strides = strides;
274 |     std::transform(numpy_strides.begin(), numpy_strides.end(), numpy_strides.begin(),
275 |                    [](size_t& c) { return c * ELEM_SIZE; });
276 |     return py::array_t<scalar_t>(shape, numpy_strides, a.ptr + offset);
277 |   });
278 | 
279 |   // convert from numpy (with copying)
280 |   m.def("from_numpy", [](py::array_t<scalar_t> a, AlignedArray* out) {
281 |     std::memcpy(out->ptr, a.request().ptr, out->size * ELEM_SIZE);
282 |   });
283 | 
284 |   m.def("fill", Fill);
285 |   m.def("compact", Compact);
286 |   m.def("ewise_setitem", EwiseSetitem);
287 |   m.def("scalar_setitem", ScalarSetitem);
288 |   m.def("ewise_add", EwiseAdd);
289 |   m.def("scalar_add", ScalarAdd);
290 | 
291 |   m.def("ewise_mul", EwiseMul);
292 |   m.def("scalar_mul", ScalarMul);
293 |   m.def("ewise_div", EwiseDiv);
294 |   m.def("scalar_div", ScalarDiv);
295 |   m.def("scalar_power", ScalarPower);
296 | 
297 |   m.def("ewise_maximum", EwiseMaximum);
298 |   m.def("scalar_maximum", ScalarMaximum);
299 |   m.def("ewise_eq", EwiseEq);
300 |   m.def("scalar_eq", ScalarEq);
301 |   m.def("ewise_ge", EwiseGe);
302 |   m.def("scalar_ge", ScalarGe);
303 | 
304 |   m.def("ewise_log", EwiseLog);
305 |   m.def("ewise_exp", EwiseExp);
306 |   m.def("ewise_tanh", EwiseTanh);
307 | 
308 |   m.def("matmul", Matmul);
309 |   m.def("matmul_tiled", MatmulTiled);
310 | 
311 |   m.def("reduce_max", ReduceMax);
312 |   m.def("reduce_sum", ReduceSum);
313 | }
314 | 


--------------------------------------------------------------------------------
/src/ndarray_backend_cuda.cu:
--------------------------------------------------------------------------------
  1 | #include <cuda_runtime.h>
  2 | #include <pybind11/numpy.h>
  3 | #include <pybind11/pybind11.h>
  4 | #include <pybind11/stl.h>
  5 | 
  6 | #include <iostream>
  7 | #include <sstream>
  8 | 
  9 | namespace needle {
 10 | namespace cuda {
 11 | 
 12 | #define BASE_THREAD_NUM 256
 13 | 
 14 | #define TILE 4
 15 | typedef float scalar_t;
 16 | const size_t ELEM_SIZE = sizeof(scalar_t);
 17 | 
 18 | struct CudaArray {
 19 |   CudaArray(const size_t size) {
 20 |     cudaError_t err = cudaMalloc(&ptr, size * ELEM_SIZE);
 21 |     if (err != cudaSuccess) throw std::runtime_error(cudaGetErrorString(err));
 22 |     this->size = size;
 23 |   }
 24 |   ~CudaArray() { cudaFree(ptr); }
 25 |   size_t ptr_as_int() { return (size_t)ptr; }
 26 |   
 27 |   scalar_t* ptr;
 28 |   size_t size;
 29 | };
 30 | 
 31 | struct CudaDims {
 32 |   dim3 block, grid;
 33 | };
 34 | 
 35 | CudaDims CudaOneDim(size_t size) {
 36 |   /**
 37 |    * Utility function to get cuda dimensions for 1D call
 38 |    */
 39 |   CudaDims dim;
 40 |   size_t num_blocks = (size + BASE_THREAD_NUM - 1) / BASE_THREAD_NUM;
 41 |   dim.block = dim3(BASE_THREAD_NUM, 1, 1);
 42 |   dim.grid = dim3(num_blocks, 1, 1);
 43 |   return dim;
 44 | }
 45 | 
 46 | #define MAX_VEC_SIZE 8
 47 | struct CudaVec {
 48 |   uint32_t size;
 49 |   int32_t data[MAX_VEC_SIZE];
 50 | };
 51 | 
 52 | CudaVec VecToCuda(const std::vector<int32_t>& x) {
 53 |   CudaVec shape;
 54 |   if (x.size() > MAX_VEC_SIZE) throw std::runtime_error("Exceeded CUDA supported max dimesions");
 55 |   shape.size = x.size();
 56 |   for (size_t i = 0; i < x.size(); i++) {
 57 |     shape.data[i] = x[i];
 58 |   }
 59 |   return shape;
 60 | }
 61 | 
 62 | ////////////////////////////////////////////////////////////////////////////////
 63 | // Fill call
 64 | ////////////////////////////////////////////////////////////////////////////////
 65 | 
 66 | __global__ void FillKernel(scalar_t* out, scalar_t val, size_t size) {
 67 |   size_t gid = blockIdx.x * blockDim.x + threadIdx.x;
 68 |   if (gid < size) out[gid] = val;
 69 | }
 70 | 
 71 | void Fill(CudaArray* out, scalar_t val) {
 72 |   CudaDims dim = CudaOneDim(out->size);
 73 |   FillKernel<<<dim.grid, dim.block>>>(out->ptr, val, out->size);
 74 | }
 75 | 
 76 | ////////////////////////////////////////////////////////////////////////////////
 77 | // Compact and setitem cals
 78 | ////////////////////////////////////////////////////////////////////////////////
 79 | 
 80 | // Untility function to convert contiguous index i to memory location from strides
 81 | 
 82 | 
 83 | 
 84 | __global__ void CompactKernel(const scalar_t* a, scalar_t* out, size_t size, CudaVec shape,
 85 |                               CudaVec strides, size_t offset) {
 86 |   /**
 87 |    * The CUDA kernel for the compact opeation.  This should effectively map a single entry in the 
 88 |    * non-compact input a, to the corresponding item (at location gid) in the compact array out.
 89 |    * 
 90 |    * Args:
 91 |    *   a: CUDA pointer to a array
 92 |    *   out: CUDA point to out array
 93 |    *   size: size of out array
 94 |    *   shape: vector of shapes of a and out arrays (of type CudaVec, for past passing to CUDA kernel)
 95 |    *   strides: vector of strides of out array
 96 |    *   offset: offset of out array
 97 |    */
 98 |   size_t gid = blockIdx.x * blockDim.x + threadIdx.x;
 99 | 
100 |   /// BEGIN SOLUTION
101 |   assert(false && "Not Implemented");
102 |   /// END SOLUTION
103 | }
104 | 
105 | void Compact(const CudaArray& a, CudaArray* out, std::vector<int32_t> shape,
106 |              std::vector<int32_t> strides, size_t offset) {
107 |   /**
108 |    * Compact an array in memory.  Unlike the C++ version, in CUDA this will primarily call the 
109 |    * relevant CUDA kernel.  In this case, we illustrate how you should set this up (i.e., we give 
110 |    * you the code for this fuction, and also the prototype for the CompactKernel() function).  For
111 |    * the functions after this, however, you'll need to define these kernels as you see fit to 
112 |    * execute the underlying function.
113 |    * 
114 |    * Args:
115 |    *   a: non-compact represntation of the array, given as input
116 |    *   out: compact version of the array to be written
117 |    *   shape: shapes of each dimension for a and out
118 |    *   strides: strides of the *a* array (not out, which has compact strides)
119 |    *   offset: offset of the *a* array (not out, which has zero offset, being compact)
120 |    */
121 | 
122 |   // Nothing needs to be added here
123 |   CudaDims dim = CudaOneDim(out->size);
124 |   CompactKernel<<<dim.grid, dim.block>>>(a.ptr, out->ptr, out->size, VecToCuda(shape),
125 |                                          VecToCuda(strides), offset);
126 | }
127 | 
128 | 
129 | 
130 | void EwiseSetitem(const CudaArray& a, CudaArray* out, std::vector<int32_t> shape,
131 |                   std::vector<int32_t> strides, size_t offset) {
132 |   /**
133 |    * Set items in a (non-compact) array using CUDA.  Yyou will most likely want to implement a
134 |    * EwiseSetitemKernel() function, similar to those above, that will do the actual work.
135 |    * 
136 |    * Args:
137 |    *   a: _compact_ array whose items will be written to out
138 |    *   out: non-compact array whose items are to be written
139 |    *   shape: shapes of each dimension for a and out
140 |    *   strides: strides of the *out* array (not a, which has compact strides)
141 |    *   offset: offset of the *out* array (not a, which has zero offset, being compact)
142 |    */
143 |   /// BEGIN SOLUTION
144 |   assert(false && "Not Implemented");
145 |   /// END SOLUTION
146 | }
147 | 
148 | 
149 | 
150 | void ScalarSetitem(size_t size, scalar_t val, CudaArray* out, std::vector<int32_t> shape,
151 |                    std::vector<int32_t> strides, size_t offset) {
152 |   /**
153 |    * Set items is a (non-compact) array
154 |    * 
155 |    * Args:
156 |    *   size: number of elements to write in out array (note that this will note be the same as
157 |    *         out.size, because out is a non-compact subset array);  it _will_ be the same as the 
158 |    *         product of items in shape, but covenient to just pass it here.
159 |    *   val: scalar value to write to
160 |    *   out: non-compact array whose items are to be written
161 |    *   shape: shapes of each dimension of out
162 |    *   strides: strides of the out array
163 |    *   offset: offset of the out array
164 |    */
165 |   /// BEGIN SOLUTION
166 |   assert(false && "Not Implemented");
167 |   /// END SOLUTION
168 | }
169 | 
170 | ////////////////////////////////////////////////////////////////////////////////
171 | // Elementwise and scalar operations
172 | ////////////////////////////////////////////////////////////////////////////////
173 | 
174 | 
175 | __global__ void EwiseAddKernel(const scalar_t* a, const scalar_t* b, scalar_t* out, size_t size) {
176 |   // Calculate the global index of the thread.
177 |   size_t gid = blockIdx.x * blockDim.x + threadIdx.x;
178 |   if (gid < size) out[gid] = a[gid] + b[gid];
179 | }
180 | 
181 | void EwiseAdd(const CudaArray& a, const CudaArray& b, CudaArray* out) {
182 |   /**
183 |    * Add together two CUDA arrays.
184 |    * Args:
185 |    *   a: Input array 'a' to be added
186 |    *   b: Input array 'b' to be added
187 |    *   out: Output array to store the result of 'a + b'
188 |    */
189 |   CudaDims dim = CudaOneDim(out->size);
190 | 
191 |   // Kernel will execute on 'dim.grid' blocks, each containing 'dim.block' threads.
192 |   EwiseAddKernel<<<dim.grid, dim.block>>>(a.ptr, b.ptr, out->ptr, out->size);
193 | }
194 | 
195 | __global__ void ScalarAddKernel(const scalar_t* a, scalar_t val, scalar_t* out, size_t size) {
196 |   // Calculate the global index of the thread.
197 |   size_t gid = blockIdx.x * blockDim.x + threadIdx.x;
198 |   if (gid < size) out[gid] = a[gid] + val;
199 | }
200 | 
201 | void ScalarAdd(const CudaArray& a, scalar_t val, CudaArray* out) {
202 |   /**
203 |    * Add a scalar value to every element of a CUDA array.
204 |    * Args:
205 |    *   a: Input array 'a'
206 |    *   val: Scalar value to be added
207 |    *   out: Output array to store the result of 'a + val'
208 |    */
209 |   CudaDims dim = CudaOneDim(out->size);
210 | 
211 |   // Launch the ScalarAddKernel that will add the scalar 'val' to each element of array 'a', 
212 |   // and store the result in array 'out'.
213 |   ScalarAddKernel<<<dim.grid, dim.block>>>(a.ptr, val, out->ptr, out->size);
214 | }
215 | 
216 | /**
217 |  * In the code the follows, use the above template to create analogous elementise
218 |  * and and scalar operators for the following functions.  See the numpy backend for
219 |  * examples of how they should work.
220 |  *   - EwiseMul, ScalarMul
221 |  *   - EwiseDiv, ScalarDiv
222 |  *   - ScalarPower
223 |  *   - EwiseMaximum, ScalarMaximum
224 |  *   - EwiseEq, ScalarEq
225 |  *   - EwiseGe, ScalarGe
226 |  *   - EwiseLog
227 |  *   - EwiseExp
228 |  *   - EwiseTanh
229 |  *
230 |  * If you implement all these naively, there will be a lot of repeated code, so
231 |  * you are welcome (but not required), to use macros or templates to define these
232 |  * functions (however you want to do so, as long as the functions match the proper)
233 |  * signatures above.
234 |  */
235 | 
236 | 
237 | ////////////////////////////////////////////////////////////////////////////////
238 | // Elementwise and scalar operations
239 | ////////////////////////////////////////////////////////////////////////////////
240 | 
241 | 
242 | void Matmul(const CudaArray& a, const CudaArray& b, CudaArray* out, uint32_t M, uint32_t N,
243 |             uint32_t P) {
244 |   /**
245 |    * Multiply two (compact) matrices into an output (also comapct) matrix.  You will want to look
246 |    * at the lecture and notes on GPU-based linear algebra to see how to do this.  Since ultimately
247 |    * mugrade is just evaluating correctness, you _can_ implement a version that simply parallelizes
248 |    * over (i,j) entries in the output array.  However, to really get the full benefit of this
249 |    * problem, we would encourage you to use cooperative fetching, shared memory register tiling, 
250 |    * and other ideas covered in the class notes.  Note that unlike the tiled matmul function in
251 |    * the CPU backend, here you should implement a single function that works across all size
252 |    * matrices, whether or not they are a multiple of a tile size.  As with previous CUDA
253 |    * implementations, this function here will largely just set up the kernel call, and you should
254 |    * implement the logic in a separate MatmulKernel() call.
255 |    * 
256 |    *
257 |    * Args:
258 |    *   a: compact 2D array of size m x n
259 |    *   b: comapct 2D array of size n x p
260 |    *   out: compact 2D array of size m x p to write the output to
261 |    *   M: rows of a / out
262 |    *   N: columns of a / rows of b
263 |    *   P: columns of b / out
264 |    */
265 | 
266 |   /// BEGIN SOLUTION
267 |   assert(false && "Not Implemented");
268 |   /// END SOLUTION
269 | }
270 | 
271 | ////////////////////////////////////////////////////////////////////////////////
272 | // Max and sum reductions
273 | ////////////////////////////////////////////////////////////////////////////////
274 | 
275 | 
276 | void ReduceMax(const CudaArray& a, CudaArray* out, size_t reduce_size) {
277 |   /**
278 |    * Reduce by taking maximum over `reduce_size` contiguous blocks.  Even though it is inefficient,
279 |    * for simplicity you can perform each reduction in a single CUDA thread.
280 |    * 
281 |    * Args:
282 |    *   a: compact array of size a.size = out.size * reduce_size to reduce over
283 |    *   out: compact array to write into
284 |    *   redice_size: size of the dimension to reduce over
285 |    */
286 |   /// BEGIN SOLUTION
287 |   assert(false && "Not Implemented");
288 |   /// END SOLUTION
289 | }
290 | 
291 | 
292 | 
293 | void ReduceSum(const CudaArray& a, CudaArray* out, size_t reduce_size) {
294 |   /**
295 |    * Reduce by taking summation over `reduce_size` contiguous blocks.  Again, for simplicity you 
296 |    * can perform each reduction in a single CUDA thread.
297 |    * 
298 |    * Args:
299 |    *   a: compact array of size a.size = out.size * reduce_size to reduce over
300 |    *   out: compact array to write into
301 |    *   redice_size: size of the dimension to reduce over
302 |    */
303 |   /// BEGIN SOLUTION
304 |   assert(false && "Not Implemented");
305 |   /// END SOLUTION
306 | }
307 | 
308 | }  // namespace cuda
309 | }  // namespace needle
310 | 
311 | PYBIND11_MODULE(ndarray_backend_cuda, m) {
312 |   namespace py = pybind11;
313 |   using namespace needle;
314 |   using namespace cuda;
315 | 
316 |   m.attr("__device_name__") = "cuda";
317 |   m.attr("__tile_size__") = TILE;
318 | 
319 |   py::class_<CudaArray>(m, "Array")
320 |       .def(py::init<size_t>(), py::return_value_policy::take_ownership)
321 |       .def_readonly("size", &CudaArray::size)
322 |       .def("ptr", &CudaArray::ptr_as_int);
323 | 
324 |   // return numpy array, copying from CPU
325 |   m.def("to_numpy", [](const CudaArray& a, std::vector<size_t> shape, std::vector<size_t> strides,
326 |                        size_t offset) {
327 |     std::vector<size_t> numpy_strides = strides;
328 |     std::transform(numpy_strides.begin(), numpy_strides.end(), numpy_strides.begin(),
329 |                    [](size_t& c) { return c * ELEM_SIZE; });
330 | 
331 |     // copy memory to host
332 |     scalar_t* host_ptr = (scalar_t*)std::malloc(a.size * ELEM_SIZE);
333 |     if (host_ptr == 0) throw std::bad_alloc();
334 |     cudaError_t err = cudaMemcpy(host_ptr, a.ptr, a.size * ELEM_SIZE, cudaMemcpyDeviceToHost);
335 |     if (err != cudaSuccess) throw std::runtime_error(cudaGetErrorString(err));
336 | 
337 |     // return numpy array
338 |     py::capsule deallocate_buffer(host_ptr, [](void* p) { free(p); });
339 |     return py::array_t<scalar_t>(shape, numpy_strides, host_ptr + offset, deallocate_buffer);
340 |   });
341 | 
342 |   // copy numpy array to GPU
343 |   m.def("from_numpy", [](py::array_t<scalar_t> a, CudaArray* out) {
344 |     cudaError_t err =
345 |         cudaMemcpy(out->ptr, a.request().ptr, out->size * ELEM_SIZE, cudaMemcpyHostToDevice);
346 |     if (err != cudaSuccess) throw std::runtime_error(cudaGetErrorString(err));
347 |   });
348 | 
349 |   m.def("fill", Fill);
350 |   m.def("compact", Compact);
351 |   m.def("ewise_setitem", EwiseSetitem);
352 |   m.def("scalar_setitem", ScalarSetitem);
353 |   m.def("ewise_add", EwiseAdd);
354 |   m.def("scalar_add", ScalarAdd);
355 | 
356 |   m.def("ewise_mul", EwiseMul);
357 |   m.def("scalar_mul", ScalarMul);
358 |   m.def("ewise_div", EwiseDiv);
359 |   m.def("scalar_div", ScalarDiv);
360 |   m.def("scalar_power", ScalarPower);
361 | 
362 |   m.def("ewise_maximum", EwiseMaximum);
363 |   m.def("scalar_maximum", ScalarMaximum);
364 |   m.def("ewise_eq", EwiseEq);
365 |   m.def("scalar_eq", ScalarEq);
366 |   m.def("ewise_ge", EwiseGe);
367 |   m.def("scalar_ge", ScalarGe);
368 | 
369 |   m.def("ewise_log", EwiseLog);
370 |   m.def("ewise_exp", EwiseExp);
371 |   m.def("ewise_tanh", EwiseTanh);
372 | 
373 |   m.def("matmul", Matmul);
374 | 
375 |   m.def("reduce_max", ReduceMax);
376 |   m.def("reduce_sum", ReduceSum);
377 | }
378 | 


--------------------------------------------------------------------------------
/tests/hw4/test_cifar_ptb_data.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | sys.path.append('./python')
  3 | import itertools
  4 | import numpy as np
  5 | import pytest
  6 | import mugrade
  7 | 
  8 | import needle as ndl
  9 | from needle import backend_ndarray as nd
 10 | 
 11 | 
 12 | np.random.seed(2)
 13 | 
 14 | 
 15 | _DEVICES = [ndl.cpu(), pytest.param(ndl.cuda(),
 16 |     marks=pytest.mark.skipif(not ndl.cuda().enabled(), reason="No GPU"))]
 17 | 
 18 | 
 19 | TRAIN = [True, False]
 20 | @pytest.mark.parametrize("train", TRAIN)
 21 | def test_cifar10_dataset(train):
 22 |     dataset = ndl.data.CIFAR10Dataset("data/cifar-10-batches-py", train=train)
 23 |     if train:
 24 |         assert len(dataset) == 50000
 25 |     else:
 26 |         assert len(dataset) == 10000
 27 |     example = dataset[np.random.randint(len(dataset))]
 28 |     assert(isinstance(example, tuple))
 29 |     X, y = example
 30 |     assert isinstance(X, np.ndarray)
 31 |     assert X.shape == (3, 32, 32)
 32 | 
 33 | 
 34 | BATCH_SIZES = [1, 15]
 35 | @pytest.mark.parametrize("batch_size", BATCH_SIZES)
 36 | @pytest.mark.parametrize("train", TRAIN)
 37 | @pytest.mark.parametrize("device", _DEVICES, ids=["cpu", "cuda"])
 38 | def test_cifar10_loader(batch_size, train, device):
 39 |     cifar10_train_dataset = ndl.data.CIFAR10Dataset("data/cifar-10-batches-py", train=True)
 40 |     train_loader = ndl.data.DataLoader(cifar10_train_dataset, batch_size)
 41 |     for (X, y) in train_loader:
 42 |         break
 43 |     assert isinstance(X.cached_data, nd.NDArray)
 44 |     assert isinstance(X, ndl.Tensor)
 45 |     assert isinstance(y, ndl.Tensor)
 46 |     assert X.dtype == 'float32'
 47 | 
 48 | 
 49 | BPTT = [3, 32]
 50 | @pytest.mark.parametrize("batch_size", BATCH_SIZES)
 51 | @pytest.mark.parametrize("bptt", BPTT)
 52 | @pytest.mark.parametrize("train", TRAIN)
 53 | @pytest.mark.parametrize("device", _DEVICES, ids=["cpu", "cuda"])
 54 | def test_ptb_dataset(batch_size, bptt, train, device):
 55 |     # TODO update with more tests?
 56 |     corpus = ndl.data.Corpus("data/ptb")
 57 |     if train:
 58 |         data = ndl.data.batchify(corpus.train, batch_size, device=device, dtype="float32")
 59 |     else:
 60 |         data = ndl.data.batchify(corpus.test, batch_size, device=device, dtype="float32")
 61 |     X, y = ndl.data.get_batch(data, np.random.randint(len(data)), bptt, device=device)
 62 |     assert X.shape == (bptt, batch_size)
 63 |     assert y.shape == (bptt * batch_size,)
 64 |     assert isinstance(X, ndl.Tensor)
 65 |     assert X.dtype == 'float32'
 66 |     assert X.device == device
 67 |     assert isinstance(X.cached_data, nd.NDArray)
 68 |     ntokens = len(corpus.dictionary)
 69 |     assert ntokens == 10000
 70 | 
 71 | 
 72 | ### MUGRADE ###
 73 | 
 74 | TEST_BATCH_SIZES = [3, 5]
 75 | TEST_BPTT = [6, 10]
 76 | 
 77 | def mugrade_submit(x):
 78 |     if isinstance(x, np.ndarray):
 79 |         x = x.flatten()[:128]
 80 |         #print(x)
 81 |         mugrade.submit(x)
 82 |     else:
 83 |         #print(x)
 84 |         mugrade.submit(x)
 85 | 
 86 | 
 87 | def submit_cifar10():
 88 |     if not ndl.cuda().enabled():
 89 |         print('You need a GPU to run some of these tests.')
 90 |     devices = [ndl.cpu(), ndl.cuda()]
 91 |     for train in TRAIN:
 92 |         dataset = ndl.data.CIFAR10Dataset("data/cifar-10-batches-py", train=train)
 93 |         mugrade_submit(len(dataset))
 94 |         for (device, batch_size) in itertools.product(devices, TEST_BATCH_SIZES):
 95 |             loader = ndl.data.DataLoader(dataset, batch_size)
 96 |             for (X, y) in loader:
 97 |                 break
 98 |             mugrade_submit(X.numpy()[0, :, :, :])
 99 |             mugrade_submit(y.numpy()[0])
100 | 
101 | 
102 | def submit_ptb():
103 |     # devices = [ndl.cpu(), ndl.cuda()] if ndl.cuda().enabled() else [ndl.cpu()]
104 |     devices = [ndl.cpu(), ndl.cuda()]
105 | 
106 |     corpus = ndl.data.Corpus("data/ptb")
107 |     mugrade_submit(np.array(len(corpus.dictionary)))
108 |     for train in TRAIN:
109 |         for (device, batch_size, bptt) in itertools.product(devices, TEST_BATCH_SIZES, TEST_BPTT):
110 |             if train:
111 |                 data = ndl.data.batchify(corpus.train, batch_size, device=device, dtype="float32")
112 |             else:
113 |                 data = ndl.data.batchify(corpus.test, batch_size, device=device, dtype="float32")
114 |             X, y = ndl.data.get_batch(data, np.random.randint(len(data)), bptt)
115 |             mugrade_submit(np.array(len(data)))
116 |             mugrade_submit(X.numpy()[0, :])
117 |             mugrade_submit(y.numpy()[0])
118 | 
119 | 
120 | if __name__ == "__main__":
121 |     submit_cifar10()
122 |     submit_ptb()


--------------------------------------------------------------------------------
/tests/hw4/test_conv.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | sys.path.append('./python')
  3 | import numpy as np
  4 | import pytest
  5 | from needle import backend_ndarray as nd
  6 | import needle as ndl
  7 | import mugrade
  8 | import itertools
  9 | 
 10 | 
 11 | _DEVICES = [ndl.cpu(), pytest.param(ndl.cuda(),
 12 |     marks=pytest.mark.skipif(not ndl.cuda().enabled(), reason="No GPU"))]
 13 | 
 14 | def backward_check(f, *args, **kwargs):
 15 |     eps = 1e-3
 16 |     out = f(*args, **kwargs)
 17 |     c = np.random.randn(*out.shape)
 18 |     is_stacked = False
 19 |     if isinstance(args[0], list):
 20 |         args = args[0]
 21 |         is_stacked = True
 22 |     numerical_grad = [np.zeros(a.shape) for a in args]
 23 |     num_args = len(args)
 24 |     for i in range(num_args):
 25 |         for j in range(args[i].realize_cached_data().size):
 26 |             args[i].realize_cached_data().flat[j] += eps
 27 |             if is_stacked:
 28 |                 f1 = (f(args, **kwargs).numpy() * c).sum()
 29 |             else:
 30 |                 f1 = (f(*args, **kwargs).numpy() * c).sum()
 31 |             args[i].realize_cached_data().flat[j] -= 2 * eps
 32 |             if is_stacked:
 33 |                 f2 = (f(args, **kwargs).numpy() * c).sum()
 34 |             else:
 35 |                 f2 = (f(*args, **kwargs).numpy() * c).sum()
 36 |             args[i].realize_cached_data().flat[j] += eps
 37 |             numerical_grad[i].flat[j] = (f1 - f2) / (2 * eps)
 38 |     backward_grad = out.op.gradient_as_tuple(ndl.Tensor(c, device=args[0].device), out)
 39 |     if isinstance(backward_grad[0], ndl.TensorTuple): # TODO keep this?
 40 |         backward_grad = backward_grad[0].tuple()
 41 |     error = sum(
 42 |         np.linalg.norm(backward_grad[i].numpy() - numerical_grad[i])
 43 |         for i in range(len(args))
 44 |     )
 45 |     assert error < 1e-2
 46 |     return [g.numpy() for g in backward_grad]
 47 | 
 48 | 
 49 | stack_back_params = [
 50 |     ( (3, 4), 3, 0),
 51 |     ( (3, 4), 3, 1),
 52 |     ( (3, 4), 3, 2),
 53 |     ( (3, 4), 5, 2),
 54 |     ( (3, 4), 1, 2),
 55 | ]
 56 | @pytest.mark.parametrize("device", _DEVICES)
 57 | @pytest.mark.parametrize("shape, n, axis", stack_back_params)
 58 | def test_stack_backward(shape, n, axis, device):
 59 |     np.random.seed(0)
 60 |     get_tensor = lambda shape: ndl.Tensor(np.random.randn(*shape)*5, device=device)
 61 |     backward_check(ndl.stack, [get_tensor(shape) for _ in range(n)], axis=axis)
 62 | 
 63 | 
 64 | stack_params = [
 65 |     {"shape": (10,3),    "n": 4, "axis": 0},
 66 |     {"shape": (4, 5, 6), "n": 5, "axis": 0},
 67 |     {"shape": (4, 5, 6), "n": 3, "axis": 1},
 68 |     {"shape": (4, 5, 6), "n": 2, "axis": 2}
 69 | ]
 70 | @pytest.mark.parametrize("device", _DEVICES)
 71 | @pytest.mark.parametrize("params", stack_params)
 72 | def test_stack_forward(params, device):
 73 |     np.random.seed(0)
 74 |     shape, n, axis = params['shape'], params['n'], params['axis']
 75 |     to_stack_ndl = []
 76 |     to_stack_npy = []
 77 |     for i in range(n):
 78 |         _A = np.random.randn(*shape)
 79 |         to_stack_ndl += [ndl.Tensor(_A, device=device)]
 80 |         to_stack_npy += [_A]
 81 | 
 82 |     lhs = np.stack(to_stack_npy, axis=axis)
 83 |     rhs = ndl.stack(to_stack_ndl, axis=axis)
 84 | 
 85 | 
 86 | pad_params = [
 87 |     {"shape": (10, 32, 32, 8), "padding": ( (0, 0), (2, 2), (2, 2), (0, 0) )},
 88 |     {"shape": (10, 32, 32, 8), "padding": ( (0, 0), (0, 0), (0, 0), (0, 0) )},
 89 | ]
 90 | @pytest.mark.parametrize("device", [nd.cpu()])
 91 | @pytest.mark.parametrize("params", pad_params)
 92 | def test_pad_forward(params, device):
 93 |     np.random.seed(0)
 94 |     shape, padding = params['shape'], params['padding']
 95 |     _A = np.random.randn(*shape)
 96 |     _B = np.pad(_A, padding)
 97 |     A = nd.NDArray(_A, device=device)
 98 |     B = A.pad(padding)
 99 | 
100 |     assert np.linalg.norm(A.numpy() - _A) < 1e-4
101 | 
102 | 
103 | flip_forward_params = [
104 |     {"shape": (10, 5), "axes": (0,)},
105 |     {"shape": (10, 5), "axes": (1,)},
106 |     {"shape": (10, 5), "axes": (0,1)},
107 |     {"shape": (10, 32, 32, 8), "axes": (0,1)},
108 |     {"shape": (3, 3, 6, 8), "axes": (0,1)},
109 |     {"shape": (10, 32, 32, 8), "axes": (1,2)},
110 |     {"shape": (3, 3, 6, 8), "axes": (1,2)},
111 |     {"shape": (10, 32, 32, 8), "axes": (2,3)},
112 |     {"shape": (3, 3, 6, 8), "axes": (2,3)},
113 |     {"shape": (10, 32, 32, 8), "axes": (0,1,2,3)},
114 | ]
115 | @pytest.mark.parametrize("device", _DEVICES)
116 | @pytest.mark.parametrize("params", flip_forward_params)
117 | def test_flip_forward(params, device):
118 |     np.random.seed(0)
119 |     shape, axes = params['shape'], params['axes']
120 |     _A = np.random.randn(*shape)
121 |     _B = np.flip(_A, axes)
122 |     A = ndl.Tensor(_A, device=device)
123 |     B = ndl.flip(A, axes=axes)
124 | 
125 |     assert np.linalg.norm(A.numpy() - _A) < 1e-4
126 | 
127 | 
128 | flip_backward_params = [
129 |     {"shape": (10, 5), "axes": (0,)},
130 |     {"shape": (10, 5), "axes": (1,)},
131 |     {"shape": (10, 5), "axes": (0,1)},
132 |     {"shape": (2, 3, 3, 8), "axes": (0,1)},
133 |     {"shape": (3, 3, 6, 4), "axes": (0,1)},
134 |     {"shape": (2, 3, 3, 4), "axes": (1,2)},
135 |     {"shape": (3, 3, 6, 4), "axes": (1,2)},
136 |     {"shape": (2, 3, 3, 4), "axes": (2,3)},
137 |     {"shape": (3, 3, 6, 4), "axes": (2,3)},
138 |     {"shape": (2, 3, 3, 4), "axes": (0,1,2,3)},
139 | ]
140 | @pytest.mark.parametrize("device", _DEVICES)
141 | @pytest.mark.parametrize("params", flip_backward_params)
142 | def test_flip_backward(params, device):
143 |     np.random.seed(0)
144 |     shape, axes = params['shape'], params['axes']
145 |     backward_check(ndl.flip, ndl.Tensor(np.random.randn(*shape), device=device), axes=axes)
146 | 
147 | 
148 | # @pytest.mark.parametrize("device", _DEVICES)
149 | # def test_init_calculate_fans(device):
150 | #     _A = np.random.randn(3, 3, 16, 8)
151 | #     A = ndl.Tensor(_A, device=device)
152 | #     assert ndl.init._calculate_fans(A) == (144, 72)
153 | 
154 | #     _A = np.random.randn(3, 3, 16, 8)
155 | #     A = ndl.Tensor(_A, device=device)
156 | #     assert ndl.init._calculate_fans(A) == (144, 72)
157 | 
158 | 
159 | #     _A = np.random.randn(16, 8)
160 | #     A = ndl.Tensor(_A, device=device)
161 | #     assert ndl.init._calculate_fans(A) == (16, 8)
162 | 
163 | 
164 | @pytest.mark.parametrize("device", _DEVICES)
165 | def test_init_kaiming_uniform(device):
166 |     _A = np.random.randn(3, 3, 16, 8)
167 |     A = ndl.Tensor(_A, device=device)
168 |     np.random.seed(0)
169 |     A = ndl.init.kaiming_uniform(16*9, 8*9, shape=A.shape)
170 |     assert abs(A.sum().numpy() - -2.5719218) < 1e-4
171 | 
172 | 
173 | @pytest.mark.parametrize("device", _DEVICES)
174 | def test_resnet9(device):
175 |     def num_params(model):
176 |         return np.sum([np.prod(x.shape) for x in model.parameters()])
177 | 
178 |     from apps.models import ResNet9
179 |     np.random.seed(0)
180 |     model = ResNet9(device=device)
181 | 
182 |     assert num_params(model) == 431946
183 | 
184 |     _A = np.random.randn(2, 3, 32, 32)
185 |     A = ndl.Tensor(_A, device=device)
186 |     y = model(A)
187 | 
188 |     assert np.linalg.norm(np.array([[-1.8912625 ,  0.64833605,  1.9400386 ,  1.1435282 ,  1.89777   ,
189 |          2.9039745 , -0.10433993,  0.35458302, -0.5684191 ,  2.6178317 ],
190 |        [-0.2905612 , -0.4147861 ,  0.90268034,  0.46530387,  1.3335679 ,
191 |          1.8534894 , -0.1867125 , -2.4298222 , -0.5344223 ,  4.362149  ]]) - y.numpy()) < 1e-2
192 | 
193 | 
194 | 
195 | @pytest.mark.parametrize("device", _DEVICES)
196 | def test_dilate_forward(device):
197 |     np.random.seed(0)
198 |     device = ndl.cpu()
199 | 
200 |     _A = np.random.randint(1, 10, size=(2, 5))
201 |     A = ndl.Tensor(_A, device=device)
202 |     assert np.linalg.norm(ndl.dilate(A, dilation=0, axes=(0,)).numpy() - np.array([[6., 1., 4., 4., 8.],
203 |        [4., 6., 3., 5., 8.]])) < 1e-5 
204 | 
205 |     _A = np.random.randint(1, 10, size=(2, 5))
206 |     A = ndl.Tensor(_A, device=device)
207 |     assert np.linalg.norm(ndl.dilate(A, dilation=1, axes=(0,)).numpy() - np.array([[7., 9., 9., 2., 7.],
208 |        [0., 0., 0., 0., 0.],
209 |        [8., 8., 9., 2., 6.],
210 |        [0., 0., 0., 0., 0.]])) < 1e-5
211 | 
212 |     _A = np.random.randint(1, 10, size=(2, 5))
213 |     A = ndl.Tensor(_A, device=device)
214 |     assert np.linalg.norm(ndl.dilate(A, dilation=1, axes=(1,)).numpy() - np.array([[9., 0., 5., 0., 4., 0., 1., 0., 4., 0.],
215 |        [6., 0., 1., 0., 3., 0., 4., 0., 9., 0.]])) < 1e-5
216 | 
217 |     _A = np.random.randint(1, 10, size=(2, 5))
218 |     A = ndl.Tensor(_A, device=device)
219 |     assert np.linalg.norm(ndl.dilate(A, dilation=1, axes=(0,1)).numpy() - np.array([[2., 0., 4., 0., 4., 0., 4., 0., 8., 0.],
220 |        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
221 |        [1., 0., 2., 0., 1., 0., 5., 0., 8., 0.],
222 |        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])) < 1e-5
223 | 
224 |     _A = np.random.randint(1, 10, size=(2, 2))
225 |     A = ndl.Tensor(_A, device=device)
226 |     assert np.linalg.norm(ndl.dilate(A, dilation=2, axes=(0,1)).numpy() - np.array([[4., 0., 0., 3., 0., 0.],
227 |        [0., 0., 0., 0., 0., 0.],
228 |        [0., 0., 0., 0., 0., 0.],
229 |        [8., 0., 0., 3., 0., 0.],
230 |        [0., 0., 0., 0., 0., 0.],
231 |        [0., 0., 0., 0., 0., 0.]])) < 1e-5
232 | 
233 |     _A = np.random.randint(1, 10, size=(2, 2, 2, 2))
234 |     A = ndl.Tensor(_A, device=device)
235 |     assert np.linalg.norm(ndl.dilate(A, dilation=1, axes=(1,2)).numpy() - np.array([[[[1., 1.],
236 |          [0., 0.],
237 |          [5., 6.],
238 |          [0., 0.]],
239 | 
240 |         [[0., 0.],
241 |          [0., 0.],
242 |          [0., 0.],
243 |          [0., 0.]],
244 | 
245 |         [[6., 7.],
246 |          [0., 0.],
247 |          [9., 5.],
248 |          [0., 0.]],
249 | 
250 |         [[0., 0.],
251 |          [0., 0.],
252 |          [0., 0.],
253 |          [0., 0.]]],
254 | 
255 | 
256 |        [[[2., 5.],
257 |          [0., 0.],
258 |          [9., 2.],
259 |          [0., 0.]],
260 | 
261 |         [[0., 0.],
262 |          [0., 0.],
263 |          [0., 0.],
264 |          [0., 0.]],
265 | 
266 |         [[2., 8.],
267 |          [0., 0.],
268 |          [4., 7.],
269 |          [0., 0.]],
270 | 
271 |         [[0., 0.],
272 |          [0., 0.],
273 |          [0., 0.],
274 |          [0., 0.]]]])) < 1e-5
275 | 
276 | 
277 | dilate_backward_params = [
278 |     {"shape": (2, 5),          "d": 1, "axes": (0,)},
279 |     {"shape": (2, 5),          "d": 2, "axes": (1,)},
280 |     {"shape": (2, 5),          "d": 1, "axes": (0,1)},
281 |     {"shape": (2, 5),          "d": 0, "axes": (0,1)},
282 |     {"shape": (2, 3, 3, 4),     "d": 2, "axes": (0,1)},
283 |     {"shape": (3, 3, 6, 4),     "d": 3, "axes": (0,1)},
284 |     {"shape": (2, 3, 3, 4),     "d": 0, "axes": (1,2)},
285 |     {"shape": (2, 3, 3, 4),     "d": 1, "axes": (1,2)},
286 |     {"shape": (3, 3, 6, 4),     "d": 1, "axes": (1,2)},
287 |     {"shape": (2, 3, 3, 4),     "d": 1, "axes": (2,3)},
288 |     {"shape": (3, 3, 6, 4),     "d": 1, "axes": (2,3)},
289 |     {"shape": (2, 3, 3, 4),     "d": 1, "axes": (0,1,2,3)},
290 | ]
291 | @pytest.mark.parametrize("device", _DEVICES)
292 | @pytest.mark.parametrize("params", dilate_backward_params)
293 | def test_dilate_backward(params, device):
294 |     np.random.seed(0)
295 |     shape, d, axes = params['shape'], params['d'], params['axes']
296 |     backward_check(ndl.dilate, ndl.Tensor(np.random.randn(*shape), device=device), dilation=d, axes=axes)
297 | 
298 | 
299 | def test_stack_vs_pytorch():
300 |     np.random.seed(0)
301 |     import torch
302 |     A = np.random.randn(5, 5)
303 |     B = np.random.randn(5, 5)
304 |     C = np.random.randn(5, 5)
305 |     D = np.random.randn(15, 5)
306 | 
307 |     Andl = ndl.Tensor(A, requires_grad=True)
308 |     Bndl = ndl.Tensor(B, requires_grad=True)
309 |     Cndl = ndl.Tensor(C, requires_grad=True)
310 |     Dndl = ndl.Tensor(D, requires_grad=True)
311 | 
312 |     Atch = torch.tensor(A, requires_grad=True)
313 |     Btch = torch.tensor(B, requires_grad=True)
314 |     Ctch = torch.tensor(C, requires_grad=True)
315 |     Dtch = torch.tensor(D, requires_grad=True)
316 | 
317 |     Xndl = ndl.stack([Andl, Cndl @ Bndl, Cndl], axis=1)
318 |     Xtch = torch.stack([Atch, Ctch @ Btch, Ctch], dim=1)
319 | 
320 |     assert Xndl.shape == Xtch.shape
321 |     assert np.linalg.norm(Xndl.numpy() - Xtch.detach().numpy()) < 1e-3
322 | 
323 |     Yndl = (Dndl @ Xndl.reshape((5, 15)) @ Dndl).sum()
324 |     Ytch = (Dtch @ Xtch.reshape(5, 15) @ Dtch).sum()
325 | 
326 |     assert np.linalg.norm(Yndl.numpy() - Ytch.detach().numpy()) < 1e-3
327 | 
328 |     Yndl.backward()
329 |     Ytch.backward()
330 | 
331 |     assert np.linalg.norm(Andl.grad.cached_data.numpy() - Atch.grad.detach().numpy()) < 1e-3
332 |     assert np.linalg.norm(Bndl.grad.cached_data.numpy() - Btch.grad.detach().numpy()) < 1e-3
333 |     assert np.linalg.norm(Cndl.grad.cached_data.numpy() - Ctch.grad.detach().numpy()) < 1e-3
334 | 
335 | 
336 | 
337 | conv_forward_params = [
338 |     (4, 8, 16, 3, 1),
339 |     (32, 8, 16, 3, 2),
340 |     (32, 8, 8, 3, 2),
341 |     (32, 16, 8, 3, 1),
342 |     (32, 16, 8, 3, 2)
343 | ]
344 | @pytest.mark.parametrize("s,cin,cout,k,stride", conv_forward_params)
345 | @pytest.mark.parametrize("device", _DEVICES)
346 | def test_nn_conv_forward(s, cin, cout, k, stride, device):
347 |     np.random.seed(0)
348 |     import torch
349 |     f = ndl.nn.Conv(cin, cout, k, stride=stride, device=device)
350 |     x = ndl.init.rand(10, cin, s, s, device=device)
351 | 
352 |     g = torch.nn.Conv2d(cin, cout, k, stride=stride, padding=k//2)
353 |     g.weight.data = torch.tensor(f.weight.cached_data.numpy().transpose(3, 2, 0, 1))
354 |     g.bias.data = torch.tensor(f.bias.cached_data.numpy())
355 |     z = torch.tensor(x.cached_data.numpy())
356 | 
357 |     assert np.linalg.norm(f(x).cached_data.numpy() - g(z).data.numpy()) < 1e-3
358 | 
359 | 
360 | conv_back_params = [
361 |     (4, 1, 1, 3, 1),
362 |     (14, 8, 16, 3, 1),
363 |     (14, 8, 16, 3, 2),
364 |     (14, 8, 8, 3, 1),
365 |     (14, 8, 8, 3, 2),
366 |     (14, 16, 8, 3, 1),
367 |     (14, 16, 8, 3, 2),
368 | ]
369 | @pytest.mark.parametrize("s,cin,cout,k,stride", conv_back_params)
370 | @pytest.mark.parametrize("device", _DEVICES)
371 | def test_nn_conv_backward(s, cin, cout, k, stride, device):
372 |     np.random.seed(0)
373 |     import torch
374 |     f = ndl.nn.Conv(cin, cout, k, stride=stride, device=device)
375 |     x = ndl.init.rand(1, cin, s, s, device=device, requires_grad=True)
376 | 
377 |     g = torch.nn.Conv2d(cin, cout, k, stride=stride, padding=k//2)
378 |     g.weight.data = torch.tensor(f.weight.cached_data.numpy().transpose(3, 2, 0, 1))
379 |     g.bias.data = torch.tensor(f.bias.cached_data.numpy())
380 |     z = torch.tensor(x.cached_data.numpy(), requires_grad=True)
381 |     z.requires_grad = True
382 | 
383 |     res1 = f(x)
384 |     y1 = res1.sum()
385 | 
386 |     y2 = g(z).sum()
387 | 
388 |     y1.backward()
389 |     y2.backward()
390 | 
391 |     assert np.linalg.norm(g.weight.grad.data.numpy() - f.weight.grad.cached_data.numpy().transpose(3, 2, 0, 1)) < 1e-3, "weight gradients match"
392 |     assert np.linalg.norm(g.bias.grad.data.numpy() - f.bias.grad.cached_data.numpy()) < 1e-3, "bias gradients match"
393 |     assert np.linalg.norm(z.grad.data.numpy() - x.grad.cached_data.numpy()) < 1e-3, "input gradients match"
394 | 
395 | 
396 | op_conv_shapes = [
397 |     ( (3, 14, 14, 8), (3, 3, 8, 16), 1, 0 ),
398 |     ( (3, 14, 14, 8), (3, 3, 8, 16), 1, 1 ),
399 |     ( (3, 16, 16, 8), (3, 3, 8, 16), 1, 2 ),
400 |     ( (3, 16, 16, 8), (3, 3, 8, 14), 1, 0 ),
401 |     ( (3, 16, 16, 2), (3, 3, 2, 14), 1, 0 ),
402 | 
403 |     ( (3, 14, 14, 8), (3, 3, 8, 16), 2, 0 ),
404 |     ( (3, 14, 14, 8), (3, 3, 8, 16), 2, 1 ),
405 |     ( (3, 16, 16, 8), (3, 3, 8, 16), 2, 2 ),
406 |     ( (3, 16, 16, 8), (3, 3, 8, 14), 2, 0 ),
407 |     ( (3, 16, 16, 2), (3, 3, 2, 14), 2, 0 ),
408 | 
409 |     ( (3, 16, 16, 24), (3, 3, 24, 14), 1, 0 ),
410 |     ( (3, 14, 14, 8), (5, 5, 8, 16),   1, 0 ),
411 |     ( (3, 17, 17, 8), (5, 5, 8, 16),   1, 0 ),
412 |     ( (3, 17, 17, 1), (5, 5, 1, 16) ,  1, 0),
413 |     ( (3, 17, 17, 16), (5, 5, 16, 1),  1, 0 ),
414 |     ( (3, 17, 17, 16), (1, 1, 16, 1),  1, 0 ),
415 |     ( (1, 14, 14, 2), (3, 3, 2, 2),    1, 0 ),
416 | ]
417 | @pytest.mark.parametrize("Z_shape, W_shape, stride, padding", op_conv_shapes)
418 | @pytest.mark.parametrize("device", _DEVICES)
419 | @pytest.mark.parametrize("backward", [True, False], ids=["backward", "forward"])
420 | def test_op_conv(Z_shape, W_shape, stride, padding, backward, device):
421 |     np.random.seed(0)
422 |     import torch
423 |     _Z = np.random.randn(*Z_shape)*5
424 |     _Z = _Z.astype(np.float32)
425 |     _W = np.random.randn(*W_shape)*5
426 |     _W = _W.astype(np.float32)
427 |     Z = ndl.Tensor(_Z, device=device)
428 |     W = ndl.Tensor(_W, device=device)
429 |     y = ndl.conv(Z, W, padding=padding, stride=stride)
430 |     y2 = y.sum()
431 |     if backward:
432 |         y2.backward()
433 |     Ztch = torch.Tensor(_Z).float()
434 |     Ztch.requires_grad=True
435 |     Wtch = torch.Tensor(_W).float()
436 |     Wtch.requires_grad=True
437 |     out = torch.nn.functional.conv2d(Ztch.permute(0, 3, 1, 2), Wtch.permute(3, 2, 0, 1), padding=padding, stride=stride)
438 |     out2 = out.sum()
439 |     if backward:
440 |         out2.backward()
441 |     if backward:
442 |         err1 = np.linalg.norm(Ztch.grad.numpy() - Z.grad.numpy())
443 |         err2 = np.linalg.norm(Wtch.grad.numpy() - W.grad.numpy())
444 |     err3 = np.linalg.norm(out2.detach().numpy() - y2.numpy())
445 |     if backward:
446 |         assert err1 < 1e-2, "input grads match"
447 |         assert err2 < 1e-2, "weight grads match"
448 |     assert err3 < 1e-1, "outputs match %s, %s" % (y2, out2)
449 | 
450 | 
451 | @pytest.mark.parametrize("device", _DEVICES)
452 | def test_train_cifar10(device):
453 |     np.random.seed(0)
454 |     dataset = ndl.data.CIFAR10Dataset("./data/cifar-10-batches-py", train=True)
455 |     dataloader = ndl.data.DataLoader(\
456 |              dataset=dataset,
457 |              batch_size=128,
458 |              shuffle=False
459 |              # collate_fn=ndl.data.collate_ndarray,
460 |              # drop_last=False,
461 |              # device=device,
462 |              # dtype="float32"
463 |              )
464 |     from apps.models import ResNet9
465 |     np.random.seed(0)
466 |     model = ResNet9(device=device, dtype="float32")
467 |     out = one_iter_of_cifar10_training(dataloader, model, opt=ndl.optim.Adam(model.parameters(), lr=0.001, weight_decay=0.001), device=device)
468 |     assert np.linalg.norm(np.array(list(out), dtype=object) - np.array([0.09375, 3.5892258])) < 1e-2
469 | 
470 | 
471 | def one_iter_of_cifar10_training(dataloader, model, niter=1, loss_fn=ndl.nn.SoftmaxLoss(), opt=None, device=None):
472 |     np.random.seed(4)
473 |     model.train()
474 |     correct, total_loss = 0, 0
475 |     i = 1
476 |     for batch in dataloader:
477 |         opt.reset_grad()
478 |         X, y = batch
479 |         X,y = ndl.Tensor(X, device=device), ndl.Tensor(y, device=device)
480 |         out = model(X)
481 |         correct += np.sum(np.argmax(out.numpy(), axis=1) == y.numpy())
482 |         loss = loss_fn(out, y)
483 |         total_loss += loss.data.numpy() * y.shape[0]
484 |         loss.backward()
485 |         opt.step()
486 |         if i >= niter:
487 |             break
488 |         i += 1
489 |     return correct/(y.shape[0]*niter), total_loss/(y.shape[0]*niter)
490 | 
491 | 
492 | ######################    |    ######################
493 | ###################### MUGRADE ######################
494 | ######################    v    ######################
495 | 
496 | def Prepare(A):
497 |     return (A.numpy().flatten()[:64], A.shape)
498 | 
499 | 
500 | def Rand(*shape, device=ndl.cpu(), entropy=1):
501 |     np.random.seed(np.prod(shape) * len(shape) * entropy)
502 |     _A = np.random.randint(low=1, high=10, size=shape)
503 |     return ndl.Tensor(_A, device=device)
504 | 
505 | 
506 | def RandC(*shape, entropy=1):
507 |     if ndl.cuda().enabled():
508 |         return Rand(*shape, device=ndl.cuda(), entropy=2)
509 |     else:
510 |         raise NotImplementedError("You need a GPU to run these tests.")
511 | 
512 | 
513 | def MugradeSubmit(things):
514 |     mugrade.submit(Prepare(things))
515 |     # print(Prepare(things))
516 | 
517 | 
518 | def submit_conv_forward():
519 |     def DoConvOp(batches, cin, cout, n, k=3, stride=1, padding=0, device=ndl.cpu()):
520 |         X = Rand(batches, n, n, cin, device=device)
521 |         W = Rand(k, k, cin, cout, device=device)
522 |         y = ndl.conv(X, W, stride=stride, padding=padding)
523 |         return y
524 | 
525 |     def DoConvLayer(batches, cin, cout, n, k=3, stride=1, bias=True, device=ndl.cpu()):
526 |         X = Rand(batches, cin, n, n, device=device)
527 |         f = ndl.nn.Conv(cin, cout, k, stride=stride, bias=bias, device=device)
528 |         return f(X)
529 | 
530 |     MugradeSubmit(DoConvOp(2, 1, 2, 4, k=1, stride=1, padding=0))
531 |     MugradeSubmit(DoConvOp(2, 1, 2, 4, k=1, stride=1, padding=2))
532 |     MugradeSubmit(DoConvOp(2, 3, 1, 6, k=1, stride=2, padding=2))
533 | 
534 | 
535 |     MugradeSubmit(DoConvOp(2, 1, 2, 4, k=3, stride=1, padding=0))
536 |     MugradeSubmit(DoConvOp(3, 1, 2, 4, k=3, stride=1, padding=2))
537 |     MugradeSubmit(DoConvOp(1, 1, 3, 6, k=5, stride=2, padding=2))
538 | 
539 |     MugradeSubmit(DoConvLayer(3, 2, 4, 6, k=3, stride=1, bias=True))
540 |     MugradeSubmit(DoConvLayer(3, 4, 2, 6, k=3, stride=1, bias=True))
541 |     MugradeSubmit(DoConvLayer(1, 1, 1, 12, k=3, stride=2, bias=True))
542 |     MugradeSubmit(DoConvLayer(1, 1, 1, 12, k=1, stride=1, bias=False))
543 |     MugradeSubmit(DoConvLayer(1, 2, 1, 12, k=7, stride=1, bias=False))
544 |     MugradeSubmit(DoConvLayer(1, 1, 3, 12, k=7, stride=4, bias=False))
545 | 
546 | 
547 |     if ndl.cuda().enabled():
548 |         MugradeSubmit(DoConvLayer(3, 2, 4, 6, k=3, stride=1, bias=False, device=ndl.cuda()))
549 |         MugradeSubmit(DoConvLayer(3, 4, 2, 6, k=3, stride=1, bias=False, device=ndl.cuda()))
550 |     else:
551 |         print('You need a GPU to run these tests!')
552 | 
553 | 
554 | def submit_conv_backward():
555 | 
556 |     def DoConvOpBackward(batches, cin, cout, n, k=3, stride=1, padding=0, device=ndl.cpu(), wrtX=True):
557 |         X = Rand(batches, n, n, cin, device=device)
558 |         X.requires_grad = True
559 |         W = Rand(k, k, cin, cout, device=device)
560 |         W.requires_grad = True
561 |         y = ndl.conv(X, W, stride=stride, padding=padding).sum()
562 |         y.backward()
563 |         if wrtX:
564 |             return W.grad
565 |         else:
566 |             return X.grad
567 | 
568 |     def DoConvLayerBackward(batches, cin, cout, n, k=3, stride=1, bias=True, device=ndl.cpu(), wrtX=True):
569 |         X = Rand(batches, cin, n, n, device=device)
570 |         X.requires_grad = True
571 |         f = ndl.nn.Conv(cin, cout, k, stride=stride, bias=bias, device=device)
572 |         y = f(X).sum()
573 |         y.backward()
574 |         if wrtX:
575 |             return f.weight.grad
576 |         else:
577 |             return X.grad
578 | 
579 |     MugradeSubmit(DoConvOpBackward(2, 1, 2, 4, k=1, stride=1, padding=0, wrtX=True))
580 |     MugradeSubmit(DoConvOpBackward(2, 3, 1, 6, k=1, stride=2, padding=0, wrtX=True))
581 |     MugradeSubmit(DoConvOpBackward(2, 1, 2, 10, k=3, stride=1, padding=1, wrtX=True))
582 |     MugradeSubmit(DoConvOpBackward(2, 3, 1, 8, k=3, stride=2, padding=2, wrtX=True))
583 |     MugradeSubmit(DoConvOpBackward(2, 1, 3, 8, k=5, stride=1, padding=2, wrtX=True))
584 | 
585 |     MugradeSubmit(DoConvOpBackward(2, 1, 2, 4, k=1, stride=1, padding=0, wrtX=False))
586 |     MugradeSubmit(DoConvOpBackward(2, 3, 1, 6, k=1, stride=2, padding=0, wrtX=False))
587 |     MugradeSubmit(DoConvOpBackward(2, 1, 2, 6, k=3, stride=1, padding=1, wrtX=False))
588 |     MugradeSubmit(DoConvOpBackward(2, 3, 1, 6, k=3, stride=2, padding=2, wrtX=False))
589 |     MugradeSubmit(DoConvOpBackward(2, 1, 3, 8, k=5, stride=1, padding=2, wrtX=False))
590 | 
591 |     MugradeSubmit(DoConvLayerBackward(3, 2, 4, 6, k=3, stride=1, bias=True, wrtX=True))
592 |     MugradeSubmit(DoConvLayerBackward(1, 2, 1, 12, k=7, stride=1, bias=False, wrtX=True))
593 |     MugradeSubmit(DoConvLayerBackward(1, 1, 3, 12, k=7, stride=4, bias=False, wrtX=True))
594 |     MugradeSubmit(DoConvLayerBackward(3, 2, 4, 6, k=3, stride=1, bias=True, wrtX=False))
595 |     MugradeSubmit(DoConvLayerBackward(1, 2, 1, 12, k=7, stride=1, bias=False, wrtX=False))
596 |     MugradeSubmit(DoConvLayerBackward(1, 1, 3, 12, k=7, stride=4, bias=False, wrtX=False))
597 | 
598 |     if ndl.cuda().enabled():
599 |         MugradeSubmit(DoConvLayerBackward(3, 2, 4, 6, k=3, stride=1, bias=False, wrtX=True, device=ndl.cuda()))
600 |         MugradeSubmit(DoConvLayerBackward(3, 4, 2, 6, k=3, stride=1, bias=False, wrtX=False, device=ndl.cuda()))
601 |     else:
602 |         print('You need a GPU to run these tests!')
603 | 
604 | 
605 | def submit_new_ops():
606 |     # pad
607 |     np.random.seed(1337)
608 |     _A = np.random.randint(low=1, high=10, size=(2, 2, 2, 2))
609 |     A  = nd.NDArray(_A, device=nd.cpu())
610 |     MugradeSubmit(A.pad(( (0, 0), (1, 1), (2, 2), (0, 0))))
611 | 
612 |     def DoFlip(shape, axes, backward=False, device=ndl.cpu()):
613 |         X = Rand(*shape, device=device)
614 |         X.requires_grad = True
615 |         Y = ndl.flip(X, axes=axes)
616 |         if backward:
617 |             V = Rand(*shape, device=device, entropy=2)
618 |             Z = (V*Y).sum()
619 |             Z.backward()
620 |             return X.grad
621 |         else:
622 |             return Y
623 | 
624 |     def DoDilate(shape, axes, dilation, backward=False, device=ndl.cpu()):
625 |         X = Rand(*shape, device=device)
626 |         X.requires_grad = True
627 |         Y = ndl.dilate(X, dilation=dilation, axes=axes)
628 |         if backward:
629 |             V = Rand(*Y.shape, device=device, entropy=2)
630 |             Z = (V*Y).sum()
631 |             Z.backward()
632 |             return X.grad
633 |         else:
634 |             return Y
635 | 
636 |     # flip
637 |     MugradeSubmit(DoFlip((2, 2, 3, 1), (1,2)))
638 |     MugradeSubmit(DoFlip((2, 1, 3, 2), (0,1,2,3)))
639 |     MugradeSubmit(DoFlip((8, 4), (1,)))
640 |     MugradeSubmit(DoFlip((4, 8), (0,)))
641 |     MugradeSubmit(DoFlip((2, 2, 3, 1), (2,3), backward=True))
642 |     MugradeSubmit(DoFlip((2, 1, 3, 2), (1,2,3), backward=True))
643 | 
644 |     # dilate
645 |     MugradeSubmit(DoDilate((2, 2, 3, 1), (1,2), 1))
646 |     MugradeSubmit(DoDilate((2, 2), (2,), 1))
647 |     MugradeSubmit(DoDilate((2, 2, 3, 1), (1,2), 1, backward=True))
648 |     MugradeSubmit(DoDilate((2, 2), (2,), 1, backward=True))
649 | 
650 | 
651 | 
652 | def submit_resnet9():
653 |     def num_params(model):
654 |         return np.sum([np.prod(x.shape) for x in model.parameters()])
655 | 
656 |     device = ndl.cpu()
657 |     import sys
658 |     sys.path.append('.')
659 |     from apps.models import ResNet9
660 |     np.random.seed(1)
661 |     model = ResNet9(device=device)
662 | 
663 |     MugradeSubmit(ndl.Tensor(num_params(model)))
664 | 
665 |     np.random.seed(1)
666 |     dataset = ndl.data.CIFAR10Dataset("./data/cifar-10-batches-py", train=True)
667 |     dataloader = ndl.data.DataLoader(\
668 |              dataset=dataset,
669 |              batch_size=128,
670 |              shuffle=True
671 |              )
672 |     np.random.seed(1)
673 |     model = ResNet9(device=device, dtype="float32")
674 |     out = one_iter_of_cifar10_training(dataloader, model, niter=2, opt=ndl.optim.Adam(model.parameters(), lr=0.01, weight_decay=0.0001), device=device)
675 |     MugradeSubmit(ndl.Tensor(np.array(list(out), dtype=object)))
676 | 
677 | 
678 | if __name__ == "__main__":
679 |     submit_conv_forward()
680 |     submit_conv_backward()
681 |     submit_new_ops()
682 |     submit_resnet9()


--------------------------------------------------------------------------------
/tests/hw4/test_nd_backend.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | sys.path.append('./python')
  3 | import itertools
  4 | import numpy as np
  5 | import pytest
  6 | import mugrade
  7 | import torch
  8 | 
  9 | import needle as ndl
 10 | from needle import backend_ndarray as nd
 11 | 
 12 | np.random.seed(1)
 13 | 
 14 | def backward_check(f, *args, **kwargs):
 15 |     eps = 1e-5
 16 |     out = f(*args, **kwargs)
 17 |     c = np.random.randn(*out.shape)
 18 |     numerical_grad = [np.zeros(a.shape) for a in args]
 19 |     num_args = len(args)
 20 |     for i in range(num_args):
 21 |         for j in range(args[i].realize_cached_data().size):
 22 |             args[i].realize_cached_data().flat[j] += eps
 23 |             f1 = (f(*args, **kwargs).numpy() * c).sum()
 24 |             args[i].realize_cached_data().flat[j] -= 2 * eps
 25 |             f2 = (f(*args, **kwargs).numpy() * c).sum()
 26 |             args[i].realize_cached_data().flat[j] += eps
 27 |             numerical_grad[i].flat[j] = (f1 - f2) / (2 * eps)
 28 |     backward_grad = out.op.gradient_as_tuple(ndl.Tensor(c, device=args[0].device), out)
 29 |     error = sum(
 30 |         np.linalg.norm(backward_grad[i].numpy() - numerical_grad[i])
 31 |         for i in range(len(args))
 32 |     )
 33 |     assert error < 4.2e-1
 34 |     return [g.numpy() for g in backward_grad]
 35 | 
 36 | 
 37 | _DEVICES = [ndl.cpu(), pytest.param(ndl.cuda(),
 38 |     marks=pytest.mark.skipif(not ndl.cuda().enabled(), reason="No GPU"))]
 39 | 
 40 | 
 41 | EWISE_OPS = {
 42 |     "divide": lambda a, b: a / b,
 43 |     "subtract": lambda a, b: a - b
 44 | }
 45 | EWISE_OP_FNS = [EWISE_OPS[k] for k in EWISE_OPS]
 46 | EWISE_OP_NAMES = [k for k in EWISE_OPS]
 47 | GENERAL_SHAPES = [(1, 1, 1), (4, 5, 6)]
 48 | @pytest.mark.parametrize("fn", EWISE_OP_FNS, ids=EWISE_OP_NAMES)
 49 | @pytest.mark.parametrize("shape", GENERAL_SHAPES)
 50 | @pytest.mark.parametrize("device", _DEVICES, ids=["cpu", "cuda"])
 51 | def test_ewise_fn(fn, shape, device):
 52 |     _A = np.random.randn(*shape).astype(np.float32)
 53 |     _B = np.random.randn(*shape).astype(np.float32)
 54 |     A = ndl.Tensor(nd.array(_A), device=device)
 55 |     B = ndl.Tensor(nd.array(_B), device=device)
 56 |     np.testing.assert_allclose(fn(_A, _B), fn(A, B).numpy(), atol=1e-5, rtol=1e-5)
 57 | 
 58 | 
 59 | SCALAR_OPS = {
 60 |     "divide": lambda a, b: a / b,
 61 |     "subtract": lambda a, b: a - b
 62 | }
 63 | SCALAR_OP_FNS = [SCALAR_OPS[k] for k in SCALAR_OPS]
 64 | SCALAR_OP_NAMES = [k for k in SCALAR_OPS]
 65 | @pytest.mark.parametrize("fn", SCALAR_OP_FNS, ids=SCALAR_OP_NAMES)
 66 | @pytest.mark.parametrize("shape", GENERAL_SHAPES)
 67 | @pytest.mark.parametrize("device", _DEVICES, ids=["cpu", "cuda"])
 68 | def test_scalar_fn(fn, shape, device):
 69 |     _A = np.random.randn(*shape).astype(np.float32)
 70 |     _B = np.random.randn(1).astype(np.float32).item()
 71 |     A = ndl.Tensor(nd.array(_A), device=device)
 72 |     np.testing.assert_allclose(fn(_A, _B), fn(A, _B).numpy(), atol=1e-5, rtol=1e-5)
 73 | 
 74 | 
 75 | MATMUL_DIMS = [(16, 16, 16),
 76 |     (8, 8, 8),
 77 |     (1, 2, 3),
 78 |     (3, 4, 5),
 79 |     (5, 4, 3),
 80 |     (16, 16, 32),
 81 |     (64, 64, 64),
 82 |     (72, 72, 72),
 83 |     (72, 73, 74),
 84 |     (74, 73, 72),
 85 |     (128, 128, 128)]
 86 | @pytest.mark.parametrize("m,n,p", MATMUL_DIMS)
 87 | @pytest.mark.parametrize("device", _DEVICES, ids=["cpu", "cuda"])
 88 | def test_matmul(m, n, p, device):
 89 |     _A = np.random.randn(m, n).astype(np.float32)
 90 |     _B = np.random.randn(n, p).astype(np.float32)
 91 |     A = ndl.Tensor(nd.array(_A), device=device)
 92 |     B = ndl.Tensor(nd.array(_B), device=device)
 93 |     np.testing.assert_allclose(_A @ _B, (A @ B).numpy(), atol=1e-5, rtol=1e-5)
 94 | 
 95 | 
 96 | @pytest.mark.parametrize("shape", GENERAL_SHAPES)
 97 | @pytest.mark.parametrize("device", _DEVICES, ids=["cpu", "cuda"])
 98 | def test_power(shape, device):
 99 |     _A = np.random.randn(*shape).astype(np.float32)
100 |     _B = np.random.randint(1)
101 |     A = ndl.Tensor(nd.array(_A), device=device)
102 |     np.testing.assert_allclose(_A**_B, (A**_B).numpy(), atol=1e-5, rtol=1e-5)
103 | 
104 | 
105 | @pytest.mark.parametrize("shape", GENERAL_SHAPES)
106 | @pytest.mark.parametrize("device", _DEVICES, ids=["cpu", "cuda"])
107 | def test_log(shape, device):
108 |     _A = np.random.randn(*shape).astype(np.float32) + 5.
109 |     A = ndl.Tensor(nd.array(_A), device=device)
110 |     np.testing.assert_allclose(np.log(_A), ndl.log(A).numpy(), atol=1e-5, rtol=1e-5)
111 | 
112 | 
113 | @pytest.mark.parametrize("shape", GENERAL_SHAPES)
114 | @pytest.mark.parametrize("device", _DEVICES, ids=["cpu", "cuda"])
115 | def test_exp(shape, device):
116 |     _A = np.random.randn(*shape).astype(np.float32)
117 |     A = ndl.Tensor(nd.array(_A), device=device)
118 |     np.testing.assert_allclose(np.exp(_A), ndl.exp(A).numpy(), atol=1e-5, rtol=1e-5)
119 | 
120 | 
121 | @pytest.mark.parametrize("shape", GENERAL_SHAPES)
122 | @pytest.mark.parametrize("device", _DEVICES, ids=["cpu", "cuda"])
123 | def test_relu(shape, device):
124 |     _A = np.random.randn(*shape).astype(np.float32)
125 |     A = ndl.Tensor(nd.array(_A), device=device)
126 |     np.testing.assert_allclose(np.maximum(_A, 0), ndl.relu(A).numpy(), atol=1e-5, rtol=1e-5)
127 | 
128 | 
129 | @pytest.mark.parametrize("shape", GENERAL_SHAPES)
130 | @pytest.mark.parametrize("device", _DEVICES, ids=["cpu", "cuda"])
131 | def test_tanh(shape, device):
132 |     _A = np.random.randn(*shape).astype(np.float32)
133 |     A = ndl.Tensor(nd.array(_A), device=device)
134 |     np.testing.assert_allclose(np.tanh(_A), ndl.tanh(A).numpy(), atol=1e-5, rtol=1e-5)
135 | 
136 | 
137 | @pytest.mark.parametrize("shape", GENERAL_SHAPES)
138 | @pytest.mark.parametrize("device", _DEVICES, ids=["cpu", "cuda"])
139 | def test_tanh_backward(shape, device):
140 |     _A = np.random.randn(*shape).astype(np.float32)
141 |     A = ndl.Tensor(nd.array(_A), device=device)
142 |     backward_check(ndl.tanh, A)
143 | 
144 | 
145 | STACK_PARAMETERS = [((5, 5), 0, 1),
146 |     ((5, 5), 0, 2),
147 |     ((1,5,7), 2, 5)]
148 | @pytest.mark.parametrize("shape, axis, l", STACK_PARAMETERS)
149 | @pytest.mark.parametrize("device", _DEVICES, ids=["cpu", "cuda"])
150 | def test_stack(shape, axis, l, device):
151 |     _A = [np.random.randn(*shape).astype(np.float32) for i in range(l)]
152 |     A = [ndl.Tensor(nd.array(_A[i]), device=device) for i in range(l)]
153 |     A_t = [torch.Tensor(_A[i]) for i in range(l)]
154 |     out = ndl.stack(A, axis=axis)
155 |     out_t = torch.stack(A_t, dim=axis)
156 |     np.testing.assert_allclose(out_t.numpy(), out.numpy(), atol=1e-5, rtol=1e-5)
157 | 
158 | 
159 | @pytest.mark.parametrize("shape, axis, l", STACK_PARAMETERS)
160 | @pytest.mark.parametrize("device", _DEVICES, ids=["cpu", "cuda"])
161 | def test_stack_backward(shape, axis, l, device):
162 |     _A = [np.random.randn(*shape).astype(np.float32) for i in range(l)]
163 |     A = [ndl.Tensor(nd.array(_A[i]), device=device) for i in range(l)]
164 |     A_t = [torch.Tensor(_A[i]) for i in range(l)]
165 |     for i in range(l):
166 |         A_t[i].requires_grad = True
167 |     ndl.stack(A, axis=axis).sum().backward()
168 |     torch.stack(A_t, dim=axis).sum().backward()
169 |     for i in range(l):
170 |         np.testing.assert_allclose(A_t[i].grad.numpy(), A[i].grad.numpy(), atol=1e-5, rtol=1e-5)
171 | 
172 | 
173 | SUMMATION_PARAMETERS = [((1, 1, 1), None),
174 |     ((5, 3), 0),
175 |     ((8, 3, 2), 1),
176 |     ((8, 3, 2), 2)
177 | ]
178 | @pytest.mark.parametrize("shape, axes", SUMMATION_PARAMETERS)
179 | @pytest.mark.parametrize("device", _DEVICES, ids=["cpu", "cuda"])
180 | def test_summation(shape, axes, device):
181 |     _A = np.random.randn(*shape).astype(np.float32)
182 |     A = ndl.Tensor(nd.array(_A), device=device)
183 |     np.testing.assert_allclose(np.sum(_A, axes), ndl.summation(A, axes=axes).numpy(), atol=1e-5, rtol=1e-5)
184 | 
185 | 
186 | @pytest.mark.parametrize("shape, axes", SUMMATION_PARAMETERS)
187 | @pytest.mark.parametrize("device", _DEVICES, ids=["cpu", "cuda"])
188 | def test_summation_backward(shape, axes, device):
189 |     _A = np.random.randn(*shape).astype(np.float32)
190 |     A = ndl.Tensor(nd.array(_A), device=device)
191 |     backward_check(ndl.summation, A, axes=axes)
192 | 
193 | 
194 | BROADCAST_SHAPES = [((1, 1, 1), (3, 3, 3)),
195 |     ((4, 1, 6), (4, 3, 6))]
196 | @pytest.mark.parametrize("shape,shape_to", BROADCAST_SHAPES)
197 | @pytest.mark.parametrize("device", _DEVICES, ids=["cpu", "cuda"])
198 | def test_broadcast_to(shape, shape_to, device):
199 |     _A = np.random.randn(*shape).astype(np.float32)
200 |     A = ndl.Tensor(nd.array(_A), device=device)
201 |     np.testing.assert_allclose(np.broadcast_to(_A, shape_to), ndl.broadcast_to(A, shape_to).numpy(), atol=1e-5, rtol=1e-5)
202 | 
203 | 
204 | RESHAPE_SHAPES = [((1, 1, 1), (1,)),
205 |     ((4, 1, 6), (6, 4, 1))]
206 | @pytest.mark.parametrize("shape,shape_to", RESHAPE_SHAPES)
207 | @pytest.mark.parametrize("device", _DEVICES, ids=["cpu", "cuda"])
208 | def test_reshape(shape, shape_to, device):
209 |     _A = np.random.randn(*shape).astype(np.float32)
210 |     A = ndl.Tensor(nd.array(_A), device=device)
211 |     np.testing.assert_allclose(np.reshape(_A, shape_to), ndl.reshape(A, shape_to).numpy(), atol=1e-5, rtol=1e-5)
212 | 
213 | 
214 | TRANSPOSE_SHAPES = [(1, 1, 1), (4, 5, 6)]
215 | TRANSPOSE_AXES = [(0, 1), (0, 2), None]
216 | @pytest.mark.parametrize("shape", TRANSPOSE_SHAPES)
217 | @pytest.mark.parametrize("axes", TRANSPOSE_AXES)
218 | @pytest.mark.parametrize("device", _DEVICES, ids=["cpu", "cuda"])
219 | def test_transpose(shape, axes, device):
220 |     _A = np.random.randn(*shape).astype(np.float32)
221 |     A = ndl.Tensor(nd.array(_A), device=device)
222 |     if axes is None:
223 |         np_axes = (_A.ndim - 2, _A.ndim - 1)
224 |     else:
225 |         np_axes = axes
226 |     np.testing.assert_allclose(np.swapaxes(_A, np_axes[0], np_axes[1]), ndl.transpose(A, axes=axes).numpy(), atol=1e-5, rtol=1e-5)
227 | 
228 | 
229 | @pytest.mark.parametrize("shape, axes", SUMMATION_PARAMETERS)
230 | @pytest.mark.parametrize("device", _DEVICES, ids=["cpu", "cuda"])
231 | def test_logsumexp(shape, axes, device):
232 |     _A = np.random.randn(*shape).astype(np.float32)
233 |     A = ndl.Tensor(nd.array(_A), device=device)
234 |     A_t = torch.Tensor(_A)
235 |     if axes is None:
236 |         t_axes = tuple(list(range(len(shape))))
237 |     else:
238 |         t_axes = axes
239 |     np.testing.assert_allclose(torch.logsumexp(A_t, dim=t_axes).numpy(), ndl.logsumexp(A, axes=axes).numpy(), atol=1e-5, rtol=1e-5)
240 | 
241 | 
242 | 
243 | ### MUGRADE ###
244 | 
245 | TEST_GENERAL_SHAPES = [(3, 1, 2)]
246 | TEST_MATMUL_DIMS = [(3, 4, 2), (8, 16, 16)]
247 | TEST_STACK_PARAMETERS = [((2, 3), 0, 3)]
248 | TEST_SUMMATION_PARAMETERS = [((3, 2), 0), ((2, 1, 2, 3), 3)]
249 | TEST_LOGSUMEXP_PARAMETERS = [((3, 2), 0), ((2, 1, 2, 3), 3)]
250 | TEST_BROADCAST_SHAPES = [((2, 1), (2, 4)), ((2, 1, 5), (2, 3, 5))]
251 | TEST_RESHAPE_SHAPES = [((3, 1, 2), (3, 2, 1))]
252 | TEST_TRANSPOSE_SHAPES = [(3, 5, 1)]
253 | TEST_TRANSPOSE_AXES = [(0, 1), (0, 2), None]
254 | TEST_GETSETITEM_PARAMS = [((3, 2), (2, 1)), ((3, 3, 4), (2, np.s_[2:], np.s_[:3]))]
255 | 
256 | 
257 | def mugrade_submit(x):
258 |     if isinstance(x, np.ndarray):
259 |         x = x.flatten()[:64]
260 |         # print(x)
261 |         mugrade.submit(x)
262 |     else:
263 |         # print(x)
264 |         mugrade.submit(x)
265 | 
266 | 
267 | def submit_new_nd_backend():
268 |     devices = [ndl.cpu(), ndl.cuda()] if ndl.cuda().enabled() else [ndl.cpu()]
269 |     #devices = [ndl.cpu(), ndl.cuda()]
270 | 
271 |     if not ndl.cuda().enabled():
272 |         print('You need a GPU to run some of these tests.')
273 |         
274 |     # ewise fn
275 |     for (device, shape, fn_name) in itertools.product(devices, TEST_GENERAL_SHAPES, EWISE_OP_NAMES):
276 |         _A = np.random.randn(*shape).astype(np.float32)
277 |         _B = np.random.randn(*shape).astype(np.float32)
278 |         A = ndl.Tensor(nd.array(_A), device=device)
279 |         B = ndl.Tensor(nd.array(_B), device=device)
280 |         mugrade_submit(EWISE_OPS[fn_name](A, B).numpy())
281 | 
282 |     # scalar fn
283 |     for (device, shape, fn_name) in itertools.product(devices, TEST_GENERAL_SHAPES, SCALAR_OP_NAMES):
284 |         _A = np.random.randn(*shape).astype(np.float32)
285 |         _B = np.random.randn(1).astype(np.float32).item()
286 |         A = ndl.Tensor(nd.array(_A), device=device)
287 |         mugrade_submit(EWISE_OPS[fn_name](A, _B).numpy())
288 | 
289 |     # matmul
290 |     for (device, matmul_dim) in itertools.product(devices, TEST_MATMUL_DIMS):
291 |         m, n, p = matmul_dim
292 |         _A = np.random.randn(m, n).astype(np.float32)
293 |         _B = np.random.randn(n, p).astype(np.float32)
294 |         A = ndl.Tensor(nd.array(_A), device=device)
295 |         B = ndl.Tensor(nd.array(_B), device=device)
296 |         mugrade_submit((A @ B).numpy())
297 | 
298 |     # power
299 |     for (device, shape) in itertools.product(devices, TEST_GENERAL_SHAPES):
300 |         _A = np.random.randn(*shape).astype(np.float32)
301 |         _B = np.random.randint(1)
302 |         A = ndl.Tensor(nd.array(_A), device=device)
303 |         mugrade_submit((A**_B).numpy())
304 | 
305 |     # log
306 |     for (device, shape) in itertools.product(devices, TEST_GENERAL_SHAPES):
307 |         _A = np.random.randn(*shape).astype(np.float32) + 5.
308 |         A = ndl.Tensor(nd.array(_A), device=device)
309 |         mugrade_submit(ndl.log(A).numpy())
310 | 
311 |     # exp
312 |     for (device, shape) in itertools.product(devices, TEST_GENERAL_SHAPES):
313 |         _A = np.random.randn(*shape).astype(np.float32)
314 |         A = ndl.Tensor(nd.array(_A), device=device)
315 |         mugrade_submit(ndl.exp(A).numpy())
316 | 
317 |     # tanh
318 |     for (device, shape) in itertools.product(devices, TEST_GENERAL_SHAPES):
319 |         _A = np.random.randn(*shape).astype(np.float32)
320 |         A = ndl.Tensor(nd.array(_A), device=device)
321 |         mugrade_submit(ndl.tanh(A).numpy())
322 |         mugrade_submit(backward_check(ndl.tanh, A))
323 | 
324 |     # stack
325 |     for (device, (shape, axis, l)) in itertools.product(devices, TEST_STACK_PARAMETERS):
326 |         _A = [np.random.randn(*shape).astype(np.float32) for i in range(l)]
327 |         A = [ndl.Tensor(nd.array(_A[i]), device=device) for i in range(l)]
328 |         out = ndl.stack(A, axis=axis)
329 |         mugrade_submit(out.numpy())
330 |         out.backward()
331 |         mugrade_submit(A[0].grad.numpy())
332 | 
333 |     # summation
334 |     for (device, (shape, axes)) in itertools.product(devices, TEST_SUMMATION_PARAMETERS):
335 |         _A = np.random.randn(*shape).astype(np.float32)
336 |         A = ndl.Tensor(nd.array(_A), device=device)
337 |         mugrade_submit(ndl.summation(A, axes).numpy())
338 |         mugrade_submit(backward_check(ndl.summation, A, axes=axes))
339 | 
340 |     # broadcast
341 |     for (device, (shape, shape_to)) in itertools.product(devices, TEST_BROADCAST_SHAPES):
342 |         _A = np.random.randn(*shape).astype(np.float32)
343 |         A = ndl.Tensor(nd.array(_A), device=device)
344 |         mugrade_submit(ndl.broadcast_to(A, shape_to).numpy())
345 | 
346 |     # reshape
347 |     for (device, (shape, shape_to)) in itertools.product(devices, TEST_RESHAPE_SHAPES):
348 |         _A = np.random.randn(*shape).astype(np.float32)
349 |         A = ndl.Tensor(nd.array(_A), device=device)
350 |         mugrade_submit(ndl.reshape(A, shape_to).numpy())
351 | 
352 |     # transpose
353 |     for (device, shape, axes) in itertools.product(devices, TEST_TRANSPOSE_SHAPES, TEST_TRANSPOSE_AXES):
354 |         _A = np.random.randn(*shape).astype(np.float32)
355 |         A = ndl.Tensor(nd.array(_A), device=device)
356 |         mugrade_submit(ndl.transpose(A, axes=axes).numpy())
357 | 
358 |     # logsumexp
359 |     for (device, (shape, axes)) in itertools.product(devices, TEST_LOGSUMEXP_PARAMETERS):
360 |         _A = np.random.randn(*shape).astype(np.float32)
361 |         A = ndl.Tensor(nd.array(_A), device=device)
362 |         mugrade_submit(ndl.logsumexp(A, axes).numpy())
363 |         mugrade_submit(backward_check(ndl.logsumexp, A, axes=axes))
364 | 
365 | 
366 | if __name__ == "__main__":
367 |     submit_new_nd_backend()


--------------------------------------------------------------------------------
/tests/hw4/test_sequence_models.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | sys.path.append('./python')
  3 | sys.path.append('./apps')
  4 | import numpy as np
  5 | import pytest
  6 | import torch
  7 | import itertools
  8 | import mugrade
  9 | 
 10 | import needle as ndl
 11 | import needle.nn as nn
 12 | 
 13 | from simple_ml import *
 14 | from models import LanguageModel
 15 | 
 16 | 
 17 | np.random.seed(3)
 18 | 
 19 | 
 20 | _DEVICES = [ndl.cpu(), pytest.param(ndl.cuda(),
 21 |     marks=pytest.mark.skipif(not ndl.cuda().enabled(), reason="No GPU"))]
 22 | 
 23 | 
 24 | BATCH_SIZES = [1, 15]
 25 | INPUT_SIZES = [1, 11]
 26 | HIDDEN_SIZES = [1, 12]
 27 | BIAS = [True, False]
 28 | INIT_HIDDEN = [True, False]
 29 | NONLINEARITIES = ['tanh', 'relu']
 30 | @pytest.mark.parametrize("batch_size", BATCH_SIZES)
 31 | @pytest.mark.parametrize("input_size", INPUT_SIZES)
 32 | @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
 33 | @pytest.mark.parametrize("bias", BIAS)
 34 | @pytest.mark.parametrize("init_hidden", INIT_HIDDEN)
 35 | @pytest.mark.parametrize("nonlinearity", NONLINEARITIES)
 36 | @pytest.mark.parametrize("device", _DEVICES, ids=["cpu", "cuda"])
 37 | def test_rnn_cell(batch_size, input_size, hidden_size, bias, init_hidden, nonlinearity, device):
 38 |     x = np.random.randn(batch_size, input_size).astype(np.float32)
 39 |     h0 = np.random.randn(batch_size, hidden_size).astype(np.float32)
 40 | 
 41 |     model_ = torch.nn.RNNCell(input_size, hidden_size, nonlinearity=nonlinearity, bias=bias)
 42 |     if init_hidden:
 43 |         h_ = model_(torch.tensor(x), torch.tensor(h0))
 44 |     else:
 45 |         h_ = model_(torch.tensor(x), None)
 46 | 
 47 |     model = nn.RNNCell(input_size, hidden_size, device=device, bias=bias, nonlinearity=nonlinearity)
 48 |     model.W_ih = ndl.Tensor(model_.weight_ih.detach().numpy().transpose(), device=device)
 49 |     model.W_hh = ndl.Tensor(model_.weight_hh.detach().numpy().transpose(), device=device)
 50 |     if bias:
 51 |         model.bias_ih = ndl.Tensor(model_.bias_ih.detach().numpy(), device=device)
 52 |         model.bias_hh = ndl.Tensor(model_.bias_hh.detach().numpy(), device=device)
 53 |     if init_hidden:
 54 |         h = model(ndl.Tensor(x, device=device), ndl.Tensor(h0, device=device))
 55 |     else:
 56 |         h = model(ndl.Tensor(x, device=device), None)
 57 |     assert h.device == device
 58 |     np.testing.assert_allclose(h_.detach().numpy(), h.numpy(), atol=1e-5, rtol=1e-5)
 59 |     h.sum().backward()
 60 |     h_.sum().backward()
 61 |     np.testing.assert_allclose(model_.weight_ih.grad.detach().numpy().transpose(), model.W_ih.grad.numpy(), atol=1e-5, rtol=1e-5)
 62 | 
 63 | 
 64 | @pytest.mark.parametrize("batch_size", BATCH_SIZES)
 65 | @pytest.mark.parametrize("input_size", INPUT_SIZES)
 66 | @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
 67 | @pytest.mark.parametrize("bias", BIAS)
 68 | @pytest.mark.parametrize("init_hidden", INIT_HIDDEN)
 69 | @pytest.mark.parametrize("device", _DEVICES, ids=["cpu", "cuda"])
 70 | def test_lstm_cell(batch_size, input_size, hidden_size, bias, init_hidden, device):
 71 |     x = np.random.randn(batch_size, input_size).astype(np.float32)
 72 |     h0 = np.random.randn(batch_size, hidden_size).astype(np.float32)
 73 |     c0 = np.random.randn(batch_size, hidden_size).astype(np.float32)
 74 | 
 75 |     model_ = torch.nn.LSTMCell(input_size, hidden_size, bias=bias)
 76 |     if init_hidden:
 77 |         h_, c_ = model_(torch.tensor(x), (torch.tensor(h0), torch.tensor(c0)))
 78 |     else:
 79 |         h_, c_ = model_(torch.tensor(x), None)
 80 | 
 81 |     model = nn.LSTMCell(input_size, hidden_size, device=device, bias=bias)
 82 | 
 83 |     model.W_ih = ndl.Tensor(model_.weight_ih.detach().numpy().transpose(), device=device)
 84 |     model.W_hh = ndl.Tensor(model_.weight_hh.detach().numpy().transpose(), device=device)
 85 |     if bias:
 86 |         model.bias_ih = ndl.Tensor(model_.bias_ih.detach().numpy(), device=device)
 87 |         model.bias_hh = ndl.Tensor(model_.bias_hh.detach().numpy(), device=device)
 88 | 
 89 |     if init_hidden:
 90 |         h, c = model(ndl.Tensor(x, device=device), (ndl.Tensor(h0, device=device), ndl.Tensor(c0, device=device)))
 91 |     else:
 92 |         h, c = model(ndl.Tensor(x, device=device), None)
 93 |     np.testing.assert_allclose(h_.detach().numpy(), h.numpy(), atol=1e-5, rtol=1e-5)
 94 |     np.testing.assert_allclose(c_.detach().numpy(), c.numpy(), atol=1e-5, rtol=1e-5)
 95 | 
 96 |     h.sum().backward()
 97 |     h_.sum().backward()
 98 |     np.testing.assert_allclose(model_.weight_ih.grad.detach().numpy().transpose(), model.W_ih.grad.numpy(), atol=1e-5, rtol=1e-5)
 99 | 
100 | 
101 | SEQ_LENGTHS = [1, 13]
102 | NUM_LAYERS = [1, 2]
103 | @pytest.mark.parametrize("seq_length", SEQ_LENGTHS)
104 | @pytest.mark.parametrize("num_layers", NUM_LAYERS)
105 | @pytest.mark.parametrize("batch_size", BATCH_SIZES)
106 | @pytest.mark.parametrize("input_size", INPUT_SIZES)
107 | @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
108 | @pytest.mark.parametrize("bias", BIAS)
109 | @pytest.mark.parametrize("init_hidden", INIT_HIDDEN)
110 | @pytest.mark.parametrize("nonlinearity", NONLINEARITIES)
111 | @pytest.mark.parametrize("device", _DEVICES, ids=["cpu", "cuda"])
112 | def test_rnn(seq_length, num_layers, batch_size, input_size, hidden_size, bias, init_hidden, nonlinearity, device):
113 |     x = np.random.randn(seq_length, batch_size, input_size).astype(np.float32)
114 |     h0 = np.random.randn(num_layers, batch_size, hidden_size).astype(np.float32)
115 | 
116 |     model_ = torch.nn.RNN(input_size, hidden_size, num_layers=num_layers, bias=bias, nonlinearity=nonlinearity)
117 |     if init_hidden:
118 |         output_, h_ = model_(torch.tensor(x), torch.tensor(h0))
119 |     else:
120 |         output_, h_ = model_(torch.tensor(x), None)
121 | 
122 |     model = nn.RNN(input_size, hidden_size, num_layers, bias, device=device, nonlinearity=nonlinearity)
123 |     for k in range(num_layers):
124 |         model.rnn_cells[k].W_ih = ndl.Tensor(getattr(model_, f'weight_ih_l{k}').detach().numpy().transpose(), device=device)
125 |         model.rnn_cells[k].W_hh = ndl.Tensor(getattr(model_, f'weight_hh_l{k}').detach().numpy().transpose(), device=device)
126 |         if bias:
127 |             model.rnn_cells[k].bias_ih = ndl.Tensor(getattr(model_, f'bias_ih_l{k}').detach().numpy(), device=device)
128 |             model.rnn_cells[k].bias_hh = ndl.Tensor(getattr(model_, f'bias_hh_l{k}').detach().numpy(), device=device)
129 |     if init_hidden:
130 |         output, h = model(ndl.Tensor(x, device=device), ndl.Tensor(h0, device=device))
131 |     else:
132 |         output, h = model(ndl.Tensor(x, device=device), None)
133 | 
134 |     np.testing.assert_allclose(h_.detach().numpy(), h.numpy(), atol=1e-5, rtol=1e-5)
135 |     np.testing.assert_allclose(output_.detach().numpy(), output.numpy(), atol=1e-5, rtol=1e-5)
136 | 
137 |     output.sum().backward()
138 |     output_.sum().backward()
139 |     np.testing.assert_allclose(model.rnn_cells[0].W_ih.grad.detach().numpy(), model_.weight_ih_l0.grad.numpy().transpose(), atol=1e-5, rtol=1e-5)
140 | 
141 | 
142 | @pytest.mark.parametrize("seq_length", SEQ_LENGTHS)
143 | @pytest.mark.parametrize("num_layers", NUM_LAYERS)
144 | @pytest.mark.parametrize("batch_size", BATCH_SIZES)
145 | @pytest.mark.parametrize("input_size", INPUT_SIZES)
146 | @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
147 | @pytest.mark.parametrize("bias", BIAS)
148 | @pytest.mark.parametrize("init_hidden", INIT_HIDDEN)
149 | @pytest.mark.parametrize("device", _DEVICES, ids=["cpu", "cuda"])
150 | def test_lstm(seq_length, num_layers, batch_size, input_size, hidden_size, bias, init_hidden, device):
151 |     x = np.random.randn(seq_length, batch_size, input_size).astype(np.float32)
152 |     h0 = np.random.randn(num_layers, batch_size, hidden_size).astype(np.float32)
153 |     c0 = np.random.randn(num_layers, batch_size, hidden_size).astype(np.float32)
154 | 
155 |     model_ = torch.nn.LSTM(input_size, hidden_size, bias=bias, num_layers=num_layers)
156 |     if init_hidden:
157 |         output_, (h_, c_) = model_(torch.tensor(x), (torch.tensor(h0), torch.tensor(c0)))
158 |     else:
159 |         output_, (h_, c_) = model_(torch.tensor(x), None)
160 | 
161 |     model = nn.LSTM(input_size, hidden_size, num_layers, bias, device=device)
162 |     for k in range(num_layers):
163 |         model.lstm_cells[k].W_ih = ndl.Tensor(getattr(model_, f'weight_ih_l{k}').detach().numpy().transpose(), device=device)
164 |         model.lstm_cells[k].W_hh = ndl.Tensor(getattr(model_, f'weight_hh_l{k}').detach().numpy().transpose(), device=device)
165 |         if bias:
166 |             model.lstm_cells[k].bias_ih = ndl.Tensor(getattr(model_, f'bias_ih_l{k}').detach().numpy(), device=device)
167 |             model.lstm_cells[k].bias_hh = ndl.Tensor(getattr(model_, f'bias_hh_l{k}').detach().numpy(), device=device)
168 |     if init_hidden:
169 |         output, (h, c) = model(ndl.Tensor(x, device=device), (ndl.Tensor(h0, device=device), ndl.Tensor(c0, device=device)))
170 |     else:
171 |         output, (h, c) = model(ndl.Tensor(x, device=device), None)
172 | 
173 |     np.testing.assert_allclose(h_.detach().numpy(), h.numpy(), atol=1e-5, rtol=1e-5)
174 |     np.testing.assert_allclose(c_.detach().numpy(), c.numpy(), atol=1e-5, rtol=1e-5)
175 |     np.testing.assert_allclose(output_.detach().numpy(), output.numpy(), atol=1e-5, rtol=1e-5)
176 | 
177 |     output.sum().backward()
178 |     output_.sum().backward()
179 |     np.testing.assert_allclose(model.lstm_cells[0].W_ih.grad.detach().numpy(), model_.weight_ih_l0.grad.numpy().transpose(), atol=1e-5, rtol=1e-5)
180 | 
181 | 
182 | OUTPUT_SIZES = [1, 1000]
183 | EMBEDDING_SIZES = [1, 34]
184 | SEQ_MODEL = ['rnn', 'lstm']
185 | @pytest.mark.parametrize("seq_length", SEQ_LENGTHS)
186 | @pytest.mark.parametrize("num_layers", NUM_LAYERS)
187 | @pytest.mark.parametrize("batch_size", BATCH_SIZES)
188 | @pytest.mark.parametrize("embedding_size", EMBEDDING_SIZES)
189 | @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
190 | @pytest.mark.parametrize("init_hidden", INIT_HIDDEN)
191 | @pytest.mark.parametrize("output_size", OUTPUT_SIZES)
192 | @pytest.mark.parametrize("seq_model", SEQ_MODEL)
193 | @pytest.mark.parametrize("device", _DEVICES, ids=["cpu", "cuda"])
194 | def test_language_model_implementation(seq_length, num_layers, batch_size, embedding_size, hidden_size,
195 |                         init_hidden, output_size, seq_model, device):
196 |     #TODO add test for just nn.embedding?
197 |     x = np.random.randint(0, output_size, (seq_length, batch_size)).astype(np.float32)
198 |     h0 = ndl.Tensor(np.random.randn(num_layers, batch_size, hidden_size).astype(np.float32), device=device)
199 |     c0 = ndl.Tensor(np.random.randn(num_layers, batch_size, hidden_size).astype(np.float32), device=device)
200 | 
201 |     model = LanguageModel(embedding_size, output_size, hidden_size, num_layers, seq_model, device=device)
202 |     if init_hidden:
203 |         if seq_model == 'lstm':
204 |             h = (h0, c0)
205 |         elif seq_model == 'rnn':
206 |             h = h0
207 |         output, h_ = model(ndl.Tensor(x, device=device), h)
208 |     else:
209 |         output, h_ = model(ndl.Tensor(x, device=device), None)
210 | 
211 |     if seq_model == 'lstm':
212 |         assert isinstance(h_, tuple)
213 |         h0_, c0_ = h_
214 |         assert c0_.shape == (num_layers, batch_size, hidden_size)
215 |     elif seq_model == 'rnn':
216 |         h0_ = h_
217 |     assert h0_.shape == (num_layers, batch_size, hidden_size)
218 |     assert output.shape == (batch_size * seq_length, output_size)
219 |     #TODO actually test values
220 |     output.backward()
221 |     for p in model.parameters():
222 |         assert p.grad is not None
223 | 
224 | @pytest.mark.parametrize("device", _DEVICES, ids=["cpu", "cuda"])
225 | def test_language_model_training(device):
226 |     corpus = ndl.data.Corpus("data/ptb", max_lines=20)
227 |     seq_len = 10
228 |     num_examples = 100
229 |     batch_size = 16
230 |     seq_model = 'rnn'
231 |     num_layers = 2
232 |     hidden_size = 10
233 |     n_epochs=2
234 |     train_data = ndl.data.batchify(corpus.train, batch_size=batch_size, device=device, dtype="float32")
235 |     model = LanguageModel(30, len(corpus.dictionary), hidden_size=hidden_size, num_layers=num_layers, seq_model=seq_model, device=device)
236 |     train_acc, train_loss = train_ptb(model, train_data, seq_len=seq_len, n_epochs=n_epochs, device=device)
237 |     test_acc, test_loss = evaluate_ptb(model, train_data, seq_len=seq_len, device=device)
238 |     if str(device) == "cpu(0)":
239 |         np.testing.assert_allclose(5.4136161980805575, train_loss, atol=1e-5, rtol=1e-5)
240 |         np.testing.assert_allclose(5.214852703942193, test_loss, atol=1e-5, rtol=1e-5)
241 |     elif str(device) == "cuda(0)":
242 |         np.testing.assert_allclose(5.424638041743526, train_loss, atol=1e-5, rtol=1e-5)
243 |         np.testing.assert_allclose(5.23579544491238, test_loss, atol=1e-5, rtol=1e-5)
244 | 
245 | 
246 | ### MUGRADE ###
247 | 
248 | TEST_BATCH_SIZES = [6]
249 | TEST_INPUT_SIZES = [3]
250 | TEST_HIDDEN_SIZES = [5]
251 | TEST_SEQ_LENGTHS = [7]
252 | TEST_NUM_LAYERS = [3]
253 | TEST_OUTPUT_SIZES = [16]
254 | TEST_EMBEDDING_SIZES = [8]
255 | TEST_SEQ_MODEL = ['rnn', 'lstm']
256 | 
257 | def mugrade_submit(x):
258 |     if isinstance(x, np.ndarray):
259 |         x = x.flatten()[:64]
260 |         # print(x)
261 |         mugrade.submit(x)
262 |     else:
263 |         # print(x)
264 |         mugrade.submit(x)
265 | 
266 | 
267 | def submit_rnn():
268 |     devices = [ndl.cpu(), ndl.cuda()] if ndl.cuda().enabled() else [ndl.cpu()]
269 |     # devices = [ndl.cpu(), ndl.cuda()]
270 | 
271 |     if not ndl.cuda().enabled():
272 |         print('You need a GPU to run some of these tests.')
273 | 
274 |     for (device, batch_size, input_size, hidden_size) in itertools.product(
275 |         devices, TEST_BATCH_SIZES, TEST_INPUT_SIZES, TEST_HIDDEN_SIZES):
276 |         x = np.random.randn(batch_size, input_size).astype(np.float32)
277 |         h0 = np.random.randn(batch_size, hidden_size).astype(np.float32)
278 |         model = nn.RNNCell(input_size, hidden_size, device=device)
279 |         mugrade_submit(model.W_ih.numpy())
280 |         h = model(ndl.Tensor(x, device=device), ndl.Tensor(h0, device=device))
281 |         mugrade_submit(h.numpy())
282 |         h.sum().backward()
283 |         mugrade_submit(model.W_hh.grad.numpy())
284 | 
285 |     for (device, seq_length, num_layers, batch_size, input_size, hidden_size) in itertools.product(
286 |         devices, TEST_SEQ_LENGTHS, TEST_NUM_LAYERS, TEST_BATCH_SIZES, TEST_INPUT_SIZES, TEST_HIDDEN_SIZES):
287 |         x = np.random.randn(seq_length, batch_size, input_size).astype(np.float32)
288 |         h0 = np.random.randn(num_layers, batch_size, hidden_size).astype(np.float32)
289 |         model = nn.RNN(input_size, hidden_size, num_layers, device=device)
290 |         output, h = model(ndl.Tensor(x, device=device), ndl.Tensor(h0, device=device))
291 |         mugrade_submit(h.numpy())
292 |         mugrade_submit(output.numpy())
293 |         output.sum().backward()
294 |         mugrade_submit(model.rnn_cells[-1].W_hh.grad.numpy())
295 | 
296 | 
297 | def submit_lstm():
298 |     devices = [ndl.cpu(), ndl.cuda()] if ndl.cuda().enabled() else [ndl.cpu()]
299 |     #devices = [ndl.cpu(), ndl.cuda()]
300 |     if not ndl.cuda().enabled():
301 |         print('You need a GPU to run some of these tests.')
302 |     for (device, batch_size, input_size, hidden_size) in itertools.product(
303 |         devices, TEST_BATCH_SIZES, TEST_INPUT_SIZES, TEST_HIDDEN_SIZES):
304 |         x = np.random.randn(batch_size, input_size).astype(np.float32)
305 |         h0 = np.random.randn(batch_size, hidden_size).astype(np.float32)
306 |         c0 = np.random.randn(batch_size, hidden_size).astype(np.float32)
307 |         model = nn.LSTMCell(input_size, hidden_size, device=device)
308 |         mugrade_submit(model.W_hh.numpy())
309 |         (h, c) = model(ndl.Tensor(x, device=device), (ndl.Tensor(h0, device=device), ndl.Tensor(c0, device=device)))
310 |         mugrade_submit(h.numpy())
311 |         mugrade_submit(c.numpy())
312 |         h.sum().backward()
313 |         mugrade_submit(model.W_hh.grad.numpy())
314 | 
315 |     for (device, seq_length, num_layers, batch_size, input_size, hidden_size) in itertools.product(
316 |         devices, TEST_SEQ_LENGTHS, TEST_NUM_LAYERS, TEST_BATCH_SIZES, TEST_INPUT_SIZES, TEST_HIDDEN_SIZES):
317 |         x = np.random.randn(seq_length, batch_size, input_size).astype(np.float32)
318 |         h0 = np.random.randn(num_layers, batch_size, hidden_size).astype(np.float32)
319 |         c0 = np.random.randn(num_layers, batch_size, hidden_size).astype(np.float32)
320 |         model = nn.LSTM(input_size, hidden_size, num_layers, device=device)
321 |         output, (h, c) = model(ndl.Tensor(x, device=device), (ndl.Tensor(h0, device=device), ndl.Tensor(c0, device=device)))
322 |         mugrade_submit(h.numpy())
323 |         mugrade_submit(c.numpy())
324 |         mugrade_submit(output.numpy())
325 |         output.sum().backward()
326 |         mugrade_submit(model.lstm_cells[-1].W_hh.grad.numpy())
327 | 
328 | 
329 | def submit_language_model():
330 |     devices = [ndl.cpu(), ndl.cuda()] if ndl.cuda().enabled() else [ndl.cpu()]
331 |     # devices = [ndl.cpu(), ndl.cuda()]
332 |     if not ndl.cuda().enabled():
333 |         print('You need a GPU to run some of these tests.')
334 |     for (device, seq_length, num_layers, batch_size, embedding_size, hidden_size, seq_model, output_size) in itertools.product(
335 |         devices, TEST_SEQ_LENGTHS, TEST_NUM_LAYERS, TEST_BATCH_SIZES, TEST_EMBEDDING_SIZES, TEST_HIDDEN_SIZES, TEST_SEQ_MODEL, TEST_OUTPUT_SIZES):
336 |         x = np.random.randint(0, output_size, (seq_length, batch_size)).astype(np.float32)
337 |         h0 = ndl.Tensor(np.random.randn(num_layers, batch_size, hidden_size).astype(np.float32), device=device)
338 |         c0 = ndl.Tensor(np.random.randn(num_layers, batch_size, hidden_size).astype(np.float32), device=device)
339 |         model = LanguageModel(embedding_size, output_size, hidden_size, num_layers, seq_model, device=device)
340 |         if seq_model == 'lstm':
341 |             h = (h0, c0)
342 |         elif seq_model == 'rnn':
343 |             h = h0
344 |         output, h_ = model(ndl.Tensor(x, device=device), h)
345 |         if seq_model == 'lstm':
346 |             h0_, c0_ = h_
347 |             mugrade_submit(c0_.numpy())
348 |         elif seq_model == 'rnn':
349 |             h0_ = h_
350 |         mugrade_submit(h0_.numpy())
351 |         mugrade_submit(output.numpy())
352 | 
353 |     device = ndl.cpu() # TODO CHANGE BACK
354 |     # device = ndl.cpu()
355 |     corpus = ndl.data.Corpus("data/ptb", max_lines=20)
356 |     seq_len = 8
357 |     num_examples = 88
358 |     batch_size = 12
359 |     seq_model = 'lstm'
360 |     num_layers = 2
361 |     hidden_size = 12
362 |     n_epochs=2
363 |     train_data = ndl.data.batchify(corpus.train, batch_size=batch_size, device=device, dtype="float32")
364 |     model = LanguageModel(28, len(corpus.dictionary), hidden_size=hidden_size, num_layers=num_layers,
365 |         seq_model=seq_model, device=device)
366 |     train_acc, train_loss = train_ptb(model, train_data, seq_len=seq_len, n_epochs=n_epochs, device=device)
367 |     test_acc, test_loss = evaluate_ptb(model, train_data, seq_len=seq_len, device=device)
368 |     mugrade_submit(train_loss)
369 |     mugrade_submit(test_loss)
370 | 
371 | 
372 | if __name__ == "__main__":
373 |     submit_rnn()
374 |     submit_lstm()
375 |     submit_language_model()


--------------------------------------------------------------------------------