├── CMakeLists.txt ├── Makefile ├── README.md ├── ResNet9.png ├── apps ├── models.py └── simple_ml.py ├── hw4.ipynb ├── python └── needle │ ├── __init__.py │ ├── autograd.py │ ├── backend_ndarray │ ├── __init__.py │ ├── ndarray.py │ └── ndarray_backend_numpy.py │ ├── backend_numpy.py │ ├── backend_selection.py │ ├── data │ ├── __init__.py │ ├── data_basic.py │ ├── data_transforms.py │ └── datasets │ │ ├── __init__.py │ │ ├── cifar10_dataset.py │ │ ├── mnist_dataset.py │ │ ├── ndarray_dataset.py │ │ └── ptb_dataset.py │ ├── init │ ├── __init__.py │ ├── init_basic.py │ └── init_initializers.py │ ├── nn │ ├── __init__.py │ ├── nn_basic.py │ ├── nn_conv.py │ └── nn_sequence.py │ ├── ops │ ├── __init__.py │ ├── ops_logarithmic.py │ ├── ops_mathematic.py │ └── ops_tuple.py │ └── optim.py ├── src ├── ndarray_backend_cpu.cc └── ndarray_backend_cuda.cu └── tests └── hw4 ├── test_cifar_ptb_data.py ├── test_conv.py ├── test_nd_backend.py └── test_sequence_models.py /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.5) 2 | project(needle C CXX) 3 | cmake_policy(SET CMP0146 OLD) 4 | 5 | # find correct version of Python 6 | execute_process(COMMAND python3-config --prefix 7 | OUTPUT_VARIABLE Python_ROOT_DIR) 8 | find_package(Python COMPONENTS Development Interpreter REQUIRED) 9 | include_directories(${Python_INCLUDE_DIRS}) 10 | 11 | # find pybind 12 | execute_process(COMMAND python3 -m pybind11 --cmakedir 13 | RESULT_VARIABLE __pybind_exit_code 14 | OUTPUT_VARIABLE __pybind_path 15 | OUTPUT_STRIP_TRAILING_WHITESPACE) 16 | find_package(pybind11 PATHS ${__pybind_path}) 17 | 18 | 19 | if(NOT MSVC) 20 | set(CMAKE_CXX_FLAGS "-std=c++11 -O2 -march=native ${CMAKE_CXX_FLAGS}") 21 | set(CMAKE_CUDA_STANDARD 14) 22 | else() 23 | set(CMAKE_CXX_FLAGS "/std:c++11 -O2 -march=native ${CMAKE_CXX_FLAGS}") 24 | set(CMAKE_CUDA_STANDARD 14) 25 | endif() 26 | 27 | include_directories(SYSTEM ${pybind11_INCLUDE_DIRS}) 28 | list(APPEND LINKER_LIBS ${pybind11_LIBRARIES}) 29 | 30 | 31 | ################### 32 | ### CPU BACKEND ### 33 | ################### 34 | add_library(ndarray_backend_cpu MODULE src/ndarray_backend_cpu.cc) 35 | target_link_libraries(ndarray_backend_cpu PUBLIC ${LINKER_LIBS}) 36 | pybind11_extension(ndarray_backend_cpu) 37 | pybind11_strip(ndarray_backend_cpu) 38 | 39 | 40 | # directly output to ffi folder 41 | set_target_properties(ndarray_backend_cpu 42 | PROPERTIES 43 | LIBRARY_OUTPUT_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/python/needle/backend_ndarray 44 | CXX_VISIBILITY_PRESET "hidden" 45 | ) 46 | 47 | if(${CMAKE_SYSTEM_NAME} MATCHES "Darwin") 48 | set_property(TARGET ndarray_backend_cpu PROPERTY LINK_OPTIONS -undefined dynamic_lookup) 49 | endif() 50 | 51 | 52 | 53 | #################### 54 | ### CUDA BACKEND ### 55 | #################### 56 | find_package(CUDA) 57 | if(CUDA_FOUND) 58 | message(STATUS "Found cuda, building cuda backend") 59 | 60 | include_directories(SYSTEM ${CUDA_INCLUDE_DIRS}) 61 | list(APPEND LINKER_LIBS ${CUDA_CUDART_LIBRARY}) 62 | 63 | # invoke nvidia smi to detect if we really have a GPU 64 | execute_process(COMMAND "nvidia-smi" ERROR_QUIET RESULT_VARIABLE NV_RET) 65 | if(NV_RET EQUAL "0") 66 | CUDA_SELECT_NVCC_ARCH_FLAGS(ARCH_FLAGS Auto) 67 | else() 68 | # set to 3.7 the flag of K80 69 | CUDA_SELECT_NVCC_ARCH_FLAGS(ARCH_FLAGS 3.7) 70 | endif() 71 | 72 | # set arch flags properly 73 | CUDA_ADD_LIBRARY(ndarray_backend_cuda MODULE src/ndarray_backend_cuda.cu OPTIONS ${ARCH_FLAGS}) 74 | 75 | target_link_libraries(ndarray_backend_cuda ${LINKER_LIBS}) 76 | pybind11_extension(ndarray_backend_cuda) 77 | pybind11_strip(ndarray_backend_cuda) 78 | 79 | # directly output to ffi folder 80 | set_target_properties(ndarray_backend_cuda 81 | PROPERTIES 82 | LIBRARY_OUTPUT_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/python/needle/backend_ndarray 83 | CXX_VISIBILITY_PRESET "hidden" 84 | CUDA_VISIBILITY_PRESET "hidden" 85 | ) 86 | 87 | endif() 88 | 89 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: lib, pybind, clean, format, all 2 | 3 | all: lib 4 | 5 | 6 | lib: 7 | @mkdir -p build 8 | @cd build; cmake .. 9 | @cd build; $(MAKE) 10 | 11 | format: 12 | python3 -m black . 13 | clang-format -i src/*.cc src/*.cu 14 | 15 | clean: 16 | rm -rf build python/needle/backend_ndarray/ndarray_backend*.so 17 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Homework 4 2 | Public repository and stub/testing code for Homework 4 of 10-714. 3 | -------------------------------------------------------------------------------- /ResNet9.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dlsyscourse/hw4/a1fefc20753cb30afeb222fb92351153c522dea5/ResNet9.png -------------------------------------------------------------------------------- /apps/models.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append('./python') 3 | import needle as ndl 4 | import needle.nn as nn 5 | import math 6 | import numpy as np 7 | np.random.seed(0) 8 | 9 | 10 | class ResNet9(ndl.nn.Module): 11 | def __init__(self, device=None, dtype="float32"): 12 | super().__init__() 13 | ### BEGIN YOUR SOLUTION ### 14 | raise NotImplementedError() ### 15 | ### END YOUR SOLUTION 16 | 17 | def forward(self, x): 18 | ### BEGIN YOUR SOLUTION 19 | raise NotImplementedError() 20 | ### END YOUR SOLUTION 21 | 22 | 23 | class LanguageModel(nn.Module): 24 | def __init__(self, embedding_size, output_size, hidden_size, num_layers=1, 25 | seq_model='rnn', seq_len=40, device=None, dtype="float32"): 26 | """ 27 | Consists of an embedding layer, a sequence model (either RNN or LSTM), and a 28 | linear layer. 29 | Parameters: 30 | output_size: Size of dictionary 31 | embedding_size: Size of embeddings 32 | hidden_size: The number of features in the hidden state of LSTM or RNN 33 | seq_model: 'rnn' or 'lstm', whether to use RNN or LSTM 34 | num_layers: Number of layers in RNN or LSTM 35 | """ 36 | super(LanguageModel, self).__init__() 37 | ### BEGIN YOUR SOLUTION 38 | raise NotImplementedError() 39 | ### END YOUR SOLUTION 40 | 41 | def forward(self, x, h=None): 42 | """ 43 | Given sequence (and the previous hidden state if given), returns probabilities of next word 44 | (along with the last hidden state from the sequence model). 45 | Inputs: 46 | x of shape (seq_len, bs) 47 | h of shape (num_layers, bs, hidden_size) if using RNN, 48 | else h is tuple of (h0, c0), each of shape (num_layers, bs, hidden_size) 49 | Returns (out, h) 50 | out of shape (seq_len*bs, output_size) 51 | h of shape (num_layers, bs, hidden_size) if using RNN, 52 | else h is tuple of (h0, c0), each of shape (num_layers, bs, hidden_size) 53 | """ 54 | ### BEGIN YOUR SOLUTION 55 | raise NotImplementedError() 56 | ### END YOUR SOLUTION 57 | 58 | 59 | if __name__ == "__main__": 60 | model = ResNet9() 61 | x = ndl.ops.randu((1, 32, 32, 3), requires_grad=True) 62 | model(x) 63 | cifar10_train_dataset = ndl.data.CIFAR10Dataset("data/cifar-10-batches-py", train=True) 64 | train_loader = ndl.data.DataLoader(cifar10_train_dataset, 128, ndl.cpu(), dtype="float32") 65 | print(cifar10_train_dataset[1][0].shape) 66 | -------------------------------------------------------------------------------- /apps/simple_ml.py: -------------------------------------------------------------------------------- 1 | """hw1/apps/simple_ml.py""" 2 | 3 | import struct 4 | import gzip 5 | import numpy as np 6 | 7 | import sys 8 | 9 | sys.path.append("python/") 10 | import needle as ndl 11 | 12 | import needle.nn as nn 13 | from apps.models import * 14 | import time 15 | device = ndl.cpu() 16 | 17 | def parse_mnist(image_filesname, label_filename): 18 | """Read an images and labels file in MNIST format. See this page: 19 | http://yann.lecun.com/exdb/mnist/ for a description of the file format. 20 | 21 | Args: 22 | image_filename (str): name of gzipped images file in MNIST format 23 | label_filename (str): name of gzipped labels file in MNIST format 24 | 25 | Returns: 26 | Tuple (X,y): 27 | X (numpy.ndarray[np.float32]): 2D numpy array containing the loaded 28 | data. The dimensionality of the data should be 29 | (num_examples x input_dim) where 'input_dim' is the full 30 | dimension of the data, e.g., since MNIST images are 28x28, it 31 | will be 784. Values should be of type np.float32, and the data 32 | should be normalized to have a minimum value of 0.0 and a 33 | maximum value of 1.0. 34 | 35 | y (numpy.ndarray[dypte=np.int8]): 1D numpy array containing the 36 | labels of the examples. Values should be of type np.int8 and 37 | for MNIST will contain the values 0-9. 38 | """ 39 | ### BEGIN YOUR SOLUTION 40 | raise NotImplementedError() 41 | ### END YOUR SOLUTION 42 | 43 | 44 | def softmax_loss(Z, y_one_hot): 45 | """Return softmax loss. Note that for the purposes of this assignment, 46 | you don't need to worry about "nicely" scaling the numerical properties 47 | of the log-sum-exp computation, but can just compute this directly. 48 | 49 | Args: 50 | Z (ndl.Tensor[np.float32]): 2D Tensor of shape 51 | (batch_size, num_classes), containing the logit predictions for 52 | each class. 53 | y (ndl.Tensor[np.int8]): 2D Tensor of shape (batch_size, num_classes) 54 | containing a 1 at the index of the true label of each example and 55 | zeros elsewhere. 56 | 57 | Returns: 58 | Average softmax loss over the sample. (ndl.Tensor[np.float32]) 59 | """ 60 | ### BEGIN YOUR SOLUTION 61 | raise NotImplementedError() 62 | ### END YOUR SOLUTION 63 | 64 | 65 | def nn_epoch(X, y, W1, W2, lr=0.1, batch=100): 66 | """Run a single epoch of SGD for a two-layer neural network defined by the 67 | weights W1 and W2 (with no bias terms): 68 | logits = ReLU(X * W1) * W1 69 | The function should use the step size lr, and the specified batch size (and 70 | again, without randomizing the order of X). 71 | 72 | Args: 73 | X (np.ndarray[np.float32]): 2D input array of size 74 | (num_examples x input_dim). 75 | y (np.ndarray[np.uint8]): 1D class label array of size (num_examples,) 76 | W1 (ndl.Tensor[np.float32]): 2D array of first layer weights, of shape 77 | (input_dim, hidden_dim) 78 | W2 (ndl.Tensor[np.float32]): 2D array of second layer weights, of shape 79 | (hidden_dim, num_classes) 80 | lr (float): step size (learning rate) for SGD 81 | batch (int): size of SGD mini-batch 82 | 83 | Returns: 84 | Tuple: (W1, W2) 85 | W1: ndl.Tensor[np.float32] 86 | W2: ndl.Tensor[np.float32] 87 | """ 88 | 89 | ### BEGIN YOUR SOLUTION 90 | raise NotImplementedError() 91 | ### END YOUR SOLUTION 92 | 93 | ### CIFAR-10 training ### 94 | def epoch_general_cifar10(dataloader, model, loss_fn=nn.SoftmaxLoss(), opt=None): 95 | """ 96 | Iterates over the dataloader. If optimizer is not None, sets the 97 | model to train mode, and for each batch updates the model parameters. 98 | If optimizer is None, sets the model to eval mode, and simply computes 99 | the loss/accuracy. 100 | 101 | Args: 102 | dataloader: Dataloader instance 103 | model: nn.Module instance 104 | loss_fn: nn.Module instance 105 | opt: Optimizer instance (optional) 106 | 107 | Returns: 108 | avg_acc: average accuracy over dataset 109 | avg_loss: average loss over dataset 110 | """ 111 | np.random.seed(4) 112 | ### BEGIN YOUR SOLUTION 113 | raise NotImplementedError() 114 | ### END YOUR SOLUTION 115 | 116 | 117 | def train_cifar10(model, dataloader, n_epochs=1, optimizer=ndl.optim.Adam, 118 | lr=0.001, weight_decay=0.001, loss_fn=nn.SoftmaxLoss): 119 | """ 120 | Performs {n_epochs} epochs of training. 121 | 122 | Args: 123 | dataloader: Dataloader instance 124 | model: nn.Module instance 125 | n_epochs: number of epochs (int) 126 | optimizer: Optimizer class 127 | lr: learning rate (float) 128 | weight_decay: weight decay (float) 129 | loss_fn: nn.Module class 130 | 131 | Returns: 132 | avg_acc: average accuracy over dataset from last epoch of training 133 | avg_loss: average loss over dataset from last epoch of training 134 | """ 135 | np.random.seed(4) 136 | ### BEGIN YOUR SOLUTION 137 | raise NotImplementedError() 138 | ### END YOUR SOLUTION 139 | 140 | 141 | def evaluate_cifar10(model, dataloader, loss_fn=nn.SoftmaxLoss): 142 | """ 143 | Computes the test accuracy and loss of the model. 144 | 145 | Args: 146 | dataloader: Dataloader instance 147 | model: nn.Module instance 148 | loss_fn: nn.Module class 149 | 150 | Returns: 151 | avg_acc: average accuracy over dataset 152 | avg_loss: average loss over dataset 153 | """ 154 | np.random.seed(4) 155 | ### BEGIN YOUR SOLUTION 156 | raise NotImplementedError() 157 | ### END YOUR SOLUTION 158 | 159 | 160 | ### PTB training ### 161 | def epoch_general_ptb(data, model, seq_len=40, loss_fn=nn.SoftmaxLoss(), opt=None, 162 | clip=None, device=None, dtype="float32"): 163 | """ 164 | Iterates over the data. If optimizer is not None, sets the 165 | model to train mode, and for each batch updates the model parameters. 166 | If optimizer is None, sets the model to eval mode, and simply computes 167 | the loss/accuracy. 168 | 169 | Args: 170 | data: data of shape (nbatch, batch_size) given from batchify function 171 | model: LanguageModel instance 172 | seq_len: i.e. bptt, sequence length 173 | loss_fn: nn.Module instance 174 | opt: Optimizer instance (optional) 175 | clip: max norm of gradients (optional) 176 | 177 | Returns: 178 | avg_acc: average accuracy over dataset 179 | avg_loss: average loss over dataset 180 | """ 181 | np.random.seed(4) 182 | ### BEGIN YOUR SOLUTION 183 | raise NotImplementedError() 184 | ### END YOUR SOLUTION 185 | 186 | 187 | def train_ptb(model, data, seq_len=40, n_epochs=1, optimizer=ndl.optim.SGD, 188 | lr=4.0, weight_decay=0.0, loss_fn=nn.SoftmaxLoss, clip=None, 189 | device=None, dtype="float32"): 190 | """ 191 | Performs {n_epochs} epochs of training. 192 | 193 | Args: 194 | model: LanguageModel instance 195 | data: data of shape (nbatch, batch_size) given from batchify function 196 | seq_len: i.e. bptt, sequence length 197 | n_epochs: number of epochs (int) 198 | optimizer: Optimizer class 199 | lr: learning rate (float) 200 | weight_decay: weight decay (float) 201 | loss_fn: nn.Module class 202 | clip: max norm of gradients (optional) 203 | 204 | Returns: 205 | avg_acc: average accuracy over dataset from last epoch of training 206 | avg_loss: average loss over dataset from last epoch of training 207 | """ 208 | np.random.seed(4) 209 | ### BEGIN YOUR SOLUTION 210 | raise NotImplementedError() 211 | ### END YOUR SOLUTION 212 | 213 | def evaluate_ptb(model, data, seq_len=40, loss_fn=nn.SoftmaxLoss, 214 | device=None, dtype="float32"): 215 | """ 216 | Computes the test accuracy and loss of the model. 217 | 218 | Args: 219 | model: LanguageModel instance 220 | data: data of shape (nbatch, batch_size) given from batchify function 221 | seq_len: i.e. bptt, sequence length 222 | loss_fn: nn.Module class 223 | 224 | Returns: 225 | avg_acc: average accuracy over dataset 226 | avg_loss: average loss over dataset 227 | """ 228 | np.random.seed(4) 229 | ### BEGIN YOUR SOLUTION 230 | raise NotImplementedError() 231 | ### END YOUR SOLUTION 232 | 233 | ### CODE BELOW IS FOR ILLUSTRATION, YOU DO NOT NEED TO EDIT 234 | 235 | 236 | def loss_err(h, y): 237 | """Helper function to compute both loss and error""" 238 | y_one_hot = np.zeros((y.shape[0], h.shape[-1])) 239 | y_one_hot[np.arange(y.size), y] = 1 240 | y_ = ndl.Tensor(y_one_hot) 241 | return softmax_loss(h, y_).numpy(), np.mean(h.numpy().argmax(axis=1) != y) 242 | -------------------------------------------------------------------------------- /python/needle/__init__.py: -------------------------------------------------------------------------------- 1 | from . import ops 2 | from .ops import * 3 | from .autograd import Tensor, cpu, all_devices 4 | 5 | from . import init 6 | from .init import ones, zeros, zeros_like, ones_like 7 | 8 | from . import data 9 | from . import nn 10 | from . import optim 11 | from .backend_selection import * 12 | -------------------------------------------------------------------------------- /python/needle/autograd.py: -------------------------------------------------------------------------------- 1 | """Core data structures.""" 2 | import needle 3 | from .backend_numpy import Device, cpu, all_devices 4 | from typing import List, Optional, NamedTuple, Tuple, Union 5 | from collections import namedtuple 6 | import numpy 7 | 8 | from needle import init 9 | 10 | # needle version 11 | LAZY_MODE = False 12 | TENSOR_COUNTER = 0 13 | 14 | # NOTE: we will import numpy as the array_api 15 | # as the backend for our computations, this line will change in later homeworks 16 | 17 | import numpy as array_api 18 | NDArray = numpy.ndarray 19 | 20 | from .backend_selection import array_api, NDArray, default_device 21 | 22 | class Op: 23 | """Operator definition.""" 24 | 25 | def __call__(self, *args): 26 | raise NotImplementedError() 27 | 28 | def compute(self, *args: Tuple[NDArray]): 29 | """Calculate forward pass of operator. 30 | 31 | Parameters 32 | ---------- 33 | input: np.ndarray 34 | A list of input arrays to the function 35 | 36 | Returns 37 | ------- 38 | output: nd.array 39 | Array output of the operation 40 | 41 | """ 42 | raise NotImplementedError() 43 | 44 | def gradient( 45 | self, out_grad: "Value", node: "Value" 46 | ) -> Union["Value", Tuple["Value"]]: 47 | """Compute partial adjoint for each input value for a given output adjoint. 48 | 49 | Parameters 50 | ---------- 51 | out_grad: Value 52 | The adjoint wrt to the output value. 53 | 54 | node: Value 55 | The value node of forward evaluation. 56 | 57 | Returns 58 | ------- 59 | input_grads: Value or Tuple[Value] 60 | A list containing partial gradient adjoints to be propagated to 61 | each of the input node. 62 | """ 63 | raise NotImplementedError() 64 | 65 | def gradient_as_tuple(self, out_grad: "Value", node: "Value") -> Tuple["Value"]: 66 | """Convenience method to always return a tuple from gradient call""" 67 | output = self.gradient(out_grad, node) 68 | if isinstance(output, tuple): 69 | return output 70 | elif isinstance(output, list): 71 | return tuple(output) 72 | else: 73 | return (output,) 74 | 75 | 76 | class TensorOp(Op): 77 | """Op class specialized to output tensors, will be alternate subclasses for other structures""" 78 | 79 | def __call__(self, *args): 80 | return Tensor.make_from_op(self, args) 81 | 82 | 83 | class TensorTupleOp(Op): 84 | """Op class specialized to output TensorTuple""" 85 | 86 | def __call__(self, *args): 87 | return TensorTuple.make_from_op(self, args) 88 | 89 | 90 | class Value: 91 | """A value in the computational graph.""" 92 | 93 | # trace of computational graph 94 | op: Optional[Op] 95 | inputs: List["Value"] 96 | # The following fields are cached fields for 97 | # dynamic computation 98 | cached_data: NDArray 99 | requires_grad: bool 100 | 101 | def realize_cached_data(self): 102 | """Run compute to realize the cached data""" 103 | # avoid recomputation 104 | if self.cached_data is not None: 105 | return self.cached_data 106 | # note: data implicitly calls realized cached data 107 | self.cached_data = self.op.compute( 108 | *[x.realize_cached_data() for x in self.inputs] 109 | ) 110 | return self.cached_data 111 | 112 | def is_leaf(self): 113 | return self.op is None 114 | 115 | def __del__(self): 116 | global TENSOR_COUNTER 117 | TENSOR_COUNTER -= 1 118 | 119 | def _init( 120 | self, 121 | op: Optional[Op], 122 | inputs: List["Tensor"], 123 | *, 124 | num_outputs: int = 1, 125 | cached_data: List[object] = None, 126 | requires_grad: Optional[bool] = None 127 | ): 128 | global TENSOR_COUNTER 129 | TENSOR_COUNTER += 1 130 | if requires_grad is None: 131 | requires_grad = any(x.requires_grad for x in inputs) 132 | self.op = op 133 | self.inputs = inputs 134 | self.num_outputs = num_outputs 135 | self.cached_data = cached_data 136 | self.requires_grad = requires_grad 137 | 138 | @classmethod 139 | def make_const(cls, data, *, requires_grad=False): 140 | value = cls.__new__(cls) 141 | value._init( 142 | None, 143 | [], 144 | cached_data=data, 145 | requires_grad=requires_grad, 146 | ) 147 | return value 148 | 149 | @classmethod 150 | def make_from_op(cls, op: Op, inputs: List["Value"]): 151 | value = cls.__new__(cls) 152 | value._init(op, inputs) 153 | 154 | if not LAZY_MODE: 155 | if not value.requires_grad: 156 | return value.detach() 157 | value.realize_cached_data() 158 | return value 159 | 160 | 161 | ### Not needed in HW1 162 | class TensorTuple(Value): 163 | """Represent a tuple of tensors. 164 | 165 | To keep things simple, we do not support nested tuples. 166 | """ 167 | 168 | def __len__(self): 169 | cdata = self.realize_cached_data() 170 | return len(cdata) 171 | 172 | def __getitem__(self, index: int): 173 | return needle.ops.tuple_get_item(self, index) 174 | 175 | def tuple(self): 176 | return tuple([x for x in self]) 177 | 178 | def __repr__(self): 179 | return "needle.TensorTuple" + str(self.tuple()) 180 | 181 | def __str__(self): 182 | return self.__repr__() 183 | 184 | def __add__(self, other): 185 | assert isinstance(other, TensorTuple) 186 | assert len(self) == len(other) 187 | return needle.ops.make_tuple(*[self[i] + other[i] for i in range(len(self))]) 188 | 189 | def detach(self): 190 | """Create a new tensor that shares the data but detaches from the graph.""" 191 | return TensorTuple.make_const(self.realize_cached_data()) 192 | 193 | 194 | class Tensor(Value): 195 | grad: "Tensor" 196 | 197 | def __init__( 198 | self, 199 | array, 200 | *, 201 | device: Optional[Device] = None, 202 | dtype=None, 203 | requires_grad=True, 204 | **kwargs 205 | ): 206 | if isinstance(array, Tensor): 207 | if device is None: 208 | device = array.device 209 | if dtype is None: 210 | dtype = array.dtype 211 | if device == array.device and dtype == array.dtype: 212 | cached_data = array.realize_cached_data() 213 | else: 214 | # fall back, copy through numpy conversion 215 | cached_data = Tensor._array_from_numpy( 216 | array.numpy(), device=device, dtype=dtype 217 | ) 218 | else: 219 | device = device if device else default_device() 220 | cached_data = Tensor._array_from_numpy(array, device=device, dtype=dtype) 221 | 222 | self._init( 223 | None, 224 | [], 225 | cached_data=cached_data, 226 | requires_grad=requires_grad, 227 | ) 228 | 229 | @staticmethod 230 | def _array_from_numpy(numpy_array, device, dtype): 231 | if array_api is numpy: 232 | return numpy.array(numpy_array, dtype=dtype) 233 | return array_api.array(numpy_array, device=device, dtype=dtype) 234 | 235 | @staticmethod 236 | def make_from_op(op: Op, inputs: List["Value"]): 237 | tensor = Tensor.__new__(Tensor) 238 | tensor._init(op, inputs) 239 | if not LAZY_MODE: 240 | if not tensor.requires_grad: 241 | return tensor.detach() 242 | tensor.realize_cached_data() 243 | return tensor 244 | 245 | @staticmethod 246 | def make_const(data, requires_grad=False): 247 | tensor = Tensor.__new__(Tensor) 248 | tensor._init( 249 | None, 250 | [], 251 | cached_data=data 252 | if not isinstance(data, Tensor) 253 | else data.realize_cached_data(), 254 | requires_grad=requires_grad, 255 | ) 256 | return tensor 257 | 258 | @property 259 | def data(self): 260 | return self.detach() 261 | 262 | @data.setter 263 | def data(self, value): 264 | assert isinstance(value, Tensor) 265 | assert value.dtype == self.dtype, "%s %s" % ( 266 | value.dtype, 267 | self.dtype, 268 | ) 269 | self.cached_data = value.realize_cached_data() 270 | 271 | def detach(self): 272 | """Create a new tensor that shares the data but detaches from the graph.""" 273 | return Tensor.make_const(self.realize_cached_data()) 274 | 275 | @property 276 | def shape(self): 277 | return self.realize_cached_data().shape 278 | 279 | @property 280 | def dtype(self): 281 | return self.realize_cached_data().dtype 282 | 283 | @property 284 | def device(self): 285 | data = self.realize_cached_data() 286 | # numpy array always sits on cpu 287 | if array_api is numpy: 288 | return cpu() 289 | return data.device 290 | 291 | def backward(self, out_grad=None): 292 | out_grad = ( 293 | out_grad 294 | if out_grad 295 | else init.ones(*self.shape, dtype=self.dtype, device=self.device) 296 | ) 297 | compute_gradient_of_variables(self, out_grad) 298 | 299 | def __repr__(self): 300 | return "needle.Tensor(" + str(self.realize_cached_data()) + ")" 301 | 302 | def __str__(self): 303 | return self.realize_cached_data().__str__() 304 | 305 | def numpy(self): 306 | data = self.realize_cached_data() 307 | if array_api is numpy: 308 | return data 309 | return data.numpy() 310 | 311 | def __add__(self, other): 312 | if isinstance(other, Tensor): 313 | return needle.ops.EWiseAdd()(self, other) 314 | else: 315 | return needle.ops.AddScalar(other)(self) 316 | 317 | def __mul__(self, other): 318 | if isinstance(other, Tensor): 319 | return needle.ops.EWiseMul()(self, other) 320 | else: 321 | return needle.ops.MulScalar(other)(self) 322 | 323 | def __pow__(self, other): 324 | if isinstance(other, Tensor): 325 | return needle.ops.EWisePow()(self, other) 326 | else: 327 | return needle.ops.PowerScalar(other)(self) 328 | 329 | def __sub__(self, other): 330 | if isinstance(other, Tensor): 331 | return needle.ops.EWiseAdd()(self, needle.ops.Negate()(other)) 332 | else: 333 | return needle.ops.AddScalar(-other)(self) 334 | 335 | def __truediv__(self, other): 336 | if isinstance(other, Tensor): 337 | return needle.ops.EWiseDiv()(self, other) 338 | else: 339 | return needle.ops.DivScalar(other)(self) 340 | 341 | def __matmul__(self, other): 342 | return needle.ops.MatMul()(self, other) 343 | 344 | def matmul(self, other): 345 | return needle.ops.MatMul()(self, other) 346 | 347 | def sum(self, axes=None): 348 | return needle.ops.Summation(axes)(self) 349 | 350 | def broadcast_to(self, shape): 351 | return needle.ops.BroadcastTo(shape)(self) 352 | 353 | def reshape(self, shape): 354 | return needle.ops.Reshape(shape)(self) 355 | 356 | def __neg__(self): 357 | return needle.ops.Negate()(self) 358 | 359 | def transpose(self, axes=None): 360 | return needle.ops.Transpose(axes)(self) 361 | 362 | 363 | 364 | 365 | __radd__ = __add__ 366 | __rmul__ = __mul__ 367 | 368 | def compute_gradient_of_variables(output_tensor, out_grad): 369 | """Take gradient of output node with respect to each node in node_list. 370 | 371 | Store the computed result in the grad field of each Variable. 372 | """ 373 | # a map from node to a list of gradient contributions from each output node 374 | node_to_output_grads_list: Dict[Tensor, List[Tensor]] = {} 375 | # Special note on initializing gradient of 376 | # We are really taking a derivative of the scalar reduce_sum(output_node) 377 | # instead of the vector output_node. But this is the common case for loss function. 378 | node_to_output_grads_list[output_tensor] = [out_grad] 379 | 380 | # Traverse graph in reverse topological order given the output_node that we are taking gradient wrt. 381 | reverse_topo_order = list(reversed(find_topo_sort([output_tensor]))) 382 | 383 | ### BEGIN YOUR SOLUTION 384 | raise NotImplementedError() 385 | ### END YOUR SOLUTION 386 | 387 | 388 | def find_topo_sort(node_list: List[Value]) -> List[Value]: 389 | """Given a list of nodes, return a topological sort list of nodes ending in them. 390 | 391 | A simple algorithm is to do a post-order DFS traversal on the given nodes, 392 | going backwards based on input edges. Since a node is added to the ordering 393 | after all its predecessors are traversed due to post-order DFS, we get a topological 394 | sort. 395 | """ 396 | ### BEGIN YOUR SOLUTION 397 | raise NotImplementedError() 398 | ### END YOUR SOLUTION 399 | 400 | 401 | def topo_sort_dfs(node, visited, topo_order): 402 | """Post-order DFS""" 403 | ### BEGIN YOUR SOLUTION 404 | raise NotImplementedError() 405 | ### END YOUR SOLUTION 406 | 407 | 408 | ############################## 409 | ####### Helper Methods ####### 410 | ############################## 411 | 412 | 413 | def sum_node_list(node_list): 414 | """Custom sum function in order to avoid create redundant nodes in Python sum implementation.""" 415 | from operator import add 416 | from functools import reduce 417 | 418 | return reduce(add, node_list) 419 | -------------------------------------------------------------------------------- /python/needle/backend_ndarray/__init__.py: -------------------------------------------------------------------------------- 1 | from .ndarray import * 2 | -------------------------------------------------------------------------------- /python/needle/backend_ndarray/ndarray.py: -------------------------------------------------------------------------------- 1 | import operator 2 | import math 3 | from functools import reduce 4 | import numpy as np 5 | from . import ndarray_backend_numpy 6 | from . import ndarray_backend_cpu 7 | 8 | 9 | # math.prod not in Python 3.7 10 | def prod(x): 11 | return reduce(operator.mul, x, 1) 12 | 13 | 14 | class BackendDevice: 15 | """A backend device, wrapps the implementation module.""" 16 | 17 | def __init__(self, name, mod): 18 | self.name = name 19 | self.mod = mod 20 | 21 | def __eq__(self, other): 22 | return self.name == other.name 23 | 24 | def __repr__(self): 25 | return self.name + "()" 26 | 27 | def __getattr__(self, name): 28 | return getattr(self.mod, name) 29 | 30 | def enabled(self): 31 | return self.mod is not None 32 | 33 | def randn(self, *shape, dtype="float32"): 34 | # note: numpy doesn't support types within standard random routines, and 35 | # .astype("float32") does work if we're generating a singleton 36 | return NDArray(np.random.randn(*shape).astype(dtype), device=self) 37 | 38 | def rand(self, *shape, dtype="float32"): 39 | # note: numpy doesn't support types within standard random routines, and 40 | # .astype("float32") does work if we're generating a singleton 41 | return NDArray(np.random.rand(*shape).astype(dtype), device=self) 42 | 43 | def one_hot(self, n, i, dtype="float32"): 44 | return NDArray(np.eye(n, dtype=dtype)[i], device=self) 45 | 46 | def empty(self, shape, dtype="float32"): 47 | dtype = "float32" if dtype is None else dtype 48 | assert dtype == "float32" 49 | return NDArray.make(shape, device=self) 50 | 51 | def full(self, shape, fill_value, dtype="float32"): 52 | dtype = "float32" if dtype is None else dtype 53 | assert dtype == "float32" 54 | arr = self.empty(shape, dtype) 55 | arr.fill(fill_value) 56 | return arr 57 | 58 | 59 | def cuda(): 60 | """Return cuda device""" 61 | try: 62 | from . import ndarray_backend_cuda 63 | 64 | return BackendDevice("cuda", ndarray_backend_cuda) 65 | except ImportError: 66 | return BackendDevice("cuda", None) 67 | 68 | 69 | def cpu_numpy(): 70 | """Return numpy device""" 71 | return BackendDevice("cpu_numpy", ndarray_backend_numpy) 72 | 73 | 74 | def cpu(): 75 | """Return cpu device""" 76 | return BackendDevice("cpu", ndarray_backend_cpu) 77 | 78 | 79 | def default_device(): 80 | return cpu_numpy() 81 | 82 | 83 | def all_devices(): 84 | """return a list of all available devices""" 85 | return [cpu(), cuda(), cpu_numpy()] 86 | 87 | 88 | class NDArray: 89 | """A generic ND array class that may contain multipe different backends 90 | i.e., a Numpy backend, a native CPU backend, or a GPU backend. 91 | 92 | This class will only contains those functions that you need to implement 93 | to actually get the desired functionality for the programming examples 94 | in the homework, and no more. 95 | 96 | For now, for simplicity the class only supports float32 types, though 97 | this can be extended if desired. 98 | """ 99 | 100 | def __init__(self, other, device=None): 101 | """Create by copying another NDArray, or from numpy""" 102 | if isinstance(other, NDArray): 103 | # create a copy of existing NDArray 104 | if device is None: 105 | device = other.device 106 | self._init(other.to(device) + 0.0) # this creates a copy 107 | elif isinstance(other, np.ndarray): 108 | # create copy from numpy array 109 | device = device if device is not None else default_device() 110 | array = self.make(other.shape, device=device) 111 | array.device.from_numpy(np.ascontiguousarray(other), array._handle) 112 | self._init(array) 113 | else: 114 | # see if we can create a numpy array from input 115 | array = NDArray(np.array(other), device=device) 116 | self._init(array) 117 | 118 | def _init(self, other): 119 | self._shape = other._shape 120 | self._strides = other._strides 121 | self._offset = other._offset 122 | self._device = other._device 123 | self._handle = other._handle 124 | 125 | @staticmethod 126 | def compact_strides(shape): 127 | """Utility function to compute compact strides""" 128 | stride = 1 129 | res = [] 130 | for i in range(1, len(shape) + 1): 131 | res.append(stride) 132 | stride *= shape[-i] 133 | return tuple(res[::-1]) 134 | 135 | @staticmethod 136 | def make(shape, strides=None, device=None, handle=None, offset=0): 137 | """Create a new NDArray with the given properties. This will allocation the 138 | memory if handle=None, otherwise it will use the handle of an existing 139 | array.""" 140 | array = NDArray.__new__(NDArray) 141 | array._shape = tuple(shape) 142 | array._strides = NDArray.compact_strides(shape) if strides is None else strides 143 | array._offset = offset 144 | array._device = device if device is not None else default_device() 145 | if handle is None: 146 | array._handle = array.device.Array(prod(shape)) 147 | else: 148 | array._handle = handle 149 | return array 150 | 151 | ### Properies and string representations 152 | @property 153 | def shape(self): 154 | return self._shape 155 | 156 | @property 157 | def strides(self): 158 | return self._strides 159 | 160 | @property 161 | def device(self): 162 | return self._device 163 | 164 | @property 165 | def dtype(self): 166 | # only support float32 for now 167 | return "float32" 168 | 169 | @property 170 | def ndim(self): 171 | """Return number of dimensions.""" 172 | return len(self._shape) 173 | 174 | @property 175 | def size(self): 176 | return prod(self._shape) 177 | 178 | def __repr__(self): 179 | return "NDArray(" + self.numpy().__str__() + f", device={self.device})" 180 | 181 | def __str__(self): 182 | return self.numpy().__str__() 183 | 184 | ### Basic array manipulation 185 | def fill(self, value): 186 | """Fill (in place) with a constant value.""" 187 | self._device.fill(self._handle, value) 188 | 189 | def to(self, device): 190 | """Convert between devices, using to/from numpy calls as the unifying bridge.""" 191 | if device == self.device: 192 | return self 193 | else: 194 | return NDArray(self.numpy(), device=device) 195 | 196 | def numpy(self): 197 | """convert to a numpy array""" 198 | return self.device.to_numpy( 199 | self._handle, self.shape, self.strides, self._offset 200 | ) 201 | 202 | def is_compact(self): 203 | """Return true if array is compact in memory and internal size equals product 204 | of the shape dimensions""" 205 | return ( 206 | self._strides == self.compact_strides(self._shape) 207 | and prod(self.shape) == self._handle.size 208 | ) 209 | 210 | def compact(self): 211 | """Convert a matrix to be compact""" 212 | if self.is_compact(): 213 | return self 214 | else: 215 | out = NDArray.make(self.shape, device=self.device) 216 | self.device.compact( 217 | self._handle, out._handle, self.shape, self.strides, self._offset 218 | ) 219 | return out 220 | 221 | def as_strided(self, shape, strides): 222 | """Restride the matrix without copying memory.""" 223 | assert len(shape) == len(strides) 224 | return NDArray.make( 225 | shape, strides=strides, device=self.device, handle=self._handle, offset=self._offset 226 | ) 227 | 228 | @property 229 | def flat(self): 230 | return self.reshape((self.size,)) 231 | 232 | def reshape(self, new_shape): 233 | """ 234 | Reshape the matrix without copying memory. This will return a matrix 235 | that corresponds to a reshaped array but points to the same memory as 236 | the original array. 237 | 238 | Raises: 239 | ValueError if product of current shape is not equal to the product 240 | of the new shape, or if the matrix is not compact. 241 | 242 | Args: 243 | new_shape (tuple): new shape of the array 244 | 245 | Returns: 246 | NDArray : reshaped array; this will point to thep 247 | """ 248 | 249 | ### BEGIN YOUR SOLUTION 250 | raise NotImplementedError() 251 | ### END YOUR SOLUTION 252 | 253 | def permute(self, new_axes): 254 | """ 255 | Permute order of the dimensions. new_axes describes a permuation of the 256 | existing axes, so e.g.: 257 | - If we have an array with dimension "BHWC" then .permute((0,3,1,2)) 258 | would convert this to "BCHW" order. 259 | - For a 2D array, .permute((1,0)) would transpose the array. 260 | Like reshape, this operation should not copy memory, but achieves the 261 | permuting by just adjusting the shape/strides of the array. That is, 262 | it returns a new array that has the dimensions permuted as desired, but 263 | which points to the same memroy as the original array. 264 | 265 | Args: 266 | new_axes (tuple): permuation order of the dimensions 267 | 268 | Returns: 269 | NDarray : new NDArray object with permuted dimensions, pointing 270 | to the same memory as the original NDArray (i.e., just shape and 271 | strides changed). 272 | """ 273 | 274 | ### BEGIN YOUR SOLUTION 275 | raise NotImplementedError() 276 | ### END YOUR SOLUTION 277 | 278 | def broadcast_to(self, new_shape): 279 | """ 280 | Broadcast an array to a new shape. new_shape's elements must be the 281 | same as the original shape, except for dimensions in the self where 282 | the size = 1 (which can then be broadcast to any size). As with the 283 | previous calls, this will not copy memory, and just achieves 284 | broadcasting by manipulating the strides. 285 | 286 | Raises: 287 | assertion error if new_shape[i] != shape[i] for all i where 288 | shape[i] != 1 289 | 290 | Args: 291 | new_shape (tuple): shape to broadcast to 292 | 293 | Returns: 294 | NDArray: the new NDArray object with the new broadcast shape; should 295 | point to the same memory as the original array. 296 | """ 297 | 298 | ### BEGIN YOUR SOLUTION 299 | raise NotImplementedError() 300 | ### END YOUR SOLUTION 301 | 302 | ### Get and set elements 303 | 304 | def process_slice(self, sl, dim): 305 | """Convert a slice to an explicit start/stop/step""" 306 | start, stop, step = sl.start, sl.stop, sl.step 307 | if start == None: 308 | start = 0 309 | if start < 0: 310 | start = self.shape[dim] 311 | if stop == None: 312 | stop = self.shape[dim] 313 | if stop < 0: 314 | stop = self.shape[dim] + stop 315 | if step == None: 316 | step = 1 317 | 318 | # we're not gonna handle negative strides and that kind of thing 319 | assert stop > start, "Start must be less than stop" 320 | assert step > 0, "No support for negative increments" 321 | return slice(start, stop, step) 322 | 323 | def __getitem__(self, idxs): 324 | """ 325 | The __getitem__ operator in Python allows us to access elements of our 326 | array. When passed notation such as a[1:5,:-1:2,4,:] etc, Python will 327 | convert this to a tuple of slices and integers (for singletons like the 328 | '4' in this example). Slices can be a bit odd to work with (they have 329 | three elements .start .stop .step), which can be None or have negative 330 | entries, so for simplicity we wrote the code for you to convert these 331 | to always be a tuple of slices, one of each dimension. 332 | 333 | For this tuple of slices, return an array that subsets the desired 334 | elements. As before, this can be done entirely through compute a new 335 | shape, stride, and offset for the new "view" into the original array, 336 | pointing to the same memory 337 | 338 | Raises: 339 | AssertionError if a slice has negative size or step, or if number 340 | of slices is not equal to the number of dimension (the stub code 341 | already raises all these errors. 342 | 343 | Args: 344 | idxs tuple: (after stub code processes), a tuple of slice elements 345 | coresponding to the subset of the matrix to get 346 | 347 | Returns: 348 | NDArray: a new NDArray object corresponding to the selected 349 | subset of elements. As before, this should not copy memroy but just 350 | manipulate the shape/strides/offset of the new array, referecing 351 | the same array as the original one. 352 | """ 353 | 354 | # handle singleton as tuple, everything as slices 355 | if not isinstance(idxs, tuple): 356 | idxs = (idxs,) 357 | idxs = tuple( 358 | [ 359 | self.process_slice(s, i) if isinstance(s, slice) else slice(s, s + 1, 1) 360 | for i, s in enumerate(idxs) 361 | ] 362 | ) 363 | assert len(idxs) == self.ndim, "Need indexes equal to number of dimensions" 364 | 365 | ### BEGIN YOUR SOLUTION 366 | raise NotImplementedError() 367 | ### END YOUR SOLUTION 368 | 369 | def __setitem__(self, idxs, other): 370 | """Set the values of a view into an array, using the same semantics 371 | as __getitem__().""" 372 | view = self.__getitem__(idxs) 373 | if isinstance(other, NDArray): 374 | assert prod(view.shape) == prod(other.shape) 375 | self.device.ewise_setitem( 376 | other.compact()._handle, 377 | view._handle, 378 | view.shape, 379 | view.strides, 380 | view._offset, 381 | ) 382 | else: 383 | self.device.scalar_setitem( 384 | prod(view.shape), 385 | other, 386 | view._handle, 387 | view.shape, 388 | view.strides, 389 | view._offset, 390 | ) 391 | 392 | ### Collection of elementwise and scalar function: add, multiply, boolean, etc 393 | 394 | def ewise_or_scalar(self, other, ewise_func, scalar_func): 395 | """Run either an elementwise or scalar version of a function, 396 | depending on whether "other" is an NDArray or scalar 397 | """ 398 | out = NDArray.make(self.shape, device=self.device) 399 | if isinstance(other, NDArray): 400 | assert self.shape == other.shape, "operation needs two equal-sized arrays" 401 | ewise_func(self.compact()._handle, other.compact()._handle, out._handle) 402 | else: 403 | scalar_func(self.compact()._handle, other, out._handle) 404 | return out 405 | 406 | def __add__(self, other): 407 | return self.ewise_or_scalar( 408 | other, self.device.ewise_add, self.device.scalar_add 409 | ) 410 | 411 | __radd__ = __add__ 412 | 413 | def __sub__(self, other): 414 | return self + (-other) 415 | 416 | def __rsub__(self, other): 417 | return other + (-self) 418 | 419 | def __mul__(self, other): 420 | return self.ewise_or_scalar( 421 | other, self.device.ewise_mul, self.device.scalar_mul 422 | ) 423 | 424 | __rmul__ = __mul__ 425 | 426 | def __truediv__(self, other): 427 | return self.ewise_or_scalar( 428 | other, self.device.ewise_div, self.device.scalar_div 429 | ) 430 | 431 | def __neg__(self): 432 | return self * (-1) 433 | 434 | def __pow__(self, other): 435 | out = NDArray.make(self.shape, device=self.device) 436 | self.device.scalar_power(self.compact()._handle, other, out._handle) 437 | return out 438 | 439 | def maximum(self, other): 440 | return self.ewise_or_scalar( 441 | other, self.device.ewise_maximum, self.device.scalar_maximum 442 | ) 443 | 444 | ### Binary operators all return (0.0, 1.0) floating point values, could of course be optimized 445 | def __eq__(self, other): 446 | return self.ewise_or_scalar(other, self.device.ewise_eq, self.device.scalar_eq) 447 | 448 | def __ge__(self, other): 449 | return self.ewise_or_scalar(other, self.device.ewise_ge, self.device.scalar_ge) 450 | 451 | def __ne__(self, other): 452 | return 1 - (self == other) 453 | 454 | def __gt__(self, other): 455 | return (self >= other) * (self != other) 456 | 457 | def __lt__(self, other): 458 | return 1 - (self >= other) 459 | 460 | def __le__(self, other): 461 | return 1 - (self > other) 462 | 463 | ### Elementwise functions 464 | 465 | def log(self): 466 | out = NDArray.make(self.shape, device=self.device) 467 | self.device.ewise_log(self.compact()._handle, out._handle) 468 | return out 469 | 470 | def exp(self): 471 | out = NDArray.make(self.shape, device=self.device) 472 | self.device.ewise_exp(self.compact()._handle, out._handle) 473 | return out 474 | 475 | def tanh(self): 476 | out = NDArray.make(self.shape, device=self.device) 477 | self.device.ewise_tanh(self.compact()._handle, out._handle) 478 | return out 479 | 480 | ### Matrix multiplication 481 | def __matmul__(self, other): 482 | """Matrix multplication of two arrays. This requires that both arrays 483 | be 2D (i.e., we don't handle batch matrix multiplication), and that the 484 | sizes match up properly for matrix multiplication. 485 | 486 | In the case of the CPU backend, you will implement an efficient "tiled" 487 | version of matrix multiplication for the case when all dimensions of 488 | the array are divisible by self.device.__tile_size__. In this case, 489 | the code below will restride and compact the matrix into tiled form, 490 | and then pass to the relevant CPU backend. For the CPU version we will 491 | just fall back to the naive CPU implementation if the array shape is not 492 | a multiple of the tile size 493 | 494 | The GPU (and numpy) versions don't have any tiled version (or rather, 495 | the GPU version will just work natively by tiling any input size). 496 | """ 497 | 498 | assert self.ndim == 2 and other.ndim == 2 499 | assert self.shape[1] == other.shape[0] 500 | 501 | m, n, p = self.shape[0], self.shape[1], other.shape[1] 502 | 503 | # if the matrix is aligned, use tiled matrix multiplication 504 | if hasattr(self.device, "matmul_tiled") and all( 505 | d % self.device.__tile_size__ == 0 for d in (m, n, p) 506 | ): 507 | 508 | def tile(a, tile): 509 | return a.as_strided( 510 | (a.shape[0] // tile, a.shape[1] // tile, tile, tile), 511 | (a.shape[1] * tile, tile, a.shape[1], 1), 512 | ) 513 | 514 | t = self.device.__tile_size__ 515 | a = tile(self.compact(), t).compact() 516 | b = tile(other.compact(), t).compact() 517 | out = NDArray.make((a.shape[0], b.shape[1], t, t), device=self.device) 518 | self.device.matmul_tiled(a._handle, b._handle, out._handle, m, n, p) 519 | 520 | return ( 521 | out.permute((0, 2, 1, 3)) 522 | .compact() 523 | .reshape((self.shape[0], other.shape[1])) 524 | ) 525 | 526 | else: 527 | out = NDArray.make((m, p), device=self.device) 528 | self.device.matmul( 529 | self.compact()._handle, other.compact()._handle, out._handle, m, n, p 530 | ) 531 | return out 532 | 533 | ### Reductions, i.e., sum/max over all element or over given axis 534 | def reduce_view_out(self, axis, keepdims=False): 535 | """ Return a view to the array set up for reduction functions and output array. """ 536 | if isinstance(axis, tuple) and not axis: 537 | raise ValueError("Empty axis in reduce") 538 | 539 | if axis is None: 540 | view = self.compact().reshape((1,) * (self.ndim - 1) + (prod(self.shape),)) 541 | #out = NDArray.make((1,) * self.ndim, device=self.device) 542 | out = NDArray.make((1,), device=self.device) 543 | 544 | else: 545 | if isinstance(axis, (tuple, list)): 546 | assert len(axis) == 1, "Only support reduction over a single axis" 547 | axis = axis[0] 548 | 549 | view = self.permute( 550 | tuple([a for a in range(self.ndim) if a != axis]) + (axis,) 551 | ) 552 | out = NDArray.make( 553 | tuple([1 if i == axis else s for i, s in enumerate(self.shape)]) 554 | if keepdims else 555 | tuple([s for i, s in enumerate(self.shape) if i != axis]), 556 | device=self.device, 557 | ) 558 | return view, out 559 | 560 | def sum(self, axis=None, keepdims=False): 561 | view, out = self.reduce_view_out(axis, keepdims=keepdims) 562 | self.device.reduce_sum(view.compact()._handle, out._handle, view.shape[-1]) 563 | return out 564 | 565 | def max(self, axis=None, keepdims=False): 566 | view, out = self.reduce_view_out(axis, keepdims=keepdims) 567 | self.device.reduce_max(view.compact()._handle, out._handle, view.shape[-1]) 568 | return out 569 | 570 | def flip(self, axes): 571 | """ 572 | Flip this ndarray along the specified axes. 573 | Note: compact() before returning. 574 | """ 575 | ### BEGIN YOUR SOLUTION 576 | raise NotImplementedError() 577 | ### END YOUR SOLUTION 578 | 579 | def pad(self, axes): 580 | """ 581 | Pad this ndarray by zeros by the specified amount in `axes`, 582 | which lists for _all_ axes the left and right padding amount, e.g., 583 | axes = ( (0, 0), (1, 1), (0, 0)) pads the middle axis with a 0 on the left and right side. 584 | """ 585 | ### BEGIN YOUR SOLUTION 586 | raise NotImplementedError() 587 | ### END YOUR SOLUTION 588 | 589 | def array(a, dtype="float32", device=None): 590 | """Convenience methods to match numpy a bit more closely.""" 591 | dtype = "float32" if dtype is None else dtype 592 | assert dtype == "float32" 593 | return NDArray(a, device=device) 594 | 595 | 596 | def empty(shape, dtype="float32", device=None): 597 | device = device if device is not None else default_device() 598 | return device.empty(shape, dtype) 599 | 600 | 601 | def full(shape, fill_value, dtype="float32", device=None): 602 | device = device if device is not None else default_device() 603 | return device.full(shape, fill_value, dtype) 604 | 605 | 606 | def broadcast_to(array, new_shape): 607 | return array.broadcast_to(new_shape) 608 | 609 | 610 | def reshape(array, new_shape): 611 | return array.reshape(new_shape) 612 | 613 | 614 | def maximum(a, b): 615 | return a.maximum(b) 616 | 617 | 618 | def log(a): 619 | return a.log() 620 | 621 | 622 | def exp(a): 623 | return a.exp() 624 | 625 | 626 | def tanh(a): 627 | return a.tanh() 628 | 629 | 630 | def sum(a, axis=None, keepdims=False): 631 | return a.sum(axis=axis, keepdims=keepdims) 632 | 633 | 634 | def flip(a, axes): 635 | return a.flip(axes) 636 | -------------------------------------------------------------------------------- /python/needle/backend_ndarray/ndarray_backend_numpy.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | __device_name__ = "numpy" 5 | _datatype = np.float32 6 | _datetype_size = np.dtype(_datatype).itemsize 7 | 8 | 9 | class Array: 10 | def __init__(self, size): 11 | self.array = np.empty(size, dtype=np.float32) 12 | 13 | @property 14 | def size(self): 15 | return self.array.size 16 | 17 | 18 | def to_numpy(a, shape, strides, offset): 19 | return np.lib.stride_tricks.as_strided( 20 | a.array[offset:], shape, tuple([s * _datetype_size for s in strides]) 21 | ) 22 | 23 | 24 | def from_numpy(a, out): 25 | out.array[:] = a.flatten() 26 | 27 | 28 | def fill(out, val): 29 | out.array.fill(val) 30 | 31 | 32 | def compact(a, out, shape, strides, offset): 33 | out.array[:] = to_numpy(a, shape, strides, offset).flatten() 34 | 35 | 36 | def ewise_setitem(a, out, shape, strides, offset): 37 | to_numpy(out, shape, strides, offset)[:] = a.array.reshape(shape) 38 | 39 | 40 | def scalar_setitem(size, val, out, shape, strides, offset): 41 | to_numpy(out, shape, strides, offset)[:] = val 42 | 43 | 44 | def ewise_add(a, b, out): 45 | out.array[:] = a.array + b.array 46 | 47 | 48 | def scalar_add(a, val, out): 49 | out.array[:] = a.array + val 50 | 51 | 52 | def ewise_mul(a, b, out): 53 | out.array[:] = a.array * b.array 54 | 55 | 56 | def scalar_mul(a, val, out): 57 | out.array[:] = a.array * val 58 | 59 | 60 | def ewise_div(a, b, out): 61 | out.array[:] = a.array / b.array 62 | 63 | 64 | def scalar_div(a, val, out): 65 | out.array[:] = a.array / val 66 | 67 | 68 | def scalar_power(a, val, out): 69 | out.array[:] = a.array**val 70 | 71 | 72 | def ewise_maximum(a, b, out): 73 | out.array[:] = np.maximum(a.array, b.array) 74 | 75 | 76 | def scalar_maximum(a, val, out): 77 | out.array[:] = np.maximum(a.array, val) 78 | 79 | 80 | def ewise_eq(a, b, out): 81 | out.array[:] = (a.array == b.array).astype(np.float32) 82 | 83 | 84 | def scalar_eq(a, val, out): 85 | out.array[:] = (a.array == val).astype(np.float32) 86 | 87 | 88 | def ewise_ge(a, b, out): 89 | out.array[:] = (a.array >= b.array).astype(np.float32) 90 | 91 | 92 | def scalar_ge(a, val, out): 93 | out.array[:] = (a.array >= val).astype(np.float32) 94 | 95 | 96 | def ewise_log(a, out): 97 | out.array[:] = np.log(a.array) 98 | 99 | 100 | def ewise_exp(a, out): 101 | out.array[:] = np.exp(a.array) 102 | 103 | 104 | def ewise_tanh(a, out): 105 | out.array[:] = np.tanh(a.array) 106 | 107 | 108 | def matmul(a, b, out, m, n, p): 109 | out.array[:] = (a.array.reshape(m, n) @ b.array.reshape(n, p)).reshape(-1) 110 | 111 | 112 | def reduce_max(a, out, reduce_size): 113 | out.array[:] = a.array[:].reshape(-1, reduce_size).max(axis=1) 114 | 115 | 116 | def reduce_sum(a, out, reduce_size): 117 | out.array[:] = a.array[:].reshape(-1, reduce_size).sum(axis=1) 118 | -------------------------------------------------------------------------------- /python/needle/backend_numpy.py: -------------------------------------------------------------------------------- 1 | """This file defies specific implementations of devices when using numpy as NDArray backend. 2 | """ 3 | import numpy 4 | 5 | 6 | class Device: 7 | """Baseclass of all device""" 8 | 9 | 10 | class CPUDevice(Device): 11 | """Represents data that sits in CPU""" 12 | 13 | def __repr__(self): 14 | return "needle.cpu()" 15 | 16 | def __hash__(self): 17 | return self.__repr__().__hash__() 18 | 19 | def __eq__(self, other): 20 | return isinstance(other, CPUDevice) 21 | 22 | def enabled(self): 23 | return True 24 | 25 | def zeros(self, *shape, dtype="float32"): 26 | return numpy.zeros(shape, dtype=dtype) 27 | 28 | def ones(self, *shape, dtype="float32"): 29 | return numpy.ones(shape, dtype=dtype) 30 | 31 | def randn(self, *shape): 32 | # note: numpy doesn't support types within standard random routines, and 33 | # .astype("float32") does work if we're generating a singleton 34 | return numpy.random.randn(*shape) 35 | 36 | def rand(self, *shape): 37 | # note: numpy doesn't support types within standard random routines, and 38 | # .astype("float32") does work if we're generating a singleton 39 | return numpy.random.rand(*shape) 40 | 41 | def one_hot(self, n, i, dtype="float32"): 42 | return numpy.eye(n, dtype=dtype)[i] 43 | 44 | def empty(self, shape, dtype="float32"): 45 | return numpy.empty(shape, dtype=dtype) 46 | 47 | def full(self, shape, fill_value, dtype="float32"): 48 | return numpy.full(shape, fill_value, dtype=dtype) 49 | 50 | 51 | def cpu(): 52 | """Return cpu device""" 53 | return CPUDevice() 54 | 55 | 56 | def default_device(): 57 | return cpu() 58 | 59 | 60 | def all_devices(): 61 | """return a list of all available devices""" 62 | return [cpu()] 63 | -------------------------------------------------------------------------------- /python/needle/backend_selection.py: -------------------------------------------------------------------------------- 1 | """Logic for backend selection""" 2 | import os 3 | 4 | 5 | BACKEND = os.environ.get("NEEDLE_BACKEND", "nd") 6 | 7 | 8 | if BACKEND == "nd": 9 | print("Using needle backend") 10 | from . import backend_ndarray as array_api 11 | from .backend_ndarray import ( 12 | all_devices, 13 | cuda, 14 | cpu, 15 | cpu_numpy, 16 | default_device, 17 | BackendDevice as Device, 18 | ) 19 | 20 | NDArray = array_api.NDArray 21 | elif BACKEND == "np": 22 | print("Using numpy backend") 23 | import numpy as array_api 24 | from .backend_numpy import all_devices, cpu, default_device, Device 25 | 26 | NDArray = array_api.ndarray 27 | else: 28 | raise RuntimeError("Unknown needle array backend %s" % BACKEND) 29 | -------------------------------------------------------------------------------- /python/needle/data/__init__.py: -------------------------------------------------------------------------------- 1 | from .data_basic import * 2 | from .data_transforms import * 3 | from .datasets import * 4 | -------------------------------------------------------------------------------- /python/needle/data/data_basic.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from ..autograd import Tensor 3 | 4 | from typing import Iterator, Optional, List, Sized, Union, Iterable, Any 5 | 6 | 7 | 8 | class Dataset: 9 | r"""An abstract class representing a `Dataset`. 10 | 11 | All subclasses should overwrite :meth:`__getitem__`, supporting fetching a 12 | data sample for a given key. Subclasses must also overwrite 13 | :meth:`__len__`, which is expected to return the size of the dataset. 14 | """ 15 | 16 | def __init__(self, transforms: Optional[List] = None): 17 | self.transforms = transforms 18 | 19 | def __getitem__(self, index) -> object: 20 | raise NotImplementedError 21 | 22 | def __len__(self) -> int: 23 | raise NotImplementedError 24 | 25 | def apply_transforms(self, x): 26 | if self.transforms is not None: 27 | # apply the transforms 28 | for tform in self.transforms: 29 | x = tform(x) 30 | return x 31 | 32 | 33 | class DataLoader: 34 | r""" 35 | Data loader. Combines a dataset and a sampler, and provides an iterable over 36 | the given dataset. 37 | Args: 38 | dataset (Dataset): dataset from which to load the data. 39 | batch_size (int, optional): how many samples per batch to load 40 | (default: ``1``). 41 | shuffle (bool, optional): set to ``True`` to have the data reshuffled 42 | at every epoch (default: ``False``). 43 | """ 44 | dataset: Dataset 45 | batch_size: Optional[int] 46 | 47 | def __init__( 48 | self, 49 | dataset: Dataset, 50 | batch_size: Optional[int] = 1, 51 | shuffle: bool = False, 52 | ): 53 | 54 | self.dataset = dataset 55 | self.shuffle = shuffle 56 | self.batch_size = batch_size 57 | if not self.shuffle: 58 | self.ordering = np.array_split(np.arange(len(dataset)), 59 | range(batch_size, len(dataset), batch_size)) 60 | 61 | def __iter__(self): 62 | ### BEGIN YOUR SOLUTION 63 | raise NotImplementedError() 64 | ### END YOUR SOLUTION 65 | return self 66 | 67 | def __next__(self): 68 | ### BEGIN YOUR SOLUTION 69 | raise NotImplementedError() 70 | ### END YOUR SOLUTION 71 | 72 | -------------------------------------------------------------------------------- /python/needle/data/data_transforms.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | class Transform: 4 | def __call__(self, x): 5 | raise NotImplementedError 6 | 7 | 8 | class RandomFlipHorizontal(Transform): 9 | def __init__(self, p = 0.5): 10 | self.p = p 11 | 12 | def __call__(self, img): 13 | """ 14 | Horizonally flip an image, specified as an H x W x C NDArray. 15 | Args: 16 | img: H x W x C NDArray of an image 17 | Returns: 18 | H x W x C ndarray corresponding to image flipped with probability self.p 19 | Note: use the provided code to provide randomness, for easier testing 20 | """ 21 | flip_img = np.random.rand() < self.p 22 | ### BEGIN YOUR SOLUTION 23 | raise NotImplementedError() 24 | ### END YOUR SOLUTION 25 | 26 | 27 | class RandomCrop(Transform): 28 | def __init__(self, padding=3): 29 | self.padding = padding 30 | 31 | def __call__(self, img): 32 | """ Zero pad and then randomly crop an image. 33 | Args: 34 | img: H x W x C NDArray of an image 35 | Return 36 | H x W x C NAArray of cliped image 37 | Note: generate the image shifted by shift_x, shift_y specified below 38 | """ 39 | shift_x, shift_y = np.random.randint(low=-self.padding, high=self.padding+1, size=2) 40 | ### BEGIN YOUR SOLUTION 41 | raise NotImplementedError() 42 | ### END YOUR SOLUTION 43 | -------------------------------------------------------------------------------- /python/needle/data/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | from .mnist_dataset import * 2 | from .ndarray_dataset import * 3 | from .cifar10_dataset import * 4 | from .ptb_dataset import * 5 | -------------------------------------------------------------------------------- /python/needle/data/datasets/cifar10_dataset.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | from typing import Iterator, Optional, List, Sized, Union, Iterable, Any 4 | import numpy as np 5 | from ..data_basic import Dataset 6 | 7 | class CIFAR10Dataset(Dataset): 8 | def __init__( 9 | self, 10 | base_folder: str, 11 | train: bool, 12 | p: Optional[int] = 0.5, 13 | transforms: Optional[List] = None 14 | ): 15 | """ 16 | Parameters: 17 | base_folder - cifar-10-batches-py folder filepath 18 | train - bool, if True load training dataset, else load test dataset 19 | Divide pixel values by 255. so that images are in 0-1 range. 20 | Attributes: 21 | X - numpy array of images 22 | y - numpy array of labels 23 | """ 24 | ### BEGIN YOUR SOLUTION 25 | raise NotImplementedError() 26 | ### END YOUR SOLUTION 27 | 28 | def __getitem__(self, index) -> object: 29 | """ 30 | Returns the image, label at given index 31 | Image should be of shape (3, 32, 32) 32 | """ 33 | ### BEGIN YOUR SOLUTION 34 | raise NotImplementedError() 35 | ### END YOUR SOLUTION 36 | 37 | def __len__(self) -> int: 38 | """ 39 | Returns the total number of examples in the dataset 40 | """ 41 | ### BEGIN YOUR SOLUTION 42 | raise NotImplementedError() 43 | ### END YOUR SOLUTION 44 | -------------------------------------------------------------------------------- /python/needle/data/datasets/mnist_dataset.py: -------------------------------------------------------------------------------- 1 | from typing import List, Optional 2 | from ..data_basic import Dataset 3 | import numpy as np 4 | 5 | class MNISTDataset(Dataset): 6 | def __init__( 7 | self, 8 | image_filename: str, 9 | label_filename: str, 10 | transforms: Optional[List] = None, 11 | ): 12 | ### BEGIN YOUR SOLUTION 13 | raise NotImplementedError() 14 | ### END YOUR SOLUTION 15 | 16 | def __getitem__(self, index) -> object: 17 | ### BEGIN YOUR SOLUTION 18 | raise NotImplementedError() 19 | ### END YOUR SOLUTION 20 | 21 | def __len__(self) -> int: 22 | ### BEGIN YOUR SOLUTION 23 | raise NotImplementedError() 24 | ### END YOUR SOLUTION -------------------------------------------------------------------------------- /python/needle/data/datasets/ndarray_dataset.py: -------------------------------------------------------------------------------- 1 | from ..data_basic import Dataset 2 | 3 | class NDArrayDataset(Dataset): 4 | def __init__(self, *arrays): 5 | self.arrays = arrays 6 | 7 | def __len__(self) -> int: 8 | return self.arrays[0].shape[0] 9 | 10 | def __getitem__(self, i) -> object: 11 | return tuple([a[i] for a in self.arrays]) -------------------------------------------------------------------------------- /python/needle/data/datasets/ptb_dataset.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import numpy as np 4 | from needle import backend_ndarray as nd 5 | from needle import Tensor 6 | 7 | class Dictionary(object): 8 | """ 9 | Creates a dictionary from a list of words, mapping each word to a 10 | unique integer. 11 | Attributes: 12 | word2idx: dictionary mapping from a word to its unique ID 13 | idx2word: list of words in the dictionary, in the order they were added 14 | to the dictionary (i.e. each word only appears once in this list) 15 | """ 16 | def __init__(self): 17 | self.word2idx = {} 18 | self.idx2word = [] 19 | 20 | def add_word(self, word): 21 | """ 22 | Input: word of type str 23 | If the word is not in the dictionary, adds the word to the dictionary 24 | and appends to the list of words. 25 | Returns the word's unique ID. 26 | """ 27 | ### BEGIN YOUR SOLUTION 28 | raise NotImplementedError() 29 | ### END YOUR SOLUTION 30 | 31 | def __len__(self): 32 | """ 33 | Returns the number of unique words in the dictionary. 34 | """ 35 | ### BEGIN YOUR SOLUTION 36 | raise NotImplementedError() 37 | ### END YOUR SOLUTION 38 | 39 | 40 | 41 | class Corpus(object): 42 | """ 43 | Creates corpus from train, and test txt files. 44 | """ 45 | def __init__(self, base_dir, max_lines=None): 46 | self.dictionary = Dictionary() 47 | self.train = self.tokenize(os.path.join(base_dir, 'train.txt'), max_lines) 48 | self.test = self.tokenize(os.path.join(base_dir, 'test.txt'), max_lines) 49 | 50 | def tokenize(self, path, max_lines=None): 51 | """ 52 | Input: 53 | path - path to text file 54 | max_lines - maximum number of lines to read in 55 | Tokenizes a text file, first adding each word in the file to the dictionary, 56 | and then tokenizing the text file to a list of IDs. When adding words to the 57 | dictionary (and tokenizing the file content) '' should be appended to 58 | the end of each line in order to properly account for the end of the sentence. 59 | Output: 60 | ids: List of ids 61 | """ 62 | ### BEGIN YOUR SOLUTION 63 | raise NotImplementedError() 64 | ### END YOUR SOLUTION 65 | 66 | 67 | def batchify(data, batch_size, device, dtype): 68 | """ 69 | Starting from sequential data, batchify arranges the dataset into columns. 70 | For instance, with the alphabet as the sequence and batch size 4, we'd get 71 | ┌ a g m s ┐ 72 | │ b h n t │ 73 | │ c i o u │ 74 | │ d j p v │ 75 | │ e k q w │ 76 | └ f l r x ┘. 77 | These columns are treated as independent by the model, which means that the 78 | dependence of e. g. 'g' on 'f' cannot be learned, but allows more efficient 79 | batch processing. 80 | If the data cannot be evenly divided by the batch size, trim off the remainder. 81 | Returns the data as a numpy array of shape (nbatch, batch_size). 82 | """ 83 | ### BEGIN YOUR SOLUTION 84 | raise NotImplementedError() 85 | ### END YOUR SOLUTION 86 | 87 | 88 | def get_batch(batches, i, bptt, device=None, dtype=None): 89 | """ 90 | get_batch subdivides the source data into chunks of length bptt. 91 | If source is equal to the example output of the batchify function, with 92 | a bptt-limit of 2, we'd get the following two Variables for i = 0: 93 | ┌ a g m s ┐ ┌ b h n t ┐ 94 | └ b h n t ┘ └ c i o u ┘ 95 | Note that despite the name of the function, the subdivison of data is not 96 | done along the batch dimension (i.e. dimension 1), since that was handled 97 | by the batchify function. The chunks are along dimension 0, corresponding 98 | to the seq_len dimension in the LSTM or RNN. 99 | Inputs: 100 | batches - numpy array returned from batchify function 101 | i - index 102 | bptt - Sequence length 103 | Returns: 104 | data - Tensor of shape (bptt, bs) with cached data as NDArray 105 | target - Tensor of shape (bptt*bs,) with cached data as NDArray 106 | """ 107 | ### BEGIN YOUR SOLUTION 108 | raise NotImplementedError() 109 | ### END YOUR SOLUTION -------------------------------------------------------------------------------- /python/needle/init/__init__.py: -------------------------------------------------------------------------------- 1 | from .init_basic import * 2 | 3 | from .init_initializers import * 4 | -------------------------------------------------------------------------------- /python/needle/init/init_basic.py: -------------------------------------------------------------------------------- 1 | import math 2 | import needle as ndl 3 | 4 | 5 | def rand(*shape, low=0.0, high=1.0, device=None, dtype="float32", requires_grad=False): 6 | """Generate random numbers uniform between low and high""" 7 | device = ndl.cpu() if device is None else device 8 | array = device.rand(*shape) * (high - low) + low 9 | return ndl.Tensor(array, device=device, dtype=dtype, requires_grad=requires_grad) 10 | 11 | 12 | def randn(*shape, mean=0.0, std=1.0, device=None, dtype="float32", requires_grad=False): 13 | """Generate random normal with specified mean and std deviation""" 14 | device = ndl.cpu() if device is None else device 15 | array = device.randn(*shape) * std + mean 16 | return ndl.Tensor(array, device=device, dtype=dtype, requires_grad=requires_grad) 17 | 18 | 19 | 20 | 21 | 22 | 23 | def constant(*shape, c=1.0, device=None, dtype="float32", requires_grad=False): 24 | """Generate constant Tensor""" 25 | device = ndl.cpu() if device is None else device 26 | array = device.full(shape, c, dtype=dtype) 27 | return ndl.Tensor(array, device=device, dtype=dtype, requires_grad=requires_grad) 28 | 29 | def ones(*shape, device=None, dtype="float32", requires_grad=False): 30 | """Generate all-ones Tensor""" 31 | return constant( 32 | *shape, c=1.0, device=device, dtype=dtype, requires_grad=requires_grad 33 | ) 34 | 35 | 36 | def zeros(*shape, device=None, dtype="float32", requires_grad=False): 37 | """Generate all-zeros Tensor""" 38 | return constant( 39 | *shape, c=0.0, device=device, dtype=dtype, requires_grad=requires_grad 40 | ) 41 | 42 | 43 | def randb(*shape, p=0.5, device=None, dtype="bool", requires_grad=False): 44 | """Generate binary random Tensor""" 45 | device = ndl.cpu() if device is None else device 46 | array = device.rand(*shape) <= p 47 | return ndl.Tensor(array, device=device, dtype=dtype, requires_grad=requires_grad) 48 | 49 | 50 | def one_hot(n, i, device=None, dtype="float32", requires_grad=False): 51 | """Generate one-hot encoding Tensor""" 52 | device = ndl.cpu() if device is None else device 53 | return ndl.Tensor( 54 | device.one_hot(n, i.numpy().astype("int32"), dtype=dtype), 55 | device=device, 56 | requires_grad=requires_grad, 57 | ) 58 | 59 | 60 | def zeros_like(array, *, device=None, requires_grad=False): 61 | device = device if device else array.device 62 | return zeros( 63 | *array.shape, dtype=array.dtype, device=device, requires_grad=requires_grad 64 | ) 65 | 66 | 67 | def ones_like(array, *, device=None, requires_grad=False): 68 | device = device if device else array.device 69 | return ones( 70 | *array.shape, dtype=array.dtype, device=device, requires_grad=requires_grad 71 | ) 72 | -------------------------------------------------------------------------------- /python/needle/init/init_initializers.py: -------------------------------------------------------------------------------- 1 | import math 2 | from .init_basic import * 3 | 4 | 5 | def xavier_uniform(fan_in, fan_out, gain=1.0, **kwargs): 6 | ### BEGIN YOUR SOLUTION 7 | raise NotImplementedError() 8 | ### END YOUR SOLUTION 9 | 10 | 11 | def xavier_normal(fan_in, fan_out, gain=1.0, **kwargs): 12 | ### BEGIN YOUR SOLUTION 13 | raise NotImplementedError() 14 | ### END YOUR SOLUTION 15 | 16 | 17 | 18 | def kaiming_uniform(fan_in, fan_out, shape=None, nonlinearity="relu", **kwargs): 19 | assert nonlinearity == "relu", "Only relu supported currently" 20 | ### BEGIN YOUR SOLUTION 21 | raise NotImplementedError() 22 | ### END YOUR SOLUTION 23 | 24 | def kaiming_normal(fan_in, fan_out, nonlinearity="relu", **kwargs): 25 | assert nonlinearity == "relu", "Only relu supported currently" 26 | ### BEGIN YOUR SOLUTION 27 | raise NotImplementedError() 28 | ### END YOUR SOLUTION -------------------------------------------------------------------------------- /python/needle/nn/__init__.py: -------------------------------------------------------------------------------- 1 | from .nn_basic import * 2 | from .nn_conv import * 3 | from .nn_sequence import * 4 | # from .nn_transformer import * 5 | -------------------------------------------------------------------------------- /python/needle/nn/nn_basic.py: -------------------------------------------------------------------------------- 1 | """The module. 2 | """ 3 | from typing import List, Callable, Any 4 | from needle.autograd import Tensor 5 | from needle import ops 6 | import needle.init as init 7 | import numpy as np 8 | 9 | 10 | class Parameter(Tensor): 11 | """A special kind of tensor that represents parameters.""" 12 | 13 | 14 | def _unpack_params(value: object) -> List[Tensor]: 15 | if isinstance(value, Parameter): 16 | return [value] 17 | elif isinstance(value, Module): 18 | return value.parameters() 19 | elif isinstance(value, dict): 20 | params = [] 21 | for k, v in value.items(): 22 | params += _unpack_params(v) 23 | return params 24 | elif isinstance(value, (list, tuple)): 25 | params = [] 26 | for v in value: 27 | params += _unpack_params(v) 28 | return params 29 | else: 30 | return [] 31 | 32 | 33 | def _child_modules(value: object) -> List["Module"]: 34 | if isinstance(value, Module): 35 | modules = [value] 36 | modules.extend(_child_modules(value.__dict__)) 37 | return modules 38 | if isinstance(value, dict): 39 | modules = [] 40 | for k, v in value.items(): 41 | modules += _child_modules(v) 42 | return modules 43 | elif isinstance(value, (list, tuple)): 44 | modules = [] 45 | for v in value: 46 | modules += _child_modules(v) 47 | return modules 48 | else: 49 | return [] 50 | 51 | 52 | class Module: 53 | def __init__(self): 54 | self.training = True 55 | 56 | def parameters(self) -> List[Tensor]: 57 | """Return the list of parameters in the module.""" 58 | return _unpack_params(self.__dict__) 59 | 60 | def _children(self) -> List["Module"]: 61 | return _child_modules(self.__dict__) 62 | 63 | def eval(self): 64 | self.training = False 65 | for m in self._children(): 66 | m.training = False 67 | 68 | def train(self): 69 | self.training = True 70 | for m in self._children(): 71 | m.training = True 72 | 73 | def __call__(self, *args, **kwargs): 74 | return self.forward(*args, **kwargs) 75 | 76 | 77 | class Identity(Module): 78 | def forward(self, x): 79 | return x 80 | 81 | 82 | class Linear(Module): 83 | def __init__( 84 | self, in_features, out_features, bias=True, device=None, dtype="float32" 85 | ): 86 | super().__init__() 87 | self.in_features = in_features 88 | self.out_features = out_features 89 | 90 | ### BEGIN YOUR SOLUTION 91 | raise NotImplementedError() 92 | ### END YOUR SOLUTION 93 | 94 | def forward(self, X: Tensor) -> Tensor: 95 | ### BEGIN YOUR SOLUTION 96 | raise NotImplementedError() 97 | ### END YOUR SOLUTION 98 | 99 | 100 | class Flatten(Module): 101 | def forward(self, X): 102 | ### BEGIN YOUR SOLUTION 103 | raise NotImplementedError() 104 | ### END YOUR SOLUTION 105 | 106 | 107 | class ReLU(Module): 108 | def forward(self, x: Tensor) -> Tensor: 109 | ### BEGIN YOUR SOLUTION 110 | raise NotImplementedError() 111 | ### END YOUR SOLUTION 112 | 113 | class Sequential(Module): 114 | def __init__(self, *modules): 115 | super().__init__() 116 | self.modules = modules 117 | 118 | def forward(self, x: Tensor) -> Tensor: 119 | ### BEGIN YOUR SOLUTION 120 | raise NotImplementedError() 121 | ### END YOUR SOLUTION 122 | 123 | 124 | class SoftmaxLoss(Module): 125 | def forward(self, logits: Tensor, y: Tensor): 126 | ### BEGIN YOUR SOLUTION 127 | raise NotImplementedError() 128 | ### END YOUR SOLUTION 129 | 130 | 131 | class BatchNorm1d(Module): 132 | def __init__(self, dim, eps=1e-5, momentum=0.1, device=None, dtype="float32"): 133 | super().__init__() 134 | self.dim = dim 135 | self.eps = eps 136 | self.momentum = momentum 137 | ### BEGIN YOUR SOLUTION 138 | raise NotImplementedError() 139 | ### END YOUR SOLUTION 140 | 141 | def forward(self, x: Tensor) -> Tensor: 142 | ### BEGIN YOUR SOLUTION 143 | raise NotImplementedError() 144 | ### END YOUR SOLUTION 145 | 146 | class BatchNorm2d(BatchNorm1d): 147 | def __init__(self, *args, **kwargs): 148 | super().__init__(*args, **kwargs) 149 | 150 | def forward(self, x: Tensor): 151 | # nchw -> nhcw -> nhwc 152 | s = x.shape 153 | _x = x.transpose((1, 2)).transpose((2, 3)).reshape((s[0] * s[2] * s[3], s[1])) 154 | y = super().forward(_x).reshape((s[0], s[2], s[3], s[1])) 155 | return y.transpose((2,3)).transpose((1,2)) 156 | 157 | 158 | class LayerNorm1d(Module): 159 | def __init__(self, dim, eps=1e-5, device=None, dtype="float32"): 160 | super().__init__() 161 | self.dim = dim 162 | self.eps = eps 163 | ### BEGIN YOUR SOLUTION 164 | raise NotImplementedError() 165 | ### END YOUR SOLUTION 166 | 167 | def forward(self, x: Tensor) -> Tensor: 168 | ### BEGIN YOUR SOLUTION 169 | raise NotImplementedError() 170 | ### END YOUR SOLUTION 171 | 172 | 173 | class Dropout(Module): 174 | def __init__(self, p=0.5): 175 | super().__init__() 176 | self.p = p 177 | 178 | def forward(self, x: Tensor) -> Tensor: 179 | ### BEGIN YOUR SOLUTION 180 | raise NotImplementedError() 181 | ### END YOUR SOLUTION 182 | 183 | 184 | class Residual(Module): 185 | def __init__(self, fn: Module): 186 | super().__init__() 187 | self.fn = fn 188 | 189 | def forward(self, x: Tensor) -> Tensor: 190 | ### BEGIN YOUR SOLUTION 191 | raise NotImplementedError() 192 | ### END YOUR SOLUTION 193 | -------------------------------------------------------------------------------- /python/needle/nn/nn_conv.py: -------------------------------------------------------------------------------- 1 | """The module. 2 | """ 3 | from typing import List, Callable, Any 4 | from needle.autograd import Tensor 5 | from needle import ops 6 | import needle.init as init 7 | import numpy as np 8 | from .nn_basic import Parameter, Module 9 | 10 | 11 | class Conv(Module): 12 | """ 13 | Multi-channel 2D convolutional layer 14 | IMPORTANT: Accepts inputs in NCHW format, outputs also in NCHW format 15 | Only supports padding=same 16 | No grouped convolution or dilation 17 | Only supports square kernels 18 | """ 19 | def __init__(self, in_channels, out_channels, kernel_size, stride=1, bias=True, device=None, dtype="float32"): 20 | super().__init__() 21 | if isinstance(kernel_size, tuple): 22 | kernel_size = kernel_size[0] 23 | if isinstance(stride, tuple): 24 | stride = stride[0] 25 | self.in_channels = in_channels 26 | self.out_channels = out_channels 27 | self.kernel_size = kernel_size 28 | self.stride = stride 29 | 30 | ### BEGIN YOUR SOLUTION 31 | raise NotImplementedError() 32 | ### END YOUR SOLUTION 33 | 34 | def forward(self, x: Tensor) -> Tensor: 35 | ### BEGIN YOUR SOLUTION 36 | raise NotImplementedError() 37 | ### END YOUR SOLUTION -------------------------------------------------------------------------------- /python/needle/nn/nn_sequence.py: -------------------------------------------------------------------------------- 1 | """The module. 2 | """ 3 | from typing import List 4 | from needle.autograd import Tensor 5 | from needle import ops 6 | import needle.init as init 7 | import numpy as np 8 | from .nn_basic import Parameter, Module 9 | 10 | 11 | class Sigmoid(Module): 12 | def __init__(self): 13 | super().__init__() 14 | 15 | def forward(self, x: Tensor) -> Tensor: 16 | ### BEGIN YOUR SOLUTION 17 | raise NotImplementedError() 18 | ### END YOUR SOLUTION 19 | 20 | class RNNCell(Module): 21 | def __init__(self, input_size, hidden_size, bias=True, nonlinearity='tanh', device=None, dtype="float32"): 22 | """ 23 | Applies an RNN cell with tanh or ReLU nonlinearity. 24 | 25 | Parameters: 26 | input_size: The number of expected features in the input X 27 | hidden_size: The number of features in the hidden state h 28 | bias: If False, then the layer does not use bias weights 29 | nonlinearity: The non-linearity to use. Can be either 'tanh' or 'relu'. 30 | 31 | Variables: 32 | W_ih: The learnable input-hidden weights of shape (input_size, hidden_size). 33 | W_hh: The learnable hidden-hidden weights of shape (hidden_size, hidden_size). 34 | bias_ih: The learnable input-hidden bias of shape (hidden_size,). 35 | bias_hh: The learnable hidden-hidden bias of shape (hidden_size,). 36 | 37 | Weights and biases are initialized from U(-sqrt(k), sqrt(k)) where k = 1/hidden_size 38 | """ 39 | super().__init__() 40 | ### BEGIN YOUR SOLUTION 41 | raise NotImplementedError() 42 | ### END YOUR SOLUTION 43 | 44 | def forward(self, X, h=None): 45 | """ 46 | Inputs: 47 | X of shape (bs, input_size): Tensor containing input features 48 | h of shape (bs, hidden_size): Tensor containing the initial hidden state 49 | for each element in the batch. Defaults to zero if not provided. 50 | 51 | Outputs: 52 | h' of shape (bs, hidden_size): Tensor contianing the next hidden state 53 | for each element in the batch. 54 | """ 55 | ### BEGIN YOUR SOLUTION 56 | raise NotImplementedError() 57 | ### END YOUR SOLUTION 58 | 59 | 60 | class RNN(Module): 61 | def __init__(self, input_size, hidden_size, num_layers=1, bias=True, nonlinearity='tanh', device=None, dtype="float32"): 62 | """ 63 | Applies a multi-layer RNN with tanh or ReLU non-linearity to an input sequence. 64 | 65 | Parameters: 66 | input_size - The number of expected features in the input x 67 | hidden_size - The number of features in the hidden state h 68 | num_layers - Number of recurrent layers. 69 | nonlinearity - The non-linearity to use. Can be either 'tanh' or 'relu'. 70 | bias - If False, then the layer does not use bias weights. 71 | 72 | Variables: 73 | rnn_cells[k].W_ih: The learnable input-hidden weights of the k-th layer, 74 | of shape (input_size, hidden_size) for k=0. Otherwise the shape is 75 | (hidden_size, hidden_size). 76 | rnn_cells[k].W_hh: The learnable hidden-hidden weights of the k-th layer, 77 | of shape (hidden_size, hidden_size). 78 | rnn_cells[k].bias_ih: The learnable input-hidden bias of the k-th layer, 79 | of shape (hidden_size,). 80 | rnn_cells[k].bias_hh: The learnable hidden-hidden bias of the k-th layer, 81 | of shape (hidden_size,). 82 | """ 83 | super().__init__() 84 | ### BEGIN YOUR SOLUTION 85 | raise NotImplementedError() 86 | ### END YOUR SOLUTION 87 | 88 | def forward(self, X, h0=None): 89 | """ 90 | Inputs: 91 | X of shape (seq_len, bs, input_size) containing the features of the input sequence. 92 | h_0 of shape (num_layers, bs, hidden_size) containing the initial 93 | hidden state for each element in the batch. Defaults to zeros if not provided. 94 | 95 | Outputs 96 | output of shape (seq_len, bs, hidden_size) containing the output features 97 | (h_t) from the last layer of the RNN, for each t. 98 | h_n of shape (num_layers, bs, hidden_size) containing the final hidden state for each element in the batch. 99 | """ 100 | ### BEGIN YOUR SOLUTION 101 | raise NotImplementedError() 102 | ### END YOUR SOLUTION 103 | 104 | 105 | class LSTMCell(Module): 106 | def __init__(self, input_size, hidden_size, bias=True, device=None, dtype="float32"): 107 | """ 108 | A long short-term memory (LSTM) cell. 109 | 110 | Parameters: 111 | input_size - The number of expected features in the input X 112 | hidden_size - The number of features in the hidden state h 113 | bias - If False, then the layer does not use bias weights 114 | 115 | Variables: 116 | W_ih - The learnable input-hidden weights, of shape (input_size, 4*hidden_size). 117 | W_hh - The learnable hidden-hidden weights, of shape (hidden_size, 4*hidden_size). 118 | bias_ih - The learnable input-hidden bias, of shape (4*hidden_size,). 119 | bias_hh - The learnable hidden-hidden bias, of shape (4*hidden_size,). 120 | 121 | Weights and biases are initialized from U(-sqrt(k), sqrt(k)) where k = 1/hidden_size 122 | """ 123 | super().__init__() 124 | ### BEGIN YOUR SOLUTION 125 | raise NotImplementedError() 126 | ### END YOUR SOLUTION 127 | 128 | 129 | def forward(self, X, h=None): 130 | """ 131 | Inputs: X, h 132 | X of shape (batch, input_size): Tensor containing input features 133 | h, tuple of (h0, c0), with 134 | h0 of shape (bs, hidden_size): Tensor containing the initial hidden state 135 | for each element in the batch. Defaults to zero if not provided. 136 | c0 of shape (bs, hidden_size): Tensor containing the initial cell state 137 | for each element in the batch. Defaults to zero if not provided. 138 | 139 | Outputs: (h', c') 140 | h' of shape (bs, hidden_size): Tensor containing the next hidden state for each 141 | element in the batch. 142 | c' of shape (bs, hidden_size): Tensor containing the next cell state for each 143 | element in the batch. 144 | """ 145 | ### BEGIN YOUR SOLUTION 146 | raise NotImplementedError() 147 | ### END YOUR SOLUTION 148 | 149 | 150 | class LSTM(Module): 151 | def __init__(self, input_size, hidden_size, num_layers=1, bias=True, device=None, dtype="float32"): 152 | super().__init__() 153 | """ 154 | Applies a multi-layer long short-term memory (LSTM) RNN to an input sequence. 155 | 156 | Parameters: 157 | input_size - The number of expected features in the input x 158 | hidden_size - The number of features in the hidden state h 159 | num_layers - Number of recurrent layers. 160 | bias - If False, then the layer does not use bias weights. 161 | 162 | Variables: 163 | lstm_cells[k].W_ih: The learnable input-hidden weights of the k-th layer, 164 | of shape (input_size, 4*hidden_size) for k=0. Otherwise the shape is 165 | (hidden_size, 4*hidden_size). 166 | lstm_cells[k].W_hh: The learnable hidden-hidden weights of the k-th layer, 167 | of shape (hidden_size, 4*hidden_size). 168 | lstm_cells[k].bias_ih: The learnable input-hidden bias of the k-th layer, 169 | of shape (4*hidden_size,). 170 | lstm_cells[k].bias_hh: The learnable hidden-hidden bias of the k-th layer, 171 | of shape (4*hidden_size,). 172 | """ 173 | ### BEGIN YOUR SOLUTION 174 | raise NotImplementedError() 175 | ### END YOUR SOLUTION 176 | 177 | def forward(self, X, h=None): 178 | """ 179 | Inputs: X, h 180 | X of shape (seq_len, bs, input_size) containing the features of the input sequence. 181 | h, tuple of (h0, c0) with 182 | h_0 of shape (num_layers, bs, hidden_size) containing the initial 183 | hidden state for each element in the batch. Defaults to zeros if not provided. 184 | c0 of shape (num_layers, bs, hidden_size) containing the initial 185 | hidden cell state for each element in the batch. Defaults to zeros if not provided. 186 | 187 | Outputs: (output, (h_n, c_n)) 188 | output of shape (seq_len, bs, hidden_size) containing the output features 189 | (h_t) from the last layer of the LSTM, for each t. 190 | tuple of (h_n, c_n) with 191 | h_n of shape (num_layers, bs, hidden_size) containing the final hidden state for each element in the batch. 192 | h_n of shape (num_layers, bs, hidden_size) containing the final hidden cell state for each element in the batch. 193 | """ 194 | ### BEGIN YOUR SOLUTION 195 | raise NotImplementedError() 196 | ### END YOUR SOLUTION 197 | 198 | class Embedding(Module): 199 | def __init__(self, num_embeddings, embedding_dim, device=None, dtype="float32"): 200 | super().__init__() 201 | """ 202 | Maps one-hot word vectors from a dictionary of fixed size to embeddings. 203 | 204 | Parameters: 205 | num_embeddings (int) - Size of the dictionary 206 | embedding_dim (int) - The size of each embedding vector 207 | 208 | Variables: 209 | weight - The learnable weights of shape (num_embeddings, embedding_dim) 210 | initialized from N(0, 1). 211 | """ 212 | ### BEGIN YOUR SOLUTION 213 | raise NotImplementedError() 214 | ### END YOUR SOLUTION 215 | 216 | def forward(self, x: Tensor) -> Tensor: 217 | """ 218 | Maps word indices to one-hot vectors, and projects to embedding vectors 219 | 220 | Input: 221 | x of shape (seq_len, bs) 222 | 223 | Output: 224 | output of shape (seq_len, bs, embedding_dim) 225 | """ 226 | ### BEGIN YOUR SOLUTION 227 | raise NotImplementedError() 228 | ### END YOUR SOLUTION -------------------------------------------------------------------------------- /python/needle/ops/__init__.py: -------------------------------------------------------------------------------- 1 | from .ops_mathematic import * 2 | 3 | from .ops_logarithmic import * 4 | from .ops_tuple import * 5 | -------------------------------------------------------------------------------- /python/needle/ops/ops_logarithmic.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | from ..autograd import NDArray 3 | from ..autograd import Op, Tensor, Value, TensorOp 4 | from ..autograd import TensorTuple, TensorTupleOp 5 | 6 | from .ops_mathematic import * 7 | 8 | from ..backend_selection import array_api, BACKEND 9 | 10 | class LogSoftmax(TensorOp): 11 | def compute(self, Z): 12 | ### BEGIN YOUR SOLUTION 13 | raise NotImplementedError() 14 | ### END YOUR SOLUTION 15 | 16 | def gradient(self, out_grad, node): 17 | ### BEGIN YOUR SOLUTION 18 | raise NotImplementedError() 19 | ### END YOUR SOLUTION 20 | 21 | 22 | def logsoftmax(a): 23 | return LogSoftmax()(a) 24 | 25 | 26 | class LogSumExp(TensorOp): 27 | def __init__(self, axes: Optional[tuple] = None): 28 | self.axes = axes 29 | 30 | def compute(self, Z): 31 | ### BEGIN YOUR SOLUTION 32 | raise NotImplementedError() 33 | ### END YOUR SOLUTION 34 | 35 | def gradient(self, out_grad, node): 36 | ### BEGIN YOUR SOLUTION 37 | raise NotImplementedError() 38 | ### END YOUR SOLUTION 39 | 40 | 41 | def logsumexp(a, axes=None): 42 | return LogSumExp(axes=axes)(a) 43 | 44 | -------------------------------------------------------------------------------- /python/needle/ops/ops_mathematic.py: -------------------------------------------------------------------------------- 1 | """Operator implementations.""" 2 | 3 | from numbers import Number 4 | from typing import Optional, List, Tuple, Union 5 | 6 | from ..autograd import NDArray 7 | from ..autograd import Op, Tensor, Value, TensorOp 8 | from ..autograd import TensorTuple, TensorTupleOp 9 | import numpy 10 | 11 | # NOTE: we will import numpy as the array_api 12 | # as the backend for our computations, this line will change in later homeworks 13 | 14 | from ..backend_selection import array_api, BACKEND 15 | from .ops_tuple import * 16 | 17 | 18 | class EWiseAdd(TensorOp): 19 | def compute(self, a: NDArray, b: NDArray): 20 | return a + b 21 | 22 | def gradient(self, out_grad: Tensor, node: Tensor): 23 | return out_grad, out_grad 24 | 25 | 26 | def add(a, b): 27 | return EWiseAdd()(a, b) 28 | 29 | 30 | class AddScalar(TensorOp): 31 | def __init__(self, scalar): 32 | self.scalar = scalar 33 | 34 | def compute(self, a: NDArray): 35 | return a + self.scalar 36 | 37 | def gradient(self, out_grad: Tensor, node: Tensor): 38 | return out_grad 39 | 40 | 41 | def add_scalar(a, scalar): 42 | return AddScalar(scalar)(a) 43 | 44 | 45 | class EWiseMul(TensorOp): 46 | def compute(self, a: NDArray, b: NDArray): 47 | return a * b 48 | 49 | def gradient(self, out_grad: Tensor, node: Tensor): 50 | lhs, rhs = node.inputs 51 | return out_grad * rhs, out_grad * lhs 52 | 53 | 54 | def multiply(a, b): 55 | return EWiseMul()(a, b) 56 | 57 | 58 | class MulScalar(TensorOp): 59 | def __init__(self, scalar): 60 | self.scalar = scalar 61 | 62 | def compute(self, a: NDArray): 63 | return a * self.scalar 64 | 65 | def gradient(self, out_grad: Tensor, node: Tensor): 66 | return (out_grad * self.scalar,) 67 | 68 | 69 | def mul_scalar(a, scalar): 70 | return MulScalar(scalar)(a) 71 | 72 | 73 | class EWisePow(TensorOp): 74 | """Op to element-wise raise a tensor to a power.""" 75 | 76 | def compute(self, a: NDArray, b: NDArray) -> NDArray: 77 | ### BEGIN YOUR SOLUTION 78 | raise NotImplementedError() 79 | ### END YOUR SOLUTION 80 | 81 | def gradient(self, out_grad, node): 82 | ### BEGIN YOUR SOLUTION 83 | raise NotImplementedError() 84 | ### END YOUR SOLUTION 85 | 86 | 87 | def power(a, b): 88 | return EWisePow()(a, b) 89 | 90 | 91 | class PowerScalar(TensorOp): 92 | """Op raise a tensor to an (integer) power.""" 93 | 94 | def __init__(self, scalar: int): 95 | self.scalar = scalar 96 | 97 | def compute(self, a: NDArray) -> NDArray: 98 | ### BEGIN YOUR SOLUTION 99 | raise NotImplementedError() 100 | ### END YOUR SOLUTION 101 | 102 | def gradient(self, out_grad, node): 103 | ### BEGIN YOUR SOLUTION 104 | raise NotImplementedError() 105 | ### END YOUR SOLUTION 106 | 107 | 108 | def power_scalar(a, scalar): 109 | return PowerScalar(scalar)(a) 110 | 111 | 112 | class EWiseDiv(TensorOp): 113 | """Op to element-wise divide two nodes.""" 114 | 115 | def compute(self, a, b): 116 | ### BEGIN YOUR SOLUTION 117 | raise NotImplementedError() 118 | ### END YOUR SOLUTION 119 | 120 | def gradient(self, out_grad, node): 121 | ### BEGIN YOUR SOLUTION 122 | raise NotImplementedError() 123 | ### END YOUR SOLUTION 124 | 125 | 126 | def divide(a, b): 127 | return EWiseDiv()(a, b) 128 | 129 | 130 | class DivScalar(TensorOp): 131 | def __init__(self, scalar): 132 | self.scalar = scalar 133 | 134 | def compute(self, a): 135 | ### BEGIN YOUR SOLUTION 136 | raise NotImplementedError() 137 | ### END YOUR SOLUTION 138 | 139 | def gradient(self, out_grad, node): 140 | ### BEGIN YOUR SOLUTION 141 | raise NotImplementedError() 142 | ### END YOUR SOLUTION 143 | 144 | 145 | def divide_scalar(a, scalar): 146 | return DivScalar(scalar)(a) 147 | 148 | 149 | class Transpose(TensorOp): 150 | def __init__(self, axes: Optional[tuple] = None): 151 | self.axes = axes 152 | 153 | def compute(self, a): 154 | ### BEGIN YOUR SOLUTION 155 | raise NotImplementedError() 156 | ### END YOUR SOLUTION 157 | 158 | def gradient(self, out_grad, node): 159 | ### BEGIN YOUR SOLUTION 160 | raise NotImplementedError() 161 | ### END YOUR SOLUTION 162 | 163 | 164 | def transpose(a, axes=None): 165 | return Transpose(axes)(a) 166 | 167 | 168 | class Reshape(TensorOp): 169 | def __init__(self, shape): 170 | self.shape = shape 171 | 172 | def compute(self, a): 173 | ### BEGIN YOUR SOLUTION 174 | raise NotImplementedError() 175 | ### END YOUR SOLUTION 176 | 177 | def gradient(self, out_grad, node): 178 | ### BEGIN YOUR SOLUTION 179 | raise NotImplementedError() 180 | ### END YOUR SOLUTION 181 | 182 | 183 | def reshape(a, shape): 184 | return Reshape(shape)(a) 185 | 186 | 187 | class BroadcastTo(TensorOp): 188 | def __init__(self, shape): 189 | self.shape = shape 190 | 191 | def compute(self, a): 192 | ### BEGIN YOUR SOLUTION 193 | raise NotImplementedError() 194 | ### END YOUR SOLUTION 195 | 196 | def gradient(self, out_grad, node): 197 | ### BEGIN YOUR SOLUTION 198 | raise NotImplementedError() 199 | ### END YOUR SOLUTION 200 | 201 | 202 | def broadcast_to(a, shape): 203 | return BroadcastTo(shape)(a) 204 | 205 | 206 | class Summation(TensorOp): 207 | def __init__(self, axes: Optional[tuple] = None): 208 | self.axes = axes 209 | 210 | def compute(self, a): 211 | ### BEGIN YOUR SOLUTION 212 | raise NotImplementedError() 213 | ### END YOUR SOLUTION 214 | 215 | def gradient(self, out_grad, node): 216 | ### BEGIN YOUR SOLUTION 217 | raise NotImplementedError() 218 | ### END YOUR SOLUTION 219 | 220 | 221 | def summation(a, axes=None): 222 | return Summation(axes)(a) 223 | 224 | 225 | class MatMul(TensorOp): 226 | def compute(self, a, b): 227 | ### BEGIN YOUR SOLUTION 228 | raise NotImplementedError() 229 | ### END YOUR SOLUTION 230 | 231 | def gradient(self, out_grad, node): 232 | ### BEGIN YOUR SOLUTION 233 | raise NotImplementedError() 234 | ### END YOUR SOLUTION 235 | 236 | 237 | def matmul(a, b): 238 | return MatMul()(a, b) 239 | 240 | 241 | class Negate(TensorOp): 242 | def compute(self, a): 243 | ### BEGIN YOUR SOLUTION 244 | raise NotImplementedError() 245 | ### END YOUR SOLUTION 246 | 247 | def gradient(self, out_grad, node): 248 | ### BEGIN YOUR SOLUTION 249 | raise NotImplementedError() 250 | ### END YOUR SOLUTION 251 | 252 | 253 | def negate(a): 254 | return Negate()(a) 255 | 256 | 257 | class Log(TensorOp): 258 | def compute(self, a): 259 | ### BEGIN YOUR SOLUTION 260 | raise NotImplementedError() 261 | ### END YOUR SOLUTION 262 | 263 | def gradient(self, out_grad, node): 264 | ### BEGIN YOUR SOLUTION 265 | raise NotImplementedError() 266 | ### END YOUR SOLUTION 267 | 268 | 269 | def log(a): 270 | return Log()(a) 271 | 272 | 273 | class Exp(TensorOp): 274 | def compute(self, a): 275 | ### BEGIN YOUR SOLUTION 276 | raise NotImplementedError() 277 | ### END YOUR SOLUTION 278 | 279 | def gradient(self, out_grad, node): 280 | ### BEGIN YOUR SOLUTION 281 | raise NotImplementedError() 282 | ### END YOUR SOLUTION 283 | 284 | 285 | def exp(a): 286 | return Exp()(a) 287 | 288 | 289 | class ReLU(TensorOp): 290 | def compute(self, a): 291 | ### BEGIN YOUR SOLUTION 292 | raise NotImplementedError() 293 | ### END YOUR SOLUTION 294 | 295 | def gradient(self, out_grad, node): 296 | ### BEGIN YOUR SOLUTION 297 | raise NotImplementedError() 298 | ### END YOUR SOLUTION 299 | 300 | 301 | def relu(a): 302 | return ReLU()(a) 303 | 304 | 305 | class Tanh(TensorOp): 306 | def compute(self, a): 307 | ### BEGIN YOUR SOLUTION 308 | raise NotImplementedError() 309 | ### END YOUR SOLUTION 310 | 311 | def gradient(self, out_grad, node): 312 | ### BEGIN YOUR SOLUTION 313 | raise NotImplementedError() 314 | ### END YOUR SOLUTION 315 | 316 | 317 | def tanh(a): 318 | return Tanh()(a) 319 | 320 | 321 | class Stack(TensorOp): 322 | def __init__(self, axis: int): 323 | """ 324 | Concatenates a sequence of arrays along a new dimension. 325 | Parameters: 326 | axis - dimension to concatenate along 327 | All arrays need to be of the same size. 328 | """ 329 | self.axis = axis 330 | 331 | def compute(self, args: TensorTuple) -> Tensor: 332 | ### BEGIN YOUR SOLUTION 333 | raise NotImplementedError() 334 | ### END YOUR SOLUTION 335 | 336 | def gradient(self, out_grad, node): 337 | ### BEGIN YOUR SOLUTION 338 | raise NotImplementedError() 339 | ### END YOUR SOLUTION 340 | 341 | 342 | def stack(args, axis): 343 | return Stack(axis)(make_tuple(*args)) 344 | 345 | 346 | class Split(TensorTupleOp): 347 | def __init__(self, axis: int): 348 | """ 349 | Splits a tensor along an axis into a tuple of tensors. 350 | (The "inverse" of Stack) 351 | Parameters: 352 | axis - dimension to split 353 | """ 354 | self.axis = axis 355 | 356 | def compute(self, A): 357 | ### BEGIN YOUR SOLUTION 358 | raise NotImplementedError() 359 | ### END YOUR SOLUTION 360 | 361 | def gradient(self, out_grad, node): 362 | ### BEGIN YOUR SOLUTION 363 | raise NotImplementedError() 364 | ### END YOUR SOLUTION 365 | 366 | 367 | def split(a, axis): 368 | return Split(axis)(a) 369 | 370 | 371 | class Flip(TensorOp): 372 | def __init__(self, axes: Optional[tuple] = None): 373 | self.axes = axes 374 | 375 | def compute(self, a): 376 | ### BEGIN YOUR SOLUTION 377 | raise NotImplementedError() 378 | ### END YOUR SOLUTION 379 | 380 | def gradient(self, out_grad, node): 381 | ### BEGIN YOUR SOLUTION 382 | raise NotImplementedError() 383 | ### END YOUR SOLUTION 384 | 385 | 386 | def flip(a, axes): 387 | return Flip(axes)(a) 388 | 389 | 390 | class Dilate(TensorOp): 391 | def __init__(self, axes: tuple, dilation: int): 392 | self.axes = axes 393 | self.dilation = dilation 394 | 395 | def compute(self, a): 396 | ### BEGIN YOUR SOLUTION 397 | raise NotImplementedError() 398 | ### END YOUR SOLUTION 399 | 400 | def gradient(self, out_grad, node): 401 | ### BEGIN YOUR SOLUTION 402 | raise NotImplementedError() 403 | ### END YOUR SOLUTION 404 | 405 | 406 | def dilate(a, axes, dilation): 407 | return Dilate(axes, dilation)(a) 408 | 409 | 410 | class UnDilate(TensorOp): 411 | def __init__(self, axes: tuple, dilation: int): 412 | self.axes = axes 413 | self.dilation = dilation 414 | 415 | def compute(self, a): 416 | ### BEGIN YOUR SOLUTION 417 | raise NotImplementedError() 418 | ### END YOUR SOLUTION 419 | 420 | def gradient(self, out_grad, node): 421 | ### BEGIN YOUR SOLUTION 422 | raise NotImplementedError() 423 | ### END YOUR SOLUTION 424 | 425 | 426 | def undilate(a, axes, dilation): 427 | return UnDilate(axes, dilation)(a) 428 | 429 | 430 | class Conv(TensorOp): 431 | def __init__(self, stride: Optional[int] = 1, padding: Optional[int] = 0): 432 | self.stride = stride 433 | self.padding = padding 434 | 435 | def compute(self, A, B): 436 | ### BEGIN YOUR SOLUTION 437 | raise NotImplementedError() 438 | ### END YOUR SOLUTION 439 | 440 | def gradient(self, out_grad, node): 441 | ### BEGIN YOUR SOLUTION 442 | raise NotImplementedError() 443 | ### END YOUR SOLUTION 444 | 445 | 446 | def conv(a, b, stride=1, padding=1): 447 | return Conv(stride, padding)(a, b) 448 | 449 | 450 | -------------------------------------------------------------------------------- /python/needle/ops/ops_tuple.py: -------------------------------------------------------------------------------- 1 | from ..autograd import Op, Tensor, TensorTuple, Value, TensorOp, TensorTupleOp 2 | import needle.init as init 3 | 4 | class MakeTensorTuple(TensorTupleOp): 5 | def compute(self, *args) -> tuple: 6 | return tuple(args) 7 | 8 | def gradient(self, out_grad, node): 9 | assert isinstance(out_grad, TensorTuple) 10 | return tuple([out_grad[i] for i in range(len(out_grad))]) 11 | 12 | 13 | def make_tuple(*args): 14 | return MakeTensorTuple()(*args) 15 | 16 | 17 | class TupleGetItem(TensorOp): 18 | def __init__(self, index): 19 | self.index = index 20 | 21 | def __call__(self, a: TensorTuple, fold_const=True) -> Value: 22 | assert isinstance(a, TensorTuple) 23 | # constant folding 24 | if fold_const and isinstance(a.op, MakeTensorTuple): 25 | return a.inputs[self.index] 26 | return Tensor.make_from_op(self, [a]) 27 | 28 | def compute(self, a): 29 | return a[self.index] 30 | 31 | def gradient(self, out_grad, node): 32 | index = self.index 33 | in_grad = [] 34 | for i, value in enumerate(node.inputs[0]): 35 | if i != index: 36 | in_grad.append(init.zeros_like(value)) 37 | else: 38 | in_grad.append(out_grad) 39 | return MakeTensorTuple()(*in_grad) 40 | 41 | 42 | def tuple_get_item(value, index): 43 | return TupleGetItem(index)(value) 44 | 45 | 46 | class FusedAddScalars(TensorTupleOp): 47 | def __init__(self, c0: float, c1: float): 48 | self.c0 = c0 49 | self.c1 = c1 50 | 51 | def compute(self, a): 52 | return a + self.c0, a + self.c1 53 | 54 | def gradient(self, out_grad, node): 55 | return out_grad[0] + out_grad[1] 56 | 57 | 58 | def fused_add_scalars(x, c0, c1): 59 | return FusedAddScalars(c0, c1)(x) 60 | -------------------------------------------------------------------------------- /python/needle/optim.py: -------------------------------------------------------------------------------- 1 | """Optimization module""" 2 | import needle as ndl 3 | import numpy as np 4 | 5 | 6 | class Optimizer: 7 | def __init__(self, params): 8 | self.params = params 9 | 10 | def step(self): 11 | raise NotImplementedError() 12 | 13 | def reset_grad(self): 14 | for p in self.params: 15 | p.grad = None 16 | 17 | 18 | class SGD(Optimizer): 19 | def __init__(self, params, lr=0.01, momentum=0.0, weight_decay=0.0): 20 | super().__init__(params) 21 | self.lr = lr 22 | self.momentum = momentum 23 | self.u = {} 24 | self.weight_decay = weight_decay 25 | 26 | def step(self): 27 | ### BEGIN YOUR SOLUTION 28 | raise NotImplementedError() 29 | ### END YOUR SOLUTION 30 | 31 | def clip_grad_norm(self, max_norm=0.25): 32 | """ 33 | Clips gradient norm of parameters. 34 | """ 35 | ### BEGIN YOUR SOLUTION 36 | raise NotImplementedError() 37 | ### END YOUR SOLUTION 38 | 39 | 40 | class Adam(Optimizer): 41 | def __init__( 42 | self, 43 | params, 44 | lr=0.01, 45 | beta1=0.9, 46 | beta2=0.999, 47 | eps=1e-8, 48 | weight_decay=0.0, 49 | ): 50 | super().__init__(params) 51 | self.lr = lr 52 | self.beta1 = beta1 53 | self.beta2 = beta2 54 | self.eps = eps 55 | self.weight_decay = weight_decay 56 | self.t = 0 57 | 58 | self.m = {} 59 | self.v = {} 60 | 61 | def step(self): 62 | ### BEGIN YOUR SOLUTION 63 | raise NotImplementedError() 64 | ### END YOUR SOLUTION 65 | -------------------------------------------------------------------------------- /src/ndarray_backend_cpu.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include 6 | #include 7 | #include 8 | 9 | namespace needle { 10 | namespace cpu { 11 | 12 | #define ALIGNMENT 256 13 | #define TILE 8 14 | typedef float scalar_t; 15 | const size_t ELEM_SIZE = sizeof(scalar_t); 16 | 17 | 18 | /** 19 | * This is a utility structure for maintaining an array aligned to ALIGNMENT boundaries in 20 | * memory. This alignment should be at least TILE * ELEM_SIZE, though we make it even larger 21 | * here by default. 22 | */ 23 | struct AlignedArray { 24 | AlignedArray(const size_t size) { 25 | int ret = posix_memalign((void**)&ptr, ALIGNMENT, size * ELEM_SIZE); 26 | if (ret != 0) throw std::bad_alloc(); 27 | this->size = size; 28 | } 29 | ~AlignedArray() { free(ptr); } 30 | size_t ptr_as_int() {return (size_t)ptr; } 31 | scalar_t* ptr; 32 | size_t size; 33 | }; 34 | 35 | 36 | 37 | void Fill(AlignedArray* out, scalar_t val) { 38 | /** 39 | * Fill the values of an aligned array with val 40 | */ 41 | for (int i = 0; i < out->size; i++) { 42 | out->ptr[i] = val; 43 | } 44 | } 45 | 46 | 47 | 48 | void Compact(const AlignedArray& a, AlignedArray* out, std::vector shape, 49 | std::vector strides, size_t offset) { 50 | /** 51 | * Compact an array in memory 52 | * 53 | * Args: 54 | * a: non-compact representation of the array, given as input 55 | * out: compact version of the array to be written 56 | * shape: shapes of each dimension for a and out 57 | * strides: strides of the *a* array (not out, which has compact strides) 58 | * offset: offset of the *a* array (not out, which has zero offset, being compact) 59 | * 60 | * Returns: 61 | * void (you need to modify out directly, rather than returning anything; this is true for all the 62 | * function will implement here, so we won't repeat this note.) 63 | */ 64 | /// BEGIN SOLUTION 65 | assert(false && "Not Implemented"); 66 | /// END SOLUTION 67 | } 68 | 69 | void EwiseSetitem(const AlignedArray& a, AlignedArray* out, std::vector shape, 70 | std::vector strides, size_t offset) { 71 | /** 72 | * Set items in a (non-compact) array 73 | * 74 | * Args: 75 | * a: _compact_ array whose items will be written to out 76 | * out: non-compact array whose items are to be written 77 | * shape: shapes of each dimension for a and out 78 | * strides: strides of the *out* array (not a, which has compact strides) 79 | * offset: offset of the *out* array (not a, which has zero offset, being compact) 80 | */ 81 | /// BEGIN SOLUTION 82 | assert(false && "Not Implemented"); 83 | /// END SOLUTION 84 | } 85 | 86 | void ScalarSetitem(const size_t size, scalar_t val, AlignedArray* out, std::vector shape, 87 | std::vector strides, size_t offset) { 88 | /** 89 | * Set items is a (non-compact) array 90 | * 91 | * Args: 92 | * size: number of elements to write in out array (note that this will note be the same as 93 | * out.size, because out is a non-compact subset array); it _will_ be the same as the 94 | * product of items in shape, but convenient to just pass it here. 95 | * val: scalar value to write to 96 | * out: non-compact array whose items are to be written 97 | * shape: shapes of each dimension of out 98 | * strides: strides of the out array 99 | * offset: offset of the out array 100 | */ 101 | 102 | /// BEGIN SOLUTION 103 | assert(false && "Not Implemented"); 104 | /// END SOLUTION 105 | } 106 | 107 | void EwiseAdd(const AlignedArray& a, const AlignedArray& b, AlignedArray* out) { 108 | /** 109 | * Set entries in out to be the sum of correspondings entires in a and b. 110 | */ 111 | for (size_t i = 0; i < a.size; i++) { 112 | out->ptr[i] = a.ptr[i] + b.ptr[i]; 113 | } 114 | } 115 | 116 | void ScalarAdd(const AlignedArray& a, scalar_t val, AlignedArray* out) { 117 | /** 118 | * Set entries in out to be the sum of corresponding entry in a plus the scalar val. 119 | */ 120 | for (size_t i = 0; i < a.size; i++) { 121 | out->ptr[i] = a.ptr[i] + val; 122 | } 123 | } 124 | 125 | 126 | /** 127 | * In the code the follows, use the above template to create analogous element-wise 128 | * and and scalar operators for the following functions. See the numpy backend for 129 | * examples of how they should work. 130 | * - EwiseMul, ScalarMul 131 | * - EwiseDiv, ScalarDiv 132 | * - ScalarPower 133 | * - EwiseMaximum, ScalarMaximum 134 | * - EwiseEq, ScalarEq 135 | * - EwiseGe, ScalarGe 136 | * - EwiseLog 137 | * - EwiseExp 138 | * - EwiseTanh 139 | * 140 | * If you implement all these naively, there will be a lot of repeated code, so 141 | * you are welcome (but not required), to use macros or templates to define these 142 | * functions (however you want to do so, as long as the functions match the proper) 143 | * signatures above. 144 | */ 145 | 146 | 147 | void Matmul(const AlignedArray& a, const AlignedArray& b, AlignedArray* out, uint32_t m, uint32_t n, 148 | uint32_t p) { 149 | /** 150 | * Multiply two (compact) matrices into an output (also compact) matrix. For this implementation 151 | * you can use the "naive" three-loop algorithm. 152 | * 153 | * Args: 154 | * a: compact 2D array of size m x n 155 | * b: compact 2D array of size n x p 156 | * out: compact 2D array of size m x p to write the output to 157 | * m: rows of a / out 158 | * n: columns of a / rows of b 159 | * p: columns of b / out 160 | */ 161 | 162 | /// BEGIN SOLUTION 163 | assert(false && "Not Implemented"); 164 | /// END SOLUTION 165 | } 166 | 167 | inline void AlignedDot(const float* __restrict__ a, 168 | const float* __restrict__ b, 169 | float* __restrict__ out) { 170 | 171 | /** 172 | * Multiply together two TILE x TILE matrices, and _add _the result to out (it is important to add 173 | * the result to the existing out, which you should not set to zero beforehand). We are including 174 | * the compiler flags here that enable the compile to properly use vector operators to implement 175 | * this function. Specifically, the __restrict__ keyword indicates to the compile that a, b, and 176 | * out don't have any overlapping memory (which is necessary in order for vector operations to be 177 | * equivalent to their non-vectorized counterparts (imagine what could happen otherwise if a, b, 178 | * and out had overlapping memory). Similarly the __builtin_assume_aligned keyword tells the 179 | * compiler that the input array will be aligned to the appropriate blocks in memory, which also 180 | * helps the compiler vectorize the code. 181 | * 182 | * Args: 183 | * a: compact 2D array of size TILE x TILE 184 | * b: compact 2D array of size TILE x TILE 185 | * out: compact 2D array of size TILE x TILE to write to 186 | */ 187 | 188 | a = (const float*)__builtin_assume_aligned(a, TILE * ELEM_SIZE); 189 | b = (const float*)__builtin_assume_aligned(b, TILE * ELEM_SIZE); 190 | out = (float*)__builtin_assume_aligned(out, TILE * ELEM_SIZE); 191 | 192 | /// BEGIN SOLUTION 193 | assert(false && "Not Implemented"); 194 | /// END SOLUTION 195 | } 196 | 197 | void MatmulTiled(const AlignedArray& a, const AlignedArray& b, AlignedArray* out, uint32_t m, 198 | uint32_t n, uint32_t p) { 199 | /** 200 | * Matrix multiplication on tiled representations of array. In this setting, a, b, and out 201 | * are all *4D* compact arrays of the appropriate size, e.g. a is an array of size 202 | * a[m/TILE][n/TILE][TILE][TILE] 203 | * You should do the multiplication tile-by-tile to improve performance of the array (i.e., this 204 | * function should call `AlignedDot()` implemented above). 205 | * 206 | * Note that this function will only be called when m, n, p are all multiples of TILE, so you can 207 | * assume that this division happens without any remainder. 208 | * 209 | * Args: 210 | * a: compact 4D array of size m/TILE x n/TILE x TILE x TILE 211 | * b: compact 4D array of size n/TILE x p/TILE x TILE x TILE 212 | * out: compact 4D array of size m/TILE x p/TILE x TILE x TILE to write to 213 | * m: rows of a / out 214 | * n: columns of a / rows of b 215 | * p: columns of b / out 216 | * 217 | */ 218 | /// BEGIN SOLUTION 219 | assert(false && "Not Implemented"); 220 | /// END SOLUTION 221 | } 222 | 223 | void ReduceMax(const AlignedArray& a, AlignedArray* out, size_t reduce_size) { 224 | /** 225 | * Reduce by taking maximum over `reduce_size` contiguous blocks. 226 | * 227 | * Args: 228 | * a: compact array of size a.size = out.size * reduce_size to reduce over 229 | * out: compact array to write into 230 | * reduce_size: size of the dimension to reduce over 231 | */ 232 | 233 | /// BEGIN SOLUTION 234 | assert(false && "Not Implemented"); 235 | /// END SOLUTION 236 | } 237 | 238 | void ReduceSum(const AlignedArray& a, AlignedArray* out, size_t reduce_size) { 239 | /** 240 | * Reduce by taking sum over `reduce_size` contiguous blocks. 241 | * 242 | * Args: 243 | * a: compact array of size a.size = out.size * reduce_size to reduce over 244 | * out: compact array to write into 245 | * reduce_size: size of the dimension to reduce over 246 | */ 247 | 248 | /// BEGIN SOLUTION 249 | assert(false && "Not Implemented"); 250 | /// END SOLUTION 251 | } 252 | 253 | } // namespace cpu 254 | } // namespace needle 255 | 256 | PYBIND11_MODULE(ndarray_backend_cpu, m) { 257 | namespace py = pybind11; 258 | using namespace needle; 259 | using namespace cpu; 260 | 261 | m.attr("__device_name__") = "cpu"; 262 | m.attr("__tile_size__") = TILE; 263 | 264 | py::class_(m, "Array") 265 | .def(py::init(), py::return_value_policy::take_ownership) 266 | .def("ptr", &AlignedArray::ptr_as_int) 267 | .def_readonly("size", &AlignedArray::size); 268 | 269 | // return numpy array (with copying for simplicity, otherwise garbage 270 | // collection is a pain) 271 | m.def("to_numpy", [](const AlignedArray& a, std::vector shape, 272 | std::vector strides, size_t offset) { 273 | std::vector numpy_strides = strides; 274 | std::transform(numpy_strides.begin(), numpy_strides.end(), numpy_strides.begin(), 275 | [](size_t& c) { return c * ELEM_SIZE; }); 276 | return py::array_t(shape, numpy_strides, a.ptr + offset); 277 | }); 278 | 279 | // convert from numpy (with copying) 280 | m.def("from_numpy", [](py::array_t a, AlignedArray* out) { 281 | std::memcpy(out->ptr, a.request().ptr, out->size * ELEM_SIZE); 282 | }); 283 | 284 | m.def("fill", Fill); 285 | m.def("compact", Compact); 286 | m.def("ewise_setitem", EwiseSetitem); 287 | m.def("scalar_setitem", ScalarSetitem); 288 | m.def("ewise_add", EwiseAdd); 289 | m.def("scalar_add", ScalarAdd); 290 | 291 | m.def("ewise_mul", EwiseMul); 292 | m.def("scalar_mul", ScalarMul); 293 | m.def("ewise_div", EwiseDiv); 294 | m.def("scalar_div", ScalarDiv); 295 | m.def("scalar_power", ScalarPower); 296 | 297 | m.def("ewise_maximum", EwiseMaximum); 298 | m.def("scalar_maximum", ScalarMaximum); 299 | m.def("ewise_eq", EwiseEq); 300 | m.def("scalar_eq", ScalarEq); 301 | m.def("ewise_ge", EwiseGe); 302 | m.def("scalar_ge", ScalarGe); 303 | 304 | m.def("ewise_log", EwiseLog); 305 | m.def("ewise_exp", EwiseExp); 306 | m.def("ewise_tanh", EwiseTanh); 307 | 308 | m.def("matmul", Matmul); 309 | m.def("matmul_tiled", MatmulTiled); 310 | 311 | m.def("reduce_max", ReduceMax); 312 | m.def("reduce_sum", ReduceSum); 313 | } 314 | -------------------------------------------------------------------------------- /src/ndarray_backend_cuda.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include 7 | #include 8 | 9 | namespace needle { 10 | namespace cuda { 11 | 12 | #define BASE_THREAD_NUM 256 13 | 14 | #define TILE 4 15 | typedef float scalar_t; 16 | const size_t ELEM_SIZE = sizeof(scalar_t); 17 | 18 | struct CudaArray { 19 | CudaArray(const size_t size) { 20 | cudaError_t err = cudaMalloc(&ptr, size * ELEM_SIZE); 21 | if (err != cudaSuccess) throw std::runtime_error(cudaGetErrorString(err)); 22 | this->size = size; 23 | } 24 | ~CudaArray() { cudaFree(ptr); } 25 | size_t ptr_as_int() { return (size_t)ptr; } 26 | 27 | scalar_t* ptr; 28 | size_t size; 29 | }; 30 | 31 | struct CudaDims { 32 | dim3 block, grid; 33 | }; 34 | 35 | CudaDims CudaOneDim(size_t size) { 36 | /** 37 | * Utility function to get cuda dimensions for 1D call 38 | */ 39 | CudaDims dim; 40 | size_t num_blocks = (size + BASE_THREAD_NUM - 1) / BASE_THREAD_NUM; 41 | dim.block = dim3(BASE_THREAD_NUM, 1, 1); 42 | dim.grid = dim3(num_blocks, 1, 1); 43 | return dim; 44 | } 45 | 46 | #define MAX_VEC_SIZE 8 47 | struct CudaVec { 48 | uint32_t size; 49 | int32_t data[MAX_VEC_SIZE]; 50 | }; 51 | 52 | CudaVec VecToCuda(const std::vector& x) { 53 | CudaVec shape; 54 | if (x.size() > MAX_VEC_SIZE) throw std::runtime_error("Exceeded CUDA supported max dimesions"); 55 | shape.size = x.size(); 56 | for (size_t i = 0; i < x.size(); i++) { 57 | shape.data[i] = x[i]; 58 | } 59 | return shape; 60 | } 61 | 62 | //////////////////////////////////////////////////////////////////////////////// 63 | // Fill call 64 | //////////////////////////////////////////////////////////////////////////////// 65 | 66 | __global__ void FillKernel(scalar_t* out, scalar_t val, size_t size) { 67 | size_t gid = blockIdx.x * blockDim.x + threadIdx.x; 68 | if (gid < size) out[gid] = val; 69 | } 70 | 71 | void Fill(CudaArray* out, scalar_t val) { 72 | CudaDims dim = CudaOneDim(out->size); 73 | FillKernel<<>>(out->ptr, val, out->size); 74 | } 75 | 76 | //////////////////////////////////////////////////////////////////////////////// 77 | // Compact and setitem cals 78 | //////////////////////////////////////////////////////////////////////////////// 79 | 80 | // Untility function to convert contiguous index i to memory location from strides 81 | 82 | 83 | 84 | __global__ void CompactKernel(const scalar_t* a, scalar_t* out, size_t size, CudaVec shape, 85 | CudaVec strides, size_t offset) { 86 | /** 87 | * The CUDA kernel for the compact opeation. This should effectively map a single entry in the 88 | * non-compact input a, to the corresponding item (at location gid) in the compact array out. 89 | * 90 | * Args: 91 | * a: CUDA pointer to a array 92 | * out: CUDA point to out array 93 | * size: size of out array 94 | * shape: vector of shapes of a and out arrays (of type CudaVec, for past passing to CUDA kernel) 95 | * strides: vector of strides of out array 96 | * offset: offset of out array 97 | */ 98 | size_t gid = blockIdx.x * blockDim.x + threadIdx.x; 99 | 100 | /// BEGIN SOLUTION 101 | assert(false && "Not Implemented"); 102 | /// END SOLUTION 103 | } 104 | 105 | void Compact(const CudaArray& a, CudaArray* out, std::vector shape, 106 | std::vector strides, size_t offset) { 107 | /** 108 | * Compact an array in memory. Unlike the C++ version, in CUDA this will primarily call the 109 | * relevant CUDA kernel. In this case, we illustrate how you should set this up (i.e., we give 110 | * you the code for this fuction, and also the prototype for the CompactKernel() function). For 111 | * the functions after this, however, you'll need to define these kernels as you see fit to 112 | * execute the underlying function. 113 | * 114 | * Args: 115 | * a: non-compact represntation of the array, given as input 116 | * out: compact version of the array to be written 117 | * shape: shapes of each dimension for a and out 118 | * strides: strides of the *a* array (not out, which has compact strides) 119 | * offset: offset of the *a* array (not out, which has zero offset, being compact) 120 | */ 121 | 122 | // Nothing needs to be added here 123 | CudaDims dim = CudaOneDim(out->size); 124 | CompactKernel<<>>(a.ptr, out->ptr, out->size, VecToCuda(shape), 125 | VecToCuda(strides), offset); 126 | } 127 | 128 | 129 | 130 | void EwiseSetitem(const CudaArray& a, CudaArray* out, std::vector shape, 131 | std::vector strides, size_t offset) { 132 | /** 133 | * Set items in a (non-compact) array using CUDA. Yyou will most likely want to implement a 134 | * EwiseSetitemKernel() function, similar to those above, that will do the actual work. 135 | * 136 | * Args: 137 | * a: _compact_ array whose items will be written to out 138 | * out: non-compact array whose items are to be written 139 | * shape: shapes of each dimension for a and out 140 | * strides: strides of the *out* array (not a, which has compact strides) 141 | * offset: offset of the *out* array (not a, which has zero offset, being compact) 142 | */ 143 | /// BEGIN SOLUTION 144 | assert(false && "Not Implemented"); 145 | /// END SOLUTION 146 | } 147 | 148 | 149 | 150 | void ScalarSetitem(size_t size, scalar_t val, CudaArray* out, std::vector shape, 151 | std::vector strides, size_t offset) { 152 | /** 153 | * Set items is a (non-compact) array 154 | * 155 | * Args: 156 | * size: number of elements to write in out array (note that this will note be the same as 157 | * out.size, because out is a non-compact subset array); it _will_ be the same as the 158 | * product of items in shape, but covenient to just pass it here. 159 | * val: scalar value to write to 160 | * out: non-compact array whose items are to be written 161 | * shape: shapes of each dimension of out 162 | * strides: strides of the out array 163 | * offset: offset of the out array 164 | */ 165 | /// BEGIN SOLUTION 166 | assert(false && "Not Implemented"); 167 | /// END SOLUTION 168 | } 169 | 170 | //////////////////////////////////////////////////////////////////////////////// 171 | // Elementwise and scalar operations 172 | //////////////////////////////////////////////////////////////////////////////// 173 | 174 | 175 | __global__ void EwiseAddKernel(const scalar_t* a, const scalar_t* b, scalar_t* out, size_t size) { 176 | // Calculate the global index of the thread. 177 | size_t gid = blockIdx.x * blockDim.x + threadIdx.x; 178 | if (gid < size) out[gid] = a[gid] + b[gid]; 179 | } 180 | 181 | void EwiseAdd(const CudaArray& a, const CudaArray& b, CudaArray* out) { 182 | /** 183 | * Add together two CUDA arrays. 184 | * Args: 185 | * a: Input array 'a' to be added 186 | * b: Input array 'b' to be added 187 | * out: Output array to store the result of 'a + b' 188 | */ 189 | CudaDims dim = CudaOneDim(out->size); 190 | 191 | // Kernel will execute on 'dim.grid' blocks, each containing 'dim.block' threads. 192 | EwiseAddKernel<<>>(a.ptr, b.ptr, out->ptr, out->size); 193 | } 194 | 195 | __global__ void ScalarAddKernel(const scalar_t* a, scalar_t val, scalar_t* out, size_t size) { 196 | // Calculate the global index of the thread. 197 | size_t gid = blockIdx.x * blockDim.x + threadIdx.x; 198 | if (gid < size) out[gid] = a[gid] + val; 199 | } 200 | 201 | void ScalarAdd(const CudaArray& a, scalar_t val, CudaArray* out) { 202 | /** 203 | * Add a scalar value to every element of a CUDA array. 204 | * Args: 205 | * a: Input array 'a' 206 | * val: Scalar value to be added 207 | * out: Output array to store the result of 'a + val' 208 | */ 209 | CudaDims dim = CudaOneDim(out->size); 210 | 211 | // Launch the ScalarAddKernel that will add the scalar 'val' to each element of array 'a', 212 | // and store the result in array 'out'. 213 | ScalarAddKernel<<>>(a.ptr, val, out->ptr, out->size); 214 | } 215 | 216 | /** 217 | * In the code the follows, use the above template to create analogous elementise 218 | * and and scalar operators for the following functions. See the numpy backend for 219 | * examples of how they should work. 220 | * - EwiseMul, ScalarMul 221 | * - EwiseDiv, ScalarDiv 222 | * - ScalarPower 223 | * - EwiseMaximum, ScalarMaximum 224 | * - EwiseEq, ScalarEq 225 | * - EwiseGe, ScalarGe 226 | * - EwiseLog 227 | * - EwiseExp 228 | * - EwiseTanh 229 | * 230 | * If you implement all these naively, there will be a lot of repeated code, so 231 | * you are welcome (but not required), to use macros or templates to define these 232 | * functions (however you want to do so, as long as the functions match the proper) 233 | * signatures above. 234 | */ 235 | 236 | 237 | //////////////////////////////////////////////////////////////////////////////// 238 | // Elementwise and scalar operations 239 | //////////////////////////////////////////////////////////////////////////////// 240 | 241 | 242 | void Matmul(const CudaArray& a, const CudaArray& b, CudaArray* out, uint32_t M, uint32_t N, 243 | uint32_t P) { 244 | /** 245 | * Multiply two (compact) matrices into an output (also comapct) matrix. You will want to look 246 | * at the lecture and notes on GPU-based linear algebra to see how to do this. Since ultimately 247 | * mugrade is just evaluating correctness, you _can_ implement a version that simply parallelizes 248 | * over (i,j) entries in the output array. However, to really get the full benefit of this 249 | * problem, we would encourage you to use cooperative fetching, shared memory register tiling, 250 | * and other ideas covered in the class notes. Note that unlike the tiled matmul function in 251 | * the CPU backend, here you should implement a single function that works across all size 252 | * matrices, whether or not they are a multiple of a tile size. As with previous CUDA 253 | * implementations, this function here will largely just set up the kernel call, and you should 254 | * implement the logic in a separate MatmulKernel() call. 255 | * 256 | * 257 | * Args: 258 | * a: compact 2D array of size m x n 259 | * b: comapct 2D array of size n x p 260 | * out: compact 2D array of size m x p to write the output to 261 | * M: rows of a / out 262 | * N: columns of a / rows of b 263 | * P: columns of b / out 264 | */ 265 | 266 | /// BEGIN SOLUTION 267 | assert(false && "Not Implemented"); 268 | /// END SOLUTION 269 | } 270 | 271 | //////////////////////////////////////////////////////////////////////////////// 272 | // Max and sum reductions 273 | //////////////////////////////////////////////////////////////////////////////// 274 | 275 | 276 | void ReduceMax(const CudaArray& a, CudaArray* out, size_t reduce_size) { 277 | /** 278 | * Reduce by taking maximum over `reduce_size` contiguous blocks. Even though it is inefficient, 279 | * for simplicity you can perform each reduction in a single CUDA thread. 280 | * 281 | * Args: 282 | * a: compact array of size a.size = out.size * reduce_size to reduce over 283 | * out: compact array to write into 284 | * redice_size: size of the dimension to reduce over 285 | */ 286 | /// BEGIN SOLUTION 287 | assert(false && "Not Implemented"); 288 | /// END SOLUTION 289 | } 290 | 291 | 292 | 293 | void ReduceSum(const CudaArray& a, CudaArray* out, size_t reduce_size) { 294 | /** 295 | * Reduce by taking summation over `reduce_size` contiguous blocks. Again, for simplicity you 296 | * can perform each reduction in a single CUDA thread. 297 | * 298 | * Args: 299 | * a: compact array of size a.size = out.size * reduce_size to reduce over 300 | * out: compact array to write into 301 | * redice_size: size of the dimension to reduce over 302 | */ 303 | /// BEGIN SOLUTION 304 | assert(false && "Not Implemented"); 305 | /// END SOLUTION 306 | } 307 | 308 | } // namespace cuda 309 | } // namespace needle 310 | 311 | PYBIND11_MODULE(ndarray_backend_cuda, m) { 312 | namespace py = pybind11; 313 | using namespace needle; 314 | using namespace cuda; 315 | 316 | m.attr("__device_name__") = "cuda"; 317 | m.attr("__tile_size__") = TILE; 318 | 319 | py::class_(m, "Array") 320 | .def(py::init(), py::return_value_policy::take_ownership) 321 | .def_readonly("size", &CudaArray::size) 322 | .def("ptr", &CudaArray::ptr_as_int); 323 | 324 | // return numpy array, copying from CPU 325 | m.def("to_numpy", [](const CudaArray& a, std::vector shape, std::vector strides, 326 | size_t offset) { 327 | std::vector numpy_strides = strides; 328 | std::transform(numpy_strides.begin(), numpy_strides.end(), numpy_strides.begin(), 329 | [](size_t& c) { return c * ELEM_SIZE; }); 330 | 331 | // copy memory to host 332 | scalar_t* host_ptr = (scalar_t*)std::malloc(a.size * ELEM_SIZE); 333 | if (host_ptr == 0) throw std::bad_alloc(); 334 | cudaError_t err = cudaMemcpy(host_ptr, a.ptr, a.size * ELEM_SIZE, cudaMemcpyDeviceToHost); 335 | if (err != cudaSuccess) throw std::runtime_error(cudaGetErrorString(err)); 336 | 337 | // return numpy array 338 | py::capsule deallocate_buffer(host_ptr, [](void* p) { free(p); }); 339 | return py::array_t(shape, numpy_strides, host_ptr + offset, deallocate_buffer); 340 | }); 341 | 342 | // copy numpy array to GPU 343 | m.def("from_numpy", [](py::array_t a, CudaArray* out) { 344 | cudaError_t err = 345 | cudaMemcpy(out->ptr, a.request().ptr, out->size * ELEM_SIZE, cudaMemcpyHostToDevice); 346 | if (err != cudaSuccess) throw std::runtime_error(cudaGetErrorString(err)); 347 | }); 348 | 349 | m.def("fill", Fill); 350 | m.def("compact", Compact); 351 | m.def("ewise_setitem", EwiseSetitem); 352 | m.def("scalar_setitem", ScalarSetitem); 353 | m.def("ewise_add", EwiseAdd); 354 | m.def("scalar_add", ScalarAdd); 355 | 356 | m.def("ewise_mul", EwiseMul); 357 | m.def("scalar_mul", ScalarMul); 358 | m.def("ewise_div", EwiseDiv); 359 | m.def("scalar_div", ScalarDiv); 360 | m.def("scalar_power", ScalarPower); 361 | 362 | m.def("ewise_maximum", EwiseMaximum); 363 | m.def("scalar_maximum", ScalarMaximum); 364 | m.def("ewise_eq", EwiseEq); 365 | m.def("scalar_eq", ScalarEq); 366 | m.def("ewise_ge", EwiseGe); 367 | m.def("scalar_ge", ScalarGe); 368 | 369 | m.def("ewise_log", EwiseLog); 370 | m.def("ewise_exp", EwiseExp); 371 | m.def("ewise_tanh", EwiseTanh); 372 | 373 | m.def("matmul", Matmul); 374 | 375 | m.def("reduce_max", ReduceMax); 376 | m.def("reduce_sum", ReduceSum); 377 | } 378 | -------------------------------------------------------------------------------- /tests/hw4/test_cifar_ptb_data.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append('./python') 3 | import itertools 4 | import numpy as np 5 | import pytest 6 | import mugrade 7 | 8 | import needle as ndl 9 | from needle import backend_ndarray as nd 10 | 11 | 12 | np.random.seed(2) 13 | 14 | 15 | _DEVICES = [ndl.cpu(), pytest.param(ndl.cuda(), 16 | marks=pytest.mark.skipif(not ndl.cuda().enabled(), reason="No GPU"))] 17 | 18 | 19 | TRAIN = [True, False] 20 | @pytest.mark.parametrize("train", TRAIN) 21 | def test_cifar10_dataset(train): 22 | dataset = ndl.data.CIFAR10Dataset("data/cifar-10-batches-py", train=train) 23 | if train: 24 | assert len(dataset) == 50000 25 | else: 26 | assert len(dataset) == 10000 27 | example = dataset[np.random.randint(len(dataset))] 28 | assert(isinstance(example, tuple)) 29 | X, y = example 30 | assert isinstance(X, np.ndarray) 31 | assert X.shape == (3, 32, 32) 32 | 33 | 34 | BATCH_SIZES = [1, 15] 35 | @pytest.mark.parametrize("batch_size", BATCH_SIZES) 36 | @pytest.mark.parametrize("train", TRAIN) 37 | @pytest.mark.parametrize("device", _DEVICES, ids=["cpu", "cuda"]) 38 | def test_cifar10_loader(batch_size, train, device): 39 | cifar10_train_dataset = ndl.data.CIFAR10Dataset("data/cifar-10-batches-py", train=True) 40 | train_loader = ndl.data.DataLoader(cifar10_train_dataset, batch_size) 41 | for (X, y) in train_loader: 42 | break 43 | assert isinstance(X.cached_data, nd.NDArray) 44 | assert isinstance(X, ndl.Tensor) 45 | assert isinstance(y, ndl.Tensor) 46 | assert X.dtype == 'float32' 47 | 48 | 49 | BPTT = [3, 32] 50 | @pytest.mark.parametrize("batch_size", BATCH_SIZES) 51 | @pytest.mark.parametrize("bptt", BPTT) 52 | @pytest.mark.parametrize("train", TRAIN) 53 | @pytest.mark.parametrize("device", _DEVICES, ids=["cpu", "cuda"]) 54 | def test_ptb_dataset(batch_size, bptt, train, device): 55 | # TODO update with more tests? 56 | corpus = ndl.data.Corpus("data/ptb") 57 | if train: 58 | data = ndl.data.batchify(corpus.train, batch_size, device=device, dtype="float32") 59 | else: 60 | data = ndl.data.batchify(corpus.test, batch_size, device=device, dtype="float32") 61 | X, y = ndl.data.get_batch(data, np.random.randint(len(data)), bptt, device=device) 62 | assert X.shape == (bptt, batch_size) 63 | assert y.shape == (bptt * batch_size,) 64 | assert isinstance(X, ndl.Tensor) 65 | assert X.dtype == 'float32' 66 | assert X.device == device 67 | assert isinstance(X.cached_data, nd.NDArray) 68 | ntokens = len(corpus.dictionary) 69 | assert ntokens == 10000 70 | 71 | 72 | ### MUGRADE ### 73 | 74 | TEST_BATCH_SIZES = [3, 5] 75 | TEST_BPTT = [6, 10] 76 | 77 | def mugrade_submit(x): 78 | if isinstance(x, np.ndarray): 79 | x = x.flatten()[:128] 80 | #print(x) 81 | mugrade.submit(x) 82 | else: 83 | #print(x) 84 | mugrade.submit(x) 85 | 86 | 87 | def submit_cifar10(): 88 | if not ndl.cuda().enabled(): 89 | print('You need a GPU to run some of these tests.') 90 | devices = [ndl.cpu(), ndl.cuda()] 91 | for train in TRAIN: 92 | dataset = ndl.data.CIFAR10Dataset("data/cifar-10-batches-py", train=train) 93 | mugrade_submit(len(dataset)) 94 | for (device, batch_size) in itertools.product(devices, TEST_BATCH_SIZES): 95 | loader = ndl.data.DataLoader(dataset, batch_size) 96 | for (X, y) in loader: 97 | break 98 | mugrade_submit(X.numpy()[0, :, :, :]) 99 | mugrade_submit(y.numpy()[0]) 100 | 101 | 102 | def submit_ptb(): 103 | # devices = [ndl.cpu(), ndl.cuda()] if ndl.cuda().enabled() else [ndl.cpu()] 104 | devices = [ndl.cpu(), ndl.cuda()] 105 | 106 | corpus = ndl.data.Corpus("data/ptb") 107 | mugrade_submit(np.array(len(corpus.dictionary))) 108 | for train in TRAIN: 109 | for (device, batch_size, bptt) in itertools.product(devices, TEST_BATCH_SIZES, TEST_BPTT): 110 | if train: 111 | data = ndl.data.batchify(corpus.train, batch_size, device=device, dtype="float32") 112 | else: 113 | data = ndl.data.batchify(corpus.test, batch_size, device=device, dtype="float32") 114 | X, y = ndl.data.get_batch(data, np.random.randint(len(data)), bptt) 115 | mugrade_submit(np.array(len(data))) 116 | mugrade_submit(X.numpy()[0, :]) 117 | mugrade_submit(y.numpy()[0]) 118 | 119 | 120 | if __name__ == "__main__": 121 | submit_cifar10() 122 | submit_ptb() -------------------------------------------------------------------------------- /tests/hw4/test_conv.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append('./python') 3 | import numpy as np 4 | import pytest 5 | from needle import backend_ndarray as nd 6 | import needle as ndl 7 | import mugrade 8 | import itertools 9 | 10 | 11 | _DEVICES = [ndl.cpu(), pytest.param(ndl.cuda(), 12 | marks=pytest.mark.skipif(not ndl.cuda().enabled(), reason="No GPU"))] 13 | 14 | def backward_check(f, *args, **kwargs): 15 | eps = 1e-3 16 | out = f(*args, **kwargs) 17 | c = np.random.randn(*out.shape) 18 | is_stacked = False 19 | if isinstance(args[0], list): 20 | args = args[0] 21 | is_stacked = True 22 | numerical_grad = [np.zeros(a.shape) for a in args] 23 | num_args = len(args) 24 | for i in range(num_args): 25 | for j in range(args[i].realize_cached_data().size): 26 | args[i].realize_cached_data().flat[j] += eps 27 | if is_stacked: 28 | f1 = (f(args, **kwargs).numpy() * c).sum() 29 | else: 30 | f1 = (f(*args, **kwargs).numpy() * c).sum() 31 | args[i].realize_cached_data().flat[j] -= 2 * eps 32 | if is_stacked: 33 | f2 = (f(args, **kwargs).numpy() * c).sum() 34 | else: 35 | f2 = (f(*args, **kwargs).numpy() * c).sum() 36 | args[i].realize_cached_data().flat[j] += eps 37 | numerical_grad[i].flat[j] = (f1 - f2) / (2 * eps) 38 | backward_grad = out.op.gradient_as_tuple(ndl.Tensor(c, device=args[0].device), out) 39 | if isinstance(backward_grad[0], ndl.TensorTuple): # TODO keep this? 40 | backward_grad = backward_grad[0].tuple() 41 | error = sum( 42 | np.linalg.norm(backward_grad[i].numpy() - numerical_grad[i]) 43 | for i in range(len(args)) 44 | ) 45 | assert error < 1e-2 46 | return [g.numpy() for g in backward_grad] 47 | 48 | 49 | stack_back_params = [ 50 | ( (3, 4), 3, 0), 51 | ( (3, 4), 3, 1), 52 | ( (3, 4), 3, 2), 53 | ( (3, 4), 5, 2), 54 | ( (3, 4), 1, 2), 55 | ] 56 | @pytest.mark.parametrize("device", _DEVICES) 57 | @pytest.mark.parametrize("shape, n, axis", stack_back_params) 58 | def test_stack_backward(shape, n, axis, device): 59 | np.random.seed(0) 60 | get_tensor = lambda shape: ndl.Tensor(np.random.randn(*shape)*5, device=device) 61 | backward_check(ndl.stack, [get_tensor(shape) for _ in range(n)], axis=axis) 62 | 63 | 64 | stack_params = [ 65 | {"shape": (10,3), "n": 4, "axis": 0}, 66 | {"shape": (4, 5, 6), "n": 5, "axis": 0}, 67 | {"shape": (4, 5, 6), "n": 3, "axis": 1}, 68 | {"shape": (4, 5, 6), "n": 2, "axis": 2} 69 | ] 70 | @pytest.mark.parametrize("device", _DEVICES) 71 | @pytest.mark.parametrize("params", stack_params) 72 | def test_stack_forward(params, device): 73 | np.random.seed(0) 74 | shape, n, axis = params['shape'], params['n'], params['axis'] 75 | to_stack_ndl = [] 76 | to_stack_npy = [] 77 | for i in range(n): 78 | _A = np.random.randn(*shape) 79 | to_stack_ndl += [ndl.Tensor(_A, device=device)] 80 | to_stack_npy += [_A] 81 | 82 | lhs = np.stack(to_stack_npy, axis=axis) 83 | rhs = ndl.stack(to_stack_ndl, axis=axis) 84 | 85 | 86 | pad_params = [ 87 | {"shape": (10, 32, 32, 8), "padding": ( (0, 0), (2, 2), (2, 2), (0, 0) )}, 88 | {"shape": (10, 32, 32, 8), "padding": ( (0, 0), (0, 0), (0, 0), (0, 0) )}, 89 | ] 90 | @pytest.mark.parametrize("device", [nd.cpu()]) 91 | @pytest.mark.parametrize("params", pad_params) 92 | def test_pad_forward(params, device): 93 | np.random.seed(0) 94 | shape, padding = params['shape'], params['padding'] 95 | _A = np.random.randn(*shape) 96 | _B = np.pad(_A, padding) 97 | A = nd.NDArray(_A, device=device) 98 | B = A.pad(padding) 99 | 100 | assert np.linalg.norm(A.numpy() - _A) < 1e-4 101 | 102 | 103 | flip_forward_params = [ 104 | {"shape": (10, 5), "axes": (0,)}, 105 | {"shape": (10, 5), "axes": (1,)}, 106 | {"shape": (10, 5), "axes": (0,1)}, 107 | {"shape": (10, 32, 32, 8), "axes": (0,1)}, 108 | {"shape": (3, 3, 6, 8), "axes": (0,1)}, 109 | {"shape": (10, 32, 32, 8), "axes": (1,2)}, 110 | {"shape": (3, 3, 6, 8), "axes": (1,2)}, 111 | {"shape": (10, 32, 32, 8), "axes": (2,3)}, 112 | {"shape": (3, 3, 6, 8), "axes": (2,3)}, 113 | {"shape": (10, 32, 32, 8), "axes": (0,1,2,3)}, 114 | ] 115 | @pytest.mark.parametrize("device", _DEVICES) 116 | @pytest.mark.parametrize("params", flip_forward_params) 117 | def test_flip_forward(params, device): 118 | np.random.seed(0) 119 | shape, axes = params['shape'], params['axes'] 120 | _A = np.random.randn(*shape) 121 | _B = np.flip(_A, axes) 122 | A = ndl.Tensor(_A, device=device) 123 | B = ndl.flip(A, axes=axes) 124 | 125 | assert np.linalg.norm(A.numpy() - _A) < 1e-4 126 | 127 | 128 | flip_backward_params = [ 129 | {"shape": (10, 5), "axes": (0,)}, 130 | {"shape": (10, 5), "axes": (1,)}, 131 | {"shape": (10, 5), "axes": (0,1)}, 132 | {"shape": (2, 3, 3, 8), "axes": (0,1)}, 133 | {"shape": (3, 3, 6, 4), "axes": (0,1)}, 134 | {"shape": (2, 3, 3, 4), "axes": (1,2)}, 135 | {"shape": (3, 3, 6, 4), "axes": (1,2)}, 136 | {"shape": (2, 3, 3, 4), "axes": (2,3)}, 137 | {"shape": (3, 3, 6, 4), "axes": (2,3)}, 138 | {"shape": (2, 3, 3, 4), "axes": (0,1,2,3)}, 139 | ] 140 | @pytest.mark.parametrize("device", _DEVICES) 141 | @pytest.mark.parametrize("params", flip_backward_params) 142 | def test_flip_backward(params, device): 143 | np.random.seed(0) 144 | shape, axes = params['shape'], params['axes'] 145 | backward_check(ndl.flip, ndl.Tensor(np.random.randn(*shape), device=device), axes=axes) 146 | 147 | 148 | # @pytest.mark.parametrize("device", _DEVICES) 149 | # def test_init_calculate_fans(device): 150 | # _A = np.random.randn(3, 3, 16, 8) 151 | # A = ndl.Tensor(_A, device=device) 152 | # assert ndl.init._calculate_fans(A) == (144, 72) 153 | 154 | # _A = np.random.randn(3, 3, 16, 8) 155 | # A = ndl.Tensor(_A, device=device) 156 | # assert ndl.init._calculate_fans(A) == (144, 72) 157 | 158 | 159 | # _A = np.random.randn(16, 8) 160 | # A = ndl.Tensor(_A, device=device) 161 | # assert ndl.init._calculate_fans(A) == (16, 8) 162 | 163 | 164 | @pytest.mark.parametrize("device", _DEVICES) 165 | def test_init_kaiming_uniform(device): 166 | _A = np.random.randn(3, 3, 16, 8) 167 | A = ndl.Tensor(_A, device=device) 168 | np.random.seed(0) 169 | A = ndl.init.kaiming_uniform(16*9, 8*9, shape=A.shape) 170 | assert abs(A.sum().numpy() - -2.5719218) < 1e-4 171 | 172 | 173 | @pytest.mark.parametrize("device", _DEVICES) 174 | def test_resnet9(device): 175 | def num_params(model): 176 | return np.sum([np.prod(x.shape) for x in model.parameters()]) 177 | 178 | from apps.models import ResNet9 179 | np.random.seed(0) 180 | model = ResNet9(device=device) 181 | 182 | assert num_params(model) == 431946 183 | 184 | _A = np.random.randn(2, 3, 32, 32) 185 | A = ndl.Tensor(_A, device=device) 186 | y = model(A) 187 | 188 | assert np.linalg.norm(np.array([[-1.8912625 , 0.64833605, 1.9400386 , 1.1435282 , 1.89777 , 189 | 2.9039745 , -0.10433993, 0.35458302, -0.5684191 , 2.6178317 ], 190 | [-0.2905612 , -0.4147861 , 0.90268034, 0.46530387, 1.3335679 , 191 | 1.8534894 , -0.1867125 , -2.4298222 , -0.5344223 , 4.362149 ]]) - y.numpy()) < 1e-2 192 | 193 | 194 | 195 | @pytest.mark.parametrize("device", _DEVICES) 196 | def test_dilate_forward(device): 197 | np.random.seed(0) 198 | device = ndl.cpu() 199 | 200 | _A = np.random.randint(1, 10, size=(2, 5)) 201 | A = ndl.Tensor(_A, device=device) 202 | assert np.linalg.norm(ndl.dilate(A, dilation=0, axes=(0,)).numpy() - np.array([[6., 1., 4., 4., 8.], 203 | [4., 6., 3., 5., 8.]])) < 1e-5 204 | 205 | _A = np.random.randint(1, 10, size=(2, 5)) 206 | A = ndl.Tensor(_A, device=device) 207 | assert np.linalg.norm(ndl.dilate(A, dilation=1, axes=(0,)).numpy() - np.array([[7., 9., 9., 2., 7.], 208 | [0., 0., 0., 0., 0.], 209 | [8., 8., 9., 2., 6.], 210 | [0., 0., 0., 0., 0.]])) < 1e-5 211 | 212 | _A = np.random.randint(1, 10, size=(2, 5)) 213 | A = ndl.Tensor(_A, device=device) 214 | assert np.linalg.norm(ndl.dilate(A, dilation=1, axes=(1,)).numpy() - np.array([[9., 0., 5., 0., 4., 0., 1., 0., 4., 0.], 215 | [6., 0., 1., 0., 3., 0., 4., 0., 9., 0.]])) < 1e-5 216 | 217 | _A = np.random.randint(1, 10, size=(2, 5)) 218 | A = ndl.Tensor(_A, device=device) 219 | assert np.linalg.norm(ndl.dilate(A, dilation=1, axes=(0,1)).numpy() - np.array([[2., 0., 4., 0., 4., 0., 4., 0., 8., 0.], 220 | [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], 221 | [1., 0., 2., 0., 1., 0., 5., 0., 8., 0.], 222 | [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])) < 1e-5 223 | 224 | _A = np.random.randint(1, 10, size=(2, 2)) 225 | A = ndl.Tensor(_A, device=device) 226 | assert np.linalg.norm(ndl.dilate(A, dilation=2, axes=(0,1)).numpy() - np.array([[4., 0., 0., 3., 0., 0.], 227 | [0., 0., 0., 0., 0., 0.], 228 | [0., 0., 0., 0., 0., 0.], 229 | [8., 0., 0., 3., 0., 0.], 230 | [0., 0., 0., 0., 0., 0.], 231 | [0., 0., 0., 0., 0., 0.]])) < 1e-5 232 | 233 | _A = np.random.randint(1, 10, size=(2, 2, 2, 2)) 234 | A = ndl.Tensor(_A, device=device) 235 | assert np.linalg.norm(ndl.dilate(A, dilation=1, axes=(1,2)).numpy() - np.array([[[[1., 1.], 236 | [0., 0.], 237 | [5., 6.], 238 | [0., 0.]], 239 | 240 | [[0., 0.], 241 | [0., 0.], 242 | [0., 0.], 243 | [0., 0.]], 244 | 245 | [[6., 7.], 246 | [0., 0.], 247 | [9., 5.], 248 | [0., 0.]], 249 | 250 | [[0., 0.], 251 | [0., 0.], 252 | [0., 0.], 253 | [0., 0.]]], 254 | 255 | 256 | [[[2., 5.], 257 | [0., 0.], 258 | [9., 2.], 259 | [0., 0.]], 260 | 261 | [[0., 0.], 262 | [0., 0.], 263 | [0., 0.], 264 | [0., 0.]], 265 | 266 | [[2., 8.], 267 | [0., 0.], 268 | [4., 7.], 269 | [0., 0.]], 270 | 271 | [[0., 0.], 272 | [0., 0.], 273 | [0., 0.], 274 | [0., 0.]]]])) < 1e-5 275 | 276 | 277 | dilate_backward_params = [ 278 | {"shape": (2, 5), "d": 1, "axes": (0,)}, 279 | {"shape": (2, 5), "d": 2, "axes": (1,)}, 280 | {"shape": (2, 5), "d": 1, "axes": (0,1)}, 281 | {"shape": (2, 5), "d": 0, "axes": (0,1)}, 282 | {"shape": (2, 3, 3, 4), "d": 2, "axes": (0,1)}, 283 | {"shape": (3, 3, 6, 4), "d": 3, "axes": (0,1)}, 284 | {"shape": (2, 3, 3, 4), "d": 0, "axes": (1,2)}, 285 | {"shape": (2, 3, 3, 4), "d": 1, "axes": (1,2)}, 286 | {"shape": (3, 3, 6, 4), "d": 1, "axes": (1,2)}, 287 | {"shape": (2, 3, 3, 4), "d": 1, "axes": (2,3)}, 288 | {"shape": (3, 3, 6, 4), "d": 1, "axes": (2,3)}, 289 | {"shape": (2, 3, 3, 4), "d": 1, "axes": (0,1,2,3)}, 290 | ] 291 | @pytest.mark.parametrize("device", _DEVICES) 292 | @pytest.mark.parametrize("params", dilate_backward_params) 293 | def test_dilate_backward(params, device): 294 | np.random.seed(0) 295 | shape, d, axes = params['shape'], params['d'], params['axes'] 296 | backward_check(ndl.dilate, ndl.Tensor(np.random.randn(*shape), device=device), dilation=d, axes=axes) 297 | 298 | 299 | def test_stack_vs_pytorch(): 300 | np.random.seed(0) 301 | import torch 302 | A = np.random.randn(5, 5) 303 | B = np.random.randn(5, 5) 304 | C = np.random.randn(5, 5) 305 | D = np.random.randn(15, 5) 306 | 307 | Andl = ndl.Tensor(A, requires_grad=True) 308 | Bndl = ndl.Tensor(B, requires_grad=True) 309 | Cndl = ndl.Tensor(C, requires_grad=True) 310 | Dndl = ndl.Tensor(D, requires_grad=True) 311 | 312 | Atch = torch.tensor(A, requires_grad=True) 313 | Btch = torch.tensor(B, requires_grad=True) 314 | Ctch = torch.tensor(C, requires_grad=True) 315 | Dtch = torch.tensor(D, requires_grad=True) 316 | 317 | Xndl = ndl.stack([Andl, Cndl @ Bndl, Cndl], axis=1) 318 | Xtch = torch.stack([Atch, Ctch @ Btch, Ctch], dim=1) 319 | 320 | assert Xndl.shape == Xtch.shape 321 | assert np.linalg.norm(Xndl.numpy() - Xtch.detach().numpy()) < 1e-3 322 | 323 | Yndl = (Dndl @ Xndl.reshape((5, 15)) @ Dndl).sum() 324 | Ytch = (Dtch @ Xtch.reshape(5, 15) @ Dtch).sum() 325 | 326 | assert np.linalg.norm(Yndl.numpy() - Ytch.detach().numpy()) < 1e-3 327 | 328 | Yndl.backward() 329 | Ytch.backward() 330 | 331 | assert np.linalg.norm(Andl.grad.cached_data.numpy() - Atch.grad.detach().numpy()) < 1e-3 332 | assert np.linalg.norm(Bndl.grad.cached_data.numpy() - Btch.grad.detach().numpy()) < 1e-3 333 | assert np.linalg.norm(Cndl.grad.cached_data.numpy() - Ctch.grad.detach().numpy()) < 1e-3 334 | 335 | 336 | 337 | conv_forward_params = [ 338 | (4, 8, 16, 3, 1), 339 | (32, 8, 16, 3, 2), 340 | (32, 8, 8, 3, 2), 341 | (32, 16, 8, 3, 1), 342 | (32, 16, 8, 3, 2) 343 | ] 344 | @pytest.mark.parametrize("s,cin,cout,k,stride", conv_forward_params) 345 | @pytest.mark.parametrize("device", _DEVICES) 346 | def test_nn_conv_forward(s, cin, cout, k, stride, device): 347 | np.random.seed(0) 348 | import torch 349 | f = ndl.nn.Conv(cin, cout, k, stride=stride, device=device) 350 | x = ndl.init.rand(10, cin, s, s, device=device) 351 | 352 | g = torch.nn.Conv2d(cin, cout, k, stride=stride, padding=k//2) 353 | g.weight.data = torch.tensor(f.weight.cached_data.numpy().transpose(3, 2, 0, 1)) 354 | g.bias.data = torch.tensor(f.bias.cached_data.numpy()) 355 | z = torch.tensor(x.cached_data.numpy()) 356 | 357 | assert np.linalg.norm(f(x).cached_data.numpy() - g(z).data.numpy()) < 1e-3 358 | 359 | 360 | conv_back_params = [ 361 | (4, 1, 1, 3, 1), 362 | (14, 8, 16, 3, 1), 363 | (14, 8, 16, 3, 2), 364 | (14, 8, 8, 3, 1), 365 | (14, 8, 8, 3, 2), 366 | (14, 16, 8, 3, 1), 367 | (14, 16, 8, 3, 2), 368 | ] 369 | @pytest.mark.parametrize("s,cin,cout,k,stride", conv_back_params) 370 | @pytest.mark.parametrize("device", _DEVICES) 371 | def test_nn_conv_backward(s, cin, cout, k, stride, device): 372 | np.random.seed(0) 373 | import torch 374 | f = ndl.nn.Conv(cin, cout, k, stride=stride, device=device) 375 | x = ndl.init.rand(1, cin, s, s, device=device, requires_grad=True) 376 | 377 | g = torch.nn.Conv2d(cin, cout, k, stride=stride, padding=k//2) 378 | g.weight.data = torch.tensor(f.weight.cached_data.numpy().transpose(3, 2, 0, 1)) 379 | g.bias.data = torch.tensor(f.bias.cached_data.numpy()) 380 | z = torch.tensor(x.cached_data.numpy(), requires_grad=True) 381 | z.requires_grad = True 382 | 383 | res1 = f(x) 384 | y1 = res1.sum() 385 | 386 | y2 = g(z).sum() 387 | 388 | y1.backward() 389 | y2.backward() 390 | 391 | assert np.linalg.norm(g.weight.grad.data.numpy() - f.weight.grad.cached_data.numpy().transpose(3, 2, 0, 1)) < 1e-3, "weight gradients match" 392 | assert np.linalg.norm(g.bias.grad.data.numpy() - f.bias.grad.cached_data.numpy()) < 1e-3, "bias gradients match" 393 | assert np.linalg.norm(z.grad.data.numpy() - x.grad.cached_data.numpy()) < 1e-3, "input gradients match" 394 | 395 | 396 | op_conv_shapes = [ 397 | ( (3, 14, 14, 8), (3, 3, 8, 16), 1, 0 ), 398 | ( (3, 14, 14, 8), (3, 3, 8, 16), 1, 1 ), 399 | ( (3, 16, 16, 8), (3, 3, 8, 16), 1, 2 ), 400 | ( (3, 16, 16, 8), (3, 3, 8, 14), 1, 0 ), 401 | ( (3, 16, 16, 2), (3, 3, 2, 14), 1, 0 ), 402 | 403 | ( (3, 14, 14, 8), (3, 3, 8, 16), 2, 0 ), 404 | ( (3, 14, 14, 8), (3, 3, 8, 16), 2, 1 ), 405 | ( (3, 16, 16, 8), (3, 3, 8, 16), 2, 2 ), 406 | ( (3, 16, 16, 8), (3, 3, 8, 14), 2, 0 ), 407 | ( (3, 16, 16, 2), (3, 3, 2, 14), 2, 0 ), 408 | 409 | ( (3, 16, 16, 24), (3, 3, 24, 14), 1, 0 ), 410 | ( (3, 14, 14, 8), (5, 5, 8, 16), 1, 0 ), 411 | ( (3, 17, 17, 8), (5, 5, 8, 16), 1, 0 ), 412 | ( (3, 17, 17, 1), (5, 5, 1, 16) , 1, 0), 413 | ( (3, 17, 17, 16), (5, 5, 16, 1), 1, 0 ), 414 | ( (3, 17, 17, 16), (1, 1, 16, 1), 1, 0 ), 415 | ( (1, 14, 14, 2), (3, 3, 2, 2), 1, 0 ), 416 | ] 417 | @pytest.mark.parametrize("Z_shape, W_shape, stride, padding", op_conv_shapes) 418 | @pytest.mark.parametrize("device", _DEVICES) 419 | @pytest.mark.parametrize("backward", [True, False], ids=["backward", "forward"]) 420 | def test_op_conv(Z_shape, W_shape, stride, padding, backward, device): 421 | np.random.seed(0) 422 | import torch 423 | _Z = np.random.randn(*Z_shape)*5 424 | _Z = _Z.astype(np.float32) 425 | _W = np.random.randn(*W_shape)*5 426 | _W = _W.astype(np.float32) 427 | Z = ndl.Tensor(_Z, device=device) 428 | W = ndl.Tensor(_W, device=device) 429 | y = ndl.conv(Z, W, padding=padding, stride=stride) 430 | y2 = y.sum() 431 | if backward: 432 | y2.backward() 433 | Ztch = torch.Tensor(_Z).float() 434 | Ztch.requires_grad=True 435 | Wtch = torch.Tensor(_W).float() 436 | Wtch.requires_grad=True 437 | out = torch.nn.functional.conv2d(Ztch.permute(0, 3, 1, 2), Wtch.permute(3, 2, 0, 1), padding=padding, stride=stride) 438 | out2 = out.sum() 439 | if backward: 440 | out2.backward() 441 | if backward: 442 | err1 = np.linalg.norm(Ztch.grad.numpy() - Z.grad.numpy()) 443 | err2 = np.linalg.norm(Wtch.grad.numpy() - W.grad.numpy()) 444 | err3 = np.linalg.norm(out2.detach().numpy() - y2.numpy()) 445 | if backward: 446 | assert err1 < 1e-2, "input grads match" 447 | assert err2 < 1e-2, "weight grads match" 448 | assert err3 < 1e-1, "outputs match %s, %s" % (y2, out2) 449 | 450 | 451 | @pytest.mark.parametrize("device", _DEVICES) 452 | def test_train_cifar10(device): 453 | np.random.seed(0) 454 | dataset = ndl.data.CIFAR10Dataset("./data/cifar-10-batches-py", train=True) 455 | dataloader = ndl.data.DataLoader(\ 456 | dataset=dataset, 457 | batch_size=128, 458 | shuffle=False 459 | # collate_fn=ndl.data.collate_ndarray, 460 | # drop_last=False, 461 | # device=device, 462 | # dtype="float32" 463 | ) 464 | from apps.models import ResNet9 465 | np.random.seed(0) 466 | model = ResNet9(device=device, dtype="float32") 467 | out = one_iter_of_cifar10_training(dataloader, model, opt=ndl.optim.Adam(model.parameters(), lr=0.001, weight_decay=0.001), device=device) 468 | assert np.linalg.norm(np.array(list(out), dtype=object) - np.array([0.09375, 3.5892258])) < 1e-2 469 | 470 | 471 | def one_iter_of_cifar10_training(dataloader, model, niter=1, loss_fn=ndl.nn.SoftmaxLoss(), opt=None, device=None): 472 | np.random.seed(4) 473 | model.train() 474 | correct, total_loss = 0, 0 475 | i = 1 476 | for batch in dataloader: 477 | opt.reset_grad() 478 | X, y = batch 479 | X,y = ndl.Tensor(X, device=device), ndl.Tensor(y, device=device) 480 | out = model(X) 481 | correct += np.sum(np.argmax(out.numpy(), axis=1) == y.numpy()) 482 | loss = loss_fn(out, y) 483 | total_loss += loss.data.numpy() * y.shape[0] 484 | loss.backward() 485 | opt.step() 486 | if i >= niter: 487 | break 488 | i += 1 489 | return correct/(y.shape[0]*niter), total_loss/(y.shape[0]*niter) 490 | 491 | 492 | ###################### | ###################### 493 | ###################### MUGRADE ###################### 494 | ###################### v ###################### 495 | 496 | def Prepare(A): 497 | return (A.numpy().flatten()[:64], A.shape) 498 | 499 | 500 | def Rand(*shape, device=ndl.cpu(), entropy=1): 501 | np.random.seed(np.prod(shape) * len(shape) * entropy) 502 | _A = np.random.randint(low=1, high=10, size=shape) 503 | return ndl.Tensor(_A, device=device) 504 | 505 | 506 | def RandC(*shape, entropy=1): 507 | if ndl.cuda().enabled(): 508 | return Rand(*shape, device=ndl.cuda(), entropy=2) 509 | else: 510 | raise NotImplementedError("You need a GPU to run these tests.") 511 | 512 | 513 | def MugradeSubmit(things): 514 | mugrade.submit(Prepare(things)) 515 | # print(Prepare(things)) 516 | 517 | 518 | def submit_conv_forward(): 519 | def DoConvOp(batches, cin, cout, n, k=3, stride=1, padding=0, device=ndl.cpu()): 520 | X = Rand(batches, n, n, cin, device=device) 521 | W = Rand(k, k, cin, cout, device=device) 522 | y = ndl.conv(X, W, stride=stride, padding=padding) 523 | return y 524 | 525 | def DoConvLayer(batches, cin, cout, n, k=3, stride=1, bias=True, device=ndl.cpu()): 526 | X = Rand(batches, cin, n, n, device=device) 527 | f = ndl.nn.Conv(cin, cout, k, stride=stride, bias=bias, device=device) 528 | return f(X) 529 | 530 | MugradeSubmit(DoConvOp(2, 1, 2, 4, k=1, stride=1, padding=0)) 531 | MugradeSubmit(DoConvOp(2, 1, 2, 4, k=1, stride=1, padding=2)) 532 | MugradeSubmit(DoConvOp(2, 3, 1, 6, k=1, stride=2, padding=2)) 533 | 534 | 535 | MugradeSubmit(DoConvOp(2, 1, 2, 4, k=3, stride=1, padding=0)) 536 | MugradeSubmit(DoConvOp(3, 1, 2, 4, k=3, stride=1, padding=2)) 537 | MugradeSubmit(DoConvOp(1, 1, 3, 6, k=5, stride=2, padding=2)) 538 | 539 | MugradeSubmit(DoConvLayer(3, 2, 4, 6, k=3, stride=1, bias=True)) 540 | MugradeSubmit(DoConvLayer(3, 4, 2, 6, k=3, stride=1, bias=True)) 541 | MugradeSubmit(DoConvLayer(1, 1, 1, 12, k=3, stride=2, bias=True)) 542 | MugradeSubmit(DoConvLayer(1, 1, 1, 12, k=1, stride=1, bias=False)) 543 | MugradeSubmit(DoConvLayer(1, 2, 1, 12, k=7, stride=1, bias=False)) 544 | MugradeSubmit(DoConvLayer(1, 1, 3, 12, k=7, stride=4, bias=False)) 545 | 546 | 547 | if ndl.cuda().enabled(): 548 | MugradeSubmit(DoConvLayer(3, 2, 4, 6, k=3, stride=1, bias=False, device=ndl.cuda())) 549 | MugradeSubmit(DoConvLayer(3, 4, 2, 6, k=3, stride=1, bias=False, device=ndl.cuda())) 550 | else: 551 | print('You need a GPU to run these tests!') 552 | 553 | 554 | def submit_conv_backward(): 555 | 556 | def DoConvOpBackward(batches, cin, cout, n, k=3, stride=1, padding=0, device=ndl.cpu(), wrtX=True): 557 | X = Rand(batches, n, n, cin, device=device) 558 | X.requires_grad = True 559 | W = Rand(k, k, cin, cout, device=device) 560 | W.requires_grad = True 561 | y = ndl.conv(X, W, stride=stride, padding=padding).sum() 562 | y.backward() 563 | if wrtX: 564 | return W.grad 565 | else: 566 | return X.grad 567 | 568 | def DoConvLayerBackward(batches, cin, cout, n, k=3, stride=1, bias=True, device=ndl.cpu(), wrtX=True): 569 | X = Rand(batches, cin, n, n, device=device) 570 | X.requires_grad = True 571 | f = ndl.nn.Conv(cin, cout, k, stride=stride, bias=bias, device=device) 572 | y = f(X).sum() 573 | y.backward() 574 | if wrtX: 575 | return f.weight.grad 576 | else: 577 | return X.grad 578 | 579 | MugradeSubmit(DoConvOpBackward(2, 1, 2, 4, k=1, stride=1, padding=0, wrtX=True)) 580 | MugradeSubmit(DoConvOpBackward(2, 3, 1, 6, k=1, stride=2, padding=0, wrtX=True)) 581 | MugradeSubmit(DoConvOpBackward(2, 1, 2, 10, k=3, stride=1, padding=1, wrtX=True)) 582 | MugradeSubmit(DoConvOpBackward(2, 3, 1, 8, k=3, stride=2, padding=2, wrtX=True)) 583 | MugradeSubmit(DoConvOpBackward(2, 1, 3, 8, k=5, stride=1, padding=2, wrtX=True)) 584 | 585 | MugradeSubmit(DoConvOpBackward(2, 1, 2, 4, k=1, stride=1, padding=0, wrtX=False)) 586 | MugradeSubmit(DoConvOpBackward(2, 3, 1, 6, k=1, stride=2, padding=0, wrtX=False)) 587 | MugradeSubmit(DoConvOpBackward(2, 1, 2, 6, k=3, stride=1, padding=1, wrtX=False)) 588 | MugradeSubmit(DoConvOpBackward(2, 3, 1, 6, k=3, stride=2, padding=2, wrtX=False)) 589 | MugradeSubmit(DoConvOpBackward(2, 1, 3, 8, k=5, stride=1, padding=2, wrtX=False)) 590 | 591 | MugradeSubmit(DoConvLayerBackward(3, 2, 4, 6, k=3, stride=1, bias=True, wrtX=True)) 592 | MugradeSubmit(DoConvLayerBackward(1, 2, 1, 12, k=7, stride=1, bias=False, wrtX=True)) 593 | MugradeSubmit(DoConvLayerBackward(1, 1, 3, 12, k=7, stride=4, bias=False, wrtX=True)) 594 | MugradeSubmit(DoConvLayerBackward(3, 2, 4, 6, k=3, stride=1, bias=True, wrtX=False)) 595 | MugradeSubmit(DoConvLayerBackward(1, 2, 1, 12, k=7, stride=1, bias=False, wrtX=False)) 596 | MugradeSubmit(DoConvLayerBackward(1, 1, 3, 12, k=7, stride=4, bias=False, wrtX=False)) 597 | 598 | if ndl.cuda().enabled(): 599 | MugradeSubmit(DoConvLayerBackward(3, 2, 4, 6, k=3, stride=1, bias=False, wrtX=True, device=ndl.cuda())) 600 | MugradeSubmit(DoConvLayerBackward(3, 4, 2, 6, k=3, stride=1, bias=False, wrtX=False, device=ndl.cuda())) 601 | else: 602 | print('You need a GPU to run these tests!') 603 | 604 | 605 | def submit_new_ops(): 606 | # pad 607 | np.random.seed(1337) 608 | _A = np.random.randint(low=1, high=10, size=(2, 2, 2, 2)) 609 | A = nd.NDArray(_A, device=nd.cpu()) 610 | MugradeSubmit(A.pad(( (0, 0), (1, 1), (2, 2), (0, 0)))) 611 | 612 | def DoFlip(shape, axes, backward=False, device=ndl.cpu()): 613 | X = Rand(*shape, device=device) 614 | X.requires_grad = True 615 | Y = ndl.flip(X, axes=axes) 616 | if backward: 617 | V = Rand(*shape, device=device, entropy=2) 618 | Z = (V*Y).sum() 619 | Z.backward() 620 | return X.grad 621 | else: 622 | return Y 623 | 624 | def DoDilate(shape, axes, dilation, backward=False, device=ndl.cpu()): 625 | X = Rand(*shape, device=device) 626 | X.requires_grad = True 627 | Y = ndl.dilate(X, dilation=dilation, axes=axes) 628 | if backward: 629 | V = Rand(*Y.shape, device=device, entropy=2) 630 | Z = (V*Y).sum() 631 | Z.backward() 632 | return X.grad 633 | else: 634 | return Y 635 | 636 | # flip 637 | MugradeSubmit(DoFlip((2, 2, 3, 1), (1,2))) 638 | MugradeSubmit(DoFlip((2, 1, 3, 2), (0,1,2,3))) 639 | MugradeSubmit(DoFlip((8, 4), (1,))) 640 | MugradeSubmit(DoFlip((4, 8), (0,))) 641 | MugradeSubmit(DoFlip((2, 2, 3, 1), (2,3), backward=True)) 642 | MugradeSubmit(DoFlip((2, 1, 3, 2), (1,2,3), backward=True)) 643 | 644 | # dilate 645 | MugradeSubmit(DoDilate((2, 2, 3, 1), (1,2), 1)) 646 | MugradeSubmit(DoDilate((2, 2), (2,), 1)) 647 | MugradeSubmit(DoDilate((2, 2, 3, 1), (1,2), 1, backward=True)) 648 | MugradeSubmit(DoDilate((2, 2), (2,), 1, backward=True)) 649 | 650 | 651 | 652 | def submit_resnet9(): 653 | def num_params(model): 654 | return np.sum([np.prod(x.shape) for x in model.parameters()]) 655 | 656 | device = ndl.cpu() 657 | import sys 658 | sys.path.append('.') 659 | from apps.models import ResNet9 660 | np.random.seed(1) 661 | model = ResNet9(device=device) 662 | 663 | MugradeSubmit(ndl.Tensor(num_params(model))) 664 | 665 | np.random.seed(1) 666 | dataset = ndl.data.CIFAR10Dataset("./data/cifar-10-batches-py", train=True) 667 | dataloader = ndl.data.DataLoader(\ 668 | dataset=dataset, 669 | batch_size=128, 670 | shuffle=True 671 | ) 672 | np.random.seed(1) 673 | model = ResNet9(device=device, dtype="float32") 674 | out = one_iter_of_cifar10_training(dataloader, model, niter=2, opt=ndl.optim.Adam(model.parameters(), lr=0.01, weight_decay=0.0001), device=device) 675 | MugradeSubmit(ndl.Tensor(np.array(list(out), dtype=object))) 676 | 677 | 678 | if __name__ == "__main__": 679 | submit_conv_forward() 680 | submit_conv_backward() 681 | submit_new_ops() 682 | submit_resnet9() -------------------------------------------------------------------------------- /tests/hw4/test_nd_backend.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append('./python') 3 | import itertools 4 | import numpy as np 5 | import pytest 6 | import mugrade 7 | import torch 8 | 9 | import needle as ndl 10 | from needle import backend_ndarray as nd 11 | 12 | np.random.seed(1) 13 | 14 | def backward_check(f, *args, **kwargs): 15 | eps = 1e-5 16 | out = f(*args, **kwargs) 17 | c = np.random.randn(*out.shape) 18 | numerical_grad = [np.zeros(a.shape) for a in args] 19 | num_args = len(args) 20 | for i in range(num_args): 21 | for j in range(args[i].realize_cached_data().size): 22 | args[i].realize_cached_data().flat[j] += eps 23 | f1 = (f(*args, **kwargs).numpy() * c).sum() 24 | args[i].realize_cached_data().flat[j] -= 2 * eps 25 | f2 = (f(*args, **kwargs).numpy() * c).sum() 26 | args[i].realize_cached_data().flat[j] += eps 27 | numerical_grad[i].flat[j] = (f1 - f2) / (2 * eps) 28 | backward_grad = out.op.gradient_as_tuple(ndl.Tensor(c, device=args[0].device), out) 29 | error = sum( 30 | np.linalg.norm(backward_grad[i].numpy() - numerical_grad[i]) 31 | for i in range(len(args)) 32 | ) 33 | assert error < 4.2e-1 34 | return [g.numpy() for g in backward_grad] 35 | 36 | 37 | _DEVICES = [ndl.cpu(), pytest.param(ndl.cuda(), 38 | marks=pytest.mark.skipif(not ndl.cuda().enabled(), reason="No GPU"))] 39 | 40 | 41 | EWISE_OPS = { 42 | "divide": lambda a, b: a / b, 43 | "subtract": lambda a, b: a - b 44 | } 45 | EWISE_OP_FNS = [EWISE_OPS[k] for k in EWISE_OPS] 46 | EWISE_OP_NAMES = [k for k in EWISE_OPS] 47 | GENERAL_SHAPES = [(1, 1, 1), (4, 5, 6)] 48 | @pytest.mark.parametrize("fn", EWISE_OP_FNS, ids=EWISE_OP_NAMES) 49 | @pytest.mark.parametrize("shape", GENERAL_SHAPES) 50 | @pytest.mark.parametrize("device", _DEVICES, ids=["cpu", "cuda"]) 51 | def test_ewise_fn(fn, shape, device): 52 | _A = np.random.randn(*shape).astype(np.float32) 53 | _B = np.random.randn(*shape).astype(np.float32) 54 | A = ndl.Tensor(nd.array(_A), device=device) 55 | B = ndl.Tensor(nd.array(_B), device=device) 56 | np.testing.assert_allclose(fn(_A, _B), fn(A, B).numpy(), atol=1e-5, rtol=1e-5) 57 | 58 | 59 | SCALAR_OPS = { 60 | "divide": lambda a, b: a / b, 61 | "subtract": lambda a, b: a - b 62 | } 63 | SCALAR_OP_FNS = [SCALAR_OPS[k] for k in SCALAR_OPS] 64 | SCALAR_OP_NAMES = [k for k in SCALAR_OPS] 65 | @pytest.mark.parametrize("fn", SCALAR_OP_FNS, ids=SCALAR_OP_NAMES) 66 | @pytest.mark.parametrize("shape", GENERAL_SHAPES) 67 | @pytest.mark.parametrize("device", _DEVICES, ids=["cpu", "cuda"]) 68 | def test_scalar_fn(fn, shape, device): 69 | _A = np.random.randn(*shape).astype(np.float32) 70 | _B = np.random.randn(1).astype(np.float32).item() 71 | A = ndl.Tensor(nd.array(_A), device=device) 72 | np.testing.assert_allclose(fn(_A, _B), fn(A, _B).numpy(), atol=1e-5, rtol=1e-5) 73 | 74 | 75 | MATMUL_DIMS = [(16, 16, 16), 76 | (8, 8, 8), 77 | (1, 2, 3), 78 | (3, 4, 5), 79 | (5, 4, 3), 80 | (16, 16, 32), 81 | (64, 64, 64), 82 | (72, 72, 72), 83 | (72, 73, 74), 84 | (74, 73, 72), 85 | (128, 128, 128)] 86 | @pytest.mark.parametrize("m,n,p", MATMUL_DIMS) 87 | @pytest.mark.parametrize("device", _DEVICES, ids=["cpu", "cuda"]) 88 | def test_matmul(m, n, p, device): 89 | _A = np.random.randn(m, n).astype(np.float32) 90 | _B = np.random.randn(n, p).astype(np.float32) 91 | A = ndl.Tensor(nd.array(_A), device=device) 92 | B = ndl.Tensor(nd.array(_B), device=device) 93 | np.testing.assert_allclose(_A @ _B, (A @ B).numpy(), atol=1e-5, rtol=1e-5) 94 | 95 | 96 | @pytest.mark.parametrize("shape", GENERAL_SHAPES) 97 | @pytest.mark.parametrize("device", _DEVICES, ids=["cpu", "cuda"]) 98 | def test_power(shape, device): 99 | _A = np.random.randn(*shape).astype(np.float32) 100 | _B = np.random.randint(1) 101 | A = ndl.Tensor(nd.array(_A), device=device) 102 | np.testing.assert_allclose(_A**_B, (A**_B).numpy(), atol=1e-5, rtol=1e-5) 103 | 104 | 105 | @pytest.mark.parametrize("shape", GENERAL_SHAPES) 106 | @pytest.mark.parametrize("device", _DEVICES, ids=["cpu", "cuda"]) 107 | def test_log(shape, device): 108 | _A = np.random.randn(*shape).astype(np.float32) + 5. 109 | A = ndl.Tensor(nd.array(_A), device=device) 110 | np.testing.assert_allclose(np.log(_A), ndl.log(A).numpy(), atol=1e-5, rtol=1e-5) 111 | 112 | 113 | @pytest.mark.parametrize("shape", GENERAL_SHAPES) 114 | @pytest.mark.parametrize("device", _DEVICES, ids=["cpu", "cuda"]) 115 | def test_exp(shape, device): 116 | _A = np.random.randn(*shape).astype(np.float32) 117 | A = ndl.Tensor(nd.array(_A), device=device) 118 | np.testing.assert_allclose(np.exp(_A), ndl.exp(A).numpy(), atol=1e-5, rtol=1e-5) 119 | 120 | 121 | @pytest.mark.parametrize("shape", GENERAL_SHAPES) 122 | @pytest.mark.parametrize("device", _DEVICES, ids=["cpu", "cuda"]) 123 | def test_relu(shape, device): 124 | _A = np.random.randn(*shape).astype(np.float32) 125 | A = ndl.Tensor(nd.array(_A), device=device) 126 | np.testing.assert_allclose(np.maximum(_A, 0), ndl.relu(A).numpy(), atol=1e-5, rtol=1e-5) 127 | 128 | 129 | @pytest.mark.parametrize("shape", GENERAL_SHAPES) 130 | @pytest.mark.parametrize("device", _DEVICES, ids=["cpu", "cuda"]) 131 | def test_tanh(shape, device): 132 | _A = np.random.randn(*shape).astype(np.float32) 133 | A = ndl.Tensor(nd.array(_A), device=device) 134 | np.testing.assert_allclose(np.tanh(_A), ndl.tanh(A).numpy(), atol=1e-5, rtol=1e-5) 135 | 136 | 137 | @pytest.mark.parametrize("shape", GENERAL_SHAPES) 138 | @pytest.mark.parametrize("device", _DEVICES, ids=["cpu", "cuda"]) 139 | def test_tanh_backward(shape, device): 140 | _A = np.random.randn(*shape).astype(np.float32) 141 | A = ndl.Tensor(nd.array(_A), device=device) 142 | backward_check(ndl.tanh, A) 143 | 144 | 145 | STACK_PARAMETERS = [((5, 5), 0, 1), 146 | ((5, 5), 0, 2), 147 | ((1,5,7), 2, 5)] 148 | @pytest.mark.parametrize("shape, axis, l", STACK_PARAMETERS) 149 | @pytest.mark.parametrize("device", _DEVICES, ids=["cpu", "cuda"]) 150 | def test_stack(shape, axis, l, device): 151 | _A = [np.random.randn(*shape).astype(np.float32) for i in range(l)] 152 | A = [ndl.Tensor(nd.array(_A[i]), device=device) for i in range(l)] 153 | A_t = [torch.Tensor(_A[i]) for i in range(l)] 154 | out = ndl.stack(A, axis=axis) 155 | out_t = torch.stack(A_t, dim=axis) 156 | np.testing.assert_allclose(out_t.numpy(), out.numpy(), atol=1e-5, rtol=1e-5) 157 | 158 | 159 | @pytest.mark.parametrize("shape, axis, l", STACK_PARAMETERS) 160 | @pytest.mark.parametrize("device", _DEVICES, ids=["cpu", "cuda"]) 161 | def test_stack_backward(shape, axis, l, device): 162 | _A = [np.random.randn(*shape).astype(np.float32) for i in range(l)] 163 | A = [ndl.Tensor(nd.array(_A[i]), device=device) for i in range(l)] 164 | A_t = [torch.Tensor(_A[i]) for i in range(l)] 165 | for i in range(l): 166 | A_t[i].requires_grad = True 167 | ndl.stack(A, axis=axis).sum().backward() 168 | torch.stack(A_t, dim=axis).sum().backward() 169 | for i in range(l): 170 | np.testing.assert_allclose(A_t[i].grad.numpy(), A[i].grad.numpy(), atol=1e-5, rtol=1e-5) 171 | 172 | 173 | SUMMATION_PARAMETERS = [((1, 1, 1), None), 174 | ((5, 3), 0), 175 | ((8, 3, 2), 1), 176 | ((8, 3, 2), 2) 177 | ] 178 | @pytest.mark.parametrize("shape, axes", SUMMATION_PARAMETERS) 179 | @pytest.mark.parametrize("device", _DEVICES, ids=["cpu", "cuda"]) 180 | def test_summation(shape, axes, device): 181 | _A = np.random.randn(*shape).astype(np.float32) 182 | A = ndl.Tensor(nd.array(_A), device=device) 183 | np.testing.assert_allclose(np.sum(_A, axes), ndl.summation(A, axes=axes).numpy(), atol=1e-5, rtol=1e-5) 184 | 185 | 186 | @pytest.mark.parametrize("shape, axes", SUMMATION_PARAMETERS) 187 | @pytest.mark.parametrize("device", _DEVICES, ids=["cpu", "cuda"]) 188 | def test_summation_backward(shape, axes, device): 189 | _A = np.random.randn(*shape).astype(np.float32) 190 | A = ndl.Tensor(nd.array(_A), device=device) 191 | backward_check(ndl.summation, A, axes=axes) 192 | 193 | 194 | BROADCAST_SHAPES = [((1, 1, 1), (3, 3, 3)), 195 | ((4, 1, 6), (4, 3, 6))] 196 | @pytest.mark.parametrize("shape,shape_to", BROADCAST_SHAPES) 197 | @pytest.mark.parametrize("device", _DEVICES, ids=["cpu", "cuda"]) 198 | def test_broadcast_to(shape, shape_to, device): 199 | _A = np.random.randn(*shape).astype(np.float32) 200 | A = ndl.Tensor(nd.array(_A), device=device) 201 | np.testing.assert_allclose(np.broadcast_to(_A, shape_to), ndl.broadcast_to(A, shape_to).numpy(), atol=1e-5, rtol=1e-5) 202 | 203 | 204 | RESHAPE_SHAPES = [((1, 1, 1), (1,)), 205 | ((4, 1, 6), (6, 4, 1))] 206 | @pytest.mark.parametrize("shape,shape_to", RESHAPE_SHAPES) 207 | @pytest.mark.parametrize("device", _DEVICES, ids=["cpu", "cuda"]) 208 | def test_reshape(shape, shape_to, device): 209 | _A = np.random.randn(*shape).astype(np.float32) 210 | A = ndl.Tensor(nd.array(_A), device=device) 211 | np.testing.assert_allclose(np.reshape(_A, shape_to), ndl.reshape(A, shape_to).numpy(), atol=1e-5, rtol=1e-5) 212 | 213 | 214 | TRANSPOSE_SHAPES = [(1, 1, 1), (4, 5, 6)] 215 | TRANSPOSE_AXES = [(0, 1), (0, 2), None] 216 | @pytest.mark.parametrize("shape", TRANSPOSE_SHAPES) 217 | @pytest.mark.parametrize("axes", TRANSPOSE_AXES) 218 | @pytest.mark.parametrize("device", _DEVICES, ids=["cpu", "cuda"]) 219 | def test_transpose(shape, axes, device): 220 | _A = np.random.randn(*shape).astype(np.float32) 221 | A = ndl.Tensor(nd.array(_A), device=device) 222 | if axes is None: 223 | np_axes = (_A.ndim - 2, _A.ndim - 1) 224 | else: 225 | np_axes = axes 226 | np.testing.assert_allclose(np.swapaxes(_A, np_axes[0], np_axes[1]), ndl.transpose(A, axes=axes).numpy(), atol=1e-5, rtol=1e-5) 227 | 228 | 229 | @pytest.mark.parametrize("shape, axes", SUMMATION_PARAMETERS) 230 | @pytest.mark.parametrize("device", _DEVICES, ids=["cpu", "cuda"]) 231 | def test_logsumexp(shape, axes, device): 232 | _A = np.random.randn(*shape).astype(np.float32) 233 | A = ndl.Tensor(nd.array(_A), device=device) 234 | A_t = torch.Tensor(_A) 235 | if axes is None: 236 | t_axes = tuple(list(range(len(shape)))) 237 | else: 238 | t_axes = axes 239 | np.testing.assert_allclose(torch.logsumexp(A_t, dim=t_axes).numpy(), ndl.logsumexp(A, axes=axes).numpy(), atol=1e-5, rtol=1e-5) 240 | 241 | 242 | 243 | ### MUGRADE ### 244 | 245 | TEST_GENERAL_SHAPES = [(3, 1, 2)] 246 | TEST_MATMUL_DIMS = [(3, 4, 2), (8, 16, 16)] 247 | TEST_STACK_PARAMETERS = [((2, 3), 0, 3)] 248 | TEST_SUMMATION_PARAMETERS = [((3, 2), 0), ((2, 1, 2, 3), 3)] 249 | TEST_LOGSUMEXP_PARAMETERS = [((3, 2), 0), ((2, 1, 2, 3), 3)] 250 | TEST_BROADCAST_SHAPES = [((2, 1), (2, 4)), ((2, 1, 5), (2, 3, 5))] 251 | TEST_RESHAPE_SHAPES = [((3, 1, 2), (3, 2, 1))] 252 | TEST_TRANSPOSE_SHAPES = [(3, 5, 1)] 253 | TEST_TRANSPOSE_AXES = [(0, 1), (0, 2), None] 254 | TEST_GETSETITEM_PARAMS = [((3, 2), (2, 1)), ((3, 3, 4), (2, np.s_[2:], np.s_[:3]))] 255 | 256 | 257 | def mugrade_submit(x): 258 | if isinstance(x, np.ndarray): 259 | x = x.flatten()[:64] 260 | # print(x) 261 | mugrade.submit(x) 262 | else: 263 | # print(x) 264 | mugrade.submit(x) 265 | 266 | 267 | def submit_new_nd_backend(): 268 | devices = [ndl.cpu(), ndl.cuda()] if ndl.cuda().enabled() else [ndl.cpu()] 269 | #devices = [ndl.cpu(), ndl.cuda()] 270 | 271 | if not ndl.cuda().enabled(): 272 | print('You need a GPU to run some of these tests.') 273 | 274 | # ewise fn 275 | for (device, shape, fn_name) in itertools.product(devices, TEST_GENERAL_SHAPES, EWISE_OP_NAMES): 276 | _A = np.random.randn(*shape).astype(np.float32) 277 | _B = np.random.randn(*shape).astype(np.float32) 278 | A = ndl.Tensor(nd.array(_A), device=device) 279 | B = ndl.Tensor(nd.array(_B), device=device) 280 | mugrade_submit(EWISE_OPS[fn_name](A, B).numpy()) 281 | 282 | # scalar fn 283 | for (device, shape, fn_name) in itertools.product(devices, TEST_GENERAL_SHAPES, SCALAR_OP_NAMES): 284 | _A = np.random.randn(*shape).astype(np.float32) 285 | _B = np.random.randn(1).astype(np.float32).item() 286 | A = ndl.Tensor(nd.array(_A), device=device) 287 | mugrade_submit(EWISE_OPS[fn_name](A, _B).numpy()) 288 | 289 | # matmul 290 | for (device, matmul_dim) in itertools.product(devices, TEST_MATMUL_DIMS): 291 | m, n, p = matmul_dim 292 | _A = np.random.randn(m, n).astype(np.float32) 293 | _B = np.random.randn(n, p).astype(np.float32) 294 | A = ndl.Tensor(nd.array(_A), device=device) 295 | B = ndl.Tensor(nd.array(_B), device=device) 296 | mugrade_submit((A @ B).numpy()) 297 | 298 | # power 299 | for (device, shape) in itertools.product(devices, TEST_GENERAL_SHAPES): 300 | _A = np.random.randn(*shape).astype(np.float32) 301 | _B = np.random.randint(1) 302 | A = ndl.Tensor(nd.array(_A), device=device) 303 | mugrade_submit((A**_B).numpy()) 304 | 305 | # log 306 | for (device, shape) in itertools.product(devices, TEST_GENERAL_SHAPES): 307 | _A = np.random.randn(*shape).astype(np.float32) + 5. 308 | A = ndl.Tensor(nd.array(_A), device=device) 309 | mugrade_submit(ndl.log(A).numpy()) 310 | 311 | # exp 312 | for (device, shape) in itertools.product(devices, TEST_GENERAL_SHAPES): 313 | _A = np.random.randn(*shape).astype(np.float32) 314 | A = ndl.Tensor(nd.array(_A), device=device) 315 | mugrade_submit(ndl.exp(A).numpy()) 316 | 317 | # tanh 318 | for (device, shape) in itertools.product(devices, TEST_GENERAL_SHAPES): 319 | _A = np.random.randn(*shape).astype(np.float32) 320 | A = ndl.Tensor(nd.array(_A), device=device) 321 | mugrade_submit(ndl.tanh(A).numpy()) 322 | mugrade_submit(backward_check(ndl.tanh, A)) 323 | 324 | # stack 325 | for (device, (shape, axis, l)) in itertools.product(devices, TEST_STACK_PARAMETERS): 326 | _A = [np.random.randn(*shape).astype(np.float32) for i in range(l)] 327 | A = [ndl.Tensor(nd.array(_A[i]), device=device) for i in range(l)] 328 | out = ndl.stack(A, axis=axis) 329 | mugrade_submit(out.numpy()) 330 | out.backward() 331 | mugrade_submit(A[0].grad.numpy()) 332 | 333 | # summation 334 | for (device, (shape, axes)) in itertools.product(devices, TEST_SUMMATION_PARAMETERS): 335 | _A = np.random.randn(*shape).astype(np.float32) 336 | A = ndl.Tensor(nd.array(_A), device=device) 337 | mugrade_submit(ndl.summation(A, axes).numpy()) 338 | mugrade_submit(backward_check(ndl.summation, A, axes=axes)) 339 | 340 | # broadcast 341 | for (device, (shape, shape_to)) in itertools.product(devices, TEST_BROADCAST_SHAPES): 342 | _A = np.random.randn(*shape).astype(np.float32) 343 | A = ndl.Tensor(nd.array(_A), device=device) 344 | mugrade_submit(ndl.broadcast_to(A, shape_to).numpy()) 345 | 346 | # reshape 347 | for (device, (shape, shape_to)) in itertools.product(devices, TEST_RESHAPE_SHAPES): 348 | _A = np.random.randn(*shape).astype(np.float32) 349 | A = ndl.Tensor(nd.array(_A), device=device) 350 | mugrade_submit(ndl.reshape(A, shape_to).numpy()) 351 | 352 | # transpose 353 | for (device, shape, axes) in itertools.product(devices, TEST_TRANSPOSE_SHAPES, TEST_TRANSPOSE_AXES): 354 | _A = np.random.randn(*shape).astype(np.float32) 355 | A = ndl.Tensor(nd.array(_A), device=device) 356 | mugrade_submit(ndl.transpose(A, axes=axes).numpy()) 357 | 358 | # logsumexp 359 | for (device, (shape, axes)) in itertools.product(devices, TEST_LOGSUMEXP_PARAMETERS): 360 | _A = np.random.randn(*shape).astype(np.float32) 361 | A = ndl.Tensor(nd.array(_A), device=device) 362 | mugrade_submit(ndl.logsumexp(A, axes).numpy()) 363 | mugrade_submit(backward_check(ndl.logsumexp, A, axes=axes)) 364 | 365 | 366 | if __name__ == "__main__": 367 | submit_new_nd_backend() -------------------------------------------------------------------------------- /tests/hw4/test_sequence_models.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append('./python') 3 | sys.path.append('./apps') 4 | import numpy as np 5 | import pytest 6 | import torch 7 | import itertools 8 | import mugrade 9 | 10 | import needle as ndl 11 | import needle.nn as nn 12 | 13 | from simple_ml import * 14 | from models import LanguageModel 15 | 16 | 17 | np.random.seed(3) 18 | 19 | 20 | _DEVICES = [ndl.cpu(), pytest.param(ndl.cuda(), 21 | marks=pytest.mark.skipif(not ndl.cuda().enabled(), reason="No GPU"))] 22 | 23 | 24 | BATCH_SIZES = [1, 15] 25 | INPUT_SIZES = [1, 11] 26 | HIDDEN_SIZES = [1, 12] 27 | BIAS = [True, False] 28 | INIT_HIDDEN = [True, False] 29 | NONLINEARITIES = ['tanh', 'relu'] 30 | @pytest.mark.parametrize("batch_size", BATCH_SIZES) 31 | @pytest.mark.parametrize("input_size", INPUT_SIZES) 32 | @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES) 33 | @pytest.mark.parametrize("bias", BIAS) 34 | @pytest.mark.parametrize("init_hidden", INIT_HIDDEN) 35 | @pytest.mark.parametrize("nonlinearity", NONLINEARITIES) 36 | @pytest.mark.parametrize("device", _DEVICES, ids=["cpu", "cuda"]) 37 | def test_rnn_cell(batch_size, input_size, hidden_size, bias, init_hidden, nonlinearity, device): 38 | x = np.random.randn(batch_size, input_size).astype(np.float32) 39 | h0 = np.random.randn(batch_size, hidden_size).astype(np.float32) 40 | 41 | model_ = torch.nn.RNNCell(input_size, hidden_size, nonlinearity=nonlinearity, bias=bias) 42 | if init_hidden: 43 | h_ = model_(torch.tensor(x), torch.tensor(h0)) 44 | else: 45 | h_ = model_(torch.tensor(x), None) 46 | 47 | model = nn.RNNCell(input_size, hidden_size, device=device, bias=bias, nonlinearity=nonlinearity) 48 | model.W_ih = ndl.Tensor(model_.weight_ih.detach().numpy().transpose(), device=device) 49 | model.W_hh = ndl.Tensor(model_.weight_hh.detach().numpy().transpose(), device=device) 50 | if bias: 51 | model.bias_ih = ndl.Tensor(model_.bias_ih.detach().numpy(), device=device) 52 | model.bias_hh = ndl.Tensor(model_.bias_hh.detach().numpy(), device=device) 53 | if init_hidden: 54 | h = model(ndl.Tensor(x, device=device), ndl.Tensor(h0, device=device)) 55 | else: 56 | h = model(ndl.Tensor(x, device=device), None) 57 | assert h.device == device 58 | np.testing.assert_allclose(h_.detach().numpy(), h.numpy(), atol=1e-5, rtol=1e-5) 59 | h.sum().backward() 60 | h_.sum().backward() 61 | np.testing.assert_allclose(model_.weight_ih.grad.detach().numpy().transpose(), model.W_ih.grad.numpy(), atol=1e-5, rtol=1e-5) 62 | 63 | 64 | @pytest.mark.parametrize("batch_size", BATCH_SIZES) 65 | @pytest.mark.parametrize("input_size", INPUT_SIZES) 66 | @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES) 67 | @pytest.mark.parametrize("bias", BIAS) 68 | @pytest.mark.parametrize("init_hidden", INIT_HIDDEN) 69 | @pytest.mark.parametrize("device", _DEVICES, ids=["cpu", "cuda"]) 70 | def test_lstm_cell(batch_size, input_size, hidden_size, bias, init_hidden, device): 71 | x = np.random.randn(batch_size, input_size).astype(np.float32) 72 | h0 = np.random.randn(batch_size, hidden_size).astype(np.float32) 73 | c0 = np.random.randn(batch_size, hidden_size).astype(np.float32) 74 | 75 | model_ = torch.nn.LSTMCell(input_size, hidden_size, bias=bias) 76 | if init_hidden: 77 | h_, c_ = model_(torch.tensor(x), (torch.tensor(h0), torch.tensor(c0))) 78 | else: 79 | h_, c_ = model_(torch.tensor(x), None) 80 | 81 | model = nn.LSTMCell(input_size, hidden_size, device=device, bias=bias) 82 | 83 | model.W_ih = ndl.Tensor(model_.weight_ih.detach().numpy().transpose(), device=device) 84 | model.W_hh = ndl.Tensor(model_.weight_hh.detach().numpy().transpose(), device=device) 85 | if bias: 86 | model.bias_ih = ndl.Tensor(model_.bias_ih.detach().numpy(), device=device) 87 | model.bias_hh = ndl.Tensor(model_.bias_hh.detach().numpy(), device=device) 88 | 89 | if init_hidden: 90 | h, c = model(ndl.Tensor(x, device=device), (ndl.Tensor(h0, device=device), ndl.Tensor(c0, device=device))) 91 | else: 92 | h, c = model(ndl.Tensor(x, device=device), None) 93 | np.testing.assert_allclose(h_.detach().numpy(), h.numpy(), atol=1e-5, rtol=1e-5) 94 | np.testing.assert_allclose(c_.detach().numpy(), c.numpy(), atol=1e-5, rtol=1e-5) 95 | 96 | h.sum().backward() 97 | h_.sum().backward() 98 | np.testing.assert_allclose(model_.weight_ih.grad.detach().numpy().transpose(), model.W_ih.grad.numpy(), atol=1e-5, rtol=1e-5) 99 | 100 | 101 | SEQ_LENGTHS = [1, 13] 102 | NUM_LAYERS = [1, 2] 103 | @pytest.mark.parametrize("seq_length", SEQ_LENGTHS) 104 | @pytest.mark.parametrize("num_layers", NUM_LAYERS) 105 | @pytest.mark.parametrize("batch_size", BATCH_SIZES) 106 | @pytest.mark.parametrize("input_size", INPUT_SIZES) 107 | @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES) 108 | @pytest.mark.parametrize("bias", BIAS) 109 | @pytest.mark.parametrize("init_hidden", INIT_HIDDEN) 110 | @pytest.mark.parametrize("nonlinearity", NONLINEARITIES) 111 | @pytest.mark.parametrize("device", _DEVICES, ids=["cpu", "cuda"]) 112 | def test_rnn(seq_length, num_layers, batch_size, input_size, hidden_size, bias, init_hidden, nonlinearity, device): 113 | x = np.random.randn(seq_length, batch_size, input_size).astype(np.float32) 114 | h0 = np.random.randn(num_layers, batch_size, hidden_size).astype(np.float32) 115 | 116 | model_ = torch.nn.RNN(input_size, hidden_size, num_layers=num_layers, bias=bias, nonlinearity=nonlinearity) 117 | if init_hidden: 118 | output_, h_ = model_(torch.tensor(x), torch.tensor(h0)) 119 | else: 120 | output_, h_ = model_(torch.tensor(x), None) 121 | 122 | model = nn.RNN(input_size, hidden_size, num_layers, bias, device=device, nonlinearity=nonlinearity) 123 | for k in range(num_layers): 124 | model.rnn_cells[k].W_ih = ndl.Tensor(getattr(model_, f'weight_ih_l{k}').detach().numpy().transpose(), device=device) 125 | model.rnn_cells[k].W_hh = ndl.Tensor(getattr(model_, f'weight_hh_l{k}').detach().numpy().transpose(), device=device) 126 | if bias: 127 | model.rnn_cells[k].bias_ih = ndl.Tensor(getattr(model_, f'bias_ih_l{k}').detach().numpy(), device=device) 128 | model.rnn_cells[k].bias_hh = ndl.Tensor(getattr(model_, f'bias_hh_l{k}').detach().numpy(), device=device) 129 | if init_hidden: 130 | output, h = model(ndl.Tensor(x, device=device), ndl.Tensor(h0, device=device)) 131 | else: 132 | output, h = model(ndl.Tensor(x, device=device), None) 133 | 134 | np.testing.assert_allclose(h_.detach().numpy(), h.numpy(), atol=1e-5, rtol=1e-5) 135 | np.testing.assert_allclose(output_.detach().numpy(), output.numpy(), atol=1e-5, rtol=1e-5) 136 | 137 | output.sum().backward() 138 | output_.sum().backward() 139 | np.testing.assert_allclose(model.rnn_cells[0].W_ih.grad.detach().numpy(), model_.weight_ih_l0.grad.numpy().transpose(), atol=1e-5, rtol=1e-5) 140 | 141 | 142 | @pytest.mark.parametrize("seq_length", SEQ_LENGTHS) 143 | @pytest.mark.parametrize("num_layers", NUM_LAYERS) 144 | @pytest.mark.parametrize("batch_size", BATCH_SIZES) 145 | @pytest.mark.parametrize("input_size", INPUT_SIZES) 146 | @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES) 147 | @pytest.mark.parametrize("bias", BIAS) 148 | @pytest.mark.parametrize("init_hidden", INIT_HIDDEN) 149 | @pytest.mark.parametrize("device", _DEVICES, ids=["cpu", "cuda"]) 150 | def test_lstm(seq_length, num_layers, batch_size, input_size, hidden_size, bias, init_hidden, device): 151 | x = np.random.randn(seq_length, batch_size, input_size).astype(np.float32) 152 | h0 = np.random.randn(num_layers, batch_size, hidden_size).astype(np.float32) 153 | c0 = np.random.randn(num_layers, batch_size, hidden_size).astype(np.float32) 154 | 155 | model_ = torch.nn.LSTM(input_size, hidden_size, bias=bias, num_layers=num_layers) 156 | if init_hidden: 157 | output_, (h_, c_) = model_(torch.tensor(x), (torch.tensor(h0), torch.tensor(c0))) 158 | else: 159 | output_, (h_, c_) = model_(torch.tensor(x), None) 160 | 161 | model = nn.LSTM(input_size, hidden_size, num_layers, bias, device=device) 162 | for k in range(num_layers): 163 | model.lstm_cells[k].W_ih = ndl.Tensor(getattr(model_, f'weight_ih_l{k}').detach().numpy().transpose(), device=device) 164 | model.lstm_cells[k].W_hh = ndl.Tensor(getattr(model_, f'weight_hh_l{k}').detach().numpy().transpose(), device=device) 165 | if bias: 166 | model.lstm_cells[k].bias_ih = ndl.Tensor(getattr(model_, f'bias_ih_l{k}').detach().numpy(), device=device) 167 | model.lstm_cells[k].bias_hh = ndl.Tensor(getattr(model_, f'bias_hh_l{k}').detach().numpy(), device=device) 168 | if init_hidden: 169 | output, (h, c) = model(ndl.Tensor(x, device=device), (ndl.Tensor(h0, device=device), ndl.Tensor(c0, device=device))) 170 | else: 171 | output, (h, c) = model(ndl.Tensor(x, device=device), None) 172 | 173 | np.testing.assert_allclose(h_.detach().numpy(), h.numpy(), atol=1e-5, rtol=1e-5) 174 | np.testing.assert_allclose(c_.detach().numpy(), c.numpy(), atol=1e-5, rtol=1e-5) 175 | np.testing.assert_allclose(output_.detach().numpy(), output.numpy(), atol=1e-5, rtol=1e-5) 176 | 177 | output.sum().backward() 178 | output_.sum().backward() 179 | np.testing.assert_allclose(model.lstm_cells[0].W_ih.grad.detach().numpy(), model_.weight_ih_l0.grad.numpy().transpose(), atol=1e-5, rtol=1e-5) 180 | 181 | 182 | OUTPUT_SIZES = [1, 1000] 183 | EMBEDDING_SIZES = [1, 34] 184 | SEQ_MODEL = ['rnn', 'lstm'] 185 | @pytest.mark.parametrize("seq_length", SEQ_LENGTHS) 186 | @pytest.mark.parametrize("num_layers", NUM_LAYERS) 187 | @pytest.mark.parametrize("batch_size", BATCH_SIZES) 188 | @pytest.mark.parametrize("embedding_size", EMBEDDING_SIZES) 189 | @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES) 190 | @pytest.mark.parametrize("init_hidden", INIT_HIDDEN) 191 | @pytest.mark.parametrize("output_size", OUTPUT_SIZES) 192 | @pytest.mark.parametrize("seq_model", SEQ_MODEL) 193 | @pytest.mark.parametrize("device", _DEVICES, ids=["cpu", "cuda"]) 194 | def test_language_model_implementation(seq_length, num_layers, batch_size, embedding_size, hidden_size, 195 | init_hidden, output_size, seq_model, device): 196 | #TODO add test for just nn.embedding? 197 | x = np.random.randint(0, output_size, (seq_length, batch_size)).astype(np.float32) 198 | h0 = ndl.Tensor(np.random.randn(num_layers, batch_size, hidden_size).astype(np.float32), device=device) 199 | c0 = ndl.Tensor(np.random.randn(num_layers, batch_size, hidden_size).astype(np.float32), device=device) 200 | 201 | model = LanguageModel(embedding_size, output_size, hidden_size, num_layers, seq_model, device=device) 202 | if init_hidden: 203 | if seq_model == 'lstm': 204 | h = (h0, c0) 205 | elif seq_model == 'rnn': 206 | h = h0 207 | output, h_ = model(ndl.Tensor(x, device=device), h) 208 | else: 209 | output, h_ = model(ndl.Tensor(x, device=device), None) 210 | 211 | if seq_model == 'lstm': 212 | assert isinstance(h_, tuple) 213 | h0_, c0_ = h_ 214 | assert c0_.shape == (num_layers, batch_size, hidden_size) 215 | elif seq_model == 'rnn': 216 | h0_ = h_ 217 | assert h0_.shape == (num_layers, batch_size, hidden_size) 218 | assert output.shape == (batch_size * seq_length, output_size) 219 | #TODO actually test values 220 | output.backward() 221 | for p in model.parameters(): 222 | assert p.grad is not None 223 | 224 | @pytest.mark.parametrize("device", _DEVICES, ids=["cpu", "cuda"]) 225 | def test_language_model_training(device): 226 | corpus = ndl.data.Corpus("data/ptb", max_lines=20) 227 | seq_len = 10 228 | num_examples = 100 229 | batch_size = 16 230 | seq_model = 'rnn' 231 | num_layers = 2 232 | hidden_size = 10 233 | n_epochs=2 234 | train_data = ndl.data.batchify(corpus.train, batch_size=batch_size, device=device, dtype="float32") 235 | model = LanguageModel(30, len(corpus.dictionary), hidden_size=hidden_size, num_layers=num_layers, seq_model=seq_model, device=device) 236 | train_acc, train_loss = train_ptb(model, train_data, seq_len=seq_len, n_epochs=n_epochs, device=device) 237 | test_acc, test_loss = evaluate_ptb(model, train_data, seq_len=seq_len, device=device) 238 | if str(device) == "cpu(0)": 239 | np.testing.assert_allclose(5.4136161980805575, train_loss, atol=1e-5, rtol=1e-5) 240 | np.testing.assert_allclose(5.214852703942193, test_loss, atol=1e-5, rtol=1e-5) 241 | elif str(device) == "cuda(0)": 242 | np.testing.assert_allclose(5.424638041743526, train_loss, atol=1e-5, rtol=1e-5) 243 | np.testing.assert_allclose(5.23579544491238, test_loss, atol=1e-5, rtol=1e-5) 244 | 245 | 246 | ### MUGRADE ### 247 | 248 | TEST_BATCH_SIZES = [6] 249 | TEST_INPUT_SIZES = [3] 250 | TEST_HIDDEN_SIZES = [5] 251 | TEST_SEQ_LENGTHS = [7] 252 | TEST_NUM_LAYERS = [3] 253 | TEST_OUTPUT_SIZES = [16] 254 | TEST_EMBEDDING_SIZES = [8] 255 | TEST_SEQ_MODEL = ['rnn', 'lstm'] 256 | 257 | def mugrade_submit(x): 258 | if isinstance(x, np.ndarray): 259 | x = x.flatten()[:64] 260 | # print(x) 261 | mugrade.submit(x) 262 | else: 263 | # print(x) 264 | mugrade.submit(x) 265 | 266 | 267 | def submit_rnn(): 268 | devices = [ndl.cpu(), ndl.cuda()] if ndl.cuda().enabled() else [ndl.cpu()] 269 | # devices = [ndl.cpu(), ndl.cuda()] 270 | 271 | if not ndl.cuda().enabled(): 272 | print('You need a GPU to run some of these tests.') 273 | 274 | for (device, batch_size, input_size, hidden_size) in itertools.product( 275 | devices, TEST_BATCH_SIZES, TEST_INPUT_SIZES, TEST_HIDDEN_SIZES): 276 | x = np.random.randn(batch_size, input_size).astype(np.float32) 277 | h0 = np.random.randn(batch_size, hidden_size).astype(np.float32) 278 | model = nn.RNNCell(input_size, hidden_size, device=device) 279 | mugrade_submit(model.W_ih.numpy()) 280 | h = model(ndl.Tensor(x, device=device), ndl.Tensor(h0, device=device)) 281 | mugrade_submit(h.numpy()) 282 | h.sum().backward() 283 | mugrade_submit(model.W_hh.grad.numpy()) 284 | 285 | for (device, seq_length, num_layers, batch_size, input_size, hidden_size) in itertools.product( 286 | devices, TEST_SEQ_LENGTHS, TEST_NUM_LAYERS, TEST_BATCH_SIZES, TEST_INPUT_SIZES, TEST_HIDDEN_SIZES): 287 | x = np.random.randn(seq_length, batch_size, input_size).astype(np.float32) 288 | h0 = np.random.randn(num_layers, batch_size, hidden_size).astype(np.float32) 289 | model = nn.RNN(input_size, hidden_size, num_layers, device=device) 290 | output, h = model(ndl.Tensor(x, device=device), ndl.Tensor(h0, device=device)) 291 | mugrade_submit(h.numpy()) 292 | mugrade_submit(output.numpy()) 293 | output.sum().backward() 294 | mugrade_submit(model.rnn_cells[-1].W_hh.grad.numpy()) 295 | 296 | 297 | def submit_lstm(): 298 | devices = [ndl.cpu(), ndl.cuda()] if ndl.cuda().enabled() else [ndl.cpu()] 299 | #devices = [ndl.cpu(), ndl.cuda()] 300 | if not ndl.cuda().enabled(): 301 | print('You need a GPU to run some of these tests.') 302 | for (device, batch_size, input_size, hidden_size) in itertools.product( 303 | devices, TEST_BATCH_SIZES, TEST_INPUT_SIZES, TEST_HIDDEN_SIZES): 304 | x = np.random.randn(batch_size, input_size).astype(np.float32) 305 | h0 = np.random.randn(batch_size, hidden_size).astype(np.float32) 306 | c0 = np.random.randn(batch_size, hidden_size).astype(np.float32) 307 | model = nn.LSTMCell(input_size, hidden_size, device=device) 308 | mugrade_submit(model.W_hh.numpy()) 309 | (h, c) = model(ndl.Tensor(x, device=device), (ndl.Tensor(h0, device=device), ndl.Tensor(c0, device=device))) 310 | mugrade_submit(h.numpy()) 311 | mugrade_submit(c.numpy()) 312 | h.sum().backward() 313 | mugrade_submit(model.W_hh.grad.numpy()) 314 | 315 | for (device, seq_length, num_layers, batch_size, input_size, hidden_size) in itertools.product( 316 | devices, TEST_SEQ_LENGTHS, TEST_NUM_LAYERS, TEST_BATCH_SIZES, TEST_INPUT_SIZES, TEST_HIDDEN_SIZES): 317 | x = np.random.randn(seq_length, batch_size, input_size).astype(np.float32) 318 | h0 = np.random.randn(num_layers, batch_size, hidden_size).astype(np.float32) 319 | c0 = np.random.randn(num_layers, batch_size, hidden_size).astype(np.float32) 320 | model = nn.LSTM(input_size, hidden_size, num_layers, device=device) 321 | output, (h, c) = model(ndl.Tensor(x, device=device), (ndl.Tensor(h0, device=device), ndl.Tensor(c0, device=device))) 322 | mugrade_submit(h.numpy()) 323 | mugrade_submit(c.numpy()) 324 | mugrade_submit(output.numpy()) 325 | output.sum().backward() 326 | mugrade_submit(model.lstm_cells[-1].W_hh.grad.numpy()) 327 | 328 | 329 | def submit_language_model(): 330 | devices = [ndl.cpu(), ndl.cuda()] if ndl.cuda().enabled() else [ndl.cpu()] 331 | # devices = [ndl.cpu(), ndl.cuda()] 332 | if not ndl.cuda().enabled(): 333 | print('You need a GPU to run some of these tests.') 334 | for (device, seq_length, num_layers, batch_size, embedding_size, hidden_size, seq_model, output_size) in itertools.product( 335 | devices, TEST_SEQ_LENGTHS, TEST_NUM_LAYERS, TEST_BATCH_SIZES, TEST_EMBEDDING_SIZES, TEST_HIDDEN_SIZES, TEST_SEQ_MODEL, TEST_OUTPUT_SIZES): 336 | x = np.random.randint(0, output_size, (seq_length, batch_size)).astype(np.float32) 337 | h0 = ndl.Tensor(np.random.randn(num_layers, batch_size, hidden_size).astype(np.float32), device=device) 338 | c0 = ndl.Tensor(np.random.randn(num_layers, batch_size, hidden_size).astype(np.float32), device=device) 339 | model = LanguageModel(embedding_size, output_size, hidden_size, num_layers, seq_model, device=device) 340 | if seq_model == 'lstm': 341 | h = (h0, c0) 342 | elif seq_model == 'rnn': 343 | h = h0 344 | output, h_ = model(ndl.Tensor(x, device=device), h) 345 | if seq_model == 'lstm': 346 | h0_, c0_ = h_ 347 | mugrade_submit(c0_.numpy()) 348 | elif seq_model == 'rnn': 349 | h0_ = h_ 350 | mugrade_submit(h0_.numpy()) 351 | mugrade_submit(output.numpy()) 352 | 353 | device = ndl.cpu() # TODO CHANGE BACK 354 | # device = ndl.cpu() 355 | corpus = ndl.data.Corpus("data/ptb", max_lines=20) 356 | seq_len = 8 357 | num_examples = 88 358 | batch_size = 12 359 | seq_model = 'lstm' 360 | num_layers = 2 361 | hidden_size = 12 362 | n_epochs=2 363 | train_data = ndl.data.batchify(corpus.train, batch_size=batch_size, device=device, dtype="float32") 364 | model = LanguageModel(28, len(corpus.dictionary), hidden_size=hidden_size, num_layers=num_layers, 365 | seq_model=seq_model, device=device) 366 | train_acc, train_loss = train_ptb(model, train_data, seq_len=seq_len, n_epochs=n_epochs, device=device) 367 | test_acc, test_loss = evaluate_ptb(model, train_data, seq_len=seq_len, device=device) 368 | mugrade_submit(train_loss) 369 | mugrade_submit(test_loss) 370 | 371 | 372 | if __name__ == "__main__": 373 | submit_rnn() 374 | submit_lstm() 375 | submit_language_model() --------------------------------------------------------------------------------