├── .gitignore ├── README.md ├── Tests ├── Data │ ├── t10k-images-idx3-ubyte.gz │ ├── t10k-labels-idx1-ubyte.gz │ ├── train-images-idx3-ubyte.gz │ └── train-labels-idx1-ubyte.gz ├── Helpers.py ├── TestActivationFunctions.py ├── TestBatchNormalization.py ├── TestConv.py ├── TestDropout.py ├── TestFullyConnected.py ├── TestInitializers.py ├── TestMaxPoolLayer.py └── TestSoftMaxCrossEntropyLoss.py ├── img ├── ann.png ├── conv_back_weights.png ├── conv_forward.png ├── fcn.png ├── lenet.jpg ├── numerical_maxpooling.gif ├── padding_strides.gif ├── pooling.gif ├── restacking_filters.gif └── same_padding_no_strides.gif ├── license.txt ├── src ├── base.py ├── layers │ ├── activation_functions.py │ ├── batch_normalization.py │ ├── conv.py │ ├── dropout.py │ ├── fully_connected.py │ ├── initializers.py │ ├── pooling.py │ └── softmax_crossentropy.py └── network.py └── tutorial_dl.ipynb /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/* 2 | .ipynb_checkpoints/* 3 | *__pycache__* 4 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Tutorial: How to Build a Deep Learning Framework 2 | 3 | by Katharina Breininger and Tobias Wuerfl 4 | 5 | Pattern Recognition Lab, Friedrich-Alexander University Erlangen-Nuernberg, Erlangen, Germany 6 | 7 | ## How to start the tutorial: 8 | 9 | The tutorial is implemented as a jupyter notebook. 10 | 11 | To start the tutorial, open a terminal, change to the the main folder (the folder this README is located in) and then type jupyter notebook to launch the Jupyter Notebook App. It will open in a new browser tab. 12 | 13 | Click on "tutorial_dl.ipynb" in the files listed to run the notebook. 14 | 15 | More information can be found at http://jupyter-notebook-beginner-guide.readthedocs.io/en/latest/execute.html -------------------------------------------------------------------------------- /Tests/Data/t10k-images-idx3-ubyte.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kbreininger/tutorial-dlframework/83df0a128cdc7a365396d07acbe9f38eaf9b6626/Tests/Data/t10k-images-idx3-ubyte.gz -------------------------------------------------------------------------------- /Tests/Data/t10k-labels-idx1-ubyte.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kbreininger/tutorial-dlframework/83df0a128cdc7a365396d07acbe9f38eaf9b6626/Tests/Data/t10k-labels-idx1-ubyte.gz -------------------------------------------------------------------------------- /Tests/Data/train-images-idx3-ubyte.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kbreininger/tutorial-dlframework/83df0a128cdc7a365396d07acbe9f38eaf9b6626/Tests/Data/train-images-idx3-ubyte.gz -------------------------------------------------------------------------------- /Tests/Data/train-labels-idx1-ubyte.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kbreininger/tutorial-dlframework/83df0a128cdc7a365396d07acbe9f38eaf9b6626/Tests/Data/train-labels-idx1-ubyte.gz -------------------------------------------------------------------------------- /Tests/Helpers.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | import os 4 | import gzip 5 | import struct 6 | from pathlib import Path 7 | from random import shuffle 8 | from sklearn.preprocessing import OneHotEncoder 9 | from sklearn.datasets import load_iris, load_digits 10 | 11 | class SoftMax: 12 | def __init__(self): 13 | self.prediction = None 14 | 15 | def forward(self, input_tensor, label_tensor): 16 | prediction = self.predict(input_tensor) 17 | indices = np.where(label_tensor == 1) 18 | loss = np.sum( - np.log(prediction[indices])) 19 | return loss 20 | 21 | def backward(self, label_tensor): 22 | indices = np.where(label_tensor == 1) 23 | error = self.prediction.copy() 24 | error[indices] = error[indices] - 1 25 | return error 26 | 27 | def predict(self, input_tensor): 28 | input_tensor = input_tensor - np.max(input_tensor) 29 | denominator = np.tile(np.sum(np.exp(input_tensor),axis = 1),(input_tensor.shape[1],1)).T 30 | prediction = np.exp(input_tensor)/denominator 31 | self.prediction = prediction 32 | return prediction 33 | 34 | def gradient_check(layers, input_tensor, label_tensor): 35 | epsilon = 1e-5 36 | difference = np.zeros_like(input_tensor) 37 | 38 | activation_tensor = input_tensor.copy() 39 | for layer in layers[:-1]: 40 | activation_tensor = layer.forward(activation_tensor) 41 | layers[-1].forward(activation_tensor, label_tensor) 42 | 43 | error_tensor = layers[-1].backward(label_tensor) 44 | for layer in reversed(layers[:-1]): 45 | error_tensor = layer.backward(error_tensor) 46 | 47 | it = np.nditer(input_tensor, flags=['multi_index']) 48 | while not it.finished: 49 | plus_epsilon = input_tensor.copy() 50 | plus_epsilon[it.multi_index] += epsilon 51 | minus_epsilon = input_tensor.copy() 52 | minus_epsilon[it.multi_index] -= epsilon 53 | 54 | analytical_derivative = error_tensor[it.multi_index] 55 | 56 | for layer in layers[:-1]: 57 | plus_epsilon = layer.forward(plus_epsilon) 58 | minus_epsilon = layer.forward(minus_epsilon) 59 | upper_error = layers[-1].forward(plus_epsilon, label_tensor) 60 | lower_error = layers[-1].forward(minus_epsilon, label_tensor) 61 | 62 | numerical_derivative = (upper_error - lower_error) / (2 * epsilon) 63 | 64 | #print('Analytical: ' + str(analytical_derivative) + ' vs Numerical :' + str(numerical_derivative)) 65 | normalizing_constant = max(np.abs(analytical_derivative), np.abs(numerical_derivative)) 66 | 67 | if normalizing_constant < 1e-15: 68 | difference[it.multi_index] = 0 69 | else: 70 | difference[it.multi_index] = np.abs(analytical_derivative - numerical_derivative) / normalizing_constant 71 | 72 | it.iternext() 73 | return difference 74 | 75 | 76 | def plot_difference(plot, description, shape, difference, directory): 77 | if plot: 78 | image = difference[0, :] 79 | image = image.reshape(shape) 80 | fig = plt.figure(description) 81 | plt.imshow(image) 82 | plt.colorbar() 83 | fig.savefig(os.path.join(directory, description + ".pdf"), transparent=True, bbox_inches='tight', pad_inches=0) 84 | plt.close('all') 85 | 86 | 87 | def gradient_check_weights(layers, input_tensor, label_tensor, bias): 88 | epsilon = 1e-5 89 | if bias: 90 | weights = layers[0].bias 91 | else: 92 | weights = layers[0].weights 93 | difference = np.zeros_like(weights) 94 | 95 | it = np.nditer(weights, flags=['multi_index']) 96 | while not it.finished: 97 | plus_epsilon = weights.copy() 98 | plus_epsilon[it.multi_index] += epsilon 99 | minus_epsilon = weights.copy() 100 | minus_epsilon[it.multi_index] -= epsilon 101 | 102 | activation_tensor = input_tensor.copy() 103 | if bias: 104 | layers[0].bias = weights 105 | else: 106 | layers[0].weights = weights 107 | for layer in layers[:-1]: 108 | activation_tensor = layer.forward(activation_tensor) 109 | layers[-1].forward(activation_tensor, label_tensor) 110 | 111 | error_tensor = layers[-1].backward(label_tensor) 112 | for layer in reversed(layers[:-1]): 113 | error_tensor = layer.backward(error_tensor) 114 | if bias: 115 | analytical_derivative = layers[0].get_gradient_bias() 116 | else: 117 | analytical_derivative = layers[0].get_gradient_weights() 118 | analytical_derivative = analytical_derivative[it.multi_index] 119 | 120 | if bias: 121 | layers[0].bias = plus_epsilon 122 | else: 123 | layers[0].weights = plus_epsilon 124 | plus_epsilon_activation = input_tensor.copy() 125 | for layer in layers[:-1]: 126 | plus_epsilon_activation = layer.forward(plus_epsilon_activation) 127 | 128 | if bias: 129 | layers[0].bias = minus_epsilon 130 | else: 131 | layers[0].weights = minus_epsilon 132 | minus_epsilon_activation = input_tensor.copy() 133 | for layer in layers[:-1]: 134 | minus_epsilon_activation = layer.forward(minus_epsilon_activation) 135 | 136 | upper_error = layers[-1].forward(plus_epsilon_activation, label_tensor) 137 | lower_error = layers[-1].forward(minus_epsilon_activation, label_tensor) 138 | 139 | numerical_derivative = (upper_error - lower_error) / (2 * epsilon) 140 | normalizing_constant = max(np.abs(analytical_derivative), np.abs(numerical_derivative)) 141 | 142 | if normalizing_constant < 1e-15: 143 | difference[it.multi_index] = 0 144 | else: 145 | difference[it.multi_index] = np.abs(analytical_derivative - numerical_derivative) / normalizing_constant 146 | 147 | 148 | it.iternext() 149 | return difference 150 | 151 | 152 | 153 | def calculate_accuracy(results, labels): 154 | 155 | index_maximum = np.argmax(results, axis=1) 156 | one_hot_vector = np.zeros_like(results) 157 | for i in range(one_hot_vector.shape[0]): 158 | one_hot_vector[i, index_maximum[i]] = 1 159 | 160 | correct = 0. 161 | wrong = 0. 162 | for column_results, column_labels in zip(one_hot_vector, labels): 163 | if column_results[column_labels > 0.].all() > 0.: 164 | correct += 1. 165 | else: 166 | wrong += 1. 167 | 168 | return correct / (correct + wrong) 169 | 170 | 171 | def shuffle_data(input_tensor, label_tensor): 172 | index_shuffling = [i for i in range(input_tensor.shape[0])] 173 | shuffle(index_shuffling) 174 | shuffled_input = [input_tensor[i, :] for i in index_shuffling] 175 | shuffled_labels = [label_tensor[i, :] for i in index_shuffling] 176 | return (np.array(shuffled_input)), (np.array(shuffled_labels)) 177 | 178 | 179 | 180 | class RandomData: 181 | def __init__(self, input_size, batch_size, categories): 182 | self.input_size = input_size 183 | self.batch_size = batch_size 184 | self.categories = categories 185 | self.label_tensor = np.zeros([self.batch_size, self.categories]) 186 | 187 | def forward(self): 188 | input_tensor = np.random.random([self.batch_size, self.input_size]) 189 | 190 | self.label_tensor = np.zeros([self.batch_size, self.categories]) 191 | for i in range(self.batch_size): 192 | self.label_tensor[i, np.random.randint(0, self.categories)] = 1 193 | 194 | return input_tensor, self.label_tensor 195 | 196 | 197 | class IrisData: 198 | def __init__(self): 199 | self.data = load_iris() 200 | self.label_tensor = np.zeros([150, 3]) 201 | for i in range(150): 202 | self.label_tensor[i, self.data.target[i]] = 1 203 | 204 | self.input_tensor, self.label_tensor = shuffle_data((np.array(self.data.data)), self.label_tensor) 205 | self.input_tensor = self.input_tensor 206 | self.label_tensor = self.label_tensor 207 | 208 | def forward(self): 209 | return self.input_tensor[0:100, :], self.label_tensor[0:100, :] 210 | 211 | def get_test_set(self): 212 | return self.input_tensor[100:150, :], self.label_tensor[100:150, :] 213 | 214 | 215 | class DigitData: 216 | def __init__(self, batch_size): 217 | self.batch_size = batch_size 218 | self._data = load_digits(n_class=10) 219 | self._label_tensor = OneHotEncoder(sparse=False).fit_transform(self._data.target.reshape(-1, 1)) 220 | self._input_tensor = self._data.data 221 | self._input_tensor /= np.abs(self._input_tensor).max() 222 | 223 | self.split = int(self._input_tensor.shape[0]*(2/3)) # train / test split == number of samples in train set 224 | 225 | self._input_tensor, self._label_tensor = shuffle_data(self._input_tensor, self._label_tensor) 226 | self._input_tensor_train = self._input_tensor[:self.split, :] 227 | self._label_tensor_train = self._label_tensor[:self.split, :] 228 | self._input_tensor_test = self._input_tensor[self.split:, :] 229 | self._label_tensor_test = self._label_tensor[self.split:, :] 230 | 231 | self._current_forward_idx_iterator = self._forward_idx_iterator() 232 | 233 | def _forward_idx_iterator(self): 234 | num_iterations = int(np.ceil(self.split / self.batch_size)) 235 | idx = np.arange(self.split) 236 | while True: 237 | this_idx = np.random.choice(idx, self.split, replace=False) 238 | for i in range(num_iterations): 239 | yield this_idx[i * self.batch_size:(i + 1) * self.batch_size] 240 | 241 | def forward(self): 242 | idx = next(self._current_forward_idx_iterator) 243 | 244 | return self._input_tensor_train[idx, :], self._label_tensor_train[idx, :] 245 | 246 | def get_test_set(self): 247 | return self._input_tensor_test, self._label_tensor_test 248 | 249 | 250 | 251 | 252 | class MNISTData: 253 | def __init__(self, batch_size): 254 | self.batch_size = batch_size 255 | self.train, self.labels = self._read() 256 | self.test, self.testLabels = self._read(dataset="testing") 257 | 258 | self._current_forward_idx_iterator = self._forward_idx_iterator() 259 | 260 | def _forward_idx_iterator(self): 261 | num_iterations = int(self.train.shape[0] / self.batch_size) 262 | idx = np.arange(self.train.shape[0]) 263 | while True: 264 | this_idx = np.random.choice(idx, self.train.shape[0], replace=False) 265 | for i in range(num_iterations): 266 | yield this_idx[i * self.batch_size:(i + 1) * self.batch_size] 267 | 268 | def forward(self): 269 | idx = next(self._current_forward_idx_iterator) 270 | current = self.train[idx, :].reshape(-1,1,28,28) 271 | return current, self.labels[idx, :] 272 | 273 | def show_random_training_image(self): 274 | image = self.train[np.random.randint(0, self.train.shape[0]-1), :28 * 28] 275 | plt.imshow(image.reshape(28, 28), cmap='gray') 276 | plt.show() 277 | 278 | def show_image(self, index, test=True): 279 | if test: 280 | image = self.test[index, :28 * 28] 281 | else: 282 | image = self.train[index, :28 * 28] 283 | 284 | plt.imshow(image.reshape(28, 28), cmap='gray') 285 | plt.show() 286 | 287 | def get_test_set(self): 288 | return self.test, self.testLabels 289 | 290 | def get_random_test_sample(self): 291 | img_id = np.random.randint(0, self.test.shape[0]-1) 292 | image = self.test[img_id, :].reshape(-1,1,28,28) 293 | label = self.testLabels[img_id] 294 | return image, label 295 | 296 | 297 | @staticmethod 298 | def _read(dataset="training"): 299 | """ 300 | Python function for importing the MNIST data set. It returns an iterator 301 | of 2-tuples with the first element being the label and the second element 302 | being a numpy.uint8 2D array of pixel data for the given image. 303 | """ 304 | 305 | root_dir = Path(__file__) 306 | 307 | if dataset is "training": 308 | fname_img = root_dir.parent.joinpath('Data', 'train-images-idx3-ubyte.gz') 309 | fname_lbl = root_dir.parent.joinpath('Data', 'train-labels-idx1-ubyte.gz') 310 | elif dataset is "testing": 311 | fname_img = root_dir.parent.joinpath('Data', 't10k-images-idx3-ubyte.gz') 312 | fname_lbl = root_dir.parent.joinpath('Data', 't10k-labels-idx1-ubyte.gz') 313 | else: 314 | raise ValueError("dataset must be 'testing' or 'training'") 315 | 316 | # Load everything in some numpy arrays 317 | with gzip.open(str(fname_lbl), 'rb') as flbl: 318 | magic, num = struct.unpack(">II", flbl.read(8)) 319 | 320 | s = flbl.read(num) 321 | lbl = np.frombuffer(s, dtype=np.int8) 322 | one_hot = np.zeros((lbl.shape[0],10)) 323 | for idx, l in enumerate(lbl): 324 | one_hot[idx, l] = 1 325 | 326 | with gzip.open(str(fname_img), 'rb') as fimg: 327 | magic, num, rows, cols = struct.unpack(">IIII", fimg.read(16)) 328 | 329 | buffer = fimg.read(num * 28 * 28 * 8) 330 | img = np.frombuffer(buffer, dtype=np.uint8).reshape(len(lbl), rows * cols) 331 | img = img.astype(np.float64) 332 | img /= 255.0 333 | 334 | img = img[:num, :] 335 | one_hot = one_hot[:num, :] 336 | return img, one_hot 337 | 338 | if __name__ == "__main__": 339 | pass 340 | 341 | -------------------------------------------------------------------------------- /Tests/TestActivationFunctions.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import numpy as np 3 | import Helpers 4 | 5 | class TestReLU(unittest.TestCase): 6 | 7 | def setUp(self): 8 | self.input_size = 5 9 | self.batch_size = 10 10 | self.half_batch_size = int(self.batch_size / 2) 11 | self.input_tensor = np.ones([self.batch_size, self.input_size]) 12 | self.input_tensor[0:self.half_batch_size,:] -= 2 13 | 14 | self.label_tensor = np.zeros([self.batch_size, self.input_size]) 15 | for i in range(self.batch_size): 16 | self.label_tensor[i, np.random.randint(0, self.input_size)] = 1 17 | 18 | def test_forward(self): 19 | expected_tensor = np.zeros([self.batch_size, self.input_size]) 20 | expected_tensor[self.half_batch_size:self.batch_size, :] = 1 21 | 22 | layer = self.ReLU() 23 | output_tensor = layer.forward(self.input_tensor) 24 | self.assertEqual(np.sum(np.power(output_tensor-expected_tensor, 2)), 0) 25 | 26 | def test_backward(self): 27 | expected_tensor = np.zeros([self.batch_size, self.input_size]) 28 | expected_tensor[self.half_batch_size:self.batch_size, :] = 2 29 | 30 | layer = self.ReLU() 31 | layer.forward(self.input_tensor) 32 | output_tensor = layer.backward(self.input_tensor*2) 33 | self.assertEqual(np.sum(np.power(output_tensor - expected_tensor, 2)), 0) 34 | 35 | def test_gradient(self): 36 | input_tensor = np.abs(np.random.random((self.batch_size, self.input_size))) 37 | input_tensor *= 2. 38 | input_tensor -= 1. 39 | layers = list() 40 | layers.append(self.ReLU()) 41 | layers.append(Helpers.SoftMax()) 42 | difference = Helpers.gradient_check(layers, input_tensor, self.label_tensor) 43 | self.assertLessEqual(np.sum(difference), 1e-5) 44 | 45 | class TestSigmoid(unittest.TestCase): 46 | Sigmoid = None 47 | 48 | def setUp(self): 49 | self.input_size = 5 50 | self.batch_size = 10 51 | self.half_batch_size = int(self.batch_size / 2) 52 | self.input_tensor = np.abs(np.random.random((self.input_size, self.batch_size))).T 53 | self.input_tensor *= 2. 54 | self.input_tensor -= 1. 55 | 56 | self.label_tensor = np.zeros([self.input_size, self.batch_size]).T 57 | for i in range(self.batch_size): 58 | self.label_tensor[i, np.random.randint(0, self.input_size)] = 1 59 | 60 | def test_forward(self): 61 | expected_tensor = 0.5 * (1. + np.tanh(self.input_tensor / 2.)) 62 | 63 | layer = self.Sigmoid() 64 | output_tensor = layer.forward(self.input_tensor) 65 | self.assertAlmostEqual(np.sum(np.power(output_tensor-expected_tensor, 2)), 0) 66 | 67 | def test_range(self): 68 | layer = self.Sigmoid() 69 | output_tensor = layer.forward(self.input_tensor*2) 70 | 71 | out_max = np.max(output_tensor) 72 | out_min = np.min(output_tensor) 73 | 74 | self.assertLessEqual(out_max, 1.) 75 | self.assertGreaterEqual(out_min, 0.) 76 | 77 | def test_gradient(self): 78 | layers = list() 79 | layers.append(self.Sigmoid()) 80 | layers.append(Helpers.SoftMax()) 81 | difference = Helpers.gradient_check(layers, self.input_tensor, self.label_tensor) 82 | self.assertLessEqual(np.sum(difference), 1e-5) 83 | 84 | if __name__ == "__main__": 85 | pass -------------------------------------------------------------------------------- /Tests/TestBatchNormalization.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import numpy as np 3 | import Helpers 4 | 5 | class TestBatchNorm(unittest.TestCase): 6 | 7 | def setUp(self): 8 | self.batch_size = 200 9 | self.channels = 2 10 | self.input_shape = (self.channels, 3, 3) 11 | self.input_size = np.prod(self.input_shape) 12 | 13 | np.random.seed(0) 14 | self.input_tensor = np.abs(np.random.random((self.input_size, self.batch_size))).T 15 | 16 | self.categories = 5 17 | self.label_tensor = np.zeros([self.categories, self.batch_size]).T 18 | for i in range(self.batch_size): 19 | self.label_tensor[i, np.random.randint(0, self.categories)] = 1 20 | 21 | self.layers = list() 22 | self.layers.append(None) 23 | self.layers.append(self.FullyConnected(self.input_size, self.categories,0.)) 24 | self.layers.append(Helpers.SoftMax()) 25 | 26 | @staticmethod 27 | def _channel_moments(tensor, channels): 28 | in_shape = tensor.shape 29 | tensor = tensor.reshape(tensor.shape[0], channels, -1) 30 | tensor = np.transpose(tensor, (0, 2, 1)) 31 | tensor = tensor.reshape(in_shape[1]//channels * in_shape[0], channels) 32 | mean = np.mean(tensor, axis=0) 33 | var = np.var(tensor, axis=0) 34 | return mean, var 35 | 36 | def test_forward_shape(self): 37 | layer = self.BatchNormalization(0.) 38 | output = layer.forward(self.input_tensor) 39 | 40 | self.assertEqual(output.shape[0], self.input_tensor.shape[0]) 41 | self.assertEqual(output.shape[1], self.input_tensor.shape[1]) 42 | 43 | def test_forward_shape_convolutional(self): 44 | layer = self.BatchNormalization(0., self.channels) 45 | output = layer.forward(self.input_tensor) 46 | 47 | self.assertEqual(output.shape[0], self.input_tensor.shape[0]) 48 | self.assertEqual(output.shape[1], self.input_tensor.shape[1]) 49 | 50 | def test_forward(self): 51 | layer = self.BatchNormalization(0.) 52 | output = layer.forward(self.input_tensor) 53 | mean = np.mean(output, axis=0) 54 | var = np.var(output, axis=0) 55 | 56 | self.assertAlmostEqual(np.sum(np.square(mean - np.zeros(mean.shape[0]))), 0) 57 | self.assertAlmostEqual(np.sum(np.square(var - np.ones(var.shape[0]))), 0) 58 | 59 | def test_forward_convolutional(self): 60 | layer = self.BatchNormalization(0., self.channels) 61 | output = layer.forward(self.input_tensor) 62 | mean, var = TestBatchNorm._channel_moments(output, self.channels) 63 | 64 | self.assertAlmostEqual(np.sum(np.square(mean)), 0) 65 | self.assertAlmostEqual(np.sum(np.square(var - np.ones_like(var))), 0) 66 | 67 | def test_forward_train_phase(self): 68 | layer = self.BatchNormalization(0.) 69 | layer.forward(self.input_tensor) 70 | 71 | output = layer.forward((np.zeros_like(self.input_tensor))) 72 | 73 | mean = np.mean(output, axis=0) 74 | 75 | mean_input = np.mean(self.input_tensor, axis=0) 76 | var_input = np.var(self.input_tensor, axis=0) 77 | 78 | self.assertNotEqual(np.sum(np.square(mean + (mean_input/np.sqrt(var_input)))), 0) 79 | 80 | def test_forward_train_phase_convolutional(self): 81 | layer = self.BatchNormalization(0., self.channels) 82 | layer.forward(self.input_tensor) 83 | 84 | output = layer.forward((np.zeros_like(self.input_tensor))) 85 | 86 | mean, var = TestBatchNorm._channel_moments(output, self.channels) 87 | mean_input, var_input = TestBatchNorm._channel_moments(self.input_tensor, self.channels) 88 | 89 | self.assertNotEqual(np.sum(np.square(mean + (mean_input/np.sqrt(var_input)))), 0) 90 | 91 | def test_forward_test_phase(self): 92 | layer = self.BatchNormalization(0.) 93 | layer.forward(self.input_tensor) 94 | layer.phase = self.Phase.test 95 | 96 | output = layer.forward((np.zeros_like(self.input_tensor))) 97 | 98 | mean = np.mean(output, axis=0) 99 | var = np.var(output, axis=0) 100 | 101 | mean_input = np.mean(self.input_tensor, axis=0) 102 | var_input = np.var(self.input_tensor, axis=0) 103 | 104 | self.assertAlmostEqual(np.sum(np.square(mean + (mean_input/np.sqrt(var_input)))), 0) 105 | self.assertAlmostEqual(np.sum(np.square(var)), 0) 106 | 107 | def test_forward_test_phase_convolutional(self): 108 | layer = self.BatchNormalization(0., self.channels) 109 | layer.forward(self.input_tensor) 110 | layer.phase = self.Phase.test 111 | 112 | output = layer.forward((np.zeros_like(self.input_tensor))) 113 | 114 | mean, var = TestBatchNorm._channel_moments(output, self.channels) 115 | mean_input, var_input = TestBatchNorm._channel_moments(self.input_tensor, self.channels) 116 | 117 | self.assertAlmostEqual(np.sum(np.square(mean + (mean_input / np.sqrt(var_input)))), 0, places=4) 118 | self.assertAlmostEqual(np.sum(np.square(var)), 0, places=4) 119 | 120 | def test_gradient(self): 121 | self.layers[0] = self.BatchNormalization(0.) 122 | 123 | difference = Helpers.gradient_check(self.layers, self.input_tensor, self.label_tensor) 124 | 125 | self.assertLessEqual(np.sum(difference), 1e-4) 126 | 127 | def test_gradient_weights(self): 128 | self.layers[0] = self.BatchNormalization(0.) 129 | self.layers[0].forward(self.input_tensor) 130 | 131 | difference = Helpers.gradient_check_weights(self.layers, self.input_tensor, self.label_tensor, False) 132 | 133 | self.assertLessEqual(np.sum(difference), 1e-6) 134 | 135 | def test_gradient_bias(self): 136 | self.layers[0] = self.BatchNormalization(0.) 137 | self.layers[0].forward(self.input_tensor) 138 | 139 | difference = Helpers.gradient_check_weights(self.layers, self.input_tensor, self.label_tensor, True) 140 | 141 | self.assertLessEqual(np.sum(difference), 1e-6) 142 | 143 | def test_gradient_convolutional(self): 144 | self.layers[0] = self.BatchNormalization(0., self.channels) 145 | 146 | difference = Helpers.gradient_check(self.layers, self.input_tensor, self.label_tensor) 147 | 148 | self.assertLessEqual(np.sum(difference), 1e-3) 149 | 150 | def test_gradient_weights_convolutional(self): 151 | self.layers[0] = self.BatchNormalization(0., self.channels) 152 | self.layers[0].forward(self.input_tensor) 153 | 154 | difference = Helpers.gradient_check_weights(self.layers, self.input_tensor, self.label_tensor, False) 155 | 156 | self.assertLessEqual(np.sum(difference), 1e-6) 157 | 158 | def test_gradient_bias_convolutional(self): 159 | self.layers[0] = self.BatchNormalization(0., self.channels) 160 | self.layers[0].forward(self.input_tensor) 161 | 162 | difference = Helpers.gradient_check_weights(self.layers, self.input_tensor, self.label_tensor, True) 163 | 164 | self.assertLessEqual(np.sum(difference), 1e-6) 165 | 166 | if __name__ == "__main__": 167 | pass -------------------------------------------------------------------------------- /Tests/TestConv.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import numpy as np 3 | import Helpers 4 | from scipy.ndimage.filters import gaussian_filter 5 | 6 | class TestConv(unittest.TestCase): 7 | 8 | class TestInitializer: 9 | 10 | @staticmethod 11 | def initialize(weights): 12 | weights = np.zeros((1, 3, 3, 3)) 13 | weights[0, 1, 1, 1] = 1 14 | return weights 15 | 16 | def setUp(self): 17 | self.batch_size = 2 18 | self.input_shape = (3, 10, 14) 19 | self.uneven_input_shape = (3, 11, 15) 20 | self.spatial_input_size = np.prod(self.input_shape[1:]) 21 | self.kernel_shape = (3, 5, 8) 22 | self.num_kernels = 4 23 | self.hidden_channels = 3 24 | 25 | self.categories = 5 26 | self.label_tensor = np.zeros([self.batch_size, self.categories]) 27 | for i in range(self.batch_size): 28 | self.label_tensor[i, np.random.randint(0, self.categories)] = 1 29 | 30 | def test_forward_size(self): 31 | conv = self.Conv( (1, 1), self.kernel_shape, self.num_kernels, 0.) 32 | input_tensor = np.array(range(np.prod(self.input_shape) * self.batch_size), dtype=np.float) 33 | input_tensor = input_tensor.reshape(self.batch_size, *self.input_shape) 34 | output_tensor = conv.forward(input_tensor) 35 | self.assertEqual(output_tensor.shape, (self.batch_size, self.num_kernels, *self.input_shape[1:])) 36 | 37 | def test_forward_size_stride(self): 38 | conv = self.Conv((3, 2), self.kernel_shape, self.num_kernels, 0.) 39 | input_tensor = np.array(range(np.prod(self.input_shape) * self.batch_size), dtype=np.float) 40 | input_tensor = input_tensor.reshape(self.batch_size, *self.input_shape) 41 | output_tensor = conv.forward(input_tensor) 42 | self.assertEqual(output_tensor.shape, (self.batch_size, self.num_kernels, 4, 7)) 43 | 44 | def test_forward_size_stride_uneven_image(self): 45 | conv = self.Conv((3, 2), self.kernel_shape, self.num_kernels + 1, 0.) 46 | input_tensor = np.array(range(np.prod(self.uneven_input_shape) * (self.batch_size + 1)), dtype=np.float) 47 | input_tensor = input_tensor.reshape(self.batch_size + 1, *self.uneven_input_shape) 48 | output_tensor = conv.forward(input_tensor) 49 | self.assertEqual(output_tensor.shape, ( self.batch_size+1, self.num_kernels+1, 4, 8)) 50 | 51 | def test_forward(self): 52 | np.random.seed(1337) 53 | conv = self.Conv((1, 1), (1, 3, 3), 1, 0.) 54 | conv.weights = (1./15.) * np.array([[[1, 2, 1], [2, 3, 2], [1, 2, 1]]]) 55 | conv.bias = np.array([0]) 56 | conv.weights = np.expand_dims(conv.weights, 0) 57 | input_tensor = np.random.random((1, 1, 10, 14)) 58 | expected_output = gaussian_filter(input_tensor[0, 0, :, :], 0.85, mode='constant', cval=0.0, truncate=1.0) 59 | output_tensor = conv.forward(input_tensor).reshape((10, 14)) 60 | difference = np.max(np.abs(expected_output - output_tensor)) 61 | self.assertAlmostEqual(difference, 0., places=1) 62 | 63 | def test_forward_fully_connected_channels(self): 64 | np.random.seed(1337) 65 | conv = self.Conv((1, 1), (3, 3, 3), 1, 0.) 66 | conv.weights = (1. / 15.) * np.array([[[1, 2, 1], [2, 3, 2], [1, 2, 1]], [[1, 2, 1], [2, 3, 2], [1, 2, 1]], [[1, 2, 1], [2, 3, 2], [1, 2, 1]]]) 67 | conv.bias = np.array([0]) 68 | conv.weights = np.expand_dims(conv.weights, 0) 69 | tensor = np.random.random((1, 1, 10, 14)) 70 | input_tensor = np.zeros((1, 3 , 10, 14)) 71 | input_tensor[:,0] = tensor.copy() 72 | input_tensor[:,1] = tensor.copy() 73 | input_tensor[:,2] = tensor.copy() 74 | expected_output = 3 * gaussian_filter(input_tensor[0, 0, :, :], 0.85, mode='constant', cval=0.0, truncate=1.0) 75 | output_tensor = conv.forward(input_tensor).reshape((10, 14)) 76 | difference = np.max(np.abs(expected_output - output_tensor)) 77 | self.assertLess(difference, 0.2) 78 | 79 | def test_backward_size(self): 80 | conv = self.Conv((1, 1), self.kernel_shape, self.num_kernels, 0.) 81 | input_tensor = np.array(range(np.prod(self.input_shape) * self.batch_size), dtype=np.float) 82 | input_tensor = input_tensor.reshape(self.batch_size, *self.input_shape) 83 | output_tensor = conv.forward(input_tensor) 84 | error_tensor = conv.backward(output_tensor) 85 | self.assertEqual(error_tensor.shape, (self.batch_size, *self.input_shape)) 86 | 87 | def test_backward_size_stride(self): 88 | conv = self.Conv((3, 2), self.kernel_shape, self.num_kernels, 0.) 89 | input_tensor = np.array(range(np.prod(self.input_shape) * self.batch_size), dtype=np.float) 90 | input_tensor = input_tensor.reshape(self.batch_size, *self.input_shape) 91 | output_tensor = conv.forward(input_tensor) 92 | error_tensor = conv.backward(output_tensor) 93 | self.assertEqual(error_tensor.shape, (self.batch_size, *self.input_shape)) 94 | 95 | def test_layout_preservation(self): 96 | conv = self.Conv((1, 1), (3, 3, 3), 1, 0.) 97 | conv.initialize(TestConv.TestInitializer(), self.Constant(0.0)) 98 | input_tensor = np.array(range(np.prod(self.input_shape) * self.batch_size), dtype=np.float) 99 | input_tensor = input_tensor.reshape(self.batch_size, *self.input_shape) 100 | output_tensor = conv.forward(input_tensor) 101 | self.assertAlmostEqual(np.sum(np.abs(np.squeeze(output_tensor)-input_tensor[:,1,:,:])), 0.) 102 | 103 | def test_gradient(self): 104 | np.random.seed(1337) 105 | input_tensor = np.abs(np.random.random((2, 3, 5, 7))) 106 | layers = list() 107 | layers.append(self.Conv((1, 1), (3, 3, 3), self.hidden_channels, 0.)) 108 | layers.append(self.Flatten()) 109 | layers.append(self.FullyConnected(35 * self.hidden_channels, self.categories, 0)) 110 | layers.append(Helpers.SoftMax()) 111 | difference = Helpers.gradient_check(layers, input_tensor, self.label_tensor) 112 | self.assertLessEqual(np.sum(difference), 5e-2) 113 | 114 | def test_gradient_weights(self): 115 | np.random.seed(1337) 116 | input_tensor = np.abs(np.random.random((2, 3, 5, 7))) 117 | layers = list() 118 | layers.append(self.Conv((1, 1), (3, 3, 3), self.hidden_channels, 0.)) 119 | layers.append(self.Flatten()) 120 | layers.append(self.FullyConnected(35*self.hidden_channels, self.categories, 0)) 121 | layers.append(Helpers.SoftMax()) 122 | difference = Helpers.gradient_check_weights(layers, input_tensor, self.label_tensor, False) 123 | 124 | self.assertLessEqual(np.sum(difference), 1e-5) 125 | 126 | def test_gradient_weights_strided(self): 127 | np.random.seed(1337) 128 | input_tensor = np.abs(np.random.random((2, 3, 5, 7))) 129 | layers = list() 130 | layers.append(self.Conv((2, 2), (3, 3, 3), self.hidden_channels, 0.)) 131 | layers.append(self.Flatten()) 132 | layers.append(self.FullyConnected(12*self.hidden_channels, self.categories, 0)) 133 | layers.append(Helpers.SoftMax()) 134 | difference = Helpers.gradient_check_weights(layers, input_tensor, self.label_tensor, False) 135 | 136 | self.assertLessEqual(np.sum(difference), 1e-5) 137 | 138 | def test_gradient_bias(self): 139 | np.random.seed(1337) 140 | input_tensor = np.abs(np.random.random((2, 3, 5, 7))) 141 | layers = list() 142 | layers.append(self.Conv((1, 1), (3, 3, 3), self.hidden_channels, 0.)) 143 | layers.append(self.Flatten()) 144 | layers.append(self.FullyConnected(35 * self.hidden_channels, self.categories, 0)) 145 | layers.append(Helpers.SoftMax()) 146 | difference = Helpers.gradient_check_weights(layers, input_tensor, self.label_tensor, True) 147 | 148 | self.assertLessEqual(np.sum(difference), 1e-5) 149 | 150 | def test_gradient_stride(self): 151 | np.random.seed(1337) 152 | input_tensor = np.abs(np.random.random((2, 3, 5, 14))) 153 | layers = list() 154 | layers.append(self.Conv( (1, 2), (3, 3, 3), 1, 0.)) 155 | layers.append(self.Flatten()) 156 | layers.append(self.FullyConnected(35, self.categories, 0)) 157 | layers.append(Helpers.SoftMax()) 158 | difference = Helpers.gradient_check(layers, input_tensor, self.label_tensor) 159 | 160 | self.assertLessEqual(np.sum(difference), 1e-4) 161 | 162 | def test_update(self): 163 | input_tensor = np.abs(np.random.random((self.batch_size, *self.input_shape))) 164 | conv = self.Conv((3, 2), self.kernel_shape, self.num_kernels, 1.) 165 | conv.initialize(self.He(), self.Constant(0.1)) 166 | for _ in range(10): 167 | output_tensor = conv.forward(input_tensor) 168 | error_tensor = np.zeros_like(output_tensor) 169 | error_tensor -= output_tensor 170 | conv.backward(error_tensor) 171 | new_output_tensor = conv.forward(input_tensor) 172 | self.assertLess(np.sum(np.power(output_tensor, 2)), np.sum(np.power(new_output_tensor, 2))) 173 | 174 | if __name__ == "__main__": 175 | pass -------------------------------------------------------------------------------- /Tests/TestDropout.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import numpy as np 3 | import Helpers 4 | 5 | class TestDropout(unittest.TestCase): 6 | def setUp(self): 7 | self.batch_size = 10000 8 | self.input_size = 10 9 | 10 | self.input_tensor = np.ones((self.batch_size, self.input_size)) 11 | 12 | def test_forward_trainTime(self): 13 | drop_layer = self.DropOut(0.5) 14 | output = drop_layer.forward(self.input_tensor) 15 | 16 | self.assertEqual(np.max(output), 2) 17 | self.assertEqual(np.min(output), 0) 18 | sum_over_mean = np.sum(np.mean(output, axis=0)) 19 | self.assertAlmostEqual(sum_over_mean, 1. * self.input_size, places=1) 20 | 21 | def test_forward_testTime(self): 22 | drop_layer = self.DropOut(0.5) 23 | drop_layer.phase = self.Phase.test 24 | output = drop_layer.forward(self.input_tensor) 25 | 26 | self.assertEqual(np.max(output), 1.) 27 | self.assertEqual(np.min(output), 1.) 28 | sum_over_mean = np.sum(np.mean(output, axis=0)) 29 | self.assertEqual(sum_over_mean, 1. * self.input_size) 30 | 31 | def test_backward(self): 32 | drop_layer = self.DropOut(0.5) 33 | drop_layer.forward(self.input_tensor) 34 | output = drop_layer.backward(self.input_tensor) 35 | 36 | self.assertEqual(np.max(output), 1) 37 | self.assertEqual(np.min(output), 0) 38 | sum_over_mean = np.sum(np.mean(output, axis=0)) 39 | self.assertAlmostEqual(sum_over_mean, .5 * self.input_size, places=1) 40 | 41 | if __name__ == "__main__": 42 | pass -------------------------------------------------------------------------------- /Tests/TestFullyConnected.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import numpy as np 3 | import Helpers 4 | 5 | class TestFullyConnected(unittest.TestCase): 6 | 7 | class TestInitializer: 8 | 9 | @staticmethod 10 | def initialize(weights_shape): 11 | return np.random.rand(*weights_shape) 12 | 13 | def setUp(self): 14 | self.batch_size = 9 15 | self.input_size = 4 16 | self.output_size = 3 17 | self.input_tensor = np.random.rand(self.batch_size, self.input_size) 18 | 19 | self.categories = 4 20 | self.label_tensor = np.zeros([self.batch_size, self.categories]) 21 | for i in range(self.batch_size): 22 | self.label_tensor[i, np.random.randint(0, self.categories)] = 1 23 | 24 | def test_forward_size(self): 25 | layer = self.FullyConnected(self.input_size, self.output_size, 0) 26 | layer.initialize(TestFullyConnected.TestInitializer(),TestFullyConnected.TestInitializer) 27 | output_tensor = layer.forward(self.input_tensor) 28 | self.assertEqual(output_tensor.shape[1], self.output_size) 29 | self.assertEqual(output_tensor.shape[0], self.batch_size) 30 | 31 | def test_backward_size(self): 32 | layer = self.FullyConnected(self.input_size, self.output_size, 0) 33 | layer.initialize(TestFullyConnected.TestInitializer(),TestFullyConnected.TestInitializer) 34 | output_tensor = layer.forward(self.input_tensor) 35 | error_tensor = layer.backward(output_tensor) 36 | self.assertEqual(error_tensor.shape[1], self.input_size) 37 | self.assertEqual(error_tensor.shape[0], self.batch_size) 38 | 39 | def test_update(self): 40 | layer = self.FullyConnected(self.input_size, self.output_size, 1) 41 | layer.initialize(TestFullyConnected.TestInitializer(),TestFullyConnected.TestInitializer) 42 | for _ in range(10): 43 | output_tensor = layer.forward(self.input_tensor) 44 | error_tensor = np.zeros([ self.batch_size, self.output_size]) 45 | error_tensor -= output_tensor 46 | layer.backward(error_tensor) 47 | new_output_tensor = layer.forward(self.input_tensor) 48 | self.assertLess(np.sum(np.power(output_tensor, 2)), np.sum(np.power(new_output_tensor, 2))) 49 | 50 | def test_gradient(self): 51 | input_tensor = np.abs(np.random.random((self.batch_size, self.input_size))) 52 | layers = list() 53 | layers.append(self.FullyConnected(self.input_size, self.categories, 0)) 54 | layers[0].initialize(TestFullyConnected.TestInitializer(),TestFullyConnected.TestInitializer) 55 | layers.append(Helpers.SoftMax()) 56 | difference = Helpers.gradient_check(layers, input_tensor, self.label_tensor) 57 | self.assertLessEqual(np.sum(difference), 1e-5) 58 | 59 | def test_gradient_weights(self): 60 | input_tensor = np.abs(np.random.random((self.batch_size, self.input_size))) 61 | layers = list() 62 | layers.append(self.FullyConnected(self.input_size, self.categories, 0)) 63 | layers[0].initialize(TestFullyConnected.TestInitializer(),TestFullyConnected.TestInitializer) 64 | layers.append(Helpers.SoftMax()) 65 | difference = Helpers.gradient_check_weights(layers, input_tensor, self.label_tensor, False) 66 | self.assertLessEqual(np.sum(difference), 1e-5) 67 | 68 | def test_bias(self): 69 | input_tensor = np.zeros((1, 100000)) 70 | layer = self.FullyConnected(100000, 1, 0) 71 | layer.initialize(TestFullyConnected.TestInitializer(),TestFullyConnected.TestInitializer) 72 | result = layer.forward(input_tensor) 73 | self.assertGreater(np.sum(result), 0) 74 | 75 | if __name__ == "__main__": 76 | pass -------------------------------------------------------------------------------- /Tests/TestInitializers.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import numpy as np 3 | import Helpers 4 | from scipy import stats 5 | 6 | class TestInitializers(unittest.TestCase): 7 | 8 | class DummyLayer: 9 | def __init__(self, input_size, output_size): 10 | self.weights = np.random.random_sample((output_size, input_size)) 11 | 12 | def initialize(self, initializer): 13 | self.weights = initializer.initialize(self.weights.shape) 14 | 15 | def setUp(self): 16 | self.batch_size = 9 17 | self.input_size = 200 18 | self.output_size = 50 19 | 20 | def _performInitialization(self, initializer): 21 | np.random.seed(1337) 22 | layer = TestInitializers.DummyLayer(self.input_size, self.output_size) 23 | weights_before_init = layer.weights.copy() 24 | layer.initialize(initializer) 25 | weights_after_init = layer.weights.copy() 26 | return weights_before_init, weights_after_init 27 | 28 | def test_const_shape(self): 29 | weights_before_init, weights_after_init = self._performInitialization(self.Const(0.1)) 30 | 31 | self.assertEqual(weights_before_init.shape, weights_after_init.shape) 32 | self.assertFalse(np.allclose(weights_before_init, weights_after_init)) 33 | 34 | def test_const_distribution(self): 35 | weights_before_init, weights_after_init = self._performInitialization(self.Const(0.1)) 36 | self.assertTrue(np.allclose(weights_after_init, 0.1)) 37 | 38 | def test_uniform_shape(self): 39 | weights_before_init, weights_after_init = self._performInitialization(self.Uniform()) 40 | 41 | self.assertEqual(weights_before_init.shape, weights_after_init.shape) 42 | self.assertFalse(np.allclose(weights_before_init, weights_after_init)) 43 | 44 | def test_uniform_distribution(self): 45 | weights_before_init, weights_after_init = self._performInitialization(self.Uniform()) 46 | 47 | p_value = stats.kstest(weights_after_init.flat, 'uniform', args=(0, 1)).pvalue 48 | self.assertGreater(p_value, 0.01) 49 | 50 | def test_he_shape(self): 51 | weights_before_init, weights_after_init = self._performInitialization(self.He()) 52 | 53 | self.assertEqual(weights_before_init.shape, weights_after_init.shape) 54 | self.assertFalse(np.allclose(weights_before_init, weights_after_init)) 55 | 56 | def test_he_distribution(self): 57 | weights_before_init, weights_after_init = self._performInitialization(self.He()) 58 | 59 | scale = np.sqrt(2.) / np.sqrt(self.input_size) 60 | p_value = stats.kstest(weights_after_init.flat, 'norm', args=(0, scale)).pvalue 61 | self.assertGreater(p_value, 0.01) 62 | 63 | if __name__ == "__main__": 64 | pass -------------------------------------------------------------------------------- /Tests/TestMaxPoolLayer.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import numpy as np 3 | import Helpers 4 | 5 | class TestMaxPooling(unittest.TestCase): 6 | 7 | def setUp(self): 8 | self.batch_size = 2 9 | self.input_shape = (2, 4, 7) 10 | 11 | np.random.seed(1337) 12 | self.input_tensor = np.abs(np.random.random((self.batch_size, *self.input_shape))) 13 | 14 | self.categories = 5 15 | self.label_tensor = np.zeros([self.batch_size, self.categories]) 16 | for i in range(self.batch_size): 17 | self.label_tensor[i, np.random.randint(0, self.categories)] = 1 18 | 19 | self.layers = list() 20 | self.layers.append(None) 21 | self.layers.append(self.Flatten()) 22 | self.layers.append(None) 23 | self.layers.append(Helpers.SoftMax()) 24 | 25 | def test_shape(self): 26 | layer = self.MaxPooling(neighborhood=(2, 2), stride=(2, 2)) 27 | result = layer.forward(self.input_tensor) 28 | expected_shape = np.array([self.batch_size, 2, 2, 3]) 29 | self.assertEqual(np.abs(np.sum(np.array(result.shape) - expected_shape)), 0) 30 | 31 | def test_overlapping_shape(self): 32 | layer = self.MaxPooling(neighborhood=(2, 2), stride=(2, 1)) 33 | result = layer.forward(self.input_tensor) 34 | expected_shape = np.array([self.batch_size, 2, 2, 6]) 35 | self.assertEqual(np.abs(np.sum(np.array(result.shape) - expected_shape)), 0) 36 | 37 | def test_subsampling_shape(self): 38 | layer = self.MaxPooling(neighborhood=(2, 2), stride=(3, 2)) 39 | result = layer.forward(self.input_tensor) 40 | expected_shape = np.array([self.batch_size, 2, 1, 3]) 41 | self.assertEqual(np.abs(np.sum(np.array(result.shape) - expected_shape)), 0) 42 | 43 | def test_gradient_stride(self): 44 | self.layers[0] = self.MaxPooling(neighborhood=(2, 2), stride=(2, 2)) 45 | self.layers[2] = self.FullyConnected(12, self.categories, 0.) 46 | 47 | difference = Helpers.gradient_check(self.layers, self.input_tensor, self.label_tensor) 48 | 49 | self.assertLessEqual(np.sum(difference), 1e-6) 50 | 51 | def test_gradient_overlapping_stride(self): 52 | self.layers[0] = self.MaxPooling(neighborhood=(2, 2), stride=(2, 1)) 53 | self.layers[2] = self.FullyConnected(24, self.categories, 0.) 54 | 55 | difference = Helpers.gradient_check(self.layers, self.input_tensor, self.label_tensor) 56 | 57 | self.assertLessEqual(np.sum(difference), 1e-6) 58 | 59 | def test_gradient_subsampling_stride(self): 60 | 61 | self.layers[0] = self.MaxPooling(neighborhood=(2, 2), stride=(3, 2)) 62 | self.layers[2] = self.FullyConnected(6, self.categories, 0.) 63 | 64 | difference = Helpers.gradient_check(self.layers, self.input_tensor, self.label_tensor) 65 | 66 | self.assertLessEqual(np.sum(difference), 1e-6) 67 | 68 | def test_layout_preservation(self): 69 | pool = self.MaxPooling(neighborhood=(1, 1), stride=(1, 1)) 70 | input_tensor = np.array(range(np.prod(self.input_shape) * self.batch_size), dtype=np.float) 71 | input_tensor = input_tensor.reshape(self.batch_size, *self.input_shape) 72 | output_tensor = pool.forward(input_tensor) 73 | self.assertAlmostEqual(np.sum(np.abs(output_tensor-input_tensor)), 0.) 74 | 75 | def test_expected_output_valid_edgecase(self): 76 | input_shape = (1, 3, 3) 77 | pool = self.MaxPooling(neighborhood=(2, 2), stride=(2, 2)) 78 | batch_size = 2 79 | input_tensor = np.array(range(np.prod(input_shape) * batch_size), dtype=np.float) 80 | input_tensor = input_tensor.reshape(batch_size, *input_shape) 81 | 82 | result = pool.forward(input_tensor) 83 | expected_result = np.array([[4], [13]]).T 84 | self.assertEqual(np.abs(np.sum(result - expected_result)), 0) 85 | 86 | def test_expected_output(self): 87 | input_shape = (1, 4, 4) 88 | pool = self.MaxPooling(neighborhood=(2, 2), stride=(2, 2)) 89 | batch_size = 2 90 | input_tensor = np.array(range(np.prod(input_shape) * batch_size), dtype=np.float) 91 | input_tensor = input_tensor.reshape(batch_size, *input_shape) 92 | 93 | result = pool.forward(input_tensor) 94 | expected_result = np.array([[[[ 5., 7.],[13., 15.]]],[[[21., 23.],[29., 31.]]]]).T 95 | self.assertEqual(np.abs(np.sum(result - expected_result)), 0) 96 | 97 | if __name__ == "__main__": 98 | pass -------------------------------------------------------------------------------- /Tests/TestSoftMaxCrossEntropyLoss.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import numpy as np 3 | import Helpers 4 | 5 | class TestSoftMaxCrossEntropyLoss(unittest.TestCase): 6 | 7 | def setUp(self): 8 | self.batch_size = 9 9 | self.categories = 4 10 | self.label_tensor = np.zeros([self.batch_size, self.categories]) 11 | for i in range(self.batch_size): 12 | self.label_tensor[i, np.random.randint(0, self.categories)] = 1 13 | 14 | def test_forward_zero_loss(self): 15 | input_tensor = self.label_tensor * 100. 16 | layer = self.SoftMaxCrossEntropyLoss() 17 | loss = layer.forward(input_tensor, self.label_tensor) 18 | 19 | self.assertLess(loss, 1e-10) 20 | 21 | def test_backward_zero_loss(self): 22 | input_tensor = self.label_tensor * 100. 23 | layer = self.SoftMaxCrossEntropyLoss() 24 | layer.forward(input_tensor, self.label_tensor) 25 | error = layer.backward(self.label_tensor) 26 | 27 | self.assertAlmostEqual(np.sum(error), 0) 28 | 29 | def test_regression_high_loss(self): 30 | input_tensor = self.label_tensor - 1. 31 | input_tensor *= -100. 32 | layer = self.SoftMaxCrossEntropyLoss() 33 | loss = layer.forward(input_tensor, self.label_tensor) 34 | 35 | # test a specific value here 36 | self.assertAlmostEqual(float(loss), 909.8875105980) 37 | 38 | def test_regression_backward_high_loss(self): 39 | input_tensor = self.label_tensor - 1. 40 | input_tensor *= -100. 41 | layer = self.SoftMaxCrossEntropyLoss() 42 | layer.forward(input_tensor, self.label_tensor) 43 | error = layer.backward(self.label_tensor) 44 | 45 | # test if every wrong class confidence is decreased 46 | for element in error[self.label_tensor == 0]: 47 | self.assertGreaterEqual(element, 1 / 3) 48 | 49 | # test if every correct class confidence is increased 50 | for element in error[self.label_tensor == 1]: 51 | self.assertAlmostEqual(element, -1) 52 | 53 | def test_regression_forward(self): 54 | np.random.seed(1337) 55 | input_tensor = np.abs(np.random.random(self.label_tensor.shape)) 56 | layer = self.SoftMaxCrossEntropyLoss() 57 | loss = layer.forward(input_tensor, self.label_tensor) 58 | 59 | # just see if it's bigger then zero 60 | self.assertGreater(float(loss), 0.) 61 | 62 | def test_regression_backward(self): 63 | input_tensor = np.abs(np.random.random(self.label_tensor.shape)) 64 | layer = self.SoftMaxCrossEntropyLoss() 65 | layer.forward(input_tensor, self.label_tensor) 66 | error = layer.backward(self.label_tensor) 67 | 68 | # test if every wrong class confidence is decreased 69 | for element in error[self.label_tensor == 0]: 70 | self.assertGreaterEqual(element, 0) 71 | 72 | # test if every correct class confidence is increased 73 | for element in error[self.label_tensor == 1]: 74 | self.assertLessEqual(element, 0) 75 | 76 | def test_gradient(self): 77 | input_tensor = np.abs(np.random.random(self.label_tensor.shape)) 78 | layer = self.SoftMaxCrossEntropyLoss() 79 | difference = Helpers.gradient_check([layer], input_tensor, self.label_tensor) 80 | self.assertLessEqual(np.sum(difference), 1e-5) 81 | 82 | def test_predict(self): 83 | input_tensor = np.arange(self.categories * self.batch_size) 84 | input_tensor = input_tensor / 100. 85 | input_tensor = input_tensor.reshape((self.batch_size, self.categories)) 86 | layer = self.SoftMaxCrossEntropyLoss() 87 | prediction = layer.predict(input_tensor) 88 | expected_values = [[0.24626259, 0.24873757, 0.25123743, 0.25376241], 89 | [0.24626259, 0.24873757, 0.25123743, 0.25376241], 90 | [0.24626259, 0.24873757, 0.25123743, 0.25376241], 91 | [0.24626259, 0.24873757, 0.25123743, 0.25376241], 92 | [0.24626259, 0.24873757, 0.25123743, 0.25376241], 93 | [0.24626259, 0.24873757, 0.25123743, 0.25376241], 94 | [0.24626259, 0.24873757, 0.25123743, 0.25376241], 95 | [0.24626259, 0.24873757, 0.25123743, 0.25376241], 96 | [0.24626259, 0.24873757, 0.25123743, 0.25376241]] 97 | np.testing.assert_almost_equal(prediction, expected_values) 98 | 99 | if __name__ == "__main__": 100 | pass -------------------------------------------------------------------------------- /img/ann.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kbreininger/tutorial-dlframework/83df0a128cdc7a365396d07acbe9f38eaf9b6626/img/ann.png -------------------------------------------------------------------------------- /img/conv_back_weights.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kbreininger/tutorial-dlframework/83df0a128cdc7a365396d07acbe9f38eaf9b6626/img/conv_back_weights.png -------------------------------------------------------------------------------- /img/conv_forward.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kbreininger/tutorial-dlframework/83df0a128cdc7a365396d07acbe9f38eaf9b6626/img/conv_forward.png -------------------------------------------------------------------------------- /img/fcn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kbreininger/tutorial-dlframework/83df0a128cdc7a365396d07acbe9f38eaf9b6626/img/fcn.png -------------------------------------------------------------------------------- /img/lenet.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kbreininger/tutorial-dlframework/83df0a128cdc7a365396d07acbe9f38eaf9b6626/img/lenet.jpg -------------------------------------------------------------------------------- /img/numerical_maxpooling.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kbreininger/tutorial-dlframework/83df0a128cdc7a365396d07acbe9f38eaf9b6626/img/numerical_maxpooling.gif -------------------------------------------------------------------------------- /img/padding_strides.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kbreininger/tutorial-dlframework/83df0a128cdc7a365396d07acbe9f38eaf9b6626/img/padding_strides.gif -------------------------------------------------------------------------------- /img/pooling.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kbreininger/tutorial-dlframework/83df0a128cdc7a365396d07acbe9f38eaf9b6626/img/pooling.gif -------------------------------------------------------------------------------- /img/restacking_filters.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kbreininger/tutorial-dlframework/83df0a128cdc7a365396d07acbe9f38eaf9b6626/img/restacking_filters.gif -------------------------------------------------------------------------------- /img/same_padding_no_strides.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kbreininger/tutorial-dlframework/83df0a128cdc7a365396d07acbe9f38eaf9b6626/img/same_padding_no_strides.gif -------------------------------------------------------------------------------- /license.txt: -------------------------------------------------------------------------------- 1 | Attribution-NonCommercial-ShareAlike 4.0 International 2 | 3 | ======================================================================= 4 | 5 | Creative Commons Corporation ("Creative Commons") is not a law firm and 6 | does not provide legal services or legal advice. Distribution of 7 | Creative Commons public licenses does not create a lawyer-client or 8 | other relationship. Creative Commons makes its licenses and related 9 | information available on an "as-is" basis. Creative Commons gives no 10 | warranties regarding its licenses, any material licensed under their 11 | terms and conditions, or any related information. Creative Commons 12 | disclaims all liability for damages resulting from their use to the 13 | fullest extent possible. 14 | 15 | Using Creative Commons Public Licenses 16 | 17 | Creative Commons public licenses provide a standard set of terms and 18 | conditions that creators and other rights holders may use to share 19 | original works of authorship and other material subject to copyright 20 | and certain other rights specified in the public license below. The 21 | following considerations are for informational purposes only, are not 22 | exhaustive, and do not form part of our licenses. 23 | 24 | Considerations for licensors: Our public licenses are 25 | intended for use by those authorized to give the public 26 | permission to use material in ways otherwise restricted by 27 | copyright and certain other rights. Our licenses are 28 | irrevocable. Licensors should read and understand the terms 29 | and conditions of the license they choose before applying it. 30 | Licensors should also secure all rights necessary before 31 | applying our licenses so that the public can reuse the 32 | material as expected. Licensors should clearly mark any 33 | material not subject to the license. This includes other CC- 34 | licensed material, or material used under an exception or 35 | limitation to copyright. More considerations for licensors: 36 | wiki.creativecommons.org/Considerations_for_licensors 37 | 38 | Considerations for the public: By using one of our public 39 | licenses, a licensor grants the public permission to use the 40 | licensed material under specified terms and conditions. If 41 | the licensor's permission is not necessary for any reason--for 42 | example, because of any applicable exception or limitation to 43 | copyright--then that use is not regulated by the license. Our 44 | licenses grant only permissions under copyright and certain 45 | other rights that a licensor has authority to grant. Use of 46 | the licensed material may still be restricted for other 47 | reasons, including because others have copyright or other 48 | rights in the material. A licensor may make special requests, 49 | such as asking that all changes be marked or described. 50 | Although not required by our licenses, you are encouraged to 51 | respect those requests where reasonable. More considerations 52 | for the public: 53 | wiki.creativecommons.org/Considerations_for_licensees 54 | 55 | ======================================================================= 56 | 57 | Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International 58 | Public License 59 | 60 | By exercising the Licensed Rights (defined below), You accept and agree 61 | to be bound by the terms and conditions of this Creative Commons 62 | Attribution-NonCommercial-ShareAlike 4.0 International Public License 63 | ("Public License"). To the extent this Public License may be 64 | interpreted as a contract, You are granted the Licensed Rights in 65 | consideration of Your acceptance of these terms and conditions, and the 66 | Licensor grants You such rights in consideration of benefits the 67 | Licensor receives from making the Licensed Material available under 68 | these terms and conditions. 69 | 70 | 71 | Section 1 -- Definitions. 72 | 73 | a. Adapted Material means material subject to Copyright and Similar 74 | Rights that is derived from or based upon the Licensed Material 75 | and in which the Licensed Material is translated, altered, 76 | arranged, transformed, or otherwise modified in a manner requiring 77 | permission under the Copyright and Similar Rights held by the 78 | Licensor. For purposes of this Public License, where the Licensed 79 | Material is a musical work, performance, or sound recording, 80 | Adapted Material is always produced where the Licensed Material is 81 | synched in timed relation with a moving image. 82 | 83 | b. Adapter's License means the license You apply to Your Copyright 84 | and Similar Rights in Your contributions to Adapted Material in 85 | accordance with the terms and conditions of this Public License. 86 | 87 | c. BY-NC-SA Compatible License means a license listed at 88 | creativecommons.org/compatiblelicenses, approved by Creative 89 | Commons as essentially the equivalent of this Public License. 90 | 91 | d. Copyright and Similar Rights means copyright and/or similar rights 92 | closely related to copyright including, without limitation, 93 | performance, broadcast, sound recording, and Sui Generis Database 94 | Rights, without regard to how the rights are labeled or 95 | categorized. For purposes of this Public License, the rights 96 | specified in Section 2(b)(1)-(2) are not Copyright and Similar 97 | Rights. 98 | 99 | e. Effective Technological Measures means those measures that, in the 100 | absence of proper authority, may not be circumvented under laws 101 | fulfilling obligations under Article 11 of the WIPO Copyright 102 | Treaty adopted on December 20, 1996, and/or similar international 103 | agreements. 104 | 105 | f. Exceptions and Limitations means fair use, fair dealing, and/or 106 | any other exception or limitation to Copyright and Similar Rights 107 | that applies to Your use of the Licensed Material. 108 | 109 | g. License Elements means the license attributes listed in the name 110 | of a Creative Commons Public License. The License Elements of this 111 | Public License are Attribution, NonCommercial, and ShareAlike. 112 | 113 | h. Licensed Material means the artistic or literary work, database, 114 | or other material to which the Licensor applied this Public 115 | License. 116 | 117 | i. Licensed Rights means the rights granted to You subject to the 118 | terms and conditions of this Public License, which are limited to 119 | all Copyright and Similar Rights that apply to Your use of the 120 | Licensed Material and that the Licensor has authority to license. 121 | 122 | j. Licensor means the individual(s) or entity(ies) granting rights 123 | under this Public License. 124 | 125 | k. NonCommercial means not primarily intended for or directed towards 126 | commercial advantage or monetary compensation. For purposes of 127 | this Public License, the exchange of the Licensed Material for 128 | other material subject to Copyright and Similar Rights by digital 129 | file-sharing or similar means is NonCommercial provided there is 130 | no payment of monetary compensation in connection with the 131 | exchange. 132 | 133 | l. Share means to provide material to the public by any means or 134 | process that requires permission under the Licensed Rights, such 135 | as reproduction, public display, public performance, distribution, 136 | dissemination, communication, or importation, and to make material 137 | available to the public including in ways that members of the 138 | public may access the material from a place and at a time 139 | individually chosen by them. 140 | 141 | m. Sui Generis Database Rights means rights other than copyright 142 | resulting from Directive 96/9/EC of the European Parliament and of 143 | the Council of 11 March 1996 on the legal protection of databases, 144 | as amended and/or succeeded, as well as other essentially 145 | equivalent rights anywhere in the world. 146 | 147 | n. You means the individual or entity exercising the Licensed Rights 148 | under this Public License. Your has a corresponding meaning. 149 | 150 | 151 | Section 2 -- Scope. 152 | 153 | a. License grant. 154 | 155 | 1. Subject to the terms and conditions of this Public License, 156 | the Licensor hereby grants You a worldwide, royalty-free, 157 | non-sublicensable, non-exclusive, irrevocable license to 158 | exercise the Licensed Rights in the Licensed Material to: 159 | 160 | a. reproduce and Share the Licensed Material, in whole or 161 | in part, for NonCommercial purposes only; and 162 | 163 | b. produce, reproduce, and Share Adapted Material for 164 | NonCommercial purposes only. 165 | 166 | 2. Exceptions and Limitations. For the avoidance of doubt, where 167 | Exceptions and Limitations apply to Your use, this Public 168 | License does not apply, and You do not need to comply with 169 | its terms and conditions. 170 | 171 | 3. Term. The term of this Public License is specified in Section 172 | 6(a). 173 | 174 | 4. Media and formats; technical modifications allowed. The 175 | Licensor authorizes You to exercise the Licensed Rights in 176 | all media and formats whether now known or hereafter created, 177 | and to make technical modifications necessary to do so. The 178 | Licensor waives and/or agrees not to assert any right or 179 | authority to forbid You from making technical modifications 180 | necessary to exercise the Licensed Rights, including 181 | technical modifications necessary to circumvent Effective 182 | Technological Measures. For purposes of this Public License, 183 | simply making modifications authorized by this Section 2(a) 184 | (4) never produces Adapted Material. 185 | 186 | 5. Downstream recipients. 187 | 188 | a. Offer from the Licensor -- Licensed Material. Every 189 | recipient of the Licensed Material automatically 190 | receives an offer from the Licensor to exercise the 191 | Licensed Rights under the terms and conditions of this 192 | Public License. 193 | 194 | b. Additional offer from the Licensor -- Adapted Material. 195 | Every recipient of Adapted Material from You 196 | automatically receives an offer from the Licensor to 197 | exercise the Licensed Rights in the Adapted Material 198 | under the conditions of the Adapter's License You apply. 199 | 200 | c. No downstream restrictions. You may not offer or impose 201 | any additional or different terms or conditions on, or 202 | apply any Effective Technological Measures to, the 203 | Licensed Material if doing so restricts exercise of the 204 | Licensed Rights by any recipient of the Licensed 205 | Material. 206 | 207 | 6. No endorsement. Nothing in this Public License constitutes or 208 | may be construed as permission to assert or imply that You 209 | are, or that Your use of the Licensed Material is, connected 210 | with, or sponsored, endorsed, or granted official status by, 211 | the Licensor or others designated to receive attribution as 212 | provided in Section 3(a)(1)(A)(i). 213 | 214 | b. Other rights. 215 | 216 | 1. Moral rights, such as the right of integrity, are not 217 | licensed under this Public License, nor are publicity, 218 | privacy, and/or other similar personality rights; however, to 219 | the extent possible, the Licensor waives and/or agrees not to 220 | assert any such rights held by the Licensor to the limited 221 | extent necessary to allow You to exercise the Licensed 222 | Rights, but not otherwise. 223 | 224 | 2. Patent and trademark rights are not licensed under this 225 | Public License. 226 | 227 | 3. To the extent possible, the Licensor waives any right to 228 | collect royalties from You for the exercise of the Licensed 229 | Rights, whether directly or through a collecting society 230 | under any voluntary or waivable statutory or compulsory 231 | licensing scheme. In all other cases the Licensor expressly 232 | reserves any right to collect such royalties, including when 233 | the Licensed Material is used other than for NonCommercial 234 | purposes. 235 | 236 | 237 | Section 3 -- License Conditions. 238 | 239 | Your exercise of the Licensed Rights is expressly made subject to the 240 | following conditions. 241 | 242 | a. Attribution. 243 | 244 | 1. If You Share the Licensed Material (including in modified 245 | form), You must: 246 | 247 | a. retain the following if it is supplied by the Licensor 248 | with the Licensed Material: 249 | 250 | i. identification of the creator(s) of the Licensed 251 | Material and any others designated to receive 252 | attribution, in any reasonable manner requested by 253 | the Licensor (including by pseudonym if 254 | designated); 255 | 256 | ii. a copyright notice; 257 | 258 | iii. a notice that refers to this Public License; 259 | 260 | iv. a notice that refers to the disclaimer of 261 | warranties; 262 | 263 | v. a URI or hyperlink to the Licensed Material to the 264 | extent reasonably practicable; 265 | 266 | b. indicate if You modified the Licensed Material and 267 | retain an indication of any previous modifications; and 268 | 269 | c. indicate the Licensed Material is licensed under this 270 | Public License, and include the text of, or the URI or 271 | hyperlink to, this Public License. 272 | 273 | 2. You may satisfy the conditions in Section 3(a)(1) in any 274 | reasonable manner based on the medium, means, and context in 275 | which You Share the Licensed Material. For example, it may be 276 | reasonable to satisfy the conditions by providing a URI or 277 | hyperlink to a resource that includes the required 278 | information. 279 | 3. If requested by the Licensor, You must remove any of the 280 | information required by Section 3(a)(1)(A) to the extent 281 | reasonably practicable. 282 | 283 | b. ShareAlike. 284 | 285 | In addition to the conditions in Section 3(a), if You Share 286 | Adapted Material You produce, the following conditions also apply. 287 | 288 | 1. The Adapter's License You apply must be a Creative Commons 289 | license with the same License Elements, this version or 290 | later, or a BY-NC-SA Compatible License. 291 | 292 | 2. You must include the text of, or the URI or hyperlink to, the 293 | Adapter's License You apply. You may satisfy this condition 294 | in any reasonable manner based on the medium, means, and 295 | context in which You Share Adapted Material. 296 | 297 | 3. You may not offer or impose any additional or different terms 298 | or conditions on, or apply any Effective Technological 299 | Measures to, Adapted Material that restrict exercise of the 300 | rights granted under the Adapter's License You apply. 301 | 302 | 303 | Section 4 -- Sui Generis Database Rights. 304 | 305 | Where the Licensed Rights include Sui Generis Database Rights that 306 | apply to Your use of the Licensed Material: 307 | 308 | a. for the avoidance of doubt, Section 2(a)(1) grants You the right 309 | to extract, reuse, reproduce, and Share all or a substantial 310 | portion of the contents of the database for NonCommercial purposes 311 | only; 312 | 313 | b. if You include all or a substantial portion of the database 314 | contents in a database in which You have Sui Generis Database 315 | Rights, then the database in which You have Sui Generis Database 316 | Rights (but not its individual contents) is Adapted Material, 317 | including for purposes of Section 3(b); and 318 | 319 | c. You must comply with the conditions in Section 3(a) if You Share 320 | all or a substantial portion of the contents of the database. 321 | 322 | For the avoidance of doubt, this Section 4 supplements and does not 323 | replace Your obligations under this Public License where the Licensed 324 | Rights include other Copyright and Similar Rights. 325 | 326 | 327 | Section 5 -- Disclaimer of Warranties and Limitation of Liability. 328 | 329 | a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE 330 | EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS 331 | AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF 332 | ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS, 333 | IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION, 334 | WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR 335 | PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS, 336 | ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT 337 | KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT 338 | ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU. 339 | 340 | b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE 341 | TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION, 342 | NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT, 343 | INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES, 344 | COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR 345 | USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN 346 | ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR 347 | DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR 348 | IN PART, THIS LIMITATION MAY NOT APPLY TO YOU. 349 | 350 | c. The disclaimer of warranties and limitation of liability provided 351 | above shall be interpreted in a manner that, to the extent 352 | possible, most closely approximates an absolute disclaimer and 353 | waiver of all liability. 354 | 355 | 356 | Section 6 -- Term and Termination. 357 | 358 | a. This Public License applies for the term of the Copyright and 359 | Similar Rights licensed here. However, if You fail to comply with 360 | this Public License, then Your rights under this Public License 361 | terminate automatically. 362 | 363 | b. Where Your right to use the Licensed Material has terminated under 364 | Section 6(a), it reinstates: 365 | 366 | 1. automatically as of the date the violation is cured, provided 367 | it is cured within 30 days of Your discovery of the 368 | violation; or 369 | 370 | 2. upon express reinstatement by the Licensor. 371 | 372 | For the avoidance of doubt, this Section 6(b) does not affect any 373 | right the Licensor may have to seek remedies for Your violations 374 | of this Public License. 375 | 376 | c. For the avoidance of doubt, the Licensor may also offer the 377 | Licensed Material under separate terms or conditions or stop 378 | distributing the Licensed Material at any time; however, doing so 379 | will not terminate this Public License. 380 | 381 | d. Sections 1, 5, 6, 7, and 8 survive termination of this Public 382 | License. 383 | 384 | 385 | Section 7 -- Other Terms and Conditions. 386 | 387 | a. The Licensor shall not be bound by any additional or different 388 | terms or conditions communicated by You unless expressly agreed. 389 | 390 | b. Any arrangements, understandings, or agreements regarding the 391 | Licensed Material not stated herein are separate from and 392 | independent of the terms and conditions of this Public License. 393 | 394 | 395 | Section 8 -- Interpretation. 396 | 397 | a. For the avoidance of doubt, this Public License does not, and 398 | shall not be interpreted to, reduce, limit, restrict, or impose 399 | conditions on any use of the Licensed Material that could lawfully 400 | be made without permission under this Public License. 401 | 402 | b. To the extent possible, if any provision of this Public License is 403 | deemed unenforceable, it shall be automatically reformed to the 404 | minimum extent necessary to make it enforceable. If the provision 405 | cannot be reformed, it shall be severed from this Public License 406 | without affecting the enforceability of the remaining terms and 407 | conditions. 408 | 409 | c. No term or condition of this Public License will be waived and no 410 | failure to comply consented to unless expressly agreed to by the 411 | Licensor. 412 | 413 | d. Nothing in this Public License constitutes or may be interpreted 414 | as a limitation upon, or waiver of, any privileges and immunities 415 | that apply to the Licensor or You, including from the legal 416 | processes of any jurisdiction or authority. 417 | 418 | ======================================================================= 419 | 420 | Creative Commons is not a party to its public 421 | licenses. Notwithstanding, Creative Commons may elect to apply one of 422 | its public licenses to material it publishes and in those instances 423 | will be considered the “Licensor.” The text of the Creative Commons 424 | public licenses is dedicated to the public domain under the CC0 Public 425 | Domain Dedication. Except for the limited purpose of indicating that 426 | material is shared under a Creative Commons public license or as 427 | otherwise permitted by the Creative Commons policies published at 428 | creativecommons.org/policies, Creative Commons does not authorize the 429 | use of the trademark "Creative Commons" or any other trademark or logo 430 | of Creative Commons without its prior written consent including, 431 | without limitation, in connection with any unauthorized modifications 432 | to any of its public licenses or any other arrangements, 433 | understandings, or agreements concerning use of licensed material. For 434 | the avoidance of doubt, this paragraph does not form part of the 435 | public licenses. 436 | 437 | Creative Commons may be contacted at creativecommons.org. 438 | 439 | -------------------------------------------------------------------------------- /src/base.py: -------------------------------------------------------------------------------- 1 | def enum(*sequential, **named): 2 | # Enum definition for backcompatibility 3 | enums = dict(zip(sequential, range(len(sequential))), **named) 4 | return type('Enum', (), enums) 5 | 6 | # Enum to encode the which phase a layer is in at the moment. 7 | Phase = enum('train', 'test', 'validation') 8 | 9 | class BaseLayer: 10 | 11 | def __init__(self): 12 | self.phase = Phase.train 13 | 14 | def forward(self, x): 15 | """ Return the result of the forward pass of this layer. Save intermediate results 16 | necessary to compute the gradients in the backward pass. 17 | """ 18 | raise NotImplementedError('Base class - method is not implemented') 19 | 20 | def backward(self, error): 21 | """ Update the parameters/weights of this layer (if applicable), 22 | and return the gradient with respect to the input. 23 | """ 24 | raise NotImplementedError('Base class - method is not implemented') -------------------------------------------------------------------------------- /src/layers/activation_functions.py: -------------------------------------------------------------------------------- 1 | class Sigmoid(BaseLayer): 2 | 3 | def forward(self, x): 4 | """ Return the element-wise sigmoid of the input. 5 | param: x (np.ndarray): input to the activation function, of arbitrary shape 6 | returns (np.ndarray): element-wise sigmoid(x), of the same shape as x 7 | """ 8 | # TODO: Implement forward pass of the Sigmoid 9 | pass 10 | 11 | def backward(self, error): 12 | """ Return the gradient with respect to the input. 13 | param: error (np.ndarray): the gradient passed down from the subsequent layer, of the same 14 | shape as x in the forward pass 15 | returns (np.ndarray): the gradient with respect to the previous layer, of the same shape as error 16 | """ 17 | # TODO: Implement backward pass of the Sigmoid 18 | pass 19 | 20 | 21 | class ReLU(BaseLayer): 22 | 23 | def forward(self, x): 24 | """ Return the result of a ReLU activation of the input. 25 | param: x (np.ndarray): input to the activation function, of arbitrary shape 26 | returns (np.ndarray): element-wise ReLU(x), of the same shape as x 27 | """ 28 | # TODO: Implement forward pass of the ReLU 29 | pass 30 | 31 | def backward(self, error): 32 | """ Return the gradient with respect to the input. 33 | param: error (np.ndarray): the gradient passed down from the previous layer, arbitrary shape (same as x) 34 | returns (np.ndarray): gradient with respect to the input, of the same shape as error 35 | """ 36 | # TODO: Implement backward pass of the ReLU 37 | pass -------------------------------------------------------------------------------- /src/layers/batch_normalization.py: -------------------------------------------------------------------------------- 1 | class BatchNorm(BaseLayer): 2 | def __init__(self, learning_rate, convolutional=False): 3 | """ Batch normalization layer. 4 | param: learning_rate (float): the learning rate of this layer 5 | param: convolutional(boolean): if true, only a scalar mean and a scalar variance is 6 | calculated for every channel, otherwise mean and variance have the same dimension 7 | as the input 8 | """ 9 | # TODO: Implement initialization 10 | pass 11 | 12 | def forward(self, x): 13 | """ Return the batch normalized input. 14 | param: x(np.ndarray): input, of arbitrary shape 15 | returns (np.ndarray): result of batch normalization, of the same shape as x 16 | """ 17 | # TODO: Implement forward pass of the batch normalization layer 18 | 19 | # Hint 1: Make sure to treat training and test phase accordingly. 20 | # Hint 2: If the network has never seen any training data, but is applied in "test mode", the network 21 | # should not change the distribution of the input. Initialize the respective variable after the 22 | # first training input is received. 23 | pass 24 | 25 | def backward(self, error): 26 | """ Return the gradient with respect to the previous layer. 27 | param: error(np.ndarray): error passed down from the subsequent layer, of the same shape as the input 28 | in the forward pass 29 | returns (np.ndarray): gradient w.r.t. the input, of the same shape as error 30 | """ 31 | # TODO: Implement backward pass of the batch normalization layer 32 | pass 33 | 34 | def get_gradient_weights(self): 35 | """ Returns the gradient with respect to the weights, i.e. \gamma, from the last call of backward() """ 36 | # TODO: Implement 37 | pass 38 | 39 | def get_gradient_bias(self): 40 | """ Returns the gradient with respect to the bias, i.e. \beta, from the last call of backward() """ 41 | # TODO: Implement 42 | pass 43 | -------------------------------------------------------------------------------- /src/layers/conv.py: -------------------------------------------------------------------------------- 1 | import Initializers 2 | 3 | class FlattenLayer(BaseLayer): 4 | def __init__(self): 5 | # TODO: define the necessary class variables 6 | pass 7 | 8 | def forward(self, x): 9 | """ Return a flattened version of the input. 10 | param: x (np.ndarray): input, of shape [b, n_channels, p, q] where b is the batch size, 11 | n_channels is the number of channels and p x q is the image size 12 | returns (np.ndarray): a flattened representation of x of shape [b, v] 13 | where b is the batch size and v is the output size = n_channels * p * q 14 | """ 15 | # TODO: Implement flattening of the image 16 | pass 17 | 18 | def backward(self, error): 19 | """ Return the gradient with respect to the input. 20 | param: error (np.ndarray): the gradient passed down from the subsequent layer, of shape [b, m], 21 | where b is the batch size and m is the output size with m = n_channels * p * q from 22 | the forward pass 23 | returns (np.ndarray): the error with restored dimensions from the forward pass, i.e. with 24 | shape [b, n_channels, p, q] where b is the batch size, n_channels is the number of 25 | channels and p x q is the image size 26 | """ 27 | # TODO: Restore the image dimensions 28 | pass 29 | 30 | 31 | class ConvolutionalLayer(BaseLayer): 32 | 33 | def __init__(self, stride_shape, kernel_shape, n_kernels, learning_rate, weights_initializer=UniformRandom(), bias_initializer=Const(0.1)): 34 | """ 35 | param: stride: tuple in the form of (np, nq) which denote the subsampling factor of the 36 | convolution operation in the spatial dimensions 37 | param: kernel_shape: integer tuple in the form of (n_channels, m, n) where n_channels is 38 | the number of input channels and m x n is the size of the filter kernels 39 | param: n_kernels (int): number of kernels and therefore the number of output channels 40 | param: learning_rate (float): learning rate of this layer 41 | param: weights_initializer: initializer object for the filter weights 42 | param: bias_initializer: initializer object for the bias 43 | """ 44 | # TODO: define the neccesary class variables, initialize the weights and bias 45 | self.weights = ... 46 | self.bias = ... 47 | pass 48 | 49 | def forward(self, x): 50 | """ Return the result of the forward pass of the convolutional layer. 51 | param: x(np.ndarray): input, of shape [b, n_channels, p, q], where b is the batch size, 52 | n_channels is the number of input channels and p x q is the image size 53 | returns (np.ndarray): result of the forward pass, of shape (b, n_kernels, p', q') 54 | where b is the batch size, n_kernels is the number of kernels in this layer and 55 | p' x q' is the output image size (which depends on the stride) 56 | """ 57 | # TODO: Implement forward pass of the convolutional layer 58 | pass 59 | 60 | def backward(self, error): 61 | """ Update the weights of this layer and return the gradient with respect to the input. 62 | param: error (np.ndarray): of shape (b, n_kernels, p', q') where b is the batch size, n_kernels 63 | is the number of kernels and p' x q' is the spacial error size (depends on the stride) 64 | returns (np.ndarray): the gradient with respect to the input, of shape (b, n_channels, p, q) 65 | where b is the batch size, n_channels is the number of input channels to this layer and 66 | p x q is the image size. 67 | """ 68 | # TODO: Implement backward pass of the convolutional layer 69 | pass 70 | 71 | def get_gradient_weights(self): 72 | """ Returns the gradient with respect to the weights from the last call of backward() """ 73 | # TODO: Implement 74 | pass 75 | 76 | def get_gradient_bias(self): 77 | """ Returns the gradient with respect to the bias from the last call of backward() """ 78 | # TODO: Implement 79 | pass 80 | 81 | def initialize(self, weights_initializer, bias_initializer): 82 | """ Initializes the weights/bias of this layer with the given initializers. 83 | param: weights_initializer: object providing a method weights_initializer.initialize(weights_shape) 84 | which will return initialized weights with the given shape 85 | param: bias_initializer: object providing a method bias_initializer.initialize(bias_shape) 86 | which will return an initialized bias with the given shape 87 | """ 88 | # TODO: Implement. To make sure that He initialization works as intended, make sure the second dimension 89 | # of weights_shape contains the number of input nodes that can be computed as n_in = n_channels * m * n 90 | # and reshape the weights to the correct shape afterwards. 91 | pass -------------------------------------------------------------------------------- /src/layers/dropout.py: -------------------------------------------------------------------------------- 1 | class DropOut(BaseLayer): 2 | 3 | def __init__(self, probability): 4 | """ DropOut Layer. 5 | param: probability: probability of each individual activation to be set to zero, in range [0, 1] 6 | """ 7 | # TODO: Implement initialization 8 | 9 | pass 10 | 11 | def forward(self, x): 12 | """ Forward pass through the layer: Set activations of the input randomly to zero. 13 | param: x (np.ndarray): input 14 | returns (np.ndarray): a new array of the same shape as x, after dropping random elements 15 | """ 16 | # TODO: Implement forward pass of the Dropout layer 17 | # Hint: Make sure to treat training and test phase accordingly. 18 | pass 19 | 20 | def backward(self, error): 21 | """ Backward pass through the layer: Return the gradient with respect to the input. 22 | param: error (np.ndarray): error passed down from the subsequent layer, of the same shape as the 23 | output of the forward pass 24 | returns (np.ndarray): gradient with respect to the input, of the same shape as error 25 | """ 26 | # TODO: Implement backward pass of the Dropout layer 27 | pass -------------------------------------------------------------------------------- /src/layers/fully_connected.py: -------------------------------------------------------------------------------- 1 | class FullyConnectedLayer(BaseLayer): 2 | def __init__(self, input_size, output_size, learning_rate): 3 | """ A fully connected layer. 4 | param: input_size (int): dimension n of the input vector 5 | param: output_size (int): dimension m of the output vector 6 | param: learning_rate (float): the learning rate of this layer 7 | """ 8 | # TODO: define the neccesary class variables 9 | pass 10 | 11 | def forward(self, x): 12 | """ Compute the foward pass through the layer. 13 | param: x (np.ndarray): input with shape [b, n] where b is the batch size and n is the input size 14 | returns (np.ndarray): result of the forward pass, of shape [b, m] where b is the batch size and 15 | m is the output size 16 | """ 17 | # TODO: Implement forward pass of the fully connected layer 18 | # Hint: Think about what you need to store during the forward pass to be able to compute 19 | # the gradients in the backward pass 20 | pass 21 | 22 | def get_gradient_weights(self): 23 | """ 24 | returns (np.ndarray): the gradient with respect to the weights and biases from the last call of backward(...) 25 | """ 26 | # TODO: Implement 27 | pass 28 | 29 | def backward(self, error): 30 | """ Update the weights of this layer and return the gradient with respect to the previous layer. 31 | param: error (np.ndarray): of shape [b, m] where b is the batch size and m is the output size 32 | returns (np.ndarray): the gradient w.r.t. the previous layer, of shape [b, n] where b is the 33 | batch size and n is the input size 34 | """ 35 | # TODO: Implement backward pass of the fully connected layer 36 | # Hint: Be careful about the order of applying the update to the weights and the calculation of 37 | # the error with respect to the previous layer. 38 | pass 39 | 40 | def initialize(self, weights_initializer, bias_initializer): 41 | """ Initializes the weights/bias of this layer with the given initializers. 42 | param: weights_initializer: object providing a method weights_initializer.initialize(weights_shape) 43 | which will return initialized weights with the given shape 44 | param: bias_initializer: object providing a method bias_initializer.initialize(bias_shape) 45 | which will return an initialized bias with the given shape 46 | """ 47 | # TODO: Implement 48 | pass -------------------------------------------------------------------------------- /src/layers/initializers.py: -------------------------------------------------------------------------------- 1 | class Initializer: 2 | """ Base class for initializers. """ 3 | def initialize(self, weight_shape): 4 | """ Return weights initialized according to the subclass definition. 5 | Required to work for arbitrary weight shapes. 6 | Base class. 7 | """ 8 | 9 | # Raises an exeption in base class. 10 | raise NotImplementedError('Method is not implemented') 11 | 12 | 13 | class Const(Initializer): 14 | 15 | def __init__(self, value): 16 | """ Create a constant initializer. 17 | params: value (float): constant that is used for initialization of weights 18 | """ 19 | # TODO: Implement 20 | pass 21 | 22 | def initialize(self, weight_shape): 23 | """ Return a new array of weights initialized with a constant value provided by self.value. 24 | param: weight_shape: shape of the new array 25 | returns (np.ndarray): array of the given shape 26 | """ 27 | # TODO: Implement 28 | pass 29 | 30 | class UniformRandom(Initializer): 31 | 32 | def initialize(self, weight_shape): 33 | """ Return a new array of weights initialized by drawing from a uniform distribution with range [0, 1]. 34 | param: weight_shape: shape of new array 35 | returns (np.ndarray): array of the given shape 36 | """ 37 | # TODO: Implement 38 | pass 39 | 40 | 41 | class He(Initializer): 42 | 43 | def initialize(self, weight_shape): 44 | """ Return a new array of weights initialized according to He et al.: Delving Deep into Rectifiers. 45 | param: weight_shape: shape of the np.array to be returned, the second dimension is assumed to be the 46 | number of input nodes 47 | returns (np.ndarray): array of the given shape 48 | """ 49 | # TODO: Implement 50 | pass 51 | -------------------------------------------------------------------------------- /src/layers/pooling.py: -------------------------------------------------------------------------------- 1 | class MaxPoolLayer(BaseLayer): 2 | 3 | def __init__(self, neighborhood=(2, 2), stride=(2, 2)): 4 | """ Max pooling layer. 5 | param: neighborhood: tuple with shape (sp, sq) which denote the kernel size of the pooling operation in 6 | the spatial dimensions 7 | param: stride: tuple with shape (np, nq) which denote the subsampling factor of the pooling operation in 8 | the spacial dimensions 9 | """ 10 | # TODO: define necessary class variables 11 | pass 12 | 13 | def forward(self, x): 14 | """ Return the result of maxpooling on the input. 15 | param: x (np.ndarray) with shape (b, n_channels, p, q) where b is the batch size, 16 | n_channels is the number of input channels and p x q is the image size 17 | returns (np.ndarray): the result of max pooling, of shape (b, n_channels, p', q') 18 | where b is the batch size, n_channels is the number of input channels and 19 | p' x q' is the new image size reduced by the stride. 20 | """ 21 | # TODO: Implement forward pass of max pooling 22 | pass 23 | 24 | def backward(self, error): 25 | """ Return the gradient with respect to the previous layer. 26 | param: error(np.ndarray): the gradient passed own from the subsequent layer, 27 | of shape [b, n_channels, p', q'] where b is the batch size, n_channels is the 28 | number of channels and p' x q' is the image size reduced by the stride 29 | returns (np.ndarray): the gradient w.r.t. the previous layer, of shape [b, n_channels, p, q] 30 | where b is the batch size, n_channels is the number of input channels to this layer and 31 | p x q is the image size prior to downsampling. 32 | """ 33 | # TODO: Implement backward pass of max pooling 34 | pass -------------------------------------------------------------------------------- /src/layers/softmax_crossentropy.py: -------------------------------------------------------------------------------- 1 | class SoftMaxCrossEntropyLoss(BaseLayer): 2 | 3 | def forward(self, x, labels): 4 | """ Return the cross entropy loss of the input and the labels after applying the softmax to the input. 5 | param: x (np.ndarray): input, of shape [b, k] where b is the batch size and k is the input size 6 | param: labels (np.ndarray): the corresponding labels of the training set in one-hot encoding for 7 | the current input, of the same shape as x 8 | returns (float): the loss of the current prediction and the label 9 | """ 10 | # Todo: Implement forward pass 11 | pass 12 | 13 | def backward(self, labels): 14 | """ Return the gradient of the SoftMaxCrossEntropy loss with respect to the previous layer. 15 | param: labels (np.ndarray): (again) the corresponding labels of the training set for the current input, 16 | of shape [b, k] where b is the batch size and k is the input size 17 | returns (np.ndarray): the error w.r.t. the previous layer, of shape [b, k] where b is the batch 18 | size and n is the input size 19 | """ 20 | # TODO: Implement backward pass 21 | pass 22 | 23 | def predict(self, x): 24 | """ Return the softmax of the input. This can be interpreted as probabilistic prediction of the class. 25 | param: x (np.ndarray): input with shape [b, k], where b is the batch size and n is the input size 26 | returns (np.ndarray): the result softmax(x), of the same shape as x 27 | """ 28 | # TODO: Implement softmax 29 | pass -------------------------------------------------------------------------------- /src/network.py: -------------------------------------------------------------------------------- 1 | import numpy 2 | from src.base import Phase 3 | # Nothing to do in this cell: Just make yourself familiar with the NeuralNetwork class. 4 | 5 | 6 | class NeuralNetwork: 7 | def __init__(self, weights_initializer, bias_initializer): 8 | # list which will contain the loss after training 9 | self.loss = [] 10 | self.data_layer = None # the layer providing data 11 | self.loss_layer = None # the layer calculating the loss and the prediction 12 | self.layers = [] 13 | self.weights_initializer = weights_initializer 14 | self.bias_initializer = bias_initializer 15 | self.label_tensor = None # the labels of the current iteration 16 | 17 | def append_fixed_layer(self, layer): 18 | """ Add a non-trainable layer to the network. """ 19 | self.layers.append(layer) 20 | 21 | def append_trainable_layer(self, layer): 22 | """ Add a new layer with trainable parameters to the network. Initialize the parameters of 23 | the network using the object's initializers for weights and bias. 24 | """ 25 | layer.initialize(self.weights_initializer, self.bias_initializer) 26 | self.layers.append(layer) 27 | 28 | def forward(self): 29 | """ Compute the forward pass through the network. """ 30 | # fetch some training data 31 | input_tensor, self.label_tensor = self.data_layer.forward() 32 | # defer iterating through the network 33 | activation_tensor = self.__forward_input(input_tensor) 34 | # calculate the loss of the network using the final loss layer 35 | return self.loss_layer.forward(activation_tensor, self.label_tensor) 36 | 37 | def __forward_input(self, input_tensor): 38 | """ Compute the forward pass through the network, stopping before the 39 | loss layer. 40 | param: input_tensor (np.ndarray): input to the network 41 | returns: activation of the last "regular" layer 42 | """ 43 | activation_tensor = input_tensor 44 | # pass the input up the network 45 | for layer in self.layers: 46 | activation_tensor = layer.forward(activation_tensor) 47 | # return the activation of the last layer 48 | return activation_tensor 49 | 50 | def backward(self): 51 | """ Perform the backward pass during training. """ 52 | error_tensor = self.loss_layer.backward(self.label_tensor) 53 | # pass back the error recursively 54 | for layer in reversed(self.layers): 55 | error_tensor = layer.backward(error_tensor) 56 | 57 | def train(self, iterations): 58 | """ Train the network for a fixed number of steps. 59 | param: iterations (int): number of iterations for training 60 | """ 61 | for layer in self.layers: 62 | layer.phase = Phase.train # Make sure phase is set to "train" for all layers 63 | for i in range(iterations): 64 | loss = self.forward() # go up the network 65 | self.loss.append(loss) # save the loss 66 | self.backward() # and down again 67 | print('.', end='') 68 | 69 | 70 | def test(self, input_tensor): 71 | """ Apply the (trained) network to input data to generate a prediction. 72 | param: input_tensor (nd.nparray): input (image or vector) 73 | returns (np.ndarray): prediction by the network 74 | """ 75 | for layer in self.layers: 76 | layer.phase = Phase.test # Make sure phase is set to "test" for all layers 77 | activation_tensor = self.__forward_input(input_tensor) 78 | return self.loss_layer.predict(activation_tensor) -------------------------------------------------------------------------------- /tutorial_dl.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "deletable": false, 7 | "editable": false 8 | }, 9 | "source": [ 10 | "# Tutorial: How to Build a Deep Learning Framework\n", 11 | "\n", 12 | "by Katharina Breininger and Tobias Würfl\n", 13 | "\n", 14 | "Pattern Recognition Lab, Friedrich-Alexander-University Erlangen-Nürnberg, Erlangen, Germany " 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": { 20 | "deletable": false, 21 | "editable": false 22 | }, 23 | "source": [ 24 | "## Introduction\n", 25 | "\n", 26 | "Neural networks, especially convolutional neural networks (CNNs), have had an incredible impact on research in medical imaging and medical signal processing in recent years. Frameworks like TensorFlow, Caffe and PyTorch make it easy to implement network architectures to carry out experiments by simply stacking together operators. This has helped to speed up research immensely - it is easy to try out new ideas and translate insights from other fields. BUT: Never having to understand the technical details of the frameworks and operators deprives researchers of one avenue to potential innovation in the field. Improvements like trainable region-proposals and depth-wise separable convolutions are easier to come up with a thorough understanding the details of the machinery. Many essential advances in DL, such as the ReLU, batch normalization and better initialization strategies, have originated in understanding and improving drawbacks of building blocks in neural networks.\n", 27 | "\n", 28 | "With this tutorial, we aim to support you in understanding what's going in neural network frameworks in detail, and teach you how the most common operators work during inference in a network and how they are adapted by training. This will enable you to implement a broader range of ideas, relying on innovative new operators embedded into neural networks. \n", 29 | "\n", 30 | "As prerequisites we expect some conceptual knowledge about neural networks as acquired in offline or online courses, like the [Stanford DL course](http://cs231n.stanford.edu/), or our course ([DL_course_videos](https://www.video.uni-erlangen.de/clip/id/8947)), as well as basic Python/NumPy programming experience.\n", 31 | "\n", 32 | "- How it works:\n", 33 | " - We provide the necessary math and code skeletons of building blocks\n", 34 | " - You translate this math into code\n", 35 | " - Our unit-tests will give you feedback on the correctness of your implementation\n", 36 | " - At the end, we will put these building blocks together to a working network\n", 37 | "- What we don't do:\n", 38 | " - Teach you Python programming\n", 39 | " - Teach you about the fundamentals of machine learning\n", 40 | " - Give a thorough introduction into the subject of deep learning\n", 41 | " - Implement a framework with a focus on performance and efficiency\n", 42 | "- Elements in this tutorial\n", 43 | " - Implementing a multilayer perceptron framework\n", 44 | " - Extending this framework with state-of-the-art initialization\n", 45 | " - Adding the basic operators of CNNs\n", 46 | " - Including some operators for regularization to the framework\n", 47 | " \n", 48 | "If you have feedback or suggestions for improvement, please contact us at katharina.breininger@fau.de and tobias.wuerfl@fau.de. The most recent version can be found at https://github.com/kbreininger/tutorial-dlframework.\n", 49 | "\n", 50 | "Have fun!" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": null, 56 | "metadata": { 57 | "collapsed": true 58 | }, 59 | "outputs": [], 60 | "source": [ 61 | "# minor set-up work\n", 62 | "import numpy # we will definitely need this\n", 63 | "\n", 64 | "# automatic reloading\n", 65 | "%load_ext autoreload\n", 66 | "%autoreload 2\n", 67 | "\n", 68 | "%matplotlib inline" 69 | ] 70 | }, 71 | { 72 | "cell_type": "markdown", 73 | "metadata": { 74 | "deletable": false, 75 | "editable": false 76 | }, 77 | "source": [ 78 | "## The General Idea of the Framework\n", 79 | "\n", 80 | "\n", 81 | "Almost all tasks in this tutorial will revolve around implementing \"layers\". All layers are derived from the base class defined in the next cell. Each layer needs to implement the methods ```forward``` and ```backward```. We will use the term \"layer\" to represent any operator in the network that can be considered as a \"unit\" during forward and backward pass, e.g., a \"fully connected layer\", an \"activation layer\" or a \"loss layer\". \n", 82 | "\n", 83 | "In ```forward(x)```, the forward pass of the layer is computed by applying the respective operation to the input ```x```. Furthermore, intermediate results necessary to compute the gradients in the backward pass have to be stored. \n", 84 | "In ```backward(error)```, the layer receives the error passed down from the subsequent layer, updates its parameters accordingly and returns the error with respect to its input.\n", 85 | "\n", 86 | "This way, a simple network for classification can be expressed by a list of layer objects. Given an initial input ```x``` and a corresponding ```label```, the forward pass through the network is computed by subsequently calling ```forward``` for each layer in the list. The respective output is passed as input to the next layer. The very last layer, the \"loss\" layer, additionally receives the label to compute the loss. To adapt the weights in each layer, we then go backwards through the list, calling ```backward```, backpropagating the error through the network. The network is trained by alternating the forward and backward pass through the network while iterating through the training data.\n", 87 | "\n", 88 | "During test-time, only the forward pass through the network is computed to generate a prediction.\n", 89 | "\n", 90 | "### Basic notation and terminology\n", 91 | "\n", 92 | "We will work with the following notation and terminology:\n", 93 | "\n", 94 | "- $\\mathbf{X}$ and $\\mathbf{x}$ represent the input, \n", 95 | "- $\\mathbf{W}$ and $\\mathbf{w}$ the trainable weights/parameters and\n", 96 | "- $\\mathbf{Y}$ and $\\mathbf{y}$ the output of a layer.\n", 97 | "- $L$ represents the loss. Accordingly,\n", 98 | "- $E_\\mathbf{Y} = \\frac{\\partial L}{\\partial \\mathbf{Y}}$ is the error passed down from the subsequent layer,\n", 99 | "- $E_\\mathbf{W} = \\frac{\\partial L}{\\partial \\mathbf{W}}$ the error with respect to the weights and\n", 100 | "- $E_\\mathbf{X} = \\frac{\\partial L}{\\partial \\mathbf{X}}$ is the error with respect to the input.\n", 101 | "\n", 102 | "Note that $x$ and $y$ always have \"local\" meaning, i.e., with respect to the __current__ layer. The $y$ of the previous layer is the $x$ to the next, and vice versa.\n", 103 | "\n", 104 | "\n", 105 | "Have a look at the class definitions below and make yourself familiar with the concepts before continuing with the next part of the tutorial, the fully connected layer." 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": null, 111 | "metadata": { 112 | "collapsed": true, 113 | "deletable": false, 114 | "editable": false 115 | }, 116 | "outputs": [], 117 | "source": [ 118 | "# %load src/base.py\n", 119 | "def enum(*sequential, **named):\n", 120 | " # Enum definition for backcompatibility\n", 121 | " enums = dict(zip(sequential, range(len(sequential))), **named)\n", 122 | " return type('Enum', (), enums)\n", 123 | "\n", 124 | "# Enum to encode the which phase a layer is in at the moment.\n", 125 | "Phase = enum('train', 'test', 'validation')\n", 126 | "\n", 127 | "class BaseLayer:\n", 128 | " \n", 129 | " def __init__(self):\n", 130 | " self.phase = Phase.train\n", 131 | " \n", 132 | " def forward(self, x):\n", 133 | " \"\"\" Return the result of the forward pass of this layer. Save intermediate results\n", 134 | " necessary to compute the gradients in the backward pass. \n", 135 | " \"\"\"\n", 136 | " raise NotImplementedError('Base class - method is not implemented')\n", 137 | " \n", 138 | " def backward(self, error):\n", 139 | " \"\"\" Update the parameters/weights of this layer (if applicable), \n", 140 | " and return the gradient with respect to the input.\n", 141 | " \"\"\"\n", 142 | " raise NotImplementedError('Base class - method is not implemented')" 143 | ] 144 | }, 145 | { 146 | "cell_type": "markdown", 147 | "metadata": { 148 | "deletable": false, 149 | "editable": false 150 | }, 151 | "source": [ 152 | "## Fully Connected Layers\n", 153 | "\n", 154 | "Fully connected (FC) layers are the essential building blocks in (multi-layer) perceptrons. Inspired by biological neurons, they are able to represent any connection topology between two layers (without same-layer connections).\n", 155 | "\n", 156 | "\n", 157 | "\n", 158 | "Let's have a look at the forward pass: Given an input vector $\\mathbf{x} \\in \\mathbb{R}^{n}$ to an FC layer, the output $y$ of a single neuron can be described as a weighted sum of the input values plus a bias:\n", 159 | "\\begin{equation}\n", 160 | "y = w_{n+1} + \\sum_{j=1}^n w_j x_j ,\n", 161 | "\\end{equation}\n", 162 | "\n", 163 | "where we collect the weights in a vector $\\mathbf{w} \\in \\mathbb{R}^{n + 1}$.\n", 164 | "\n", 165 | "This is simply a vector-vector multiplication: \n", 166 | "\n", 167 | "\\begin{equation}\n", 168 | "y = \\begin{pmatrix} \n", 169 | " w_{1}&\\dots&w_{n}&w_{n+1} \\end{pmatrix}\n", 170 | "\\begin{pmatrix} \n", 171 | " x_{1} \\\\ \n", 172 | " \\vdots \\\\\n", 173 | " x_{n} \\\\\n", 174 | " 1\n", 175 | "\\end{pmatrix}\n", 176 | "\\end{equation}\n", 177 | "\n", 178 | "By extending $\\mathbf{x}$ with an additional \"1\", we can include the bias directly in the multiplication. \n", 179 | "\n", 180 | "\n", 181 | "Since we want to have a layer able to generate multiple outputs, we need multiple neurons:\n", 182 | "\n", 183 | "\n", 184 | "\n", 185 | "To achieve this, we extend the weight vector to a matrix to allow for an output vector $\\mathbf{y} \\in \\mathbb{R}^{m}$:\n", 186 | "\n", 187 | "\\begin{align}\n", 188 | "\\begin{pmatrix} \n", 189 | "y_1 \\\\ \n", 190 | "\\vdots \\\\\n", 191 | "y_m\n", 192 | "\\end{pmatrix} &=\n", 193 | "\\begin{pmatrix} \n", 194 | "w_{1,1} & \\dots & w_{n,1} & w_{n+1,1} \\\\\n", 195 | "\\vdots & \\ddots & \\vdots & \\vdots \\\\%\n", 196 | "w_{1,m} & \\dots & w_{n,m} & w_{n+1,m}\n", 197 | "\\end{pmatrix}\n", 198 | "\\begin{pmatrix} \n", 199 | "x_1 \\\\ \n", 200 | "\\vdots \\\\\n", 201 | "x_n\t \\\\\n", 202 | "1\n", 203 | "\\end{pmatrix}\\\\\n", 204 | "\\mathbf{y} &= \\mathbf{W}\\mathbf{x} \n", 205 | "\\end{align}\n", 206 | "\n", 207 | "For batch processing, we can accordingly stack multiple input vectors in a matrix $\\mathbf{X}$:\n", 208 | "\n", 209 | "\\begin{equation}\n", 210 | "\\mathbf{Y} = \\mathbf{W}\\mathbf{X}\n", 211 | "\\end{equation}\n", 212 | "\n", 213 | "The weight matrix represents the trainable parameters of the FC layer. To be able to update the parameters, we need the gradient of the loss with respect to these weights.\n", 214 | "Given the error with respect to the output $\\mathbf{Y}$ of the current layer $\\frac{\\partial L}{\\partial \\mathbf{Y}} = E_\\mathbf{Y}$, we can compute the gradient with respect to the weights $\\frac{\\partial L}{\\partial \\mathbf{W}} = E_\\mathbf{W}$ using backpropagation, i.e., the chain rule. To backpropagate the error to the previous layer (and then update the weights there), we further need to compute the error with respect to the inputs $\\frac{\\partial L}{\\partial \\mathbf{X}} = E_\\mathbf{X}$.\n", 215 | "\n", 216 | "Using the formula of the fully connected layer $\\mathbf{Y} = \\mathbf{W}\\mathbf{X}$, we can compute the wanted gradients:\n", 217 | "\n", 218 | "\\begin{align}\n", 219 | "\\frac{\\partial L}{\\partial \\mathbf{W}} &= \\frac{\\partial L}{\\partial \\mathbf{Y}} \\frac{\\partial \\mathbf{Y}}{\\partial \\mathbf{W}}\\\\\n", 220 | " &= E_\\mathbf{Y} \\mathbf{X}^T\\\\\n", 221 | "\\end{align}\n", 222 | "\n", 223 | "\\begin{align}\n", 224 | "\\frac{\\partial L}{\\partial \\mathbf{X}} &= \\frac{\\partial L}{\\partial \\mathbf{Y}} \\frac{\\partial \\mathbf{Y}}{\\partial \\mathbf{X}}\\\\\n", 225 | " &= \\mathbf{W}^T E_\\mathbf{Y}\\\\\n", 226 | "\\end{align}\n", 227 | "\n", 228 | "We will use (mini-batch) stochastic gradient descent in this tutorial, so the update rule for the weights is as follows:\n", 229 | "\n", 230 | "\\begin{equation}\n", 231 | "\\mathbf{W}^{t+1} = \\mathbf{W}^{t} - \\eta E_{\\mathbf{W}^t} \\enspace{,}\n", 232 | "\\end{equation}\n", 233 | "\n", 234 | "where $\\eta$ is the learning rate and ${t}$ denotes the iteration.\n", 235 | "\n", 236 | "\n", 237 | "### Implementation task\n", 238 | "\n", 239 | "**Now it is your turn**: In the next cell, implement the methods ```init```, ```forward```, ```backward```, and ```get_gradient_weights``` and test the method by running the cell after the next. The method ```get_gradient_weights``` should return the gradient with respect to the weights and biases of the last backward pass.\n", 240 | "\n", 241 | "**Note that input and output, and accordingly the respective errors, are actually transposed compared to the formulas above**. This is due to performance reasons and consistency with known frameworks. Make sure to consider this in your implementation.\n", 242 | "\n", 243 | "Furthermore, implement the method ```initialize```. For the moment, take the initializer objects as given, we will return to them later. Just make sure to use them with the correct weight shapes to initialize weights and biases. Implement the update of these parameters as part of the backward pass." 244 | ] 245 | }, 246 | { 247 | "cell_type": "code", 248 | "execution_count": null, 249 | "metadata": { 250 | "collapsed": true 251 | }, 252 | "outputs": [], 253 | "source": [ 254 | "# %load src/layers/fully_connected.py\n", 255 | "class FullyConnectedLayer(BaseLayer):\n", 256 | " def __init__(self, input_size, output_size, learning_rate):\n", 257 | " \"\"\" A fully connected layer.\n", 258 | " param: input_size (int): dimension n of the input vector\n", 259 | " param: output_size (int): dimension m of the output vector\n", 260 | " param: learning_rate (float): the learning rate of this layer\n", 261 | " \"\"\"\n", 262 | " # TODO: define the neccesary class variables\n", 263 | " self.weights = ... #\n", 264 | " pass\n", 265 | "\n", 266 | " def forward(self, x):\n", 267 | " \"\"\" Compute the foward pass through the layer.\n", 268 | " param: x (np.ndarray): input with shape [b, n] where b is the batch size and n is the input size\n", 269 | " returns (np.ndarray): result of the forward pass, of shape [b, m] where b is the batch size and\n", 270 | " m is the output size\n", 271 | " \"\"\"\n", 272 | " # TODO: Implement forward pass of the fully connected layer\n", 273 | " # Hint: Think about what you need to store during the forward pass to be able to compute \n", 274 | " # the gradients in the backward pass \n", 275 | " pass\n", 276 | " \n", 277 | " def get_gradient_weights(self):\n", 278 | " \"\"\" \n", 279 | " returns (np.ndarray): the gradient with respect to the weights and biases from the last call of backward(...)\n", 280 | " \"\"\"\n", 281 | " # TODO: Implement \n", 282 | " pass\n", 283 | " \n", 284 | " def backward(self, error):\n", 285 | " \"\"\" Update the weights of this layer and return the gradient with respect to the previous layer.\n", 286 | " param: error (np.ndarray): of shape [b, m] where b is the batch size and m is the output size\n", 287 | " returns (np.ndarray): the gradient w.r.t. the previous layer, of shape [b, n] where b is the \n", 288 | " batch size and n is the input size\n", 289 | " \"\"\"\n", 290 | " # TODO: Implement backward pass of the fully connected layer\n", 291 | " # Hint: Be careful about the order of applying the update to the weights and the calculation of \n", 292 | " # the error with respect to the previous layer.\n", 293 | " pass\n", 294 | " \n", 295 | " def initialize(self, weights_initializer, bias_initializer):\n", 296 | " \"\"\" Initializes the weights/bias of this layer with the given initializers.\n", 297 | " param: weights_initializer: object providing a method weights_initializer.initialize(weights_shape)\n", 298 | " which will return initialized weights with the given shape\n", 299 | " param: bias_initializer: object providing a method bias_initializer.initialize(bias_shape) \n", 300 | " which will return an initialized bias with the given shape\n", 301 | " \"\"\"\n", 302 | " # TODO: Implement\n", 303 | " pass" 304 | ] 305 | }, 306 | { 307 | "cell_type": "code", 308 | "execution_count": null, 309 | "metadata": { 310 | "collapsed": true, 311 | "deletable": false, 312 | "editable": false 313 | }, 314 | "outputs": [], 315 | "source": [ 316 | "# Running the testsuite\n", 317 | "%run Tests/TestFullyConnected.py\n", 318 | "TestFullyConnected.FullyConnected = FullyConnectedLayer\n", 319 | "unittest.main(argv=['first-arg-is-ignored'], exit=False)" 320 | ] 321 | }, 322 | { 323 | "cell_type": "markdown", 324 | "metadata": { 325 | "deletable": false, 326 | "editable": false 327 | }, 328 | "source": [ 329 | "## Activation Functions\n", 330 | "\n", 331 | "Activation functions play an essential role in neural networks: They introduce non-linearity. In this tutorial, we are going to implement two activation functions: The sigmoid and the rectified linear unit (ReLU).\n", 332 | "\n", 333 | "### Sigmoid activation function\n", 334 | "Historically, the Sigmoid function has played a big role in the development of neural networks. Given the motivation of biological neurons and their all-or-nothing response, the sigmoid is an obvious choice close to a true step function: It scales the input between 0 and 1, and its gradient exists everywhere.\n", 335 | "For each element of the input, it is defined as:\n", 336 | "\\begin{equation}\n", 337 | "\\mathrm{sig}(x) = \\frac{1}{1 + e^{-x}} \\enspace{.}\n", 338 | "\\end{equation}\n", 339 | "\n", 340 | "To be able to backpropagate the error through the network, we need the gradient with respect to the input. \n", 341 | "\n", 342 | "\\begin{align}\n", 343 | "\\frac{\\partial \\mathrm{sig}(x)}{\\partial x} &= \\frac{1}{1 + e^{-x}} (1 - \\frac{1}{1 + e^{-x}}) \\\\\n", 344 | " &= \\mathrm{sig}(x) (1-\\mathrm{sig}(x)) \\enspace{.}\n", 345 | "\\end{align}\n", 346 | "\n", 347 | "### ReLU activation function\n", 348 | "\n", 349 | "While the sigmoid function is still frequently used for example in recurrent networks and as the last layer for binary segmentation/classification, it has been overtaken by the rectified linear unit (ReLU) and its variants in many other setting.\n", 350 | "The main drawback of the sigmoid function is that its gradient is close to zero everywhere apart from a small region around the origin. This can cause the so-called vanishing gradient problem, meaning that the network will learn very slow or will stop learning completely. The ReLU is much less affected by this problem, as the output is linear for inputs $>0$:\n", 351 | "\n", 352 | "\\begin{equation}\n", 353 | "\\mathrm{relu}(x) = \n", 354 | "\\begin{cases}\n", 355 | "x \\quad \\text{if}~x > 0,\\\\\n", 356 | "0 \\quad \\text{else}.\n", 357 | "\\end{cases}\n", 358 | "\\end{equation}\n", 359 | "\n", 360 | "However, due to the kink at position 0, the function is not continuously differentiable. Instead, we need to compute subgradients in the backward pass:\n", 361 | "\n", 362 | "\\begin{equation}\n", 363 | "\\frac{\\partial \\mathrm{relu}(x)}{x} = \n", 364 | "\\begin{cases}\n", 365 | "1 \\quad \\text{if}~x > 0,\\\\\n", 366 | "0 \\quad \\text{else}.\n", 367 | "\\end{cases}\n", 368 | "\\end{equation}\n", 369 | "\n", 370 | "For both activation functions, we need to apply the chain rule to compute the result of the backward pass:\n", 371 | "\\begin{align}\n", 372 | "\\frac{\\partial L}{\\partial x} &= \\frac{\\partial L}{\\partial f(x)} \\frac{\\partial f(x)}{\\partial x} \\enspace{,}\n", 373 | "\\end{align}\n", 374 | "where $f(x)$ stands for any of the two functions.\n", 375 | "\n", 376 | "### Implementation task\n", 377 | "\n", 378 | "In the following, implement the ```Simoid``` and ```ReLU``` activation functions. Test your implementation by running the cell below." 379 | ] 380 | }, 381 | { 382 | "cell_type": "code", 383 | "execution_count": null, 384 | "metadata": { 385 | "collapsed": true 386 | }, 387 | "outputs": [], 388 | "source": [ 389 | "# %load src/layers/activation_functions.py\n", 390 | "class Sigmoid(BaseLayer):\n", 391 | " \n", 392 | " def forward(self, x):\n", 393 | " \"\"\" Return the element-wise sigmoid of the input.\n", 394 | " param: x (np.ndarray): input to the activation function, of arbitrary shape\n", 395 | " returns (np.ndarray): element-wise sigmoid(x), of the same shape as x\n", 396 | " \"\"\"\n", 397 | " # TODO: Implement forward pass of the Sigmoid\n", 398 | " pass\n", 399 | " \n", 400 | " def backward(self, error):\n", 401 | " \"\"\" Return the gradient with respect to the input.\n", 402 | " param: error (np.ndarray): the gradient passed down from the subsequent layer, of the same \n", 403 | " shape as x in the forward pass\n", 404 | " returns (np.ndarray): the gradient with respect to the previous layer, of the same shape as error \n", 405 | " \"\"\"\n", 406 | " # TODO: Implement backward pass of the Sigmoid\n", 407 | " pass\n", 408 | " \n", 409 | "\n", 410 | "class ReLU(BaseLayer):\n", 411 | " \n", 412 | " def forward(self, x):\n", 413 | " \"\"\" Return the result of a ReLU activation of the input.\n", 414 | " param: x (np.ndarray): input to the activation function, of arbitrary shape\n", 415 | " returns (np.ndarray): element-wise ReLU(x), of the same shape as x\n", 416 | " \"\"\"\n", 417 | " # TODO: Implement forward pass of the ReLU\n", 418 | " pass\n", 419 | " \n", 420 | " def backward(self, error):\n", 421 | " \"\"\" Return the gradient with respect to the input.\n", 422 | " param: error (np.ndarray): the gradient passed down from the previous layer, arbitrary shape (same as x)\n", 423 | " returns (np.ndarray): gradient with respect to the input, of the same shape as error \n", 424 | " \"\"\"\n", 425 | " # TODO: Implement backward pass of the ReLU\n", 426 | " pass" 427 | ] 428 | }, 429 | { 430 | "cell_type": "code", 431 | "execution_count": null, 432 | "metadata": { 433 | "collapsed": true, 434 | "deletable": false, 435 | "editable": false 436 | }, 437 | "outputs": [], 438 | "source": [ 439 | "%run Tests/TestActivationFunctions.py\n", 440 | "TestReLU.ReLU = ReLU\n", 441 | "TestSigmoid.Sigmoid = Sigmoid\n", 442 | "unittest.main(argv=['first-arg-is-ignored'], exit=False)" 443 | ] 444 | }, 445 | { 446 | "cell_type": "markdown", 447 | "metadata": { 448 | "deletable": false, 449 | "editable": false 450 | }, 451 | "source": [ 452 | "## Softmax and Loss Layer\n", 453 | "\n", 454 | "By combining the layers we implemented so far, we can represent a non-linear function of the input. For example, we can compute an output vector with $K$ elements to classify between $K$ classes.\n", 455 | "\n", 456 | "### Softmax\n", 457 | "The output of this computation is not further restricted. In many cases, however, it is beneficial if a prediction for the targeted classification has the properties of a probability distribution, i.e., \n", 458 | "\n", 459 | "\\begin{align}\n", 460 | "\\sum_{k=1}^{K} y_k &= 1 \\enspace{,}\\\\\n", 461 | "y_k &\\le 0 \\quad \\forall k~\\text{in}~{1, ..., K} \\enspace{.}\n", 462 | "\\end{align}\n", 463 | "\n", 464 | "This makes it for example easier to compare the prediction with the ground truth of the classification task.\n", 465 | "We can achieve these properties by applying the softmax function as a last activation function. It is defined as: \n", 466 | "\n", 467 | "\\begin{equation}\n", 468 | "\\mathrm{softmax}(x_k) = \\frac{\\mathrm{exp}(x_k)}{\\sum_{j=1}^{K}\\mathrm{exp}(x_j)} \\enspace{.}\n", 469 | "\\end{equation}\n", 470 | "\n", 471 | "However, if the activations in $\\mathbf{x}$ are high, $\\mathrm{exp}(x_k)$ can become very large. This can cause numerical instabilities. To avoid this, the activations can be shifted by the maximum value of $\\mathbf{x}$ before applying the softmax:\n", 472 | "\n", 473 | "\\begin{equation}\n", 474 | "\\mathbf{\\widetilde{x}} = \\mathbf{x} - \\mathrm{max}(\\mathbf{x}) \\enspace{.}\n", 475 | "\\end{equation}\n", 476 | "\n", 477 | "After the softmax, the predictions of the network have the properties of a probability distribution.\n", 478 | "\n", 479 | "### Loss function\n", 480 | "To adapt the parameters of the network, we to know how \"well\" the network performs compared to a given ground truth (or label) - we need a loss function. Then, we can \"train\" the network by minimizing this loss by iteratively adapting the weights and biases using our training data.\n", 481 | "\n", 482 | "A common loss function is cross entropy. To compute it, we need the ground truth $\\mathbf{y^*}$ in \"one-hot\"-vector encoding. The ground truth is represented as a vector with $K$ elements where only the value that corresponds to the true class is $\\neq 0$:\n", 483 | "\n", 484 | "\\begin{equation}\n", 485 | "\\mathbf{y^*} = \n", 486 | "\\begin{pmatrix}\n", 487 | " 0 \\\\\n", 488 | " \\vdots\\\\\n", 489 | " 1\\\\\n", 490 | " \\vdots\\\\\n", 491 | " 0\n", 492 | "\\end{pmatrix}\n", 493 | "\\end{equation}\n", 494 | "\n", 495 | "Then, the cross entropy loss for a batch of b samples is defined as:\n", 496 | "\n", 497 | "\\begin{equation}\n", 498 | "L(\\mathbf{Y^*},\\mathbf{Y}) = - \\sum_b \\sum_{k=1}^K \\ln( y_{b, k} ) y^*_{b, k}\n", 499 | "\\end{equation}\n", 500 | "\n", 501 | "### Combining both\n", 502 | "\n", 503 | "The softmax activation and the cross entropy loss are frequently combined, and sometimes called the \"SoftMax loss\". Together, their gradient has a simple and elegant form:\n", 504 | "\n", 505 | "\\begin{equation}\n", 506 | "e_k = \n", 507 | "y_k - y^*_k \\enspace{.}\n", 508 | "\\end{equation}\n", 509 | "\n", 510 | "for every element of the batch.\n", 511 | "\n", 512 | "### Implementation task\n", 513 | "\n", 514 | "Implement the softmax function and the cross entropy loss combined in the class ```SoftMaxCrossEntropyLoss```. Since the two functions are combined in ```forward```, additionally implement a function ```predict``` that computes only the softmax of the input. This function can be used during test-time, when we are interested in a prediction for unseen data." 515 | ] 516 | }, 517 | { 518 | "cell_type": "code", 519 | "execution_count": null, 520 | "metadata": { 521 | "collapsed": true 522 | }, 523 | "outputs": [], 524 | "source": [ 525 | "# %load src/layers/softmax_crossentropy.py\n", 526 | "class SoftMaxCrossEntropyLoss(BaseLayer):\n", 527 | " \n", 528 | " def forward(self, x, labels):\n", 529 | " \"\"\" Return the cross entropy loss of the input and the labels after applying the softmax to the input. \n", 530 | " param: x (np.ndarray): input, of shape [b, k] where b is the batch size and k is the input size\n", 531 | " param: labels (np.ndarray): the corresponding labels of the training set in one-hot encoding for \n", 532 | " the current input, of the same shape as x\n", 533 | " returns (float): the loss of the current prediction and the label\n", 534 | " \"\"\"\n", 535 | " # Todo: Implement forward pass\n", 536 | " pass\n", 537 | " \n", 538 | " def backward(self, labels):\n", 539 | " \"\"\" Return the gradient of the SoftMaxCrossEntropy loss with respect to the previous layer.\n", 540 | " param: labels (np.ndarray): (again) the corresponding labels of the training set for the current input, \n", 541 | " of shape [b, k] where b is the batch size and k is the input size\n", 542 | " returns (np.ndarray): the error w.r.t. the previous layer, of shape [b, k] where b is the batch \n", 543 | " size and n is the input size\n", 544 | " \"\"\"\n", 545 | " # TODO: Implement backward pass\n", 546 | " pass\n", 547 | " \n", 548 | " def predict(self, x):\n", 549 | " \"\"\" Return the softmax of the input. This can be interpreted as probabilistic prediction of the class.\n", 550 | " param: x (np.ndarray): input with shape [b, k], where b is the batch size and n is the input size\n", 551 | " returns (np.ndarray): the result softmax(x), of the same shape as x\n", 552 | " \"\"\"\n", 553 | " # TODO: Implement softmax\n", 554 | " pass" 555 | ] 556 | }, 557 | { 558 | "cell_type": "code", 559 | "execution_count": null, 560 | "metadata": { 561 | "collapsed": true, 562 | "deletable": false, 563 | "editable": false 564 | }, 565 | "outputs": [], 566 | "source": [ 567 | "%run Tests/TestSoftMaxCrossEntropyLoss.py\n", 568 | "TestSoftMaxCrossEntropyLoss.SoftMaxCrossEntropyLoss = SoftMaxCrossEntropyLoss\n", 569 | "unittest.main(argv=['first-arg-is-ignored'], exit=False)" 570 | ] 571 | }, 572 | { 573 | "cell_type": "markdown", 574 | "metadata": { 575 | "deletable": false, 576 | "editable": false 577 | }, 578 | "source": [ 579 | "## Initialization\n", 580 | "\n", 581 | "Initialization is very critical for non-convex optimization problems, and neural networks are no exception. The most simple strategy is initialization with a constant value, which is frequently used for bias initialization. Generally, bias initialization with a constant of 0 is common, however, with ReLU as activation function, a small positive value is sensible to reduce the risk of \"dying ReLUs\". \n", 582 | "\n", 583 | "For other weights in FC layers and for weights in convolutional layers that we will look at in a bit, we need a different initialization strategy. If all weights are initialized with the same value, each node would receive the same update and training becomes impossible. One option to break this symmetry is uniform random initialization. Each element of $\\mathbf{W}$ is drawn from a uniform distribution with a certain range, commonly [0, 1].\n", 584 | "\n", 585 | "However, even with random initialization, finding the right range for weights is still tricky. If the weights are too small, activations become subsequently smaller when they are passed through the layers. Conversely, if they are too large, the signal grows which each subsequent layer. Both effects hinder effective training.\n", 586 | "\n", 587 | "Glorot and Bengio$^1$ investigated this problem in more detail and presented a strategy to find the \"sweet spot\" for weight initialization that keeps the variance of the input and output gradient the same. Given certain assumptions, this can be achieved by drawing the weights from a Gaussian distribution $\\mathcal{N}(0, \\sigma)$ with zero mean and a standard deviation depending on the number of inputs $n_\\mathrm{in}$ and outputs $n_\\mathrm{out}$ of the layer. He et al.$^2$ showed that for ReLU activations, an adapted version is required to retain this property:\n", 588 | "\n", 589 | "\\begin{equation}\n", 590 | "\\sigma = \\sqrt{\\frac{2}{n_\\mathrm{in}}} \\enspace{.}\n", 591 | "\\end{equation}\n", 592 | "\n", 593 | "### Implementation task\n", 594 | "\n", 595 | "As the next task, implement the initializers ```Const```, ```UniformRandom``` and ```He``` that provide the method ```initialize``` for arbitrary weight shapes. For He initialization, the second dimension of ```weight_shape``` is assumed to be the number of input nodes. As before, run the cell below to test your implementation.\n", 596 | "\n", 597 | "$^1$ Glorot X. and Bengio Y. Understanding the difficulty of training deep feedforward neural networks. In Proc. AISTATS, PMLR 9:249-256, 2010.\n", 598 | "\n", 599 | "$^2$ He K. et al. Delving deep into rectifiers: Surpassing human-level performance on ImageNet classification. In CoRR, abs/1502.01852, 2015." 600 | ] 601 | }, 602 | { 603 | "cell_type": "code", 604 | "execution_count": null, 605 | "metadata": { 606 | "collapsed": true 607 | }, 608 | "outputs": [], 609 | "source": [ 610 | "# %load src/layers/initializers.py\n", 611 | "class Initializer:\n", 612 | " \"\"\" Base class for initializers. \"\"\"\n", 613 | " def initialize(self, weight_shape):\n", 614 | " \"\"\" Return weights initialized according to the subclass definition. \n", 615 | " Required to work for arbitrary weight shapes.\n", 616 | " Base class. \n", 617 | " \"\"\"\n", 618 | " \n", 619 | " # Raises an exeption in base class.\n", 620 | " raise NotImplementedError('Method is not implemented')\n", 621 | "\n", 622 | " \n", 623 | "class Const(Initializer):\n", 624 | " \n", 625 | " def __init__(self, value):\n", 626 | " \"\"\" Create a constant initializer.\n", 627 | " params: value (float): constant that is used for initialization of weights\n", 628 | " \"\"\"\n", 629 | " # TODO: Implement\n", 630 | " pass\n", 631 | "\n", 632 | " def initialize(self, weight_shape):\n", 633 | " \"\"\" Return a new array of weights initialized with a constant value provided by self.value.\n", 634 | " param: weight_shape: shape of the new array\n", 635 | " returns (np.ndarray): array of the given shape\n", 636 | " \"\"\"\n", 637 | " # TODO: Implement\n", 638 | " pass\n", 639 | "\n", 640 | "class UniformRandom(Initializer):\n", 641 | " \n", 642 | " def initialize(self, weight_shape):\n", 643 | " \"\"\" Return a new array of weights initialized by drawing from a uniform distribution with range [0, 1].\n", 644 | " param: weight_shape: shape of new array\n", 645 | " returns (np.ndarray): array of the given shape\n", 646 | " \"\"\"\n", 647 | " # TODO: Implement\n", 648 | " pass\n", 649 | "\n", 650 | " \n", 651 | "class He(Initializer):\n", 652 | " \n", 653 | " def initialize(self, weight_shape):\n", 654 | " \"\"\" Return a new array of weights initialized according to He et al.: Delving Deep into Rectifiers.\n", 655 | " param: weight_shape: shape of the np.array to be returned, the second dimension is assumed to be the \n", 656 | " number of input nodes\n", 657 | " returns (np.ndarray): array of the given shape\n", 658 | " \"\"\" \n", 659 | " # TODO: Implement\n", 660 | " pass\n", 661 | " " 662 | ] 663 | }, 664 | { 665 | "cell_type": "code", 666 | "execution_count": null, 667 | "metadata": { 668 | "collapsed": true, 669 | "deletable": false, 670 | "editable": false 671 | }, 672 | "outputs": [], 673 | "source": [ 674 | "%run Tests/TestInitializers.py\n", 675 | "TestInitializers.Const = Const\n", 676 | "TestInitializers.Uniform = UniformRandom\n", 677 | "TestInitializers.He = He\n", 678 | "unittest.main(argv=['first-arg-is-ignored'], exit=False) " 679 | ] 680 | }, 681 | { 682 | "cell_type": "markdown", 683 | "metadata": { 684 | "deletable": false, 685 | "editable": false 686 | }, 687 | "source": [ 688 | "## Convolutional layers\n", 689 | "\n", 690 | "Convolutional layers are without doubt one of the key elements of the success of neural networks in recent years. The main idea is simple: Convolution with trainable filters. They allow to learn which features are important for a given task in a data driven manner. One of their big advantages is that they inherently consider the spatial layout of the data. The animation below shows an example of a 2-D convolution of a padded input (blue) with a $3 \\times 3$ filter kernel that generates the output in green:\n", 691 | "\n", 692 | "
\n", 693 | "\n", 694 | "
Source: https://github.com/vdumoulin/conv_arithmetic
\n", 695 | "
\n", 696 | "\n", 697 | "In this tutorial, we will implement a 2-D convolutional layer that is fully connected in the depth/channel direction. Accordingly, given an input with $C$ channels, each filter has a shape of $M \\times N \\times C$, where $M$ and $N$ describe the spacial dimensions of the filter. The number of channels of the output depends on the number of filters $S$ in the convolutional layer.\n", 698 | "\n", 699 | "\n", 700 | "\n", 701 | "In the example above, the input has $C = 3$ channels and the convolutional layer has $S = 2$ filters fully connected in depth direction. Accordingly, the output has two channels.\n", 702 | "\n", 703 | "### Forward pass in a Conv layer:\n", 704 | "We can compute the forward pass in multiple ways:\n", 705 | "\n", 706 | "#### As a special case of a fully connected layer: Matrix multiplication\n", 707 | "Given a fixed input size, a convolutional layer can be considered as a special case of a fully connected layer. Accordingly, we can express the forward pass using a multiplication with a sparse matrix that represents the local connections within a convolutional layer. This allows us to use the same formulas as in the forward pass for the FC layer. While this presents a rather inefficient implementation, it can help to illustrate the connection between the convolutional and the FC layer.\n", 708 | "\n", 709 | "#### Convolution\n", 710 | "The forward pass of a *convolutional* layer can of course also be straight forwardly implemented as a convolution. Different very efficient low-level implementations of convolutions are available, e.g., implementations that use fast Fourier transforms (FFT), generalized matrix multiplication (GEMM) or that are based on Winograd minimal filtering algorithms$^3$. In this tutorial, we will consider a \"naive\" convolution where we slide a filter over the image to facilitate a better understanding, and subordinate efficiency.\n", 711 | "\n", 712 | "#### Cross-correlation\n", 713 | "Cross-correlation is simply a convolution without a flipped filter. For filters that are initialized randomly, we are free to use cross-correlation instead of convolution in the forward pass. We will see that it saves us a bit of kernel flipping in the backward pass.\n", 714 | "\n", 715 | "In all cases, the bias in a convolutional layer is an element-wise addition of a scalar value for each output channel.\n", 716 | "\n", 717 | "### Backward pass in a Conv layer:\n", 718 | "\n", 719 | "In the backward pass, we need to compute the gradient with respect to the weights of the convolutional kernel, the bias and the input, given the backpropagated error tensor $E_Y$.\n", 720 | "\n", 721 | "#### Matrix multiplication\n", 722 | "Like in the forward pass, we can implement the backward pass by reusing the formulas from the fully connected layer if we express the convolution as a matrix multiplication. \n", 723 | "\n", 724 | "#### Convolution/cross-correlation\n", 725 | "We may want to have a detailed look at the animation above, pick up pen and paper and track which pixels of the input/weight and correspondingly which pixels of the error contribute to respective gradient. For the gradient with respect to the input, we can then see that we need flipped kernel weights in the spacial dimensions (width and height). Alternatively, if we used convolution in the forward pass, we can now apply cross-correlation, and vice versa.\n", 726 | "\n", 727 | "Next, let's have a look at the channels: If we have $S$ kernels in the forward pass, and the input has $C$ channels, we obviously need to re-arrange the weights to $C$ kernels with $S$ channels for the backward pass. \n", 728 | "\n", 729 | "\n", 730 | "\n", 731 | "In the animation above shows that channel $c$ of $E_X$ depends only on the channel $c$ of the kernel weights. You can further see how the channels of the kernels can be recombined to compute the gradient with respect to the input. \n", 732 | "\n", 733 | "For the gradient with respect to the weights, you can observe that a correlation operation is necessary: First, the input has to be padded with half the kernel width. Then, each channel of the input has to be correlated with the channel $s$ of $E_Y$ to yield the gradient for kernel $s$. We have to compute\n", 734 | "\n", 735 | "\\begin{equation}\n", 736 | "\\frac{\\partial L}{\\partial W_{c, s}} = X_c \\star E_{Y_s} \\end{equation}\n", 737 | "\n", 738 | "for $c$ in $\\{1, ..., C\\}$ to stack together $W_s$:\n", 739 | "\n", 740 | "\n", 741 | "\n", 742 | "If convolution was used in the forward pass, the result of this correlation represents the flipped gradient, so it has to be flipped back before an update. If correlation was used instead, we save this flipping operation. To really understand this, you may want to grab pen and paper again.\n", 743 | "\n", 744 | "The gradient with respect to the bias can be computed by simply summing over the respective channel.\n", 745 | "\n", 746 | "Like in the fully connected layer, the gradient for the full mini-batch are the sum of the gradient of the elements of the batch.\n", 747 | "\n", 748 | "### Stride\n", 749 | "
\n", 750 | "\n", 751 | "
Source: https://github.com/vdumoulin/conv_arithmetic
\n", 752 | "
\n", 753 | "\n", 754 | "A strided convolution can be used to downsample the input. From a mathematical perspective, this can be expressed as a convolution followed by subsampling. Similarly, in the backward pass, $E_Y$ is first upsampled (introducing zeros), and then processed as before.\n", 755 | "\n", 756 | "### Padding\n", 757 | "In this tutorial, we will restrict the padding strategy to \"same\" padding, meaning the input will be padded with zeros such that output after the convolution has the same size as the original input.\n", 758 | "\n", 759 | "### Reshaping\n", 760 | "Convolutional layers inherently expect the input to have a certain spatial layout with possibly arbitrary size, which is different to FC layers that expect a vector of fixed size. There are two common ways to make these operations interoperable: \n", 761 | "\n", 762 | " - Flatten the input before passing it to an FC layer\n", 763 | " - Have the convolutional layers reshape the input to the correct spatial layout\n", 764 | " \n", 765 | "Here, we will implement the first option. To this end, a FlattenLayer is introduced with the sole purpose of reshaping the input to be compatible with FC layers. As no computation is involved, the backward pass simply consists of reversing the reshaping.\n", 766 | "\n", 767 | "### Implementation task\n", 768 | "\n", 769 | "In the following, implement the classes ```FlattenLayer``` and ```ConvolutionalLayer``` as described above. The necessary parameters are further described in the method documentation. \n", 770 | "\n", 771 | "Note: If you use 3D convolution/correlation (which makes sense from an implementation perspective), keep in mind that you potentially need to compensate for \"unnecessary\" flipping in the channel dimension in your implementation. Check your implementation by running the unit tests in the subsequent cell.\n", 772 | "\n", 773 | "$^3$ Lavin A., Gray S. Fast Algorithms for Convolutional Neural Networks. In Proc. CVPR, 2016. arXiv:1509.09308." 774 | ] 775 | }, 776 | { 777 | "cell_type": "code", 778 | "execution_count": null, 779 | "metadata": { 780 | "collapsed": true 781 | }, 782 | "outputs": [], 783 | "source": [ 784 | "# %load src/layers/conv.py\n", 785 | "class FlattenLayer(BaseLayer):\n", 786 | " def __init__(self):\n", 787 | " # TODO: define the necessary class variables\n", 788 | " pass\n", 789 | " \n", 790 | " def forward(self, x):\n", 791 | " \"\"\" Return a flattened version of the input.\n", 792 | " param: x (np.ndarray): input, of shape [b, n_channels, p, q] where b is the batch size, \n", 793 | " n_channels is the number of channels and p x q is the image size\n", 794 | " returns (np.ndarray): a flattened representation of x of shape [b, v] \n", 795 | " where b is the batch size and v is the output size = n_channels * p * q\n", 796 | " \"\"\"\n", 797 | " # TODO: Implement flattening of the image\n", 798 | " pass\n", 799 | " \n", 800 | " def backward(self, error):\n", 801 | " \"\"\" Return the gradient with respect to the input.\n", 802 | " param: error (np.ndarray): the gradient passed down from the subsequent layer, of shape [b, m],\n", 803 | " where b is the batch size and m is the output size with m = n_channels * p * q from \n", 804 | " the forward pass\n", 805 | " returns (np.ndarray): the error with restored dimensions from the forward pass, i.e. with \n", 806 | " shape [b, n_channels, p, q] where b is the batch size, n_channels is the number of \n", 807 | " channels and p x q is the image size\n", 808 | " \"\"\"\n", 809 | " # TODO: Restore the image dimensions\n", 810 | " pass\n", 811 | "\n", 812 | "\n", 813 | "class ConvolutionalLayer(BaseLayer):\n", 814 | " \n", 815 | " def __init__(self, stride_shape, kernel_shape, n_kernels, learning_rate, weights_initializer=UniformRandom(), bias_initializer=Const(0.1)):\n", 816 | " \"\"\" \n", 817 | " param: stride: tuple in the form of (np, nq) which denote the subsampling factor of the \n", 818 | " convolution operation in the spatial dimensions\n", 819 | " param: kernel_shape: integer tuple in the form of (n_channels, m, n) where n_channels is \n", 820 | " the number of input channels and m x n is the size of the filter kernels\n", 821 | " param: n_kernels (int): number of kernels and therefore the number of output channels\n", 822 | " param: learning_rate (float): learning rate of this layer\n", 823 | " param: weights_initializer: initializer object for the filter weights\n", 824 | " param: bias_initializer: initializer object for the bias\n", 825 | " \"\"\"\n", 826 | " # TODO: define the neccesary class variables, initialize the weights and bias\n", 827 | " self.weights = ...\n", 828 | " self.bias = ...\n", 829 | " pass \n", 830 | " \n", 831 | " def forward(self, x):\n", 832 | " \"\"\" Return the result of the forward pass of the convolutional layer.\n", 833 | " param: x(np.ndarray): input, of shape [b, n_channels, p, q], where b is the batch size, \n", 834 | " n_channels is the number of input channels and p x q is the image size\n", 835 | " returns (np.ndarray): result of the forward pass, of shape (b, n_kernels, p', q') \n", 836 | " where b is the batch size, n_kernels is the number of kernels in this layer and \n", 837 | " p' x q' is the output image size (which depends on the stride)\n", 838 | " \"\"\"\n", 839 | " # TODO: Implement forward pass of the convolutional layer\n", 840 | " pass\n", 841 | " \n", 842 | " def backward(self, error):\n", 843 | " \"\"\" Update the weights of this layer and return the gradient with respect to the input.\n", 844 | " param: error (np.ndarray): of shape (b, n_kernels, p', q') where b is the batch size, n_kernels\n", 845 | " is the number of kernels and p' x q' is the spacial error size (depends on the stride)\n", 846 | " returns (np.ndarray): the gradient with respect to the input, of shape (b, n_channels, p, q) \n", 847 | " where b is the batch size, n_channels is the number of input channels to this layer and \n", 848 | " p x q is the image size.\n", 849 | " \"\"\" \n", 850 | " # TODO: Implement backward pass of the convolutional layer\n", 851 | " pass\n", 852 | " \n", 853 | " def get_gradient_weights(self):\n", 854 | " \"\"\" Returns the gradient with respect to the weights from the last call of backward() \"\"\"\n", 855 | " # TODO: Implement\n", 856 | " pass\n", 857 | "\n", 858 | " def get_gradient_bias(self):\n", 859 | " \"\"\" Returns the gradient with respect to the bias from the last call of backward() \"\"\"\n", 860 | " # TODO: Implement\n", 861 | " pass\n", 862 | " \n", 863 | " def initialize(self, weights_initializer, bias_initializer):\n", 864 | " \"\"\" Initializes the weights/bias of this layer with the given initializers.\n", 865 | " param: weights_initializer: object providing a method weights_initializer.initialize(weights_shape)\n", 866 | " which will return initialized weights with the given shape\n", 867 | " param: bias_initializer: object providing a method bias_initializer.initialize(bias_shape) \n", 868 | " which will return an initialized bias with the given shape\n", 869 | " \"\"\"\n", 870 | " # TODO: Implement. To make sure that He initialization works as intended, make sure the second dimension \n", 871 | " # of weights_shape contains the number of input nodes that can be computed as n_in = n_channels * m * n\n", 872 | " # and reshape the weights to the correct shape afterwards.\n", 873 | " pass" 874 | ] 875 | }, 876 | { 877 | "cell_type": "code", 878 | "execution_count": null, 879 | "metadata": { 880 | "collapsed": true, 881 | "deletable": false, 882 | "editable": false 883 | }, 884 | "outputs": [], 885 | "source": [ 886 | "%run Tests/TestConv.py\n", 887 | "TestConv.Conv = ConvolutionalLayer\n", 888 | "TestConv.FullyConnected = FullyConnectedLayer\n", 889 | "TestConv.He = He\n", 890 | "TestConv.Constant = Const\n", 891 | "TestConv.Flatten = FlattenLayer\n", 892 | "unittest.main(argv=['first-arg-is-ignored'], exit=False)" 893 | ] 894 | }, 895 | { 896 | "cell_type": "markdown", 897 | "metadata": { 898 | "deletable": false, 899 | "editable": false 900 | }, 901 | "source": [ 902 | "## Pooling Layers\n", 903 | "\n", 904 | "As alternative to striding in a convolutional layer, specific pooling layers can be used to downsample the data and condense spacial information. We will look at max pooling as one example. In the forward pass, the output for each pixel is the maximum value in a neighborhood of the corresponding input pixel, calculated separately for every channel. The downsampling is again achieved by using a stride > 1.\n", 905 | "\n", 906 | "
\n", 907 | "\n", 908 | "
Source: https://github.com/vdumoulin/conv_arithmetic
\n", 909 | "
\n", 910 | "\n", 911 | "The above example shows maxpooling with a neighborhood of $3 \\times 3$ and a stride of $[1, 1]$.\n", 912 | "\n", 913 | "The maximum operation can be thought of as an on/off switch for the backpropagation of the gradient for each pixel. We therefore need to store the location of the maximum value in the forward pass. Since the layer has no trainable parameters, we only need to compute the gradient with respect to the input. In the backward pass, the subgradient is given by the colloquial rule \"the winner takes it all\". The error is routed only towards the maximum locations; for all other input pixels, the gradient is zero. If the stride is smaller than the neighborhood, the routed gradients for the respective pixels are summed up.\n", 914 | "\n", 915 | "### Implementation task\n", 916 | "\n", 917 | "In the following, implement the class ```MaxPoolLayer```. Check your implementation as usual by running the unittests in the cell below the implementation." 918 | ] 919 | }, 920 | { 921 | "cell_type": "code", 922 | "execution_count": null, 923 | "metadata": { 924 | "collapsed": true 925 | }, 926 | "outputs": [], 927 | "source": [ 928 | "# %load src/layers/pooling\n", 929 | "class MaxPoolLayer(BaseLayer):\n", 930 | " \n", 931 | " def __init__(self, neighborhood=(2, 2), stride=(2, 2)):\n", 932 | " \"\"\" Max pooling layer.\n", 933 | " param: neighborhood: tuple with shape (sp, sq) which denote the kernel size of the pooling operation in \n", 934 | " the spatial dimensions\n", 935 | " param: stride: tuple with shape (np, nq) which denote the subsampling factor of the pooling operation in\n", 936 | " the spacial dimensions\n", 937 | " \"\"\"\n", 938 | " # TODO: define necessary class variables\n", 939 | " pass\n", 940 | " \n", 941 | " def forward(self, x):\n", 942 | " \"\"\" Return the result of maxpooling on the input.\n", 943 | " param: x (np.ndarray) with shape (b, n_channels, p, q) where b is the batch size, \n", 944 | " n_channels is the number of input channels and p x q is the image size\n", 945 | " returns (np.ndarray): the result of max pooling, of shape (b, n_channels, p', q')\n", 946 | " where b is the batch size, n_channels is the number of input channels and \n", 947 | " p' x q' is the new image size reduced by the stride. \n", 948 | " \"\"\"\n", 949 | " # TODO: Implement forward pass of max pooling\n", 950 | " pass\n", 951 | " \n", 952 | " def backward(self, error):\n", 953 | " \"\"\" Return the gradient with respect to the previous layer.\n", 954 | " param: error(np.ndarray): the gradient passed own from the subsequent layer, \n", 955 | " of shape [b, n_channels, p', q'] where b is the batch size, n_channels is the \n", 956 | " number of channels and p' x q' is the image size reduced by the stride\n", 957 | " returns (np.ndarray): the gradient w.r.t. the previous layer, of shape [b, n_channels, p, q] \n", 958 | " where b is the batch size, n_channels is the number of input channels to this layer and \n", 959 | " p x q is the image size prior to downsampling.\n", 960 | " \"\"\"\n", 961 | " # TODO: Implement backward pass of max pooling\n", 962 | " pass" 963 | ] 964 | }, 965 | { 966 | "cell_type": "code", 967 | "execution_count": null, 968 | "metadata": { 969 | "collapsed": true, 970 | "deletable": false, 971 | "editable": false 972 | }, 973 | "outputs": [], 974 | "source": [ 975 | "%run Tests/TestMaxPoolLayer.py\n", 976 | "TestMaxPooling.MaxPooling = MaxPoolLayer\n", 977 | "TestMaxPooling.FullyConnected = FullyConnectedLayer\n", 978 | "TestMaxPooling.Flatten = FlattenLayer\n", 979 | "unittest.main(argv=['first-arg-is-ignored'], exit=False)" 980 | ] 981 | }, 982 | { 983 | "cell_type": "markdown", 984 | "metadata": { 985 | "deletable": false, 986 | "editable": false 987 | }, 988 | "source": [ 989 | "## Dropout\n", 990 | "\n", 991 | "Most successful deep learning models use some regularization techniques intended to decrease the gap between training and test accuracy. The goal is to bias the model towards a model with lower training accuracy but better generalization capability. One prominent technique is dropout. It was for example used in the famous AlexNet network. \n", 992 | "The idea of this technique is to break dependencies between features by setting random activations to zero during training. This is typically done with a Bernoulli distribution: In each training iteration, the probability for a certain activation to \"drop out\" is $p$.\n", 993 | "The application of dropout shifts the mean of the activations because many elements are set to zero during training. At test time, when no element are dropped out, the mean is different, which can decrease performance. To combat this the \"training mean\" can be restored by multiplying all activations with $1 - p$ at test time.\n", 994 | " \n", 995 | "### Inverted dropout\n", 996 | "The multiplication at test time can be avoiding by rewriting dropout behaviour during training. This means that the dropout layer can actually be skipped completely during test time, allowing for faster inference. To this end, the activations are multiplied by $\\frac{1}{1 - p}$ after applying the stochastic function during training. This way, the mean is not changed by the layer and no operation needs to be performed during test time.\n", 997 | "\n", 998 | "\n", 999 | "### Implementation task\n", 1000 | "In the following, implement the ```DropOut``` layer with inverted dropout. As usual, check your implementation by running the unittests. Note that dropout operates on each element of the input vector independently." 1001 | ] 1002 | }, 1003 | { 1004 | "cell_type": "code", 1005 | "execution_count": null, 1006 | "metadata": { 1007 | "collapsed": true 1008 | }, 1009 | "outputs": [], 1010 | "source": [ 1011 | "# %load src/layers/dropout\n", 1012 | "class DropOut(BaseLayer):\n", 1013 | " \n", 1014 | " def __init__(self, probability):\n", 1015 | " \"\"\" DropOut Layer.\n", 1016 | " param: drop_probability: probability of each individual activation to be set to zero, in range [0, 1] \n", 1017 | " \"\"\"\n", 1018 | " # TODO: Implement initialization\n", 1019 | " \n", 1020 | " pass\n", 1021 | " \n", 1022 | " def forward(self, x):\n", 1023 | " \"\"\" Forward pass through the layer: Set activations of the input randomly to zero.\n", 1024 | " param: x (np.ndarray): input\n", 1025 | " returns (np.ndarray): a new array of the same shape as x, after dropping random elements\n", 1026 | " \"\"\"\n", 1027 | " # TODO: Implement forward pass of the Dropout layer\n", 1028 | " # Hint: Make sure to treat training and test phase accordingly.\n", 1029 | " pass\n", 1030 | " \n", 1031 | " def backward(self, error):\n", 1032 | " \"\"\" Backward pass through the layer: Return the gradient with respect to the input.\n", 1033 | " param: error (np.ndarray): error passed down from the subsequent layer, of the same shape as the \n", 1034 | " output of the forward pass\n", 1035 | " returns (np.ndarray): gradient with respect to the input, of the same shape as error\n", 1036 | " \"\"\"\n", 1037 | " # TODO: Implement backward pass of the Dropout layer\n", 1038 | " pass" 1039 | ] 1040 | }, 1041 | { 1042 | "cell_type": "code", 1043 | "execution_count": null, 1044 | "metadata": { 1045 | "collapsed": true, 1046 | "deletable": false, 1047 | "editable": false 1048 | }, 1049 | "outputs": [], 1050 | "source": [ 1051 | "%run Tests/TestDropout.py\n", 1052 | "TestDropout.DropOut = DropOut\n", 1053 | "TestDropout.Phase = Phase\n", 1054 | "unittest.main(argv=['first-arg-is-ignored'], exit=False)" 1055 | ] 1056 | }, 1057 | { 1058 | "cell_type": "markdown", 1059 | "metadata": { 1060 | "deletable": false, 1061 | "editable": false 1062 | }, 1063 | "source": [ 1064 | "## LeNet\n", 1065 | "\n", 1066 | "As the last part of this tutorial, we use our developed operators to construct a simple neural network inspired by the traditional LeNet architecture:\n", 1067 | "\n", 1068 | "
\n", 1069 | "\n", 1070 | "
Source: LeCun et al, 1998.$^4$
\n", 1071 | "
\n", 1072 | "\n", 1073 | "Use two convolutional layers with $5 \\times 5$ kernels and $6$ respectively $10$ channels. Each convolution is followed by a ReLU unit and max pooling of with a neighborhood and stride of 2 in each dimension. The top of the network is formed by three FC layers with ReLU activations producing outputs of dimensionality $120$, $84$ and subsequently the number of categories. Finally, use the SoftMaxCrossEntropyLoss as loss layer.\n", 1074 | "\n", 1075 | "First, have a look at the class ```NeuralNetwork```, that provides the basic framework in which you can use the different layers and stack them together to a functioning network. You don't need to adapt this class, but you can use it to implement the LeNet architecture. You may also want to refer back to the [description](#network_description) in the beginning.\n", 1076 | "\n", 1077 | "### Implementation task\n", 1078 | "\n", 1079 | "Next, implement the LeNet architecture in the ```build``` function and train your network in with the script provided below. \n", 1080 | "\n", 1081 | "Experiment for example with the activation function and DropOut, tune the learning rate or look at the effect of initialization. Feel free to add your own evaluations and plots. You can get the full test data of the MNIST data object by calling ```net.data_layer.get_test_set```.\n", 1082 | "\n", 1083 | "$^4$ LeCun Y., Bottou L., Bengio Y. and Haffner P. Gradient-based Learning Applied to Document Recognition. In Proc. IEEE, 1989." 1084 | ] 1085 | }, 1086 | { 1087 | "cell_type": "code", 1088 | "execution_count": null, 1089 | "metadata": { 1090 | "collapsed": true, 1091 | "deletable": false 1092 | }, 1093 | "outputs": [], 1094 | "source": [ 1095 | "# %load src/network.py\n", 1096 | "\n", 1097 | "# Nothing to do in this cell: Just make yourself familiar with the NeuralNetwork class.\n", 1098 | "class NeuralNetwork:\n", 1099 | " def __init__(self, weights_initializer, bias_initializer):\n", 1100 | " # list which will contain the loss after training\n", 1101 | " self.loss = []\n", 1102 | " self.data_layer = None # the layer providing data\n", 1103 | " self.loss_layer = None # the layer calculating the loss and the prediction\n", 1104 | " self.layers = []\n", 1105 | " self.weights_initializer = weights_initializer\n", 1106 | " self.bias_initializer = bias_initializer\n", 1107 | " self.label_tensor = None # the labels of the current iteration\n", 1108 | "\n", 1109 | " def append_fixed_layer(self, layer):\n", 1110 | " \"\"\" Add a non-trainable layer to the network. \"\"\"\n", 1111 | " self.layers.append(layer)\n", 1112 | " \n", 1113 | " def append_trainable_layer(self, layer):\n", 1114 | " \"\"\" Add a new layer with trainable parameters to the network. Initialize the parameters of \n", 1115 | " the network using the object's initializers for weights and bias.\n", 1116 | " \"\"\"\n", 1117 | " layer.initialize(self.weights_initializer, self.bias_initializer)\n", 1118 | " self.layers.append(layer)\n", 1119 | "\n", 1120 | " def forward(self):\n", 1121 | " \"\"\" Compute the forward pass through the network. \"\"\"\n", 1122 | " # fetch some training data\n", 1123 | " input_tensor, self.label_tensor = self.data_layer.forward()\n", 1124 | " # defer iterating through the network\n", 1125 | " activation_tensor = self.__forward_input(input_tensor)\n", 1126 | " # calculate the loss of the network using the final loss layer\n", 1127 | " return self.loss_layer.forward(activation_tensor, self.label_tensor)\n", 1128 | "\n", 1129 | " def __forward_input(self, input_tensor):\n", 1130 | " \"\"\" Compute the forward pass through the network, stopping before the \n", 1131 | " loss layer.\n", 1132 | " param: input_tensor (np.ndarray): input to the network\n", 1133 | " returns: activation of the last \"regular\" layer\n", 1134 | " \"\"\"\n", 1135 | " activation_tensor = input_tensor\n", 1136 | " # pass the input up the network\n", 1137 | " for layer in self.layers:\n", 1138 | " activation_tensor = layer.forward(activation_tensor)\n", 1139 | " # return the activation of the last layer\n", 1140 | " return activation_tensor\n", 1141 | "\n", 1142 | " def backward(self):\n", 1143 | " \"\"\" Perform the backward pass during training. \"\"\"\n", 1144 | " error_tensor = self.loss_layer.backward(self.label_tensor)\n", 1145 | " # pass back the error recursively\n", 1146 | " for layer in reversed(self.layers):\n", 1147 | " error_tensor = layer.backward(error_tensor)\n", 1148 | "\n", 1149 | " def train(self, iterations):\n", 1150 | " \"\"\" Train the network for a fixed number of steps.\n", 1151 | " param: iterations (int): number of iterations for training \n", 1152 | " \"\"\"\n", 1153 | " for layer in self.layers:\n", 1154 | " layer.phase = Phase.train # Make sure phase is set to \"train\" for all layers\n", 1155 | " for i in range(iterations):\n", 1156 | " loss = self.forward() # go up the network\n", 1157 | " self.loss.append(loss) # save the loss\n", 1158 | " self.backward() # and down again\n", 1159 | " print('.', end='')\n", 1160 | "\n", 1161 | "\n", 1162 | " def test(self, input_tensor):\n", 1163 | " \"\"\" Apply the (trained) network to input data to generate a prediction. \n", 1164 | " param: input_tensor (nd.nparray): input (image or vector)\n", 1165 | " returns (np.ndarray): prediction by the network\n", 1166 | " \"\"\"\n", 1167 | " for layer in self.layers:\n", 1168 | " layer.phase = Phase.test # Make sure phase is set to \"test\" for all layers\n", 1169 | " activation_tensor = self.__forward_input(input_tensor)\n", 1170 | " return self.loss_layer.predict(activation_tensor)" 1171 | ] 1172 | }, 1173 | { 1174 | "cell_type": "code", 1175 | "execution_count": null, 1176 | "metadata": { 1177 | "collapsed": true 1178 | }, 1179 | "outputs": [], 1180 | "source": [ 1181 | "def build():\n", 1182 | " # returns: a neural network architecture built according to the provided specification\n", 1183 | " \n", 1184 | " net = NeuralNetwork(He(), Const(0.1))\n", 1185 | " learning_rate = 0.001\n", 1186 | " categories = 10 # MNIST, numbers 0-9\n", 1187 | " \n", 1188 | " # TODO: Implement the architecture by adding layers to net\n", 1189 | "\n", 1190 | " return net" 1191 | ] 1192 | }, 1193 | { 1194 | "cell_type": "code", 1195 | "execution_count": null, 1196 | "metadata": { 1197 | "collapsed": true, 1198 | "deletable": false 1199 | }, 1200 | "outputs": [], 1201 | "source": [ 1202 | "import matplotlib\n", 1203 | "import numpy as np\n", 1204 | "import matplotlib.pyplot as plt\n", 1205 | "\n", 1206 | "net = build()\n", 1207 | "\n", 1208 | "from Tests import Helpers\n", 1209 | "net.data_layer = Helpers.MNISTData(20)\n", 1210 | "n_iters = 100\n", 1211 | "net.train(n_iters)\n", 1212 | "\n", 1213 | "plt.plot(range(n_iters), net.loss)\n", 1214 | "\n" 1215 | ] 1216 | }, 1217 | { 1218 | "cell_type": "code", 1219 | "execution_count": null, 1220 | "metadata": { 1221 | "collapsed": true 1222 | }, 1223 | "outputs": [], 1224 | "source": [ 1225 | "# Perform the prediction for a random test sample from the dataset:\n", 1226 | "x, l = net.data_layer.get_random_test_sample()\n", 1227 | "plt.imshow(x[:28*28].reshape(28, 28), cmap='gray')\n", 1228 | "\n", 1229 | "print(x.shape)\n", 1230 | "print('Prediction with highest output: {}'.format(np.argmax(net.test(x))))\n", 1231 | "print('Ground truth: {}'.format(np.argmax(l)))\n" 1232 | ] 1233 | }, 1234 | { 1235 | "cell_type": "markdown", 1236 | "metadata": { 1237 | "deletable": false, 1238 | "editable": false 1239 | }, 1240 | "source": [ 1241 | "## Summary and Outlook\n", 1242 | "In this tutorial, we implemented some of the most common building blocks of neural networks, including fully connected layers, activation functions, convolutional layers and regularization operators. Finally, we combined these operators to working network.\n", 1243 | "\n", 1244 | "We covered only a small subset of elements that are relevant for neural networks. We encourage you to play with other operators, for example batch normalization$^5$, alternative activation functions, initialization strategies or recurrent units. You may also refactor the framework to experiment with different optimizers, like SGD with momentum, Adam or AdaGrad, or extend the framework to allow for weight decay.\n", 1245 | "\n", 1246 | "We hope you enjoyed this tutorial and gained a deeper understanding of neural network operators and frameworks. Have fun on your journey further into deep learning and neural networks!\n", 1247 | "\n", 1248 | "$^5$ Ioffe S., Szegedy C. Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift. In Proc. ICML, 2015." 1249 | ] 1250 | } 1251 | ], 1252 | "metadata": { 1253 | "celltoolbar": "Edit Metadata", 1254 | "kernelspec": { 1255 | "display_name": "Python 3", 1256 | "language": "python", 1257 | "name": "python3" 1258 | }, 1259 | "language_info": { 1260 | "codemirror_mode": { 1261 | "name": "ipython", 1262 | "version": 3 1263 | }, 1264 | "file_extension": ".py", 1265 | "mimetype": "text/x-python", 1266 | "name": "python", 1267 | "nbconvert_exporter": "python", 1268 | "pygments_lexer": "ipython3", 1269 | "version": "3.6.1" 1270 | } 1271 | }, 1272 | "nbformat": 4, 1273 | "nbformat_minor": 2 1274 | } 1275 | --------------------------------------------------------------------------------